91 lines
2.5 KiB
Python
91 lines
2.5 KiB
Python
"""
|
|
Find date expressions in a string.
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
p_day = r'(0?[1-9]|[12][0-9]|3[01])'
|
|
p_month = r'(0?[1-9]|1[0-2])'
|
|
p_year = r'(20\d\d|19\d\d)'
|
|
sep = r'\D{1,2}'
|
|
p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
|
|
|
|
|
|
format_re = {
|
|
'iso': (
|
|
re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
|
|
(1, 2, 3, 6, 7),
|
|
),
|
|
'dmy': (
|
|
re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
|
|
(3, 2, 1, 6, 7),
|
|
),
|
|
'mdy': (
|
|
re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
|
|
(3, 1, 2, 6, 7),
|
|
),
|
|
}
|
|
|
|
|
|
lang_format = {
|
|
'de': ('iso', 'dmy'),
|
|
'en': ('iso', 'mdy'),
|
|
None: ('iso', 'dmy', 'mdy'),
|
|
}
|
|
|
|
|
|
def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
|
|
"""
|
|
Extract the latest date compatible with the *lang* from *text*.
|
|
|
|
Only consider dates in the past.
|
|
"""
|
|
dates = extract_dates(text, lang=lang)
|
|
return max(dates) if dates else None
|
|
|
|
|
|
def extract_dates(text: str, lang: str = None) -> list[datetime]:
|
|
"""
|
|
Extract dates form a string, optionally limiting formats to a language.
|
|
"""
|
|
dates = []
|
|
fmts = lang_format.get(lang, lang_format[None])
|
|
for fmt in fmts:
|
|
re_, slots = format_re[fmt]
|
|
matches = re_.findall(text)
|
|
if matches:
|
|
for match in matches:
|
|
try:
|
|
date = datetime(
|
|
int(match[slots[0]]),
|
|
int(match[slots[1]]),
|
|
int(match[slots[2]]),
|
|
int(match[slots[3]] or 0),
|
|
int(match[slots[4]] or 0),
|
|
)
|
|
if date <= datetime.utcnow():
|
|
dates.append(date)
|
|
except:
|
|
pass
|
|
return dates
|
|
|
|
|
|
## from htmldate import find_date
|
|
|
|
# def extract_last_pub(html):
|
|
# """
|
|
# Return an estimate for the time of last content publication from html.
|
|
# """
|
|
# # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
|
|
# lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
|
|
# # publication date (from startpage)
|
|
# try:
|
|
# date_string = find_date(lxml_tree)
|
|
# pd = date.fromisoformat(date_string)
|
|
# last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
|
|
# except:
|
|
# last_pub = None
|
|
# return last_pub
|