atextcrawler/src/atextcrawler/utils/date_finder.py

91 lines
2.5 KiB
Python

"""
Find date expressions in a string.
"""
import re
from datetime import datetime
from typing import Optional
p_day = r'(0?[1-9]|[12][0-9]|3[01])'
p_month = r'(0?[1-9]|1[0-2])'
p_year = r'(20\d\d|19\d\d)'
sep = r'\D{1,2}'
p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
format_re = {
'iso': (
re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
(1, 2, 3, 6, 7),
),
'dmy': (
re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
(3, 2, 1, 6, 7),
),
'mdy': (
re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
(3, 1, 2, 6, 7),
),
}
lang_format = {
'de': ('iso', 'dmy'),
'en': ('iso', 'mdy'),
None: ('iso', 'dmy', 'mdy'),
}
def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
"""
Extract the latest date compatible with the *lang* from *text*.
Only consider dates in the past.
"""
dates = extract_dates(text, lang=lang)
return max(dates) if dates else None
def extract_dates(text: str, lang: str = None) -> list[datetime]:
"""
Extract dates form a string, optionally limiting formats to a language.
"""
dates = []
fmts = lang_format.get(lang, lang_format[None])
for fmt in fmts:
re_, slots = format_re[fmt]
matches = re_.findall(text)
if matches:
for match in matches:
try:
date = datetime(
int(match[slots[0]]),
int(match[slots[1]]),
int(match[slots[2]]),
int(match[slots[3]] or 0),
int(match[slots[4]] or 0),
)
if date <= datetime.utcnow():
dates.append(date)
except:
pass
return dates
## from htmldate import find_date
# def extract_last_pub(html):
# """
# Return an estimate for the time of last content publication from html.
# """
# # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
# lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
# # publication date (from startpage)
# try:
# date_string = find_date(lxml_tree)
# pd = date.fromisoformat(date_string)
# last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
# except:
# last_pub = None
# return last_pub