atextcrawler/src/atextcrawler/utils/lang.py

45 lines
949 B
Python

"""
Utility functions related to languages.
"""
from pathlib import Path
from typing import Optional
import gcld3
asset_path = Path(__file__).parent.parent / 'assets'
with open(asset_path / 'iso_639-1', 'r') as f:
iso_639_1_codes = f.read().strip().split('\n')
lang_detector = gcld3.NNetLanguageIdentifier(
min_num_bytes=0, max_num_bytes=1000
)
def clean_lang(lang: Optional[str]) -> Optional[str]:
"""
Clean a language code string: it must be an ISO 639-1 code or None.
"""
if lang is None:
return None
lang = lang[:2].lower()
if lang in iso_639_1_codes:
return lang
return None
def extract_content_language(text: str) -> Optional[str]:
"""
Extract the language from a text.
"""
if len(text) < 10:
return None
lang = None
lang_det = lang_detector.FindLanguage(text=text)
if lang_det.is_reliable:
lang = lang_det.language[:2]
return lang