45 lines
949 B
Python
45 lines
949 B
Python
"""
|
|
Utility functions related to languages.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import gcld3
|
|
|
|
asset_path = Path(__file__).parent.parent / 'assets'
|
|
|
|
|
|
with open(asset_path / 'iso_639-1', 'r') as f:
|
|
iso_639_1_codes = f.read().strip().split('\n')
|
|
|
|
|
|
lang_detector = gcld3.NNetLanguageIdentifier(
|
|
min_num_bytes=0, max_num_bytes=1000
|
|
)
|
|
|
|
|
|
def clean_lang(lang: Optional[str]) -> Optional[str]:
|
|
"""
|
|
Clean a language code string: it must be an ISO 639-1 code or None.
|
|
"""
|
|
if lang is None:
|
|
return None
|
|
lang = lang[:2].lower()
|
|
if lang in iso_639_1_codes:
|
|
return lang
|
|
return None
|
|
|
|
|
|
def extract_content_language(text: str) -> Optional[str]:
|
|
"""
|
|
Extract the language from a text.
|
|
"""
|
|
if len(text) < 10:
|
|
return None
|
|
lang = None
|
|
lang_det = lang_detector.FindLanguage(text=text)
|
|
if lang_det.is_reliable:
|
|
lang = lang_det.language[:2]
|
|
return lang
|