240 lines
7.2 KiB
Python
240 lines
7.2 KiB
Python
import re
|
||
from datetime import date, datetime
|
||
from typing import Any
|
||
|
||
|
||
MONTH_MAP: dict[str, int] = {
|
||
"januar": 1,
|
||
"jan": 1,
|
||
"februar": 2,
|
||
"feb": 2,
|
||
"mars": 3,
|
||
"mar": 3,
|
||
"april": 4,
|
||
"apr": 4,
|
||
"mai": 5,
|
||
"juni": 6,
|
||
"jun": 6,
|
||
"juli": 7,
|
||
"jul": 7,
|
||
"august": 8,
|
||
"aug": 8,
|
||
"september": 9,
|
||
"sep": 9,
|
||
"sept": 9,
|
||
"oktober": 10,
|
||
"okt": 10,
|
||
"november": 11,
|
||
"nov": 11,
|
||
"desember": 12,
|
||
"des": 12,
|
||
}
|
||
|
||
|
||
def normalize_whitespace(value: str) -> str:
|
||
return re.sub(r"\s+", " ", str(value or "")).strip()
|
||
|
||
|
||
def _to_date(year: int, month: int, day: int) -> date | None:
|
||
try:
|
||
return date(year, month, day)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def _infer_year(month: int, day: int, explicit_year: int | None, today: date) -> int:
|
||
if explicit_year:
|
||
return explicit_year
|
||
|
||
candidate = _to_date(today.year, month, day)
|
||
if candidate and candidate < today.replace(day=max(1, min(today.day, 28))):
|
||
if (today - candidate).days > 7:
|
||
return today.year + 1
|
||
return today.year
|
||
|
||
|
||
def _parse_numeric_date(raw: str) -> date | None:
|
||
match = re.search(r"\b(\d{1,2})[./](\d{1,2})[./](\d{2,4})\b", raw)
|
||
if not match:
|
||
return None
|
||
|
||
day = int(match.group(1))
|
||
month = int(match.group(2))
|
||
year = int(match.group(3))
|
||
if year < 100:
|
||
year += 2000
|
||
return _to_date(year, month, day)
|
||
|
||
|
||
def _parse_textual_dates(raw: str, today: date) -> list[date]:
|
||
results: list[date] = []
|
||
pattern = re.compile(
|
||
r"\b(\d{1,2})\.?\s*(" + "|".join(sorted(MONTH_MAP.keys(), key=len, reverse=True)) + r")\b(?:\s+(20\d{2}))?",
|
||
re.IGNORECASE,
|
||
)
|
||
for match in pattern.finditer(raw):
|
||
day = int(match.group(1))
|
||
month = MONTH_MAP.get(match.group(2).lower())
|
||
if not month:
|
||
continue
|
||
explicit_year = int(match.group(3)) if match.group(3) else None
|
||
year = _infer_year(month, day, explicit_year, today)
|
||
candidate = _to_date(year, month, day)
|
||
if candidate:
|
||
results.append(candidate)
|
||
return results
|
||
|
||
|
||
def parse_course_date_range(raw: str, today: date | None = None) -> tuple[date | None, date | None]:
|
||
reference_today = today or date.today()
|
||
normalized = normalize_whitespace(raw).lower()
|
||
if not normalized:
|
||
return None, None
|
||
|
||
iso_candidate = None
|
||
try:
|
||
iso_candidate = datetime.fromisoformat(normalized).date()
|
||
except ValueError:
|
||
iso_candidate = None
|
||
if iso_candidate:
|
||
return iso_candidate, iso_candidate
|
||
|
||
numeric_dates = re.findall(r"\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b", normalized)
|
||
if len(numeric_dates) >= 2:
|
||
start = _parse_numeric_date(numeric_dates[0])
|
||
end = _parse_numeric_date(numeric_dates[1])
|
||
return start, end or start
|
||
if len(numeric_dates) == 1:
|
||
single = _parse_numeric_date(numeric_dates[0])
|
||
return single, single
|
||
|
||
range_match = re.search(
|
||
r"\b(\d{1,2})\s*\.?\s*(?:-|–|—|til)\s*(\d{1,2})\.?\s*(" + "|".join(sorted(MONTH_MAP.keys(), key=len, reverse=True)) + r")\b(?:\s+(20\d{2}))?",
|
||
normalized,
|
||
re.IGNORECASE,
|
||
)
|
||
if range_match:
|
||
start_day = int(range_match.group(1))
|
||
end_day = int(range_match.group(2))
|
||
month = MONTH_MAP.get(range_match.group(3).lower())
|
||
explicit_year = int(range_match.group(4)) if range_match.group(4) else None
|
||
if month:
|
||
year = _infer_year(month, end_day, explicit_year, reference_today)
|
||
start = _to_date(year, month, start_day)
|
||
end = _to_date(year, month, end_day)
|
||
return start, end or start
|
||
|
||
textual_dates = _parse_textual_dates(normalized, reference_today)
|
||
if len(textual_dates) >= 2:
|
||
return textual_dates[0], textual_dates[1]
|
||
if len(textual_dates) == 1:
|
||
return textual_dates[0], textual_dates[0]
|
||
|
||
return None, None
|
||
|
||
|
||
def normalize_vtg_course_rows(rows: Any) -> list[dict[str, Any]]:
|
||
if not isinstance(rows, list):
|
||
return []
|
||
|
||
normalized_rows: list[dict[str, Any]] = []
|
||
for index, row in enumerate(rows):
|
||
if not isinstance(row, dict):
|
||
continue
|
||
display_label = normalize_whitespace(str(row.get("dato") or row.get("display_label") or ""))
|
||
if not display_label:
|
||
continue
|
||
status = normalize_whitespace(str(row.get("status") or "Ledig")) or "Ledig"
|
||
explicit_start = row.get("start_date")
|
||
explicit_end = row.get("end_date")
|
||
if explicit_start:
|
||
try:
|
||
start_date = datetime.fromisoformat(str(explicit_start)).date()
|
||
except ValueError:
|
||
start_date = None
|
||
else:
|
||
start_date = None
|
||
if explicit_end:
|
||
try:
|
||
end_date = datetime.fromisoformat(str(explicit_end)).date()
|
||
except ValueError:
|
||
end_date = None
|
||
else:
|
||
end_date = None
|
||
|
||
if not start_date and not end_date:
|
||
start_date, end_date = parse_course_date_range(display_label)
|
||
|
||
normalized_rows.append(
|
||
{
|
||
"dato": display_label,
|
||
"status": status,
|
||
"start_date": start_date.isoformat() if start_date else None,
|
||
"end_date": end_date.isoformat() if end_date else None,
|
||
"sort_order": index,
|
||
}
|
||
)
|
||
|
||
normalized_rows.sort(
|
||
key=lambda row: (
|
||
row.get("start_date") or row.get("end_date") or "9999-12-31",
|
||
int(row.get("sort_order") or 0),
|
||
row.get("dato") or "",
|
||
)
|
||
)
|
||
return normalized_rows
|
||
|
||
|
||
def is_upcoming_course(row: dict[str, Any], today: date | None = None) -> bool:
|
||
reference_today = today or date.today()
|
||
end_value = row.get("end_date") or row.get("start_date")
|
||
if not end_value:
|
||
return True
|
||
try:
|
||
end_date = datetime.fromisoformat(str(end_value)).date()
|
||
except ValueError:
|
||
return True
|
||
return end_date >= reference_today
|
||
|
||
|
||
def filter_upcoming_courses(rows: Any) -> list[dict[str, Any]]:
|
||
normalized_rows = normalize_vtg_course_rows(rows)
|
||
return [row for row in normalized_rows if is_upcoming_course(row)]
|
||
|
||
|
||
def get_invalid_vtg_course_labels(rows: Any) -> list[str]:
|
||
if not isinstance(rows, list):
|
||
return []
|
||
|
||
invalid_labels: list[str] = []
|
||
for row in rows:
|
||
if not isinstance(row, dict):
|
||
continue
|
||
|
||
display_label = normalize_whitespace(str(row.get("dato") or row.get("display_label") or ""))
|
||
if not display_label:
|
||
continue
|
||
|
||
explicit_start = row.get("start_date")
|
||
explicit_end = row.get("end_date")
|
||
start_date = None
|
||
end_date = None
|
||
|
||
if explicit_start:
|
||
try:
|
||
start_date = datetime.fromisoformat(str(explicit_start)).date()
|
||
except ValueError:
|
||
start_date = None
|
||
if explicit_end:
|
||
try:
|
||
end_date = datetime.fromisoformat(str(explicit_end)).date()
|
||
except ValueError:
|
||
end_date = None
|
||
|
||
if not start_date and not end_date:
|
||
start_date, end_date = parse_course_date_range(display_label)
|
||
|
||
if not start_date and not end_date:
|
||
invalid_labels.append(display_label)
|
||
|
||
return invalid_labels
|