Nye-TeeOff/backend/scrape_utils.py

92 lines
2.5 KiB
Python
Raw Permalink Normal View History

2026-04-12 10:11:23 +02:00
import json
from typing import Any, Awaitable, Callable
ProgressCallback = Callable[[dict[str, Any]], Awaitable[None]]
2026-05-04 15:30:29 +02:00
def exclude_discontinued_facilities_clause(facility_table: str = "facilities") -> str:
return f"""
AND (
NOT EXISTS (
SELECT 1
FROM courses course_filter
WHERE course_filter.facility_id = {facility_table}.id
)
OR EXISTS (
SELECT 1
FROM courses course_filter
WHERE course_filter.facility_id = {facility_table}.id
AND COALESCE(course_filter.status, 'ukjent') <> 'nedlagt'
)
)
"""
2026-04-12 10:11:23 +02:00
async def emit_progress(progress_callback: ProgressCallback | None, **payload: Any) -> None:
if progress_callback is None:
return
await progress_callback(payload)
def make_progress_event(
*,
facility_id: int | None,
facility_name: str,
outcome: str,
message: str,
processed: int,
total: int,
) -> dict[str, Any]:
return {
"facility_id": facility_id,
"facility_name": facility_name,
"outcome": outcome,
"message": message,
"processed": processed,
"total": total,
}
def parse_llm_json(raw_response: str) -> Any:
text = (raw_response or "").strip()
if not text:
raise ValueError("Tomt svar fra modellen.")
if text.startswith("```"):
lines = text.splitlines()
if lines:
lines = lines[1:]
if lines and lines[-1].strip().startswith("```"):
lines = lines[:-1]
text = "\n".join(lines).strip()
candidates = [text]
first_obj = text.find("{")
last_obj = text.rfind("}")
if first_obj != -1 and last_obj != -1 and last_obj > first_obj:
candidates.append(text[first_obj:last_obj + 1])
first_arr = text.find("[")
last_arr = text.rfind("]")
if first_arr != -1 and last_arr != -1 and last_arr > first_arr:
candidates.append(text[first_arr:last_arr + 1])
seen: set[str] = set()
for candidate in candidates:
candidate = candidate.strip()
if not candidate or candidate in seen:
continue
seen.add(candidate)
try:
parsed = json.loads(candidate)
if isinstance(parsed, str):
nested = parsed.strip()
if nested.startswith("{") or nested.startswith("["):
return json.loads(nested)
return parsed
except json.JSONDecodeError:
continue
raise ValueError("Klarte ikke å tolke gyldig JSON fra modellsvaret.")