2026-03-08 10:26:56 +01:00
"""
2026-03-12 13:39:10 +01:00
TEE OFF - MEDLEMSKAPSSKRAPER MED GEMINI AI ( MULTI - URL VERSJON )
2026-03-08 10:26:56 +01:00
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2026-03-12 13:39:10 +01:00
Går til oppgitte medlemskaps - URLer ( støtter flere URLer adskilt med komma ) ,
henter ut tekst , og bruker Gemini til å summere og finne ' Standard ' og
' Rimeligste ' medlemskap .
2026-03-08 10:26:56 +01:00
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"""
import asyncio
import asyncpg
import os
import json
import argparse
from bs4 import BeautifulSoup
from playwright . async_api import async_playwright
import google . generativeai as genai
from dotenv import load_dotenv
2026-04-16 09:58:08 +02:00
from env_config import get_database_url
2026-05-04 15:30:29 +02:00
from scrape_utils import (
ProgressCallback ,
emit_progress ,
exclude_discontinued_facilities_clause ,
make_progress_event ,
parse_llm_json ,
)
2026-03-08 10:26:56 +01:00
load_dotenv ( )
2026-04-16 09:58:08 +02:00
DB_URL = get_database_url ( )
2026-03-08 10:26:56 +01:00
GEMINI_API_KEY = os . getenv ( " GEMINI_API_KEY " )
if not GEMINI_API_KEY :
raise ValueError ( " 🚨 GEMINI_API_KEY mangler i .env filen! " )
genai . configure ( api_key = GEMINI_API_KEY )
2026-03-12 13:39:10 +01:00
model = genai . GenerativeModel ( ' gemini-2.5-flash ' )
2026-03-08 10:26:56 +01:00
2026-03-12 13:39:10 +01:00
async def fetch_page_text ( url : str , browser ) - > str :
""" Bruker Playwright for å hente all synlig tekst fra EN nettside. """
url = url . strip ( )
if not url . startswith ( " http " ) :
return " "
2026-03-08 10:26:56 +01:00
print ( f " 🌐 Laster inn: { url } " )
try :
2026-03-12 13:39:10 +01:00
page = await browser . new_page ( )
await page . goto ( url , wait_until = " domcontentloaded " , timeout = 15000 )
html_content = await page . content ( )
await page . close ( )
soup = BeautifulSoup ( html_content , ' html.parser ' )
for script in soup ( [ " script " , " style " , " nav " , " footer " , " header " ] ) :
script . extract ( )
2026-03-08 10:26:56 +01:00
2026-03-12 13:39:10 +01:00
text = soup . get_text ( separator = ' ' , strip = True )
return text
2026-03-08 10:26:56 +01:00
except Exception as e :
2026-03-12 13:39:10 +01:00
print ( f " ❌ Feil ved lasting av { url } : { e } " )
2026-03-08 10:26:56 +01:00
return " "
2026-04-12 10:11:23 +02:00
def analyze_with_gemini ( text : str , club_name : str ) - > dict | None :
2026-03-12 13:39:10 +01:00
""" Sender den kombinerte teksten til Gemini for å trekke ut og evt. summere priser. """
2026-03-08 10:26:56 +01:00
print ( f " 🧠 Sender { len ( text ) } tegn til Gemini for analyse... " )
prompt = f """
2026-03-12 13:39:10 +01:00
Du er en ekspert på norske golfklubber . Din oppgave er å lese teksten hentet fra nettsidene til " {club_name} " og finne to spesifikke priser .
VIKTIG REGEL OM NORSK GOLF :
Mange steder er " Klubbkontingent/Medlemskap " og " Spillerett/Årskort " to forskjellige ting .
For å få spille ubegrenset ( Fritt spill ) MÅ man betale BEGGE DELER . Hvis du ser at prisene for kontingent og spillerett er oppgitt hver for seg , SKAL DU SUMMERE disse to summene og bruke totalen som " Standard pris " .
ALDERSPREMISS FOR BEGGE PRISER :
Vi forutsetter at personen som skal ha medlemskap er en VOKSEN GOLFER PÅ MINST 35 ÅR . Du må ALDRI velge priser som gjelder for barn , junior , ung voksen ( f . eks . 20 - 29 år ) , student eller senior / pensjonist .
2026-03-08 10:26:56 +01:00
DEFINISJONER DU MÅ FØLGE STRENGT :
2026-03-12 13:39:10 +01:00
1. " Standard medlemskap " : Hva er TOTALPRISEN ( inkludert evt . spillerett / årskort ) for en voksen person ( 35 + år ) for å spille SÅ MYE VEDKOMMENDE ØNSKER ( Fritt spill ) i år ?
2. " Rimeligste alternativ " : Det absolutt billigste alternativet FOR EN VOKSEN PERSON ( 35 + år ) som gir medlemskap i klubben ( golfkortet ) , forutsatt at man betaler greenfee for hver runde . ( Ofte kalt Greenfeemedlem , Postkassemedlem , Fjernmedlem , eller kun " Klubbkontingent for voksne " uten spillerett ) .
2026-03-08 10:26:56 +01:00
2026-03-12 13:39:10 +01:00
TEKST FRA NETTSIDEN ( E ) :
2026-03-08 10:26:56 +01:00
{ text }
OPPGAVE :
2026-03-12 13:39:10 +01:00
Returner KUN et gyldig JSON - objekt med følgende struktur :
2026-03-08 10:26:56 +01:00
{ {
2026-03-12 13:39:10 +01:00
" foreslatt_standard_navn " : " Navn (eks: Hovedmedlem Voksen inkl. spillerett) " ,
2026-03-08 10:26:56 +01:00
" foreslatt_standard_pris " : 1234 ,
2026-03-12 13:39:10 +01:00
" foreslatt_standard_kommentar " : " Kort kommentar (eks: Måtte summere kontingent på 900 og årskort på 5000) " ,
" foreslatt_rimeligste_navn " : " Navn (eks: Greenfeemedlemskap Voksen) " ,
2026-03-08 10:26:56 +01:00
" foreslatt_rimeligste_pris " : 500 ,
2026-03-12 13:39:10 +01:00
" ai_begrunnelse " : " Kort forklaring på utregningen din. "
2026-03-08 10:26:56 +01:00
} }
2026-03-12 13:39:10 +01:00
Merk : Prisene SKAL være tall ( integer ) , ikke tekst . Sett til null hvis du ikke finner det .
2026-03-08 10:26:56 +01:00
"""
try :
response = model . generate_content ( prompt )
2026-04-12 10:11:23 +02:00
parsed = parse_llm_json ( response . text )
return parsed if isinstance ( parsed , dict ) else None
2026-03-08 10:26:56 +01:00
except Exception as e :
print ( f " ❌ AI-analyse feilet: { e } " )
return None
2026-04-12 10:11:23 +02:00
async def run_scraper ( facility_ids = None , progress_callback : ProgressCallback | None = None ) :
2026-03-12 13:39:10 +01:00
print ( " 🚀 Starter Medlemskaps-skraperen (Støtter multi-URL)... " )
2026-03-08 10:26:56 +01:00
conn = await asyncpg . connect ( DB_URL )
2026-04-10 18:37:33 +02:00
facilities = [ ]
analyzed_count = 0
saved_count = 0
skipped_count = 0
2026-04-12 10:11:23 +02:00
failed_count = 0
2026-03-08 10:26:56 +01:00
try :
2026-05-04 15:30:29 +02:00
query = (
" SELECT id, name, medlemskap_url FROM facilities "
" WHERE medlemskap_url IS NOT NULL AND medlemskap_url != ' ' "
f " { exclude_discontinued_facilities_clause ( ' facilities ' ) } "
)
2026-03-08 10:26:56 +01:00
if facility_ids :
query + = f " AND id IN ( { ' , ' . join ( map ( str , facility_ids ) ) } ) "
facilities = await conn . fetch ( query )
2026-04-12 10:11:23 +02:00
total_facilities = len ( facilities )
print ( f " 📋 Fant { total_facilities } anlegg å skrape. " )
await emit_progress (
progress_callback ,
progress_total = total_facilities ,
progress_completed = 0 ,
progress_ok = 0 ,
progress_failed = 0 ,
progress_skipped = 0 ,
event = make_progress_event (
facility_id = None ,
facility_name = " Medlemskap " ,
outcome = " info " ,
message = f " Starter medlemskapsskraping for { total_facilities } anlegg. " ,
processed = 0 ,
total = total_facilities ,
) ,
)
2026-03-08 10:26:56 +01:00
2026-03-12 13:39:10 +01:00
async with async_playwright ( ) as p :
browser = await p . chromium . launch ( headless = True )
2026-03-08 10:26:56 +01:00
2026-04-12 10:11:23 +02:00
for index , facility in enumerate ( facilities , start = 1 ) :
2026-03-12 13:39:10 +01:00
fac_id = facility [ ' id ' ]
name = facility [ ' name ' ]
urls_raw = facility [ ' medlemskap_url ' ]
2026-03-08 10:26:56 +01:00
2026-03-12 13:39:10 +01:00
print ( f " \n ▶️ Behandler: { name } (ID: { fac_id } ) " )
2026-04-12 10:11:23 +02:00
await emit_progress (
progress_callback ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " info " ,
message = " Starter henting av medlemskapssider. " ,
processed = index - 1 ,
total = total_facilities ,
) ,
)
2026-03-08 10:26:56 +01:00
2026-03-12 13:39:10 +01:00
# Sjekker om det er flere URL-er adskilt med komma
urls = [ u . strip ( ) for u in urls_raw . split ( ' , ' ) ]
combined_text = " "
2026-04-12 10:11:23 +02:00
try :
for idx , url in enumerate ( urls , 1 ) :
page_text = await fetch_page_text ( url , browser )
if page_text :
combined_text + = f " \n \n --- TEKST FRA SIDE { idx } ( { url } ) --- \n { page_text } "
2026-03-12 13:39:10 +01:00
2026-04-12 10:11:23 +02:00
if len ( combined_text ) < 50 :
print ( " ⚠️ Fant for lite tekst, hopper over. " )
skipped_count + = 1
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " warning " ,
message = " Hoppet over fordi det ble funnet for lite tekst på medlemskapssidene. " ,
processed = index ,
total = total_facilities ,
) ,
)
continue
draft_data = analyze_with_gemini ( combined_text [ : 25000 ] , name )
if not draft_data :
failed_count + = 1
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " error " ,
message = " AI-analysen ga ikke et gyldig medlemskapsutkast. " ,
processed = index ,
total = total_facilities ,
) ,
)
continue
2026-04-10 18:37:33 +02:00
2026-04-12 10:11:23 +02:00
analyzed_count + = 1
print ( f " ✅ AI foreslår: Standard: { draft_data . get ( ' foreslatt_standard_pris ' ) } | Rimeligste: { draft_data . get ( ' foreslatt_rimeligste_pris ' ) } " )
2026-03-12 13:39:10 +01:00
2026-04-12 10:11:23 +02:00
await conn . execute ( """
UPDATE facilities
SET membership_draft = $ 1 : : jsonb
WHERE id = $ 2
""" , json.dumps(draft_data), fac_id)
print ( " 💾 Utkast lagret i databasen! " )
saved_count + = 1
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " success " ,
message = (
f " Utkast lagret. Standard: { draft_data . get ( ' foreslatt_standard_pris ' ) or ' ukjent ' } "
f " | Rimeligste: { draft_data . get ( ' foreslatt_rimeligste_pris ' ) or ' ukjent ' } "
) ,
processed = index ,
total = total_facilities ,
) ,
)
except Exception as e :
failed_count + = 1
print ( f " ❌ Uventet feil for { name } : { e } " )
await emit_progress (
progress_callback ,
progress_completed = index ,
progress_ok = saved_count ,
progress_failed = failed_count ,
progress_skipped = skipped_count ,
current_facility_id = fac_id ,
current_facility_name = name ,
event = make_progress_event (
facility_id = fac_id ,
facility_name = name ,
outcome = " error " ,
message = f " Feilet under behandling: { str ( e ) . splitlines ( ) [ 0 ] } " ,
processed = index ,
total = total_facilities ,
) ,
)
2026-03-12 13:39:10 +01:00
await browser . close ( )
2026-03-08 10:26:56 +01:00
finally :
await conn . close ( )
print ( " \n 🏁 Skraping fullført. " )
2026-04-10 18:37:33 +02:00
return {
" processed_facilities " : len ( facilities ) ,
" analyzed_facilities " : analyzed_count ,
" saved_drafts " : saved_count ,
" skipped_facilities " : skipped_count ,
2026-04-12 10:11:23 +02:00
" failed_facilities " : failed_count ,
2026-04-10 18:37:33 +02:00
}
2026-03-08 10:26:56 +01:00
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = " Skrap medlemskapspriser via AI. " )
parser . add_argument ( " --ids " , type = str , help = " Kommaseparert liste med facility IDs (eks: 1,5,12) " )
args = parser . parse_args ( )
ids_to_scrape = None
if args . ids :
ids_to_scrape = [ int ( x . strip ( ) ) for x in args . ids . split ( " , " ) ]
2026-04-10 18:37:33 +02:00
asyncio . run ( run_scraper ( ids_to_scrape ) )