import asyncio import asyncpg import httpx from bs4 import BeautifulSoup import re import json DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff" def clean_name(text): if not text: return "" # Vasker navnet for matching (fjerner alt unntatt bokstaver) s = text.lower().replace("golfklubb", "").replace("gk", "").replace(" & ", "").strip() return re.sub(r'[^a-z]', '', s) async def get_golfamore_links(client): """Henter ALLE norske klubblenker fra Golfamore sin sitemap""" print("🕵️ Henter komplett liste fra Golfamore...") try: # Golfamore har egne sitemaps for hvert land resp = await client.get("https://www.golfamore.com/sitemaps/courses-no.xml") if resp.status_code == 200: links = re.findall(r'(https://www.golfamore.com/no/golfklubb/.*?/)', resp.text) return list(set(links)) except Exception as e: print(f"❌ Kunne ikke hente sitemap: {e}") return [] async def scrape_golfamore(): print("\n******************************************") print("🚀 STARTER GOLFAMORE-SYNKRONISERING v1.0") print("******************************************\n") conn = await asyncpg.connect(DB_URL) facilities = await conn.fetch("SELECT id, name FROM facilities") async with httpx.AsyncClient(timeout=20.0, headers={'User-Agent': 'Mozilla/5.0'}) as client: ga_links = await get_golfamore_links(client) # Map vaskede navn fra URL-en til selve URL-en link_map = {clean_name(l.split('/')[-2].replace('-', ' ')): l for l in ga_links} matches_found = 0 for fac in facilities: fac_id = fac['id'] fac_name = fac['name'] fac_clean = clean_name(fac_name) match_url = link_map.get(fac_clean) # Prøv delvis match hvis ikke eksakt (f.eks "Arendal" i "Arendal og Omegn") if not match_url: for slug, url in link_map.items(): if len(fac_clean) > 4 and (fac_clean in slug or slug in fac_clean): match_url = url break if match_url: try: # Gå til klubbsiden for å finne vilkårene f_resp = await client.get(match_url) soup = BeautifulSoup(f_resp.text, 'html.parser') # Finn teksten om når kortet gjelder. # Golfamore bruker ofte spesifikke klasser for "rules" eller "conditions" rules_section = soup.find('div', {'class': 'course-rules'}) or \ soup.find('div', {'class': 'course-info__rules'}) or \ soup.find(text=re.compile(r'Golfamore gjelder', re.I)) validity = "Gjelder alle dager" # Standard if rules_section: # Rydd opp i teksten validity = rules_section.get_text(separator=' ').replace('\n', ' ') validity = re.sub(r'\s+', ' ', validity).strip() ga_data = { "validity": validity, "source_url": match_url } # Oppdater databasen await conn.execute(""" UPDATE facilities SET golfamore = true, golfamore_data = $1 WHERE id = $2 """, json.dumps(ga_data), fac_id) print(f"✅ MATCH: {fac_name} ({validity[:50]}...)") matches_found += 1 except: # Hvis vi ikke klarer å lese detaljene, markerer vi den i hvert fall som aktiv await conn.execute("UPDATE facilities SET golfamore = true WHERE id = $1", fac_id) else: # Hvis den ikke finnes på Golfamore, sett til false await conn.execute("UPDATE facilities SET golfamore = false, golfamore_data = '{}' WHERE id = $1", fac_id) await conn.close() print(f"\n🎉 Ferdig! {matches_found} baner er nå bekreftet hos Golfamore.") if __name__ == "__main__": asyncio.run(scrape_golfamore())