Nye-TeeOff/backend/scrape_golfamore.py
2026-02-26 09:20:51 +01:00

100 lines
4.3 KiB
Python

import asyncio
import asyncpg
import httpx
from bs4 import BeautifulSoup
import re
import json
DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff"
def clean_name(text):
if not text: return ""
# Vasker navnet for matching (fjerner alt unntatt bokstaver)
s = text.lower().replace("golfklubb", "").replace("gk", "").replace(" & ", "").strip()
return re.sub(r'[^a-z]', '', s)
async def get_golfamore_links(client):
"""Henter ALLE norske klubblenker fra Golfamore sin sitemap"""
print("🕵️ Henter komplett liste fra Golfamore...")
try:
# Golfamore har egne sitemaps for hvert land
resp = await client.get("https://www.golfamore.com/sitemaps/courses-no.xml")
if resp.status_code == 200:
links = re.findall(r'<loc>(https://www.golfamore.com/no/golfklubb/.*?/)</loc>', resp.text)
return list(set(links))
except Exception as e:
print(f"❌ Kunne ikke hente sitemap: {e}")
return []
async def scrape_golfamore():
print("\n******************************************")
print("🚀 STARTER GOLFAMORE-SYNKRONISERING v1.0")
print("******************************************\n")
conn = await asyncpg.connect(DB_URL)
facilities = await conn.fetch("SELECT id, name FROM facilities")
async with httpx.AsyncClient(timeout=20.0, headers={'User-Agent': 'Mozilla/5.0'}) as client:
ga_links = await get_golfamore_links(client)
# Map vaskede navn fra URL-en til selve URL-en
link_map = {clean_name(l.split('/')[-2].replace('-', ' ')): l for l in ga_links}
matches_found = 0
for fac in facilities:
fac_id = fac['id']
fac_name = fac['name']
fac_clean = clean_name(fac_name)
match_url = link_map.get(fac_clean)
# Prøv delvis match hvis ikke eksakt (f.eks "Arendal" i "Arendal og Omegn")
if not match_url:
for slug, url in link_map.items():
if len(fac_clean) > 4 and (fac_clean in slug or slug in fac_clean):
match_url = url
break
if match_url:
try:
# Gå til klubbsiden for å finne vilkårene
f_resp = await client.get(match_url)
soup = BeautifulSoup(f_resp.text, 'html.parser')
# Finn teksten om når kortet gjelder.
# Golfamore bruker ofte spesifikke klasser for "rules" eller "conditions"
rules_section = soup.find('div', {'class': 'course-rules'}) or \
soup.find('div', {'class': 'course-info__rules'}) or \
soup.find(text=re.compile(r'Golfamore gjelder', re.I))
validity = "Gjelder alle dager" # Standard
if rules_section:
# Rydd opp i teksten
validity = rules_section.get_text(separator=' ').replace('\n', ' ')
validity = re.sub(r'\s+', ' ', validity).strip()
ga_data = {
"validity": validity,
"source_url": match_url
}
# Oppdater databasen
await conn.execute("""
UPDATE facilities
SET golfamore = true, golfamore_data = $1
WHERE id = $2
""", json.dumps(ga_data), fac_id)
print(f"✅ MATCH: {fac_name} ({validity[:50]}...)")
matches_found += 1
except:
# Hvis vi ikke klarer å lese detaljene, markerer vi den i hvert fall som aktiv
await conn.execute("UPDATE facilities SET golfamore = true WHERE id = $1", fac_id)
else:
# Hvis den ikke finnes på Golfamore, sett til false
await conn.execute("UPDATE facilities SET golfamore = false, golfamore_data = '{}' WHERE id = $1", fac_id)
await conn.close()
print(f"\n🎉 Ferdig! {matches_found} baner er nå bekreftet hos Golfamore.")
if __name__ == "__main__":
asyncio.run(scrape_golfamore())