import asyncio import asyncpg import httpx from bs4 import BeautifulSoup import re import json DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff" def clean_name(text): if not text: return "" s = text.lower().replace("golfklubb", "").replace("gk", "").replace("par3golf", "").replace(" & ", "").strip() return re.sub(r'[^a-z]', '', s) def clean_nsg_content(text): """Fjerner doble linjeskift og kutter teksten før websidemenyen starter""" if not text: return "" # Fjern alt som ligner på bunn-menyen til NSG garbage_phrases = [ "Klubbens hjemmeside", "Resultatlister i Golfbox", "Livescoring", "Scoreinntasting", "Lagserie", "Turneringer", "Innmelding" ] for phrase in garbage_phrases: text = text.split(phrase)[0] # Rydd opp i linjeskift og doble mellomrom text = text.replace('\r', '').replace('\n', ' ') text = re.sub(r'\s+', ' ', text).strip() return text async def get_nsg_links(client): links = [] urls = ["https://seniorgolf.no/lojalitetskort-sitemap.xml", "https://seniorgolf.no/fordelskortet/"] for url in urls: try: resp = await client.get(url) if resp.status_code == 200: if ".xml" in url: found = re.findall(r'(https://seniorgolf.no/lojalitetskort/.*?/)', resp.text) if found: return list(set(found)) else: soup = BeautifulSoup(resp.text, 'html.parser') links.extend([l['href'] for l in soup.select('a[href*="/lojalitetskort/"]')]) except: continue return list(set(links)) async def scrape_nsg(): print("🚀 Starter NSG VASKEMASKIN v3.8...") conn = await asyncpg.connect(DB_URL) facilities = await conn.fetch("SELECT id, name FROM facilities") async with httpx.AsyncClient(timeout=20.0, headers={'User-Agent': 'Mozilla/5.0'}) as client: all_nsg_links = await get_nsg_links(client) link_map = {clean_name(l.split('/')[-2].replace('-', ' ')): l for l in all_nsg_links} matches_found = 0 for fac in facilities: fac_name_clean = clean_name(fac['name']) match_url = link_map.get(fac_name_clean) if not match_url: for slug, url in link_map.items(): if fac_name_clean in slug or slug in fac_name_clean: match_url = url break if match_url: try: f_resp = await client.get(match_url) f_soup = BeautifulSoup(f_resp.text, 'html.parser') # Finn hovedinnholdet i stedet for hele siden for å unngå menyer main_content = f_soup.find('div', {'class': 'entry-content'}) or f_soup text = main_content.get_text() st = re.search(r"Starttider:?\s*(.*?)(?=Greenfee|Booking|Adresse|Kontakt|$)", text, re.S | re.I) gf = re.search(r"Greenfee:?\s*(.*?)(?=Booking|Adresse|Kontakt|$)", text, re.S | re.I) bk = re.search(r"Booking:?\s*(.*?)(?=Adresse|Kontakt|$)", text, re.S | re.I) nsg_data = { "url": match_url, "starttider": clean_nsg_content(st.group(1)) if st else "Se nettside", "greenfee": clean_nsg_content(gf.group(1)) if gf else "Se nettside", "booking": clean_nsg_content(bk.group(1)) if bk else "Se nettside" } await conn.execute("UPDATE facilities SET nsg_data = $1 WHERE id = $2", json.dumps(nsg_data), fac['id']) print(f"✅ Vasket & Lagret: {fac['name']}") matches_found += 1 except: pass await conn.close() print(f"\n🎉 Vask ferdig! {matches_found} baner er nå 100% klare.") if __name__ == "__main__": asyncio.run(scrape_nsg())