Nye-TeeOff/backend/scrape_status.py

174 lines
7.6 KiB
Python
Raw Normal View History

2026-02-28 09:20:56 +01:00
import asyncio
import os
import asyncpg
import smtplib
2026-03-02 19:39:40 +01:00
import re
2026-02-28 09:20:56 +01:00
from datetime import datetime
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from playwright.async_api import async_playwright
try:
from playwright_stealth import stealth_async as apply_stealth
except ImportError:
from playwright_stealth import stealth as apply_stealth
from dotenv import load_dotenv
load_dotenv()
DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff"
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9æøåÆØÅ]', '', text).lower()
def interpret_status(text, keyword=None):
t_raw = text.lower()
if keyword:
k_clean = clean_text(keyword)
if k_clean not in clean_text(t_raw):
return "NOT_FOUND"
parts = re.split(re.escape(keyword), t_raw, flags=re.IGNORECASE)
if len(parts) > 1:
t_raw = parts[1][:150]
else:
2026-03-02 19:39:40 +01:00
t_raw = t_raw[-200:]
2026-02-28 09:20:56 +01:00
if any(word in t_raw for word in ["stengt", "lukket", "frost", "snø", "is", "closed", "stenger"]):
return "stengt"
if any(word in t_raw for word in ["vintergreen", "vintergrønn", "vinter"]):
return "aapen_med_vintergreener"
if any(word in t_raw for word in ["snart", "åpner kl"]):
return "aapner_snart"
if any(word in t_raw for word in ["åpen", "åpent", "aapen", "open"]):
return "aapen"
return "ukjent"
2026-03-02 19:39:40 +01:00
def send_report(changes, warnings, successes):
if not changes and not warnings and not successes: return
2026-02-28 09:20:56 +01:00
subject = f"TeeOff Banestatus Rapport - {datetime.now().strftime('%d.%m.%Y')}"
body = "BANESTATUS RAPPORT\n" + "="*30 + "\n\n"
2026-03-02 19:39:40 +01:00
2026-02-28 09:20:56 +01:00
if changes: body += "✅ OPPDATERINGER:\n" + "\n".join(changes) + "\n\n"
2026-03-02 19:39:40 +01:00
if warnings: body += "⚠️ MERKNADER / ADVARSLER:\n" + "\n".join(warnings) + "\n\n"
if successes: body += "🆗 VELLYKKEDE SJEKKER (INGEN ENDRING):\n" + "\n".join(successes) + "\n"
2026-02-28 09:20:56 +01:00
2026-03-02 19:39:40 +01:00
msg = MIMEMultipart()
msg['From'] = os.getenv("SMTP_USER")
msg['To'] = os.getenv("EMAIL_TO")
msg['Subject'] = subject
2026-02-28 09:20:56 +01:00
msg.attach(MIMEText(body, 'plain'))
try:
with smtplib.SMTP_SSL(os.getenv("SMTP_SERVER"), int(os.getenv("SMTP_PORT"))) as server:
server.login(os.getenv("SMTP_USER"), os.getenv("SMTP_PASS"))
server.send_message(msg)
print("✅ Rapport sendt på e-post.")
2026-03-02 19:39:40 +01:00
except Exception as e:
print(f"❌ E-post feil: {e}")
2026-02-28 09:20:56 +01:00
async def run_daily_scraping():
print(f"🚀 Starter sjekk {datetime.now().strftime('%H:%M:%S')}...")
conn = await asyncpg.connect(DB_URL)
2026-03-02 19:39:40 +01:00
facilities = await conn.fetch("SELECT id, name, scrape_status_url, scrape_status_selector, scrape_method FROM facilities WHERE scrape_status_url IS NOT NULL")
2026-02-28 09:20:56 +01:00
2026-03-02 19:39:40 +01:00
changes, warnings, successes = [], [], []
2026-02-28 09:20:56 +01:00
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
for f in facilities:
page = await context.new_page()
try: await apply_stealth(page)
except: pass
try:
print(f"🔍 Besøker {f['name']}...")
2026-03-02 19:39:40 +01:00
# Endret fra networkidle til domcontentloaded for å unngå Arendal-timeout
await page.goto(f['scrape_status_url'], timeout=60000, wait_until="domcontentloaded")
await asyncio.sleep(3) # Gir Javascript 3 sekunder på å bygge siden
2026-02-28 09:20:56 +01:00
2026-03-02 19:39:40 +01:00
full_text = ""
method = f.get('scrape_method') or 'css_selector'
if method == 'css_selector':
element = page.locator(f['scrape_status_selector']).first
if await element.count() == 0:
warnings.append(f"{f['name']}: Fant ikke CSS-elementet '{f['scrape_status_selector']}'")
continue
full_text = await element.inner_text()
elif method == 'iframe_golfbox':
frame = page.frame_locator('iframe[src*="golfbox"]')
element = frame.locator(f['scrape_status_selector']).first
if await element.count() == 0:
warnings.append(f"{f['name']}: Fant ikke elementet '{f['scrape_status_selector']}' i iframen")
continue
full_text = await element.inner_text()
2026-03-04 13:17:10 +01:00
elif method == 'click_then_css':
# Vi forventer formatet: "knappe_selector||tekst_selector"
parts = f['scrape_status_selector'].split('||')
if len(parts) != 2:
warnings.append(f"{f['name']}: Ugyldig selector for click_then_css (mangler ||)")
continue
btn_selector, text_selector = parts
# 1. Finn og klikk på knappen
btn = page.locator(btn_selector).first
if await btn.count() == 0:
warnings.append(f"{f['name']}: Fant ikke knappen å klikke på: '{btn_selector}'")
continue
await btn.click()
# 2. Vent 2 sekunder så animasjonen (sidepanelet) rekker å bli ferdig
await asyncio.sleep(2)
# 3. Les av teksten
element = page.locator(text_selector).first
if await element.count() == 0:
warnings.append(f"{f['name']}: Fant ikke tekstboksen '{text_selector}' etter klikk")
continue
full_text = await element.inner_text()
2026-02-28 09:20:56 +01:00
2026-03-02 19:39:40 +01:00
else:
warnings.append(f"⚠️ {f['name']}: Ukjent skrapemetode i databasen: '{method}'")
continue
2026-02-28 09:20:56 +01:00
await conn.execute("UPDATE facilities SET status_updated_at = CURRENT_DATE WHERE id = $1", f['id'])
courses = await conn.fetch("SELECT id, name, status, scrape_keyword FROM courses WHERE facility_id = $1", f['id'])
for c in courses:
new_status = interpret_status(full_text, c['scrape_keyword'])
if new_status == "NOT_FOUND":
2026-03-02 19:39:40 +01:00
warnings.append(f"{f['name']} ({c['name']}): Fant ikke søkeordet '{c['scrape_keyword']}' i teksten på siden.")
2026-02-28 09:20:56 +01:00
continue
old_status = c['status'] or "ukjent"
if new_status != old_status and new_status != "ukjent":
await conn.execute("UPDATE courses SET status = $1 WHERE id = $2", new_status, c['id'])
changes.append(f"🔹 {f['name']} ({c['name']}): {old_status.upper()}{new_status.upper()}")
print(f"✅ Oppdatert status for {f['name']} - {c['name']}")
else:
2026-03-02 19:39:40 +01:00
successes.append(f"{f['name']} ({c['name']}): {new_status.upper()}")
2026-02-28 09:20:56 +01:00
print(f" - {c['name']}: Ingen endring ({new_status.upper()})")
except Exception as e:
2026-03-02 19:39:40 +01:00
# Trekker ut kun første linje av feilmeldingen for å unngå massiv og stygg tekst i e-posten
err_msg = str(e).split('\n')[0]
warnings.append(f"🔥 {f['name']}: Feil under skraping: {err_msg}")
2026-02-28 09:20:56 +01:00
finally:
await page.close()
await browser.close()
await conn.close()
2026-03-02 19:39:40 +01:00
send_report(changes, warnings, successes)
2026-02-28 09:20:56 +01:00
print("🏁 Ferdig.")
if __name__ == "__main__":
asyncio.run(run_daily_scraping())