130 lines
No EOL
5.5 KiB
Python
130 lines
No EOL
5.5 KiB
Python
import asyncio
|
|
import os
|
|
import asyncpg
|
|
import smtplib
|
|
import re # Ny import for tekst-vasking
|
|
from datetime import datetime
|
|
from email.mime.text import MIMEText
|
|
from email.mime.multipart import MIMEMultipart
|
|
from playwright.async_api import async_playwright
|
|
try:
|
|
from playwright_stealth import stealth_async as apply_stealth
|
|
except ImportError:
|
|
from playwright_stealth import stealth as apply_stealth
|
|
|
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
DB_URL = "postgresql://teeoff_admin:teeoff_secret_password@db:5432/teeoff"
|
|
|
|
def clean_text(text):
|
|
"""Fjerner spesialtegn og normaliserer tekst for sammenligning"""
|
|
return re.sub(r'[^a-zA-Z0-9æøåÆØÅ]', '', text).lower()
|
|
|
|
def interpret_status(text, keyword=None):
|
|
t_raw = text.lower()
|
|
|
|
if keyword:
|
|
# Fuzzy match: Vi renser både tekst og søkeord for bindestreker/mellomrom
|
|
k_clean = clean_text(keyword)
|
|
|
|
# Hvis vi ikke finner søkeordet engang i renset form, gi opp
|
|
if k_clean not in clean_text(t_raw):
|
|
return "NOT_FOUND"
|
|
|
|
# Hvis vi finner det, prøv å isolere teksten rundt det originale ordet
|
|
# Vi leter etter det originale keywordet først
|
|
parts = re.split(re.escape(keyword), t_raw, flags=re.IGNORECASE)
|
|
if len(parts) > 1:
|
|
t_raw = parts[1][:150]
|
|
else:
|
|
# Fallback hvis keywordet er delt av HTML-tagger (f.eks 18 <strong>hull</strong>)
|
|
t_raw = t_raw[-200:] # Bruk slutten av teksten hvis ordet er vanskelig å isolere
|
|
|
|
if any(word in t_raw for word in ["stengt", "lukket", "frost", "snø", "is", "closed", "stenger"]):
|
|
return "stengt"
|
|
if any(word in t_raw for word in ["vintergreen", "vintergrønn", "vinter"]):
|
|
return "aapen_med_vintergreener"
|
|
if any(word in t_raw for word in ["snart", "åpner kl"]):
|
|
return "aapner_snart"
|
|
if any(word in t_raw for word in ["åpen", "åpent", "aapen", "open"]):
|
|
return "aapen"
|
|
return "ukjent"
|
|
|
|
def send_report(changes, warnings):
|
|
if not changes and not warnings: return
|
|
subject = f"TeeOff Banestatus Rapport - {datetime.now().strftime('%d.%m.%Y')}"
|
|
body = "BANESTATUS RAPPORT\n" + "="*30 + "\n\n"
|
|
if changes: body += "✅ OPPDATERINGER:\n" + "\n".join(changes) + "\n\n"
|
|
if warnings: body += "⚠️ MERKNADER / ADVARSLER:\n" + "\n".join(warnings) + "\n"
|
|
|
|
msg = MIMEMultipart(); msg['From'] = os.getenv("SMTP_USER"); msg['To'] = os.getenv("EMAIL_TO"); msg['Subject'] = subject
|
|
msg.attach(MIMEText(body, 'plain'))
|
|
try:
|
|
with smtplib.SMTP_SSL(os.getenv("SMTP_SERVER"), int(os.getenv("SMTP_PORT"))) as server:
|
|
server.login(os.getenv("SMTP_USER"), os.getenv("SMTP_PASS"))
|
|
server.send_message(msg)
|
|
print("✅ Rapport sendt på e-post.")
|
|
except Exception as e: print(f"❌ E-post feil: {e}")
|
|
|
|
async def run_daily_scraping():
|
|
print(f"🚀 Starter sjekk {datetime.now().strftime('%H:%M:%S')}...")
|
|
conn = await asyncpg.connect(DB_URL)
|
|
facilities = await conn.fetch("SELECT id, name, scrape_status_url, scrape_status_selector FROM facilities WHERE scrape_status_url IS NOT NULL")
|
|
|
|
changes, warnings = [], []
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=True)
|
|
context = await browser.new_context()
|
|
|
|
for f in facilities:
|
|
page = await context.new_page()
|
|
try: await apply_stealth(page)
|
|
except: pass
|
|
|
|
try:
|
|
print(f"🔍 Besøker {f['name']}...")
|
|
await page.goto(f['scrape_status_url'], timeout=60000, wait_until="networkidle")
|
|
|
|
# Vent på at innholdet skal lande
|
|
await asyncio.sleep(3)
|
|
|
|
element = await page.query_selector(f['scrape_status_selector'])
|
|
if not element:
|
|
warnings.append(f"❌ {f['name']}: Fant ikke elementet '{f['scrape_status_selector']}'")
|
|
continue
|
|
|
|
full_text = await element.inner_text()
|
|
await conn.execute("UPDATE facilities SET status_updated_at = CURRENT_DATE WHERE id = $1", f['id'])
|
|
|
|
courses = await conn.fetch("SELECT id, name, status, scrape_keyword FROM courses WHERE facility_id = $1", f['id'])
|
|
for c in courses:
|
|
new_status = interpret_status(full_text, c['scrape_keyword'])
|
|
|
|
if new_status == "NOT_FOUND":
|
|
warnings.append(f"❓ {f['name']} ({c['name']}): Fant ikke søkeordet '{c['scrape_keyword']}' på siden.")
|
|
continue
|
|
|
|
old_status = c['status'] or "ukjent"
|
|
if new_status != old_status and new_status != "ukjent":
|
|
await conn.execute("UPDATE courses SET status = $1 WHERE id = $2", new_status, c['id'])
|
|
changes.append(f"🔹 {f['name']} ({c['name']}): {old_status.upper()} ➔ {new_status.upper()}")
|
|
print(f"✅ Oppdatert status for {f['name']} - {c['name']}")
|
|
else:
|
|
print(f" - {c['name']}: Ingen endring ({new_status.upper()})")
|
|
|
|
except Exception as e:
|
|
warnings.append(f"🔥 {f['name']}: Feil: {str(e)[:100]}")
|
|
finally:
|
|
await page.close()
|
|
await browser.close()
|
|
|
|
await conn.close()
|
|
send_report(changes, warnings)
|
|
print("🏁 Ferdig.")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run_daily_scraping()) |