#!/usr/bin/env python3
"""
Crawler headless che simula un browser reale con Playwright per navigare e scaricare
contenuti di un sito (HTML, testo, immagini opzionali) seguendo link anche generati da JS
quando possibile.

Caratteristiche principali:
- Usa Chromium/Firefox/WebKit via Playwright con user agent reale
- Rispettoso di robots.txt (opzione per ignorarlo se consenti esplicitamente)
- Scansione BFS con limite di profondità e numero pagine
- Dominio vincolato (stesso host) o permissivo (allowlist multipla)
- Estrazione link da <a>, mappe <map>/<area>, e data-attributes comuni
- Simulazione umana: attese random, scroll graduali per innescare lazy-loading
- Download immagini opzionale con filtri per tipo e dimensione
- Salvataggio HTML, testo pulito, metadati, e JSON per ogni pagina
- Log strutturati

Requisiti:
  pip install playwright beautifulsoup4 lxml tldextract tqdm
  playwright install

Esempio d'uso:
  python crawler_playwright.py \
    --start https://www.centrosarca.it/ \
    --out ./output_sarca \
    --max-depth 3 --max-pages 500 \
    --same-domain \
    --include-images --image-types jpg,png,webp --image-min-kb 10 --image-max-kb 2048

Nota etica/legale:
- Verifica sempre i Termini d'Uso del sito e rispetta robots.txt (default: rispettato).
- Evita di aggirare misure attive (captcha, paywall). Questo script non aggira protezioni.
- Usa una velocità e frequenza ragionevoli.
"""

import asyncio
import argparse
import hashlib
import json
import os
import random
import re
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional, Set, Dict, List, Tuple

import tldextract
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
import urllib.robotparser as robotparser

from playwright.async_api import async_playwright, BrowserContext, Page
from tqdm.asyncio import tqdm_asyncio

# ------------------------------ Utility ---------------------------------

def slugify_path(url: str) -> str:
    parsed = urlparse(url)
    path = parsed.path if parsed.path else "/"
    query = ("?" + parsed.query) if parsed.query else ""
    raw = (parsed.netloc + path + query).rstrip("/")
    h = hashlib.sha1(url.encode("utf-8")).hexdigest()[:10]
    # sostituisce caratteri non ammessi e comprime ripetizioni
    safe = re.sub(r"[^a-zA-Z0-9/_\-\.]+", "-", raw)
    if safe.endswith("-"):
        safe = safe[:-1]
    if safe == "":
        safe = "root"
    return f"{safe}-{h}"

def normalize_url(base: str, href: str) -> Optional[str]:
    if not href:
        return None
    href = href.strip()
    # ignora ancore mailto/tel/javascript
    if href.startswith("mailto:") or href.startswith("tel:") or href.lower().startswith("javascript:"):
        return None
    # unisci e rimuovi anchor
    absolute = urljoin(base, href)
    absolute, _frag = urldefrag(absolute)
    # opzionale: rimuovi trailing slash duplicati
    if absolute.endswith('/') and len(absolute) > len(urlparse(absolute).scheme) + 3:
        absolute = absolute.rstrip('/')
    return absolute

@dataclass
class CrawlConfig:
    start: str
    out: Path
    same_domain: bool = True
    allow_domains: Optional[List[str]] = None
    max_depth: int = 2
    max_pages: int = 200
    headless: bool = True
    obey_robots: bool = True
    include_images: bool = False
    image_types: Set[str] = None
    image_min_kb: int = 0
    image_max_kb: int = 10_000
    concurrency: int = 4
    min_delay: float = 0.5
    max_delay: float = 2.0

    def __post_init__(self):
        if self.image_types is None:
            self.image_types = {"jpg", "jpeg", "png", "webp", "gif"}
        self.out.mkdir(parents=True, exist_ok=True)
        (self.out / "pages").mkdir(exist_ok=True)
        (self.out / "images").mkdir(exist_ok=True)
        (self.out / "logs").mkdir(exist_ok=True)

# ------------------------------ Robots ---------------------------------

class Robots:
    def __init__(self, user_agent: str, root_url: str):
        self.user_agent = user_agent
        self.root = urlparse(root_url)
        self.parser = robotparser.RobotFileParser()
        self.loaded = False

    async def load(self, context: BrowserContext):
        robots_url = f"{self.root.scheme}://{self.root.netloc}/robots.txt"
        try:
            resp = await context.request.get(robots_url, timeout=15000)
            if resp.ok:
                text = await resp.text()
                self.parser.parse(text.splitlines())
                self.loaded = True
            else:
                self.loaded = True  # trattiamo come vuoto
        except Exception:
            self.loaded = True

    def allowed(self, url: str) -> bool:
        if not self.loaded:
            return True
        try:
            return self.parser.can_fetch(self.user_agent, url)
        except Exception:
            return True

# ------------------------------ Estrazione ------------------------------

def extract_links(html: str, base_url: str) -> Set[str]:
    soup = BeautifulSoup(html, "lxml")
    links: Set[str] = set()

    # <a href>
    for a in soup.select('a[href]'):
        u = normalize_url(base_url, a.get('href'))
        if u:
            links.add(u)

    # <area href> (mappe HTML)
    for area in soup.select('area[href]'):
        u = normalize_url(base_url, area.get('href'))
        if u:
            links.add(u)

    # data-href / role=link (comuni nei siti con JS)
    for el in soup.find_all(attrs={"data-href": True}):
        u = normalize_url(base_url, el.get('data-href'))
        if u:
            links.add(u)

    # URL in script (grezzo, prudente)
    for script in soup.find_all('script'):
        txt = script.string or ""
        for match in re.finditer(r"https?://[\w\-\.\:/#%\?=&]+", txt):
            links.add(match.group(0))

    return links

async def human_like_scroll(page: Page):
    height = await page.evaluate("() => document.body.scrollHeight")
    pos = 0
    step = random.randint(300, 700)
    while pos < height:
        pos += step
        await page.mouse.wheel(0, step)
        await asyncio.sleep(random.uniform(0.1, 0.4))
        height = await page.evaluate("() => document.body.scrollHeight")

# ------------------------------ Salvataggio ----------------------------

async def save_page(output_dir: Path, url: str, html: str, text: str, meta: Dict):
    slug = slugify_path(url)
    base = output_dir / "pages" / slug
    base.parent.mkdir(parents=True, exist_ok=True)
    (base.with_suffix('.html')).write_text(html, encoding='utf-8')
    (base.with_suffix('.txt')).write_text(text, encoding='utf-8')
    (base.with_suffix('.json')).write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding='utf-8')

async def download_image(context: BrowserContext, out_dir: Path, url: str, cfg: CrawlConfig) -> Optional[str]:
    # Filtra per estensione
    ext = urlparse(url).path.split('.')[-1].lower()
    if ext not in cfg.image_types:
        return None
    try:
        resp = await context.request.get(url, timeout=20000)
        if not resp.ok:
            return None
        ctype = resp.headers.get("content-type", "")
        if not ctype.startswith("image/"):
            return None
        b = await resp.body()
        kb = len(b) / 1024
        if kb < cfg.image_min_kb or kb > cfg.image_max_kb:
            return None
        name = slugify_path(url) + f".{ext}"
        path = out_dir / "images" / name
        path.write_bytes(b)
        return str(path)
    except Exception:
        return None

# ------------------------------ Crawler --------------------------------

class Crawler:
    def __init__(self, cfg: CrawlConfig):
        self.cfg = cfg
        self.visited: Set[str] = set()
        self.queue: List[Tuple[str, int]] = [(cfg.start, 0)]
        self.allowed_domains = self._build_allowed_domains()
        self.robots: Optional[Robots] = None
        self.extracted_data: List[Dict] = []

    def _build_allowed_domains(self) -> Set[str]:
        allowed: Set[str] = set()
        start_host = urlparse(self.cfg.start).netloc
        if self.cfg.same_domain:
            allowed.add(start_host)
        if self.cfg.allow_domains:
            allowed.update(self.cfg.allow_domains)
        return allowed

    def _is_allowed_domain(self, url: str) -> bool:
        host = urlparse(url).netloc
        if not self.allowed_domains:
            return True
        # consenti sottodomini dello stesso registrable domain
        ext_host = tldextract.extract(host)
        for dom in self.allowed_domains:
            ext_dom = tldextract.extract(dom)
            if (ext_host.domain == ext_dom.domain and ext_host.suffix == ext_dom.suffix):
                return True
        return False

    async def run(self):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=self.cfg.headless, args=["--disable-blink-features=AutomationControlled"])  # realistic
            context = await browser.new_context(
                user_agent=None,  # Playwright imposta UA realistico
                viewport={"width": 1366, "height": 900},
                java_script_enabled=True,
                bypass_csp=True,
            )
            # load robots
            self.robots = Robots(await context.user_agent(), self.cfg.start)
            if self.cfg.obey_robots:
                await self.robots.load(context)

            sem = asyncio.Semaphore(self.cfg.concurrency)

            async def worker(url: str, depth: int):
                async with sem:
                    await self._visit(context, url, depth)

            tasks = []
            pbar_total = min(self.cfg.max_pages, 10_000)
            with tqdm_asyncio(total=pbar_total, desc="Pagine") as pbar:
                while self.queue and len(self.visited) < self.cfg.max_pages:
                    url, depth = self.queue.pop(0)
                    if url in self.visited:
                        continue
                    if depth > self.cfg.max_depth:
                        continue
                    if not self._is_allowed_domain(url):
                        continue
                    if self.cfg.obey_robots and not self.robots.allowed(url):
                        continue

                    self.visited.add(url)
                    tasks.append(asyncio.create_task(worker(url, depth)))
                    pbar.update(1)

                # attende completamento delle visite lanciate
                await tqdm_asyncio.gather(*tasks)

            await browser.close()

            # salva indice complessivo
            (self.cfg.out / "crawl_index.json").write_text(
                json.dumps(self.extracted_data, ensure_ascii=False, indent=2), encoding='utf-8'
            )

    async def _visit(self, context: BrowserContext, url: str, depth: int):
        page = await context.new_page()
        try:
            await asyncio.sleep(random.uniform(self.cfg.min_delay, self.cfg.max_delay))
            resp = await page.goto(url, wait_until="networkidle", timeout=45000)
            if not resp or not (200 <= resp.status < 400):
                await page.close()
                return

            # scroll per attivare lazy-loading
            await human_like_scroll(page)
            await asyncio.sleep(random.uniform(0.2, 0.6))

            html = await page.content()
            text = await page.evaluate("() => document.body.innerText")
            title = await page.title()

            meta = {
                "url": url,
                "depth": depth,
                "status": resp.status if resp else None,
                "title": title,
                "timestamp": int(time.time()),
            }

            await save_page(self.cfg.out, url, html, text, meta)

            # Estrai e pianifica nuovi link
            links = extract_links(html, url)
            next_links: List[Tuple[str, int]] = []
            for u in links:
                if u not in self.visited and self._is_allowed_domain(u):
                    next_links.append((u, depth + 1))

            # Tentativo leggero di scoprire link JS-only: clicca link/button visibili con href/data-href
            # in un contesto effimero (nuova pagina) e registra eventuale navigazione.
            js_links = await self._discover_js_navigation(page)
            for u in js_links:
                if u not in self.visited and self._is_allowed_domain(u):
                    next_links.append((u, depth + 1))

            # Scarica immagini opzionali
            images_saved = []
            if self.cfg.include_images:
                imgs = await page.eval_on_selector_all("img", "els => els.map(e => e.currentSrc || e.src || '')")
                imgs = [normalize_url(url, src) for src in imgs if src]
                imgs = [u for u in imgs if u]
                for img_url in set(imgs):
                    path = await download_image(context, self.cfg.out, img_url, self.cfg)
                    if path:
                        images_saved.append(path)

            # Registra
            self.extracted_data.append({
                **meta,
                "out_paths": {
                    "html": slugify_path(url) + ".html",
                    "text": slugify_path(url) + ".txt",
                    "meta": slugify_path(url) + ".json",
                },
                "discovered_links": sorted(list(links))[:200],
                "images_saved": images_saved,
            })

            # Aggiunge alla coda globale (BFS)
            self.queue.extend(next_links)

        except Exception as e:
            (self.cfg.out / "logs" / "errors.log").open("a", encoding="utf-8").write(f"{url}\t{repr(e)}\n")
        finally:
            await page.close()

    async def _discover_js_navigation(self, page: Page) -> Set[str]:
        """Cerca di attivare alcune navigazioni dipendenti da JS senza eseguire azioni pericolose.
        Limita a clic su elementi con ruolo link/button che portano a URL dello stesso dominio.
        Esegue i clic in un nuovo popup intercettando l'URL destinazione, senza persistere stato.
        """
        urls: Set[str] = set()
        # seleziona candidati
        candidates = await page.query_selector_all("a, button, [role=button], [role=link]")
        for el in candidates[:20]:  # limite prudenziale per performance
            try:
                box = await el.bounding_box()
                if not box or box["width"] < 24 or box["height"] < 16:
                    continue
                # verifica che sia visibile e cliccabile
                if not await el.is_visible():
                    continue

                async with page.expect_popup() as popup_info:
                    # forza ctrl/cmd-click per aprire in nuova tab quando possibile, evitando side-effects sulla pagina attuale
                    await el.click(modifiers=["Control"])  # su mac Playwright gestisce in modo trasparente
                new_page = await popup_info.value
                try:
                    await new_page.wait_for_load_state("domcontentloaded", timeout=8000)
                    dest = new_page.url
                    if dest and dest != page.url:
                        urls.add(dest)
                finally:
                    await new_page.close()
            except Exception:
                # ignora elementi problematici
                continue
        return urls

# ------------------------------ Main -----------------------------------

def parse_args() -> CrawlConfig:
    ap = argparse.ArgumentParser(description="Crawler Playwright che simula navigazione browser")
    ap.add_argument('--start', required=True, help='URL di partenza')
    ap.add_argument('--out', required=True, help='Directory di output')
    ap.add_argument('--same-domain', action='store_true', help='Limita al dominio di partenza (consigliato)')
    ap.add_argument('--allow-domains', default='', help='Domini aggiuntivi consentiti, separati da virgola')
    ap.add_argument('--max-depth', type=int, default=2, help='Profondità massima (default 2)')
    ap.add_argument('--max-pages', type=int, default=200, help='Numero massimo di pagine (default 200)')
    ap.add_argument('--headless', action='store_true', help='Esegui browser headless (default)')
    ap.add_argument('--headed', dest='headless', action='store_false', help='Mostra il browser durante il crawl')
    ap.add_argument('--ignore-robots', action='store_true', help='Ignora robots.txt (non consigliato)')
    ap.add_argument('--include-images', action='store_true', help='Scarica immagini collegate e visibili')
    ap.add_argument('--image-types', default='jpg,jpeg,png,webp,gif', help='Tipi immagine consentiti (estensioni)')
    ap.add_argument('--image-min-kb', type=int, default=0, help='Dimensione minima immagine in KB')
    ap.add_argument('--image-max-kb', type=int, default=10_000, help='Dimensione massima immagine in KB')
    ap.add_argument('--concurrency', type=int, default=4, help='Numero massimo di pagine in parallelo')
    ap.add_argument('--delay-range', default='0.5,2.0', help='Intervallo attese random tra azioni (sec) es. 0.5,2.5')

    args = ap.parse_args()

    allow_domains = [d.strip() for d in args.allow_domains.split(',') if d.strip()] or None
    min_d, max_d = [float(x) for x in args.delay_range.split(',')[:2]]

    cfg = CrawlConfig(
        start=args.start,
        out=Path(args.out),
        same_domain=args.same_domain,
        allow_domains=allow_domains,
        max_depth=args.max_depth,
        max_pages=args.max_pages,
        headless=args.headless,
        obey_robots=not args.ignore_robots,
        include_images=args.include_images,
        image_types=set(x.strip().lower() for x in args.image_types.split(',')),
        image_min_kb=args.image_min_kb,
        image_max_kb=args.image_max_kb,
        concurrency=max(1, args.concurrency),
        min_delay=min_d,
        max_delay=max_d,
    )
    return cfg

async def main_async(cfg: CrawlConfig):
    crawler = Crawler(cfg)
    await crawler.run()

if __name__ == '__main__':
    cfg = parse_args()
    try:
        asyncio.run(main_async(cfg))
    except KeyboardInterrupt:
        print("Interrotto dall'utente.")
