#!/usr/bin/env python3
"""
Twitter Feed Processor (INCREMENTAL-SAFE)

Splits raw feed into tweets with/without images and downloads images incrementally.

Key guarantees:
- Never wipes existing image_description if already present in with_images.csv
- Never re-downloads images if a matching file already exists in images/
- Rebuilds manifest.json deterministically from the current raw_feed.csv + images folder state
- Safe to run multiple times per day

Usage:
  twitter-process [DATE]
"""

import sys
from pathlib import Path

# Add venv to path BEFORE other imports
venv_path = Path(__file__).parent.parent / "venv" / "lib" / "python3.12" / "site-packages"
sys.path.insert(0, str(venv_path))

import os
import csv
import json
import argparse
import re
from datetime import date
from typing import Dict, List, Optional, Tuple

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import filetype

BASE_DIR = Path(__file__).parent.parent


def create_session_with_retries():
    session = requests.Session()
    retry = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=["GET"],
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({"User-Agent": "twitter-pipeline/1.0"})
    return session


def sanitize_filename(s: str) -> str:
    s = (s or "").replace("@", "").strip()
    s = re.sub(r"[^\w\-]", "_", s)
    return s[:30] or "unknown"


def detect_extension(content: bytes) -> str:
    kind = filetype.guess(content)
    if kind:
        return f".{kind.extension}"
    return ".jpg"


def download_image(session: requests.Session, url: str, filepath_base: Path) -> Tuple[bool, Optional[Path], Optional[str]]:
    try:
        response = session.get(url, timeout=30)
        response.raise_for_status()
        content = response.content

        ext = detect_extension(content)
        filepath = filepath_base.with_suffix(ext)

        with open(filepath, "wb") as f:
            f.write(content)

        return True, filepath, None
    except Exception as e:
        return False, None, f"{type(e).__name__}: {e}"


def write_csv(path: Path, fieldnames: List[str], rows: List[Dict[str, str]]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        writer.writerows(rows)


def read_csv_rows(path: Path) -> List[Dict[str, str]]:
    if not path.exists():
        return []
    with open(path, "r", newline="", encoding="utf-8-sig") as f:
        r = csv.DictReader(f)
        return list(r)


def build_existing_with_images_map(with_images_path: Path) -> Dict[str, Dict[str, str]]:
    """
    tweet_id -> row
    """
    rows = read_csv_rows(with_images_path)
    out: Dict[str, Dict[str, str]] = {}
    for row in rows:
        tid = (row.get("tweet_id") or "").strip()
        if tid.isdigit():
            out[tid] = row
    return out


def find_existing_image_file(images_dir: Path, filename_base: str) -> Optional[str]:
    """
    Given base like 'user-tweetid-123-image1', find any existing file with any extension.
    Returns filename if found.
    """
    matches = sorted(images_dir.glob(f"{filename_base}.*"))
    if matches:
        return matches[0].name
    return None


def process_feed(archive_dir: Path) -> None:
    raw_file = archive_dir / "raw_feed.csv"
    if not raw_file.exists():
        print(f"Error: {raw_file} not found")
        sys.exit(1)

    images_dir = archive_dir / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    session = create_session_with_retries()

    # Existing with_images.csv map (to preserve image_description)
    with_images_file = archive_dir / "with_images.csv"
    existing_map = build_existing_with_images_map(with_images_file)

    no_images: List[Dict[str, str]] = []
    with_images: List[Dict[str, str]] = []
    manifest_images: List[Dict[str, str]] = []
    download_failures: List[Dict[str, str]] = []

    rows = read_csv_rows(raw_file)
    print(f"Processing {len(rows)} tweets...")

    base_fields = [
        "tweet_id", "tweet_url", "name", "username", "content",
        "likes", "retweets", "replies", "image_count", "image_urls", "has_video",
        "created_at"
    ]
    no_images_fields = base_fields + ["media_note"]
    with_images_fields = base_fields + ["downloaded_images", "image_description", "media_note"]

    for row in rows:
        tweet_id = (row.get("tweet_id") or "").strip()
        if not tweet_id.isdigit():
            print(f"  Skipping invalid tweet_id: {tweet_id}")
            continue

        # Parse image_count safely
        try:
            image_count = int(row.get("image_count") or 0)
        except Exception:
            image_count = 0

        has_images = image_count > 0

        if not has_images:
            row["media_note"] = "[Video - not downloaded]" if row.get("has_video") == "True" else ""
            no_images.append(row)
            continue

        # Images pathing
        image_urls = row.get("image_urls", "")
        urls = [u for u in image_urls.split("|") if u.strip()] if image_urls else []
        username = sanitize_filename(row.get("username", "unknown"))

        downloaded_files: List[str] = []
        for idx, url in enumerate(urls, 1):
            if not url:
                continue

            filename_base = f"{username}-tweetid-{tweet_id}-image{idx}"

            # If file already exists, do not download again
            existing_file = find_existing_image_file(images_dir, filename_base)
            if existing_file:
                downloaded_files.append(existing_file)
                manifest_images.append({
                    "file": existing_file,
                    "tweet_id": tweet_id,
                    "username": row.get("username", ""),
                    "image_num": idx,
                    "url": url
                })
                continue

            # Download if missing
            print(f"  Downloading {filename_base}...")
            success, final_path, error = download_image(session, url, images_dir / filename_base)
            if success and final_path:
                downloaded_files.append(final_path.name)
                manifest_images.append({
                    "file": final_path.name,
                    "tweet_id": tweet_id,
                    "username": row.get("username", ""),
                    "image_num": idx,
                    "url": url
                })
            else:
                print(f"    Failed: {error}")
                download_failures.append({"tweet_id": tweet_id, "url": url, "error": error or "unknown"})

        row["downloaded_images"] = "|".join(downloaded_files)

        # Preserve existing image_description if present
        prev = existing_map.get(tweet_id, {})
        prev_desc = (prev.get("image_description") or "").strip()
        row["image_description"] = prev_desc  # may be empty; later filled by apply script

        row["media_note"] = "[Also has video - not downloaded]" if row.get("has_video") == "True" else ""
        with_images.append(row)

    # Write CSVs always
    write_csv(archive_dir / "no_images.csv", no_images_fields, no_images)
    write_csv(archive_dir / "with_images.csv", with_images_fields, with_images)

    manifest = {
        "date": archive_dir.name,
        "total_tweets": len(rows),
        "tweets_without_images": len(no_images),
        "tweets_with_images": len(with_images),
        "total_images": len(manifest_images),
        "download_failures": len(download_failures),
        "images": manifest_images,
        "failures": download_failures
    }

    with open(archive_dir / "manifest.json", "w", encoding="utf-8") as f:
        json.dump(manifest, f, indent=2)

    print(f"Done! Output in {archive_dir}")
    print(f"  - no_images.csv: {len(no_images)}")
    print(f"  - with_images.csv: {len(with_images)}")
    print(f"  - images listed in manifest: {len(manifest_images)}")
    if download_failures:
        print(f"  - failures: {len(download_failures)}")


def main():
    parser = argparse.ArgumentParser(description="Process Twitter feed (incremental-safe)")
    parser.add_argument("date", nargs="?", default=None, help="Date (YYYY-MM-DD) or path to archive dir")
    args = parser.parse_args()

    if args.date:
        if os.path.isdir(args.date):
            archive_dir = Path(args.date)
        else:
            archive_dir = (
                BASE_DIR
                / "PHASES"
                / "PHASE-01-INGEST"
                / "tweets"
                / "archive"
                / args.date
            )
    else:
        archive_dir = (
            BASE_DIR
            / "PHASES"
            / "PHASE-01-INGEST"
            / "tweets"
            / "archive"
            / date.today().isoformat()
        )

    print(f"Twitter Feed Processor (incremental)")
    print(f"  Archive: {archive_dir}")
    process_feed(archive_dir)


if __name__ == "__main__":
    main()
