#!/usr/bin/env python3
"""
Twitter Feed Scraper CLI

Usage:
    twitter-scraper feed <batches> [--delay MIN-MAX] [--output FILE]

Examples:
    twitter-scraper feed 5 --delay 30-60
    twitter-scraper feed 10 --delay 45-90 --output custom.csv
"""

import sys
import os
import asyncio
import argparse
import csv
import random
import time
from datetime import date, timezone
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

# Activate venv
venv_path = Path(__file__).parent.parent / "venv" / "lib" / "python3.12" / "site-packages"
sys.path.insert(0, str(venv_path))

from twikit import Client
from twikit.errors import TooManyRequests

BASE_DIR = Path(__file__).parent.parent
COOKIES_FILE = BASE_DIR / "PHASES" / "PHASE-01-INGEST" / "cookies.json"


def parse_delay(delay_str):
    """Parse delay string like '30-60' into min/max tuple."""
    if '-' in delay_str:
        parts = delay_str.split('-')
        return int(parts[0]), int(parts[1])
    else:
        val = int(delay_str)
        return val, val


async def call_with_rate_limit_handling(awaitable_factory, max_retries=3):
    """
    Wrap twikit calls with rate limit handling.
    On TooManyRequests, wait until reset time + jitter, then retry.
    """
    for attempt in range(1, max_retries + 1):
        try:
            return await awaitable_factory()
        except TooManyRequests as e:
            if attempt == max_retries:
                raise

            # Calculate wait time from rate_limit_reset
            now = int(time.time())
            reset = getattr(e, 'rate_limit_reset', None)

            if reset and reset > now:
                wait = reset - now + random.randint(10, 30)  # Add jitter
            else:
                wait = 15 * 60 + random.randint(10, 60)  # Default 15 min + jitter

            print(f"  Rate limited! Waiting {wait}s until reset (attempt {attempt}/{max_retries})...")
            await asyncio.sleep(wait)

    return None


async def scrape_feed(batches, delay_min, delay_max, output_file):
    """Scrape the For You timeline."""
    client = Client('en-US')
    client.load_cookies(str(COOKIES_FILE))

    all_tweets = []
    seen_ids = set()
    tweets = None

    for i in range(batches):
        print(f"Batch {i+1}/{batches}: Fetching tweets...")

        try:
            if tweets is None:
                tweets = await call_with_rate_limit_handling(
                    lambda: client.get_timeline(count=20)
                )
            else:
                tweets = await call_with_rate_limit_handling(
                    lambda t=tweets: t.next()
                )

            if tweets is None:
                print("  Failed to fetch after retries, stopping.")
                break

            batch_count = 0
            for tweet in tweets:
                if tweet.id in seen_ids:
                    continue
                seen_ids.add(tweet.id)
                batch_count += 1

                # Get image URLs (up to 4)
                image_urls = []
                has_video = False
                if tweet.media:
                    for media in tweet.media[:4]:
                        media_type = type(media).__name__
                        if media_type == 'Photo':
                            image_urls.append(media.media_url)
                        elif media_type in ('Video', 'AnimatedGif'):
                            has_video = True

                # Extract created_at timestamp
                created_at = ""
                dt = getattr(tweet, "created_at_datetime", None)
                if dt:
                    try:
                        created_at = dt.astimezone(timezone.utc).isoformat()
                    except Exception:
                        created_at = str(dt)
                else:
                    created_at = str(getattr(tweet, "created_at", "") or "")

                all_tweets.append({
                    'tweet_id': tweet.id,
                    'tweet_url': f"https://x.com/{tweet.user.screen_name}/status/{tweet.id}",
                    'name': tweet.user.name,
                    'username': f"@{tweet.user.screen_name}",
                    'content': tweet.text.replace('\n', ' ').replace('\r', ''),
                    'likes': tweet.favorite_count,
                    'retweets': tweet.retweet_count,
                    'replies': tweet.reply_count,
                    'image_count': len(image_urls),
                    'image_urls': '|'.join(image_urls) if image_urls else '',
                    'has_video': has_video,
                    'created_at': created_at
                })

            print(f"  Got {batch_count} new tweets. Total: {len(all_tweets)}")

        except TooManyRequests:
            print("  Rate limit exceeded after retries, stopping.")
            break
        except Exception as e:
            print(f"  Error: {type(e).__name__}: {e}")
            break

        if i < batches - 1:
            delay = random.randint(delay_min, delay_max)
            print(f"  Waiting {delay}s...")
            await asyncio.sleep(delay)

    # Write CSV - fix: only mkdir if there's a directory component
    out_dir = os.path.dirname(output_file)
    if out_dir:
        os.makedirs(out_dir, exist_ok=True)

    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'tweet_id', 'tweet_url', 'name', 'username', 'content',
            'likes', 'retweets', 'replies', 'image_count', 'image_urls', 'has_video',
            'created_at'
        ])
        writer.writeheader()
        writer.writerows(all_tweets)

    print(f"\nDone! Saved {len(all_tweets)} tweets to {output_file}")
    return output_file


def main():
    parser = argparse.ArgumentParser(description='Twitter Feed Scraper')
    subparsers = parser.add_subparsers(dest='command', help='Commands')

    # Feed command
    feed_parser = subparsers.add_parser('feed', help='Scrape For You timeline')
    feed_parser.add_argument('batches', type=int, help='Number of batches to fetch')
    feed_parser.add_argument('--delay', default='30-60', help='Delay range in seconds (e.g., 30-60)')
    feed_parser.add_argument('--output', help='Output file path (default: auto-generated)')

    args = parser.parse_args()

    if args.command == 'feed':
        delay_min, delay_max = parse_delay(args.delay)

        # Default output path
        if args.output:
            output_file = args.output
        else:
            today = date.today().isoformat()
            output_file = str(
                BASE_DIR
                / "PHASES"
                / "PHASE-01-INGEST"
                / "tweets"
                / "archive"
                / today
                / "raw_feed.csv"
            )

        print(f"Twitter Feed Scraper")
        print(f"  Batches: {args.batches}")
        print(f"  Delay: {delay_min}-{delay_max}s (randomized)")
        print(f"  Output: {output_file}")
        print()

        asyncio.run(scrape_feed(args.batches, delay_min, delay_max, output_file))
    else:
        parser.print_help()


if __name__ == '__main__':
    main()
