#!/usr/bin/env python3 """ Twitter Feed Scraper CLI Usage: twitter-scraper feed [--delay MIN-MAX] [--output FILE] Examples: twitter-scraper feed 5 --delay 30-60 twitter-scraper feed 10 --delay 45-90 --output custom.csv """ import sys import os import asyncio import argparse import csv import random import time from datetime import date, timezone from pathlib import Path # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) # Activate venv venv_path = Path(__file__).parent.parent / "venv" / "lib" / "python3.12" / "site-packages" sys.path.insert(0, str(venv_path)) from twikit import Client from twikit.errors import TooManyRequests BASE_DIR = Path(__file__).parent.parent COOKIES_FILE = BASE_DIR / "PHASES" / "PHASE-01-INGEST" / "cookies.json" def parse_delay(delay_str): """Parse delay string like '30-60' into min/max tuple.""" if '-' in delay_str: parts = delay_str.split('-') return int(parts[0]), int(parts[1]) else: val = int(delay_str) return val, val async def call_with_rate_limit_handling(awaitable_factory, max_retries=3): """ Wrap twikit calls with rate limit handling. On TooManyRequests, wait until reset time + jitter, then retry. """ for attempt in range(1, max_retries + 1): try: return await awaitable_factory() except TooManyRequests as e: if attempt == max_retries: raise # Calculate wait time from rate_limit_reset now = int(time.time()) reset = getattr(e, 'rate_limit_reset', None) if reset and reset > now: wait = reset - now + random.randint(10, 30) # Add jitter else: wait = 15 * 60 + random.randint(10, 60) # Default 15 min + jitter print(f" Rate limited! Waiting {wait}s until reset (attempt {attempt}/{max_retries})...") await asyncio.sleep(wait) return None async def scrape_feed(batches, delay_min, delay_max, output_file): """Scrape the For You timeline.""" client = Client('en-US') client.load_cookies(str(COOKIES_FILE)) all_tweets = [] seen_ids = set() tweets = None for i in range(batches): print(f"Batch {i+1}/{batches}: Fetching tweets...") try: if tweets is None: tweets = await call_with_rate_limit_handling( lambda: client.get_timeline(count=20) ) else: tweets = await call_with_rate_limit_handling( lambda t=tweets: t.next() ) if tweets is None: print(" Failed to fetch after retries, stopping.") break batch_count = 0 for tweet in tweets: if tweet.id in seen_ids: continue seen_ids.add(tweet.id) batch_count += 1 # Get image URLs (up to 4) image_urls = [] has_video = False if tweet.media: for media in tweet.media[:4]: media_type = type(media).__name__ if media_type == 'Photo': image_urls.append(media.media_url) elif media_type in ('Video', 'AnimatedGif'): has_video = True # Extract created_at timestamp created_at = "" dt = getattr(tweet, "created_at_datetime", None) if dt: try: created_at = dt.astimezone(timezone.utc).isoformat() except Exception: created_at = str(dt) else: created_at = str(getattr(tweet, "created_at", "") or "") all_tweets.append({ 'tweet_id': tweet.id, 'tweet_url': f"https://x.com/{tweet.user.screen_name}/status/{tweet.id}", 'name': tweet.user.name, 'username': f"@{tweet.user.screen_name}", 'content': tweet.text.replace('\n', ' ').replace('\r', ''), 'likes': tweet.favorite_count, 'retweets': tweet.retweet_count, 'replies': tweet.reply_count, 'image_count': len(image_urls), 'image_urls': '|'.join(image_urls) if image_urls else '', 'has_video': has_video, 'created_at': created_at }) print(f" Got {batch_count} new tweets. Total: {len(all_tweets)}") except TooManyRequests: print(" Rate limit exceeded after retries, stopping.") break except Exception as e: print(f" Error: {type(e).__name__}: {e}") break if i < batches - 1: delay = random.randint(delay_min, delay_max) print(f" Waiting {delay}s...") await asyncio.sleep(delay) # Write CSV - fix: only mkdir if there's a directory component out_dir = os.path.dirname(output_file) if out_dir: os.makedirs(out_dir, exist_ok=True) with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=[ 'tweet_id', 'tweet_url', 'name', 'username', 'content', 'likes', 'retweets', 'replies', 'image_count', 'image_urls', 'has_video', 'created_at' ]) writer.writeheader() writer.writerows(all_tweets) print(f"\nDone! Saved {len(all_tweets)} tweets to {output_file}") return output_file def main(): parser = argparse.ArgumentParser(description='Twitter Feed Scraper') subparsers = parser.add_subparsers(dest='command', help='Commands') # Feed command feed_parser = subparsers.add_parser('feed', help='Scrape For You timeline') feed_parser.add_argument('batches', type=int, help='Number of batches to fetch') feed_parser.add_argument('--delay', default='30-60', help='Delay range in seconds (e.g., 30-60)') feed_parser.add_argument('--output', help='Output file path (default: auto-generated)') args = parser.parse_args() if args.command == 'feed': delay_min, delay_max = parse_delay(args.delay) # Default output path if args.output: output_file = args.output else: today = date.today().isoformat() output_file = str( BASE_DIR / "PHASES" / "PHASE-01-INGEST" / "tweets" / "archive" / today / "raw_feed.csv" ) print(f"Twitter Feed Scraper") print(f" Batches: {args.batches}") print(f" Delay: {delay_min}-{delay_max}s (randomized)") print(f" Output: {output_file}") print() asyncio.run(scrape_feed(args.batches, delay_min, delay_max, output_file)) else: parser.print_help() if __name__ == '__main__': main()