diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..6313b56 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto eol=lf diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9fdb553 --- /dev/null +++ b/.gitignore @@ -0,0 +1,61 @@ +# OS / Editor +.DS_Store +.vscode/ + +# Python +__pycache__/ +*.py[cod] +*.pyo +*.pyd +*.so +*.pkl +*.pickle +.pytest_cache/ +.mypy_cache/ +.coverage +coverage.xml + +# Environments +.env +.env.* +.venv/ +venv/ + +# Project outputs (large or generated) +data/ +!data/.gitkeep +models/ +!models/.gitkeep +checkpoints/ +runs/ + +# Sessions / secrets / sqlite +*.session +*.sqlite* +*.db +*.log + +# Notebooks +.ipynb_checkpoints/ + +# Caches and locks +.cache/ +*.lock# Python +__pycache__/ +*.pyc +*.pyo +*.pyd +.venv/ +venv/ +.env +*.env + +# Telethon session files +*.session +*.session-journal + +# Jupyter +.ipynb_checkpoints/ + +# macOS +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..db72e4d --- /dev/null +++ b/README.md @@ -0,0 +1,205 @@ +# Telegram analytics toolkit + +Scrape public Telegram channel posts, fetch replies and forwards, and generate rich analytics reports with tagging, sentiment, matchday overlays, and plots. Use VADER, a local transformers model, or a local GPT (Ollama) backend for sentiment. + +Highlights: +- Fast replies scraping with concurrency, resume/append, and rate-limit visibility +- Forwards scanning with chunked, concurrent search +- Analyzer: tagging from YAML keywords; sentiment via VADER, transformers, or local GPT; emoji-aware modes; combined posts+replies metrics; and matchday cross-analysis +- Plots: daily activity with in-plot match labels, daily volume vs sentiment (new), heatmaps, and per-tag (team) sentiment shares +- Local learning: fine-tune and evaluate a transformers classifier and use it in analysis + +Full command reference is in `docs/COMMANDS.md`. + +## Quick start + +1) Configure secrets in `.env` (script will prompt if absent): +``` +TELEGRAM_API_ID=123456 +TELEGRAM_API_HASH=your_api_hash +# Optional +TELEGRAM_SESSION_NAME=telegram +TELEGRAM_2FA_PASSWORD=your_2fa_password +FOOTBALL_DATA_API_TOKEN=your_token +``` + +2) Run any command via the wrapper (creates venv and installs deps automatically): + +```zsh +# Fetch messages to CSV +./run_scraper.sh scrape -c https://t.me/Premier_League_Update -o data/premier_league_update.csv --start-date 2025-08-15 --end-date 2025-10-15 + +# Fetch replies fast +./run_scraper.sh replies -c https://t.me/Premier_League_Update --from-csv data/premier_league_update.csv -o data/premier_league_replies.csv --min-replies 1 --concurrency 15 --resume --append + +# Analyze with tags, fixtures, emoji handling and plots +./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv --fixtures-csv data/premier_league_schedule_2025-08-15_to_2025-10-15.csv --tags-config config/tags.yaml --write-augmented-csv --write-combined-csv --emoji-mode keep --emoji-boost --save-plots +``` + +3) Use transformers sentiment instead of VADER: + +```zsh +# Off-the-shelf fine-tuned sentiment head +./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv \ + --sentiment-backend transformers \ + --transformers-model distilbert-base-uncased-finetuned-sst-2-english \ + --export-transformers-details \ + --write-augmented-csv --write-combined-csv --save-plots +``` + +4) Use a local GPT backend (Ollama) for sentiment (JSON labels+confidence mapped to a compound score): + +```zsh +# Ensure Ollama is running locally and the model is available (e.g., llama3) +./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv \ + --sentiment-backend gpt \ + --gpt-model llama3 \ + --gpt-base-url http://localhost:11434 \ + --write-augmented-csv --write-combined-csv --save-plots +``` + +## Aliases + +Convenient zsh functions live in `scripts/aliases.zsh`: + +- `fast_replies` — resume+append replies with concurrency +- `chunked_forwards` — concurrent forwards scan +- `analyze_combined` — posts+replies+fixtures with tags +- `analyze_emoji` — emoji-aware analyze with boost +- `analyze_transformers` — analyze with transformers and export details +- `apply_labels_and_analyze` — merge a labeled CSV into posts/replies and run analyzer (reuses sentiment_label) +- `plot_labeled` — QA plots from a labeled CSV (class distribution, confidence, lengths) +- `train_transformers` — fine-tune a model on a labeled CSV +- `eval_transformers` — evaluate a fine-tuned model + +Source them: +```zsh +source scripts/aliases.zsh +``` + +## Local transformers (optional) + +Train a classifier: +```zsh +./.venv/bin/python -m src.train_sentiment \ + --train-csv data/labeled_sentiment.csv \ + --text-col message \ + --label-col label \ + --model-name distilbert-base-uncased \ + --output-dir models/sentiment-distilbert \ + --epochs 3 --batch-size 16 +``` + +Evaluate it: +```zsh +./.venv/bin/python -m src.eval_sentiment \ + --csv data/labeled_holdout.csv \ + --text-col message \ + --label-col label \ + --model models/sentiment-distilbert +``` + +Use it in analyze: +```zsh +./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv \ + --sentiment-backend transformers \ + --transformers-model models/sentiment-distilbert \ + --export-transformers-details \ + --write-augmented-csv --write-combined-csv --save-plots +``` + +Notes: +- GPU/Apple Silicon (MPS) is auto-detected; CPU is the fallback. +- Torch pinning in `requirements.txt` uses conditional versions for smooth installs across Python versions. + +## Plots produced (when --save-plots is used) + +- `daily_activity_stacked.png` — stacked bar chart of posts vs replies per day. + - Dynamic sizing: `--plot-width-scale`, `--plot-max-width`, `--plot-height` + - Top-N highlights: `--activity-top-n` (labels show total and posts+replies breakdown) + - Match labels inside the plot using team abbreviations; control density with: + - `--labels-max-per-day`, `--labels-per-line`, `--labels-stagger-rows`, `--labels-band-y`, `--labels-annotate-mode` +- `daily_volume_and_sentiment.png` — total volume (posts+replies) per day as bars (left Y) and positive%/negative% as lines (right Y). Uses `sentiment_label` when present, otherwise `sentiment_compound` thresholds. +- `posts_heatmap_hour_dow.png` — heatmap of posts activity by hour and day-of-week. +- `sentiment_by_tag_posts.png` — stacked shares of pos/neu/neg by team tag (tags starting with `club_`), with dynamic width. +- Matchday rollups (when fixtures are provided): + - `matchday_sentiment_overall.csv` — per-fixture-day aggregates for posts (and replies when provided) + - `matchday_sentiment_overall.png` — mean sentiment time series on matchdays (posts, replies) + - `matchday_posts_volume_vs_sentiment.png` — scatter of posts volume vs mean sentiment on matchdays +- Diagnostics: + - `match_labels_debug.csv` — per-day list of rendered match labels (helps tune label density) + +Tip: The analyzer adapts plot width to the number of days; for very long ranges, raise `--plot-max-width`. + +## Plot sizing and label flags (analyze) + +- `--plot-width-scale` (default 0.8): inches per day for the daily charts width. +- `--plot-max-width` (default 104): cap on width in inches. +- `--plot-height` (default 6.5): figure height in inches. +- `--activity-top-n` (default 5): highlight top-N activity days; 0 disables. +- Match label controls: + - `--labels-max-per-day` (default 3): cap labels per day (+N more). + - `--labels-per-line` (default 2): labels per line in the band. + - `--labels-band-y` (default 0.96): vertical position of the band (axes coords). + - `--labels-stagger-rows` (default 2): stagger rows to reduce collisions. + - `--labels-annotate-mode` (ticks|all|ticks+top): which x positions get labels. + +## Automatic labeling (no manual annotation) + +If you don't want to label data by hand, generate a labeled training set automatically and train a local model. + +Label with VADER (fast) or a pretrained transformers model (higher quality): + +```zsh +# Load aliases +source scripts/aliases.zsh + +# VADER: keeps only confident predictions by default +auto_label_vader + +# Or Transformers: CardiffNLP 3-class sentiment (keeps confident only) +auto_label_transformers + +# Output: data/labeled_sentiment.csv (message, label, confidence, ...) +``` + +Then fine-tune a classifier on the generated labels and use it in analysis: + +```zsh +# Train on the auto-labeled CSV +train_transformers + +# Analyze using your fine-tuned model +./run_scraper.sh analyze -i data/premier_league_update.csv \ + --replies-csv data/premier_league_replies.csv \ + --fixtures-csv data/premier_league_schedule_2025-08-15_to_2025-10-15.csv \ + --tags-config config/tags.yaml \ + --sentiment-backend transformers \ + --transformers-model models/sentiment-distilbert \ + --export-transformers-details \ + --write-augmented-csv --write-combined-csv --save-plots +``` + +Advanced knobs (optional): +- VADER thresholds: `--vader-pos 0.05 --vader-neg -0.05 --vader-margin 0.2` +- Transformers acceptance: `--min-prob 0.6 --min-margin 0.2` +- Keep all predictions (not just confident): remove `--only-confident` + +## Local GPT backend (Ollama) + +You can use a local GPT model for sentiment. The analyzer requests strict JSON `{label, confidence}` and maps it to a compound score. If the GPT call fails for any rows, it gracefully falls back to VADER for those rows. + +Example: +```zsh +./run_scraper.sh analyze -i data/premier_league_update.csv \ + --replies-csv data/premier_league_replies.csv \ + --fixtures-csv data/premier_league_schedule_2025-08-15_to_2025-10-15.csv \ + --tags-config config/tags.yaml \ + --sentiment-backend gpt \ + --gpt-model llama3 \ + --gpt-base-url http://localhost:11434 \ + --write-augmented-csv --write-combined-csv --save-plots +``` + +## License +MIT (adjust as needed) \ No newline at end of file diff --git a/config/tags.yaml b/config/tags.yaml new file mode 100644 index 0000000..b31ff9f --- /dev/null +++ b/config/tags.yaml @@ -0,0 +1,103 @@ +# Keyword tag configuration +# Each tag has a list of case-insensitive substrings or regex patterns (prefix with re:) +# Messages matching ANY pattern for a tag are labeled with that tag. + +score_update: + - "FT" + - "full time" + - "final score" + - "HT" + - "half time" + - "kick-off" + - "kick off" + +transfer: + - "transfer" + - "signs" + - "signed" + - "loan" + - "contract" + - "deal" + +injury: + - "injury" + - "injured" + - "out for" + - "ruled out" + +match_highlight: + - "goal" + - "scores" + - "assist" + - "penalty" + - "VAR" + - "red card" + - "yellow card" + +club_arsenal: + - "Arsenal" +club_manchester_city: + - "Manchester City" +club_manchester_united: + - "Manchester United" +club_chelsea: + - "Chelsea" +club_liverpool: + - "Liverpool" +club_tottenham: + - "Tottenham" +club_newcastle: + - "Newcastle" +club_west_ham: + - "West Ham" +club_brighton: + - "Brighton" +club_aston_villa: + - "Aston Villa" +club_everton: + - "Everton" +club_crystal_palace: + - "Crystal Palace" + - "Palace" +club_bournemouth: + - "Bournemouth" + - "AFC Bournemouth" +club_brentford: + - "Brentford" +club_fulham: + - "Fulham" +club_nottingham_forest: + - "Nottingham Forest" + - "Forest" +club_wolves: + - "Wolves" + - "Wolverhampton" +club_burnley: + - "Burnley" +club_southampton: + - "Southampton" + - "Saints" +club_leicester_city: + - "Leicester" + - "Leicester City" +club_leeds_united: + - "Leeds" + - "Leeds United" +club_sheffield_united: + - "Sheffield United" + - "Sheff Utd" +club_west_bromwich_albion: + - "West Brom" + - "West Bromwich" +club_ipswich_town: + - "Ipswich" + - "Ipswich Town" +club_portsmouth: + - "Portsmouth" + - "Pompey" +club_hull_city: + - "Hull" + - "Hull City" +club_middlesbrough: + - "Middlesbrough" + - "Boro" diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md new file mode 100644 index 0000000..704333e --- /dev/null +++ b/docs/COMMANDS.md @@ -0,0 +1,743 @@ +# Project command reference + +This file lists all supported commands and practical permutations for `./run_scraper.sh`, with short comments and tips. It mirrors the actual CLI flags in the code. + +- Shell: zsh (macOS) — commands below are ready to paste. +- Env: A `.venv` is created automatically; dependencies installed from `requirements.txt`. +- Secrets: Create `.env` with TELEGRAM_API_ID and TELEGRAM_API_HASH; for fixtures also set FOOTBALL_DATA_API_TOKEN. +- 2FA: If you use Telegram two-step verification, set TELEGRAM_2FA_PASSWORD in `.env` (the shell wrapper doesn’t accept a flag for this). +- Sessions: Telethon uses a SQLite session file (default `telegram.session`). When running multiple tools in parallel, use distinct `--session-name` values. + +## Common conventions + +- Channels + - Use either handle or URL: `-c @name` or `-c https://t.me/name`. + - For replies, the channel must match the posts’ source in your CSV `url` column. +- Output behavior + - scrape/replies/forwards overwrite unless you pass `--append`. + - analyze always overwrites its outputs. +- Rate-limits + - Replies/forwards log `[rate-limit]` if Telegram asks you to wait. Reduce `--concurrency` if frequent. +- Parallel runs + - Add `--session-name ` per process to avoid “database is locked”. Prefer sessions outside iCloud Drive. + +--- + +## Scrape (posts/messages) + +Minimal (overwrite output): +```zsh +./run_scraper.sh scrape -c @SomeChannel -o data/messages.csv +``` + +With date range and limit: +```zsh +./run_scraper.sh scrape \ + -c https://t.me/SomeChannel \ + -o data/messages.jsonl \ + --start-date 2025-01-01 \ + --end-date 2025-03-31 \ + --limit 500 +``` + +Legacy offset date (deprecated; prefer --start-date): +```zsh +./run_scraper.sh scrape -c @SomeChannel -o data/messages.csv --offset-date 2025-01-01 +``` + +Append to existing file and pass phone on first login: +```zsh +./run_scraper.sh scrape \ + -c @SomeChannel \ + -o data/messages.csv \ + --append \ + --phone +15551234567 +``` + +Use a custom session (useful in parallel): +```zsh +./run_scraper.sh scrape -c @SomeChannel -o data/messages.csv --session-name telegram_scrape +``` + +Notes: +- Output format inferred by extension: `.csv` or `.jsonl`/`.ndjson`. +- Two-step verification: set TELEGRAM_2FA_PASSWORD in `.env` (no CLI flag in the shell wrapper). + +### All valid forms (scrape) + +Use one of the following combinations. Replace placeholders with your values. + +- Base variables: + - CH = @handle or https://t.me/handle + - OUT = path to .csv or .jsonl + - Optional value flags: [--limit N] [--session-name NAME] [--phone NUMBER] + +- Date filter permutations (4) × Append flag (2) × Limit presence (2) = 16 forms + +1) No dates, no append, no limit + ./run_scraper.sh scrape -c CH -o OUT +2) No dates, no append, with limit + ./run_scraper.sh scrape -c CH -o OUT --limit N +3) No dates, with append, no limit + ./run_scraper.sh scrape -c CH -o OUT --append +4) No dates, with append, with limit + ./run_scraper.sh scrape -c CH -o OUT --append --limit N +5) Start only, no append, no limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD +6) Start only, no append, with limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --limit N +7) Start only, with append, no limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --append +8) Start only, with append, with limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --append --limit N +9) End only, no append, no limit + ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD +10) End only, no append, with limit + ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD --limit N +11) End only, with append, no limit + ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD --append +12) End only, with append, with limit + ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD --append --limit N +13) Start and end, no append, no limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD +14) Start and end, no append, with limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD --limit N +15) Start and end, with append, no limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD --append +16) Start and end, with append, with limit + ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD --append --limit N + +Optional add-ons valid for any form above: +- Append [--session-name NAME] and/or [--phone NUMBER] +- Deprecated alternative to start-date: add [--offset-date YYYY-MM-DD] + +--- + +## Replies (fetch replies to posts) + +From a posts CSV (fast path; skip posts with 0 replies in CSV): +```zsh +./run_scraper.sh replies \ + -c https://t.me/SourceChannel \ + --from-csv data/messages.csv \ + -o data/replies.csv \ + --min-replies 1 \ + --concurrency 15 \ + --resume \ + --append +``` + +Using explicit message IDs: +```zsh +./run_scraper.sh replies \ + -c @SourceChannel \ + --ids "123,456,789" \ + -o data/replies.csv \ + --concurrency 5 \ + --append +``` + +IDs from a file (one per line) using zsh substitution: +```zsh +IDS=$(tr '\n' ',' < parent_ids.txt | sed 's/,$//') +./run_scraper.sh replies -c @SourceChannel --ids "$IDS" -o data/replies.csv --concurrency 8 --append +``` + +Parallel-safe session name: +```zsh +./run_scraper.sh replies -c @SourceChannel --from-csv data/messages.csv -o data/replies.csv --concurrency 12 --resume --append --session-name telegram_replies +``` + +What the flags do: +- `--from-csv PATH` reads parent IDs from a CSV with an `id` column (optionally filtered by `--min-replies`). +- `--ids` provides a comma-separated list of parent IDs. +- `--concurrency K` processes K parent IDs in parallel (default 5). +- `--resume` dedupes by `(parent_id,id)` pairs already present in the output. +- `--append` appends to output instead of overwriting. + +Notes: +- The channel (`-c`) must match the posts’ source in your CSV URLs (the tool warns on mismatch). +- First login may require `--phone` (interactive prompt). For 2FA, set TELEGRAM_2FA_PASSWORD in `.env`. + +### All valid forms (replies) + +- Base variables: + - CH = @handle or https://t.me/handle + - OUT = path to .csv + - Source: exactly one of S1 or S2 + - S1: --ids "id1,id2,..." + - S2: --from-csv PATH [--min-replies N] + - Optional: [--concurrency K] [--session-name NAME] [--phone NUMBER] + - Binary: [--append], [--resume] + +- Enumerated binary permutations for each source (4 per source = 8 total): + +S1 + no append + no resume + ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT +S1 + no append + resume + ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT --resume +S1 + append + no resume + ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT --append +S1 + append + resume + ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT --append --resume + +S2 + no append + no resume + ./run_scraper.sh replies -c CH --from-csv PATH -o OUT +S2 + no append + resume + ./run_scraper.sh replies -c CH --from-csv PATH -o OUT --resume +S2 + append + no resume + ./run_scraper.sh replies -c CH --from-csv PATH -o OUT --append +S2 + append + resume + ./run_scraper.sh replies -c CH --from-csv PATH -o OUT --append --resume + +Optional add-ons valid for any form above: +- Add [--concurrency K] to tune speed; recommended 8–20 +- With S2 you may add [--min-replies N] to prioritize parents with replies +- Add [--session-name NAME] and/or [--phone NUMBER] + +--- + +## Forwards (same-channel forwards referencing posts) + +Typical concurrent scan (best-effort; often zero results): +```zsh +./run_scraper.sh forwards \ + -c https://t.me/SourceChannel \ + --from-csv data/messages.csv \ + -o data/forwards.csv \ + --scan-limit 20000 \ + --concurrency 10 \ + --chunk-size 1500 +``` + +With date filters (applied to scanned messages): +```zsh +./run_scraper.sh forwards \ + -c @SourceChannel \ + --from-csv data/messages.csv \ + -o data/forwards.csv \ + --start-date 2025-01-01 \ + --end-date 2025-03-31 \ + --scan-limit 10000 \ + --concurrency 8 \ + --chunk-size 1000 +``` + +Using explicit message IDs: +```zsh +./run_scraper.sh forwards -c @SourceChannel --ids "100,200,300" -o data/forwards.csv --scan-limit 8000 --concurrency 6 --chunk-size 1000 +``` + +Sequential mode (no chunking) by omitting --scan-limit: +```zsh +./run_scraper.sh forwards -c @SourceChannel --from-csv data/messages.csv -o data/forwards.csv +``` + +What the flags do: +- `--scan-limit N`: enables chunked, concurrent scanning of ~N recent message IDs. +- `--concurrency K`: number of id-chunks to scan in parallel (requires `--scan-limit`). +- `--chunk-size M`: approx. IDs per chunk (trade-off between balance/overhead). Start with 1000–2000. +- `--append`: append instead of overwrite. + +Notes: +- This only finds forwards within the same channel that reference your parent IDs (self-forwards). Many channels will yield zero. +- Global cross-channel forward discovery is not supported here (can be added as a separate mode). +- Without `--scan-limit`, the tool scans sequentially from newest backwards and logs progress every ~1000 messages. + +### All valid forms (forwards) + +- Base variables: + - CH = @handle or https://t.me/handle + - OUT = path to .csv + - Source: exactly one of S1 or S2 + - S1: --ids "id1,id2,..." + - S2: --from-csv PATH + - Modes: + - M1: Sequential scan (omit --scan-limit) + - M2: Chunked concurrent scan (requires --scan-limit N; accepts --concurrency K and --chunk-size M) + - Optional date filters for both modes: [--start-date D] [--end-date D] + - Binary: [--append] + - Optional: [--session-name NAME] [--phone NUMBER] + +- Enumerated permutations by mode, source, and append (2 modes × 2 sources × 2 append = 8 forms): + +M1 + S1 + no append + ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT [--start-date D] [--end-date D] +M1 + S1 + append + ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT --append [--start-date D] [--end-date D] +M1 + S2 + no append + ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT [--start-date D] [--end-date D] +M1 + S2 + append + ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT --append [--start-date D] [--end-date D] + +M2 + S1 + no append + ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT --scan-limit N [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D] +M2 + S1 + append + ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT --scan-limit N --append [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D] +M2 + S2 + no append + ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT --scan-limit N [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D] +M2 + S2 + append + ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT --scan-limit N --append [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D] + +Optional add-ons valid for any form above: +- Add [--session-name NAME] and/or [--phone NUMBER] + +--- + +## Analyze (reports and tagging) + +Posts-only report + tagged CSV: +```zsh +./run_scraper.sh analyze \ + -i data/messages.csv \ + --channel @SourceChannel \ + --tags-config config/tags.yaml \ + --fixtures-csv data/fixtures.csv \ + --write-augmented-csv +``` +Outputs: +- `data/messages_report.md` +- `data/messages_tagged.csv` + +Replies-only report + tagged CSV: +```zsh +./run_scraper.sh analyze \ + -i data/replies.csv \ + --channel "Replies - @SourceChannel" \ + --tags-config config/tags.yaml \ + --write-augmented-csv +``` +Outputs: +- `data/replies_report.md` +- `data/replies_tagged.csv` + +Combined (posts report augmented with replies): +```zsh +./run_scraper.sh analyze \ + -i data/messages.csv \ + --channel @SourceChannel \ + --tags-config config/tags.yaml \ + --replies-csv data/replies.csv \ + --fixtures-csv data/fixtures.csv \ + --write-augmented-csv \ + --write-combined-csv \ + --emoji-mode keep \ + --emoji-boost \ + --save-plots +``` +Adds to posts dataset: +- `sentiment_compound` for posts (VADER) +- `replies_sentiment_mean` (avg reply sentiment per post) +- `replies_count_scraped` and `replies_top_tags` (rollup from replies) + +Report sections include: +- Summary, top posts by views/forwards/replies +- Temporal distributions +- Per-tag engagement +- Per-tag sentiment (posts) +- Replies per-tag summary +- Per-tag sentiment (replies) + - Combined sentiment (posts + replies) + - Matchday cross-analysis (when `--fixtures-csv` is provided): + - Posts: on vs off matchdays (counts and sentiment shares) + - Posts engagement vs matchday (replies per post: total, mean, median, share of posts with replies) + - Replies: on vs off matchdays (counts and sentiment shares) + - Replies by parent matchday and by reply date are both shown; parent-based classification is recommended for engagement. + +Notes: +- Analyze overwrites outputs; use `-o` to customize report filename if needed. +- Emoji handling: add `--emoji-mode keep|demojize|strip` (default keep). Optionally `--emoji-boost` to gently tilt scores when clearly positive/negative emojis are present. + - Add `--write-combined-csv` to emit a unified CSV of posts+replies with a `content_type` column. + +### All valid forms (analyze) + +- Base variables: + - IN = input CSV (posts or replies) + - Optional outputs/labels: [-o REPORT.md] [--channel @handle] + - Optional configs/data: [--tags-config config/tags.yaml] [--replies-csv REPLIES.csv] [--fixtures-csv FIXTURES.csv] + - Binary: [--write-augmented-csv] + +- Core permutations across replies-csv, fixtures-csv, write-augmented-csv (2×2×2 = 8 forms): + +1) No replies, no fixtures, no aug + ./run_scraper.sh analyze -i IN +2) No replies, no fixtures, with aug + ./run_scraper.sh analyze -i IN --write-augmented-csv +3) No replies, with fixtures, no aug + ./run_scraper.sh analyze -i IN --fixtures-csv FIXTURES.csv +4) No replies, with fixtures, with aug + ./run_scraper.sh analyze -i IN --fixtures-csv FIXTURES.csv --write-augmented-csv +5) With replies, no fixtures, no aug + ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv +6) With replies, no fixtures, with aug + ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv --write-augmented-csv +7) With replies, with fixtures, no aug + ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv --fixtures-csv FIXTURES.csv +8) With replies, with fixtures, with aug + ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv --fixtures-csv FIXTURES.csv --write-augmented-csv + +Optional add-ons valid for any form above: +- Append [-o REPORT.md] to control output filename +- Append [--channel @handle] for title +- Append [--tags-config config/tags.yaml] to enable tagging and per-tag summaries +- Append [--emoji-mode keep|demojize|strip] and optionally [--emoji-boost] +- Append [--write-combined-csv] to produce a merged posts+replies CSV + - Append [--save-plots] to emit plots to the data folder + - Append [--sentiment-backend transformers] and [--transformers-model ] to use a local HF model instead of VADER + - Append [--export-transformers-details] to include `sentiment_label` and `sentiment_probs` in augmented/combined CSVs + - Append [--sentiment-backend gpt] and optionally [--gpt-model MODEL] [--gpt-base-url URL] [--gpt-batch-size K] to use a local GPT (Ollama) backend + - Plot sizing and label controls (daily charts): + - [--plot-width-scale FLOAT] [--plot-max-width INCHES] [--plot-height INCHES] + - [--activity-top-n N] + - [--labels-max-per-day N] [--labels-per-line N] [--labels-band-y FLOAT] [--labels-stagger-rows N] [--labels-annotate-mode ticks|all|ticks+top] + +When fixtures are provided (`--fixtures-csv`): +- The report adds a "## Matchday cross-analysis" section with on vs off matchday tables. +- Plots include: + - daily_activity_stacked.png with match labels inside the chart + - daily_volume_and_sentiment.png (bars: volume; lines: pos%/neg%) + - matchday_sentiment_overall.png (time series on fixture days) + - matchday_posts_volume_vs_sentiment.png (scatter) +- The combined CSV (with `--write-combined-csv`) includes `is_matchday` and, for replies, `parent_is_matchday` when available. +- Replies are classified two ways: by reply date (`is_matchday` on the reply row) and by their parent post (`parent_is_matchday`). The latter better reflects matchday-driven engagement. + +Emoji and plots examples: +```zsh +# Keep emojis (default) and boost for strong positive/negative emojis +./run_scraper.sh analyze -i data/messages.csv --emoji-mode keep --emoji-boost --save-plots + +# Demojize to :smiling_face: tokens (helps some tokenizers), with boost +./run_scraper.sh analyze -i data/messages.csv --emoji-mode demojize --emoji-boost + +# Strip emojis entirely (if they add noise) +./run_scraper.sh analyze -i data/messages.csv --emoji-mode strip --save-plots + +# Use a transformers model for sentiment (will auto-download on first use unless a local path is provided). +# Tip: for an off-the-shelf sentiment head, try a fine-tuned model like SST-2: +./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \ + --sentiment-backend transformers \ + --transformers-model distilbert-base-uncased-finetuned-sst-2-english + +## Local GPT backend (Ollama) + +Use a local GPT model that returns JSON {label, confidence} per message; the analyzer maps this to a compound score and falls back to VADER on errors. + +```zsh +./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \ + --sentiment-backend gpt \ + --gpt-model llama3 \ + --gpt-base-url http://localhost:11434 \ + --write-augmented-csv --write-combined-csv --save-plots +``` +``` + +--- + +## Train a local transformers sentiment model + +Prepare a labeled CSV with at least two columns: `message` and `label` (e.g., neg/neu/pos or 0/1/2). + +Don’t have one yet? Create a labeling set from your existing posts/replies: + +```zsh +# Generate a CSV to annotate by hand (adds a blank 'label' column) +./.venv/bin/python -m src.make_labeling_set \ + --posts-csv data/premier_league_update.csv \ + --replies-csv data/premier_league_replies.csv \ + --sample-size 1000 \ + -o data/labeled_sentiment.csv + +# Or via alias (after sourcing scripts/aliases.zsh) +make_label_set "$POSTS_CSV" "$REPLIES_CSV" data/labeled_sentiment.csv 1000 +``` + +Then fine-tune: + +```zsh +# Ensure the venv exists (run any ./run_scraper.sh command once), then: +./.venv/bin/python -m src.train_sentiment \ + --train-csv data/labeled_sentiment.csv \ + --text-col message \ + --label-col label \ + --model-name distilbert-base-uncased \ + --output-dir models/sentiment-distilbert \ + --epochs 3 --batch-size 16 +``` + +Use it in analyze: + +```zsh +./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \ + --sentiment-backend transformers \ + --transformers-model models/sentiment-distilbert +``` + +Export details (labels, probabilities) into CSVs: + +```zsh +./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \ + --sentiment-backend transformers \ + --transformers-model models/sentiment-distilbert \ + --export-transformers-details \ + --write-augmented-csv --write-combined-csv +``` + +Notes: +- The analyzer maps model class probabilities to a VADER-like compound score in [-1, 1] for compatibility with the rest of the report. +- If the model has id2label including 'neg','neu','pos' labels, the mapping is more accurate; otherwise it defaults to pos - neg. +- GPU/Apple Silicon (MPS) will be used automatically if available. + +Torch install note (macOS): +- `requirements.txt` uses conditional pins: `torch==2.3.1` for Python < 3.13 and `torch>=2.7.1` for Python ≥ 3.13. This keeps installs smooth on macOS. If you hit install issues, let us know. + +## Evaluate a fine-tuned model + +```zsh +./.venv/bin/python -m src.eval_sentiment \ + --csv data/labeled_holdout.csv \ + --text-col message \ + --label-col label \ + --model models/sentiment-distilbert +``` +Prints accuracy, macro-precision/recall/F1, and a classification report. + +## Fixtures (Premier League schedule via football-data.org) + +Fetch fixtures between dates: +```zsh +./run_scraper.sh fixtures \ + --start-date 2025-08-15 \ + --end-date 2025-10-15 \ + -o data/fixtures.csv +``` + +Notes: +- Requires `FOOTBALL_DATA_API_TOKEN` in `.env`. +- Output may be `.csv` or `.json` (by extension). + +### All valid forms (fixtures) + +- Base variables: + - SD = start date YYYY-MM-DD + - ED = end date YYYY-MM-DD + - OUT = output .csv or .json + +Form: + ./run_scraper.sh fixtures --start-date SD --end-date ED -o OUT + +--- + +## Advanced recipes + +Parallel replies + forwards with separate sessions: +```zsh +# Terminal 1 – replies +./run_scraper.sh replies \ + -c https://t.me/SourceChannel \ + --from-csv data/messages.csv \ + -o data/replies.csv \ + --min-replies 1 \ + --concurrency 15 \ + --resume \ + --append \ + --session-name "$HOME/.local/share/telethon_sessions/telegram_replies" + +# Terminal 2 – forwards +./run_scraper.sh forwards \ + -c https://t.me/SourceChannel \ + --from-csv data/messages.csv \ + -o data/forwards.csv \ + --scan-limit 20000 \ + --concurrency 10 \ + --chunk-size 1500 \ + --session-name "$HOME/.local/share/telethon_sessions/telegram_forwards" +``` + +Tuning for rate limits: +- If `[rate-limit]` logs are frequent, reduce `--concurrency` (start -3 to -5) and keep `--chunk-size` around 1000–2000. +- For replies, prioritize with `--min-replies 1` to avoid parents with zero replies. + +Safety: +- Use `--append` with replies and `--resume` to avoid truncating and to dedupe. +- Forwards and scrape don’t dedupe; prefer writing to a new file or dedupe after. + +--- + +## Environment setup quick-start + +Create `.env` (script will prompt if missing): +``` +TELEGRAM_API_ID=123456 +TELEGRAM_API_HASH=your_api_hash +# Optional defaults +TELEGRAM_SESSION_NAME=telegram +TELEGRAM_2FA_PASSWORD=your_2fa_password +FOOTBALL_DATA_API_TOKEN=your_token +``` + +First run will prompt for phone and code (and 2FA if enabled). + +--- + +## Troubleshooting + +- Empty replies file + - Ensure `-c` matches the channel in your posts CSV URLs. + - Use `--append` so the file isn’t truncated before writing. +- “database is locked” + - Use unique `--session-name` per parallel process; store sessions outside iCloud Drive. +- Forwards empty + - Same-channel forwards are rare. This tool only finds self-forwards (not cross-channel). +- Analyze errors + - Ensure CSVs have expected columns. Posts: `id,date,message,...`; Replies: `parent_id,id,date,message,...`. +- Exit code 1 when starting + - Check the last log lines. Common causes: missing TELEGRAM_API_ID/HASH in `.env`, wrong channel handle vs CSV URLs, session file locked by another process (use distinct `--session-name`), or a bad output path. + +--- + +## Quick aliases for daily runs (zsh) ⚡ + +Paste this section into your current shell or your `~/.zshrc` to get convenient Make-like commands. + +### Project defaults (edit as needed) + +```zsh +# Channel and files +export CH="https://t.me/Premier_League_Update" +export POSTS_CSV="data/premier_league_update.csv" +export REPLIES_CSV="data/premier_league_replies.csv" +export FORWARDS_CSV="data/premier_league_forwards.csv" +export TAGS_CFG="config/tags.yaml" +export FIXTURES_CSV="data/premier_league_schedule_2025-08-15_to_2025-10-15.csv" + +# Sessions directory outside iCloud (avoid sqlite locks) +export SESSION_DIR="$HOME/.local/share/telethon_sessions" +mkdir -p "$SESSION_DIR" +``` + +### Aliases (zsh functions) + +```zsh +# Fast replies: resume+append, prioritizes parents with replies, tuned concurrency +fast_replies() { + local ch="${1:-$CH}" + local posts="${2:-$POSTS_CSV}" + local out="${3:-$REPLIES_CSV}" + local conc="${4:-15}" + local sess="${5:-$SESSION_DIR/telegram_replies}" + ./run_scraper.sh replies \ + -c "$ch" \ + --from-csv "$posts" \ + -o "$out" \ + --min-replies 1 \ + --concurrency "$conc" \ + --resume \ + --append \ + --session-name "$sess" +} + +# Chunked forwards: concurrent chunk scan with progress logs +chunked_forwards() { + local ch="${1:-$CH}" + local posts="${2:-$POSTS_CSV}" + local out="${3:-$FORWARDS_CSV}" + local scan="${4:-20000}" + local conc="${5:-10}" + local chunk="${6:-1500}" + local sess="${7:-$SESSION_DIR/telegram_forwards}" + ./run_scraper.sh forwards \ + -c "$ch" \ + --from-csv "$posts" \ + -o "$out" \ + --scan-limit "$scan" \ + --concurrency "$conc" \ + --chunk-size "$chunk" \ + --append \ + --session-name "$sess" +} + +# Combined analyze: posts + replies + fixtures with tags; writes augmented CSVs +analyze_combined() { + local posts="${1:-$POSTS_CSV}" + local replies="${2:-$REPLIES_CSV}" + local tags="${3:-$TAGS_CFG}" + local fixtures="${4:-$FIXTURES_CSV}" + local ch="${5:-$CH}" + ./run_scraper.sh analyze \ + -i "$posts" \ + --channel "$ch" \ + --tags-config "$tags" \ + --replies-csv "$replies" \ + --fixtures-csv "$fixtures" \ + --write-augmented-csv \ + --write-combined-csv +} + +# Emoji-aware analyze with sensible defaults (keep + boost) +analyze_emoji() { + local posts="${1:-$POSTS_CSV}" + local replies="${2:-$REPLIES_CSV}" + local tags="${3:-$TAGS_CFG}" + local fixtures="${4:-$FIXTURES_CSV}" + local ch="${5:-$CH}" + local mode="${6:-keep}" # keep | demojize | strip + ./run_scraper.sh analyze \ + -i "$posts" \ + --channel "$ch" \ + --tags-config "$tags" \ + --replies-csv "$replies" \ + --fixtures-csv "$fixtures" \ + --write-augmented-csv \ + --write-combined-csv \ + --emoji-mode "$mode" \ + --emoji-boost +} + +# One-shot daily pipeline: fast replies then combined analyze +run_daily() { + local ch="${1:-$CH}" + local posts="${2:-$POSTS_CSV}" + local replies="${3:-$REPLIES_CSV}" + local conc="${4:-15}" + fast_replies "$ch" "$posts" "$replies" "$conc" "$SESSION_DIR/telegram_replies" + analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep +} + +# One-shot daily pipeline with forwards in parallel +run_daily_with_forwards() { + local ch="${1:-$CH}" + local posts="${2:-$POSTS_CSV}" + local replies="${3:-$REPLIES_CSV}" + local forwards="${4:-$FORWARDS_CSV}" + local rep_conc="${5:-15}" + local f_scan="${6:-20000}" + local f_conc="${7:-10}" + local f_chunk="${8:-1500}" + local sess_r="${9:-$SESSION_DIR/telegram_replies}" + local sess_f="${10:-$SESSION_DIR/telegram_forwards}" + + # Launch replies and forwards in parallel with separate sessions + local pid_r pid_f + fast_replies "$ch" "$posts" "$replies" "$rep_conc" "$sess_r" & pid_r=$! + chunked_forwards "$ch" "$posts" "$forwards" "$f_scan" "$f_conc" "$f_chunk" "$sess_f" & pid_f=$! + + # Wait for completion and then analyze with emoji handling + wait $pid_r + wait $pid_f + analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep +} +``` + +### Usage + +```zsh +# Use project defaults +fast_replies +chunked_forwards +analyze_combined + +# Override on the fly (channel, files, or tuning) +fast_replies "https://t.me/AnotherChannel" data/other_posts.csv data/other_replies.csv 12 +chunked_forwards "$CH" "$POSTS_CSV" data/alt_forwards.csv 30000 12 2000 +analyze_combined data/other_posts.csv data/other_replies.csv "$TAGS_CFG" "$FIXTURES_CSV" "$CH" +``` diff --git a/docs/SESSION_HISTORY.md b/docs/SESSION_HISTORY.md new file mode 100644 index 0000000..bf6776f --- /dev/null +++ b/docs/SESSION_HISTORY.md @@ -0,0 +1,50 @@ +# Session history (Oct 25, 2025) + +This document captures the key decisions, features added, and workflows established in the current development session so that future runs have quick context. + +## Highlights +- Added a new plot: `daily_volume_and_sentiment.png` showing bars for total volume (posts+replies) and lines for positive% and negative% per day. +- Improved daily activity chart with in-plot match labels (team abbreviations), density controls, and dynamic width/height. +- Implemented matchday sentiment rollups and plots: `matchday_sentiment_overall.csv/.png`, `matchday_posts_volume_vs_sentiment.png`. +- Integrated multiple sentiment backends: + - VADER (default) + - Transformers (local model at `models/sentiment-distilbert`) + - Local GPT via Ollama (JSON {label, confidence} mapped to compound) with graceful fallback to VADER +- Labeled data workflow: + - `src/apply_labels.py` merges labels back into posts/replies as `sentiment_label` + - Analyzer reuses `sentiment_label` when present + - `src/plot_labeled.py` provides QA plots +- Convenience: created `run_all` alias to run from scratch (scrape → replies → fixtures → analyze) non-interactively. + +## Key files and outputs +- Code + - `src/analyze_csv.py` — analyzer with plots and matchday integration (now with module docstring) + - `src/gpt_sentiment.py`, `src/transformer_sentiment.py`, `src/auto_label_sentiment.py`, `src/apply_labels.py`, `src/plot_labeled.py` + - `scripts/aliases.zsh` — includes `run_all`, `apply_labels_and_analyze`, and more +- Outputs (examples) + - `data/daily_activity_stacked.png` + - `data/daily_volume_and_sentiment.png` + - `data/posts_heatmap_hour_dow.png` + - `data/sentiment_by_tag_posts.png` + - `data/matchday_sentiment_overall.csv/.png` + - `data/matchday_posts_volume_vs_sentiment.png` + +## Important flags (analyze) +- Sizing: `--plot-width-scale`, `--plot-max-width`, `--plot-height` +- Labels: `--activity-top-n`, `--labels-max-per-day`, `--labels-per-line`, `--labels-stagger-rows`, `--labels-band-y`, `--labels-annotate-mode` +- Sentiment backends: `--sentiment-backend vader|transformers|gpt`, plus `--transformers-model` or `--gpt-model`/`--gpt-base-url` +- Emoji: `--emoji-mode keep|demojize|strip` and `--emoji-boost` + +## Aliases summary +- `run_all [CH] [START] [END] [POSTS] [REPLIES] [FIXTURES] [TAGS] [SESS_SCRAPE] [SESS_REPLIES] [CONC] [BACKEND] [MODEL] [GPT_MODEL] [GPT_URL]` + - Full pipeline non-interactive, defaults set in `scripts/aliases.zsh` +- `apply_labels_and_analyze [LABELED_CSV] [POSTS_IN] [REPLIES_IN] [POSTS_OUT] [REPLIES_OUT]` +- `analyze_transformers`, `analyze_emoji`, `analyze_combined`, `fast_replies`, `chunked_forwards`, `plot_labeled` + +## Old vs New outputs +- We maintain side-by-side outputs under `data/old` and `data/new` when running legacy vs labeled pipelines. + +## Next ideas +- Per-club matchday sentiment breakdowns (fixture-level small multiples) +- Side-by-side montage generation for old vs new plots + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..598fcee --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +numpy +pandas +scikit-learn +matplotlib +seaborn +jupyter +telethon +python-dotenv +tabulate +requests +pyyaml +vaderSentiment +emoji>=2.8.0 +transformers>=4.44.0 +# Torch pinning: use 2.3.1 on Python <3.13 (known-good on macOS), and a compatible newer torch on Python >=3.13 +torch==2.3.1; python_version < "3.13" +torch>=2.7.1; python_version >= "3.13" +datasets>=2.20.0 +accelerate>=0.26.0 \ No newline at end of file diff --git a/run_scraper.sh b/run_scraper.sh new file mode 100755 index 0000000..9e08ab9 --- /dev/null +++ b/run_scraper.sh @@ -0,0 +1,278 @@ +#!/usr/bin/env zsh + + +# A convenience script to set up venv, install deps, create/load .env, and run tools: +# - Telegram scraper: scrape | replies | forwards +# - Analyzer: analyze (report + sentiment + tags) +# - Fixtures: fixtures (Premier League schedule) +set -euo pipefail + +# Change to script directory (handles spaces in path) +cd "${0:A:h}" + +PROJECT_ROOT=$(pwd) +PYTHON="${PROJECT_ROOT}/.venv/bin/python" +PIP="${PROJECT_ROOT}/.venv/bin/pip" +REQUIREMENTS_FILE="${PROJECT_ROOT}/requirements.txt" +SCRAPER_MODULE="src.telegram_scraper" +ANALYZE_MODULE="src.analyze_csv" +FIXTURES_MODULE="src.fetch_schedule" + +usage() { + cat <<'EOF' +Usage: + ./run_scraper.sh scrape -c -o [--limit N] [--start-date YYYY-MM-DD] [--end-date YYYY-MM-DD] [--phone ] [--append] + ./run_scraper.sh replies -c (--ids "1,2,3" | --from-csv ) -o [--append] [--min-replies N] [--concurrency K] [--resume] + ./run_scraper.sh forwards -c (--ids "1,2,3" | --from-csv ) -o [--start-date YYYY-MM-DD] [--end-date YYYY-MM-DD] [--scan-limit N] [--append] [--concurrency K] [--chunk-size M] + ./run_scraper.sh analyze -i [-o ] [--channel @handle] [--tags-config config/tags.yaml] [--replies-csv ] [--fixtures-csv ] [--write-augmented-csv] [--write-combined-csv] [--emoji-mode keep|demojize|strip] [--emoji-boost] [--save-plots] [--sentiment-backend vader|transformers] [--transformers-model ] [--export-transformers-details] + [--plot-width-scale ] [--plot-max-width ] [--plot-height ] [--activity-top-n ] \ + [--labels-max-per-day ] [--labels-per-line ] [--labels-band-y ] [--labels-stagger-rows ] [--labels-annotate-mode ticks|all|ticks+top] + [--sentiment-backend gpt] [--gpt-model ] [--gpt-base-url ] [--gpt-batch-size ] + ./run_scraper.sh fixtures --start-date YYYY-MM-DD --end-date YYYY-MM-DD -o + +Examples: + ./run_scraper.sh scrape -c @python -o data.jsonl --limit 200 + ./run_scraper.sh scrape -c https://t.me/python -o data.csv --start-date 2025-01-01 --end-date 2025-03-31 + ./run_scraper.sh replies -c @python --from-csv data/messages.csv -o data/replies.csv + ./run_scraper.sh forwards -c @python --from-csv data/messages.csv -o data/forwards.csv --start-date 2025-01-01 --end-date 2025-03-31 --scan-limit 20000 + ./run_scraper.sh analyze -i data/messages.csv --channel @python --tags-config config/tags.yaml --replies-csv data/replies.csv --fixtures-csv data/fixtures.csv --write-augmented-csv + ./run_scraper.sh analyze -i data/messages.csv --sentiment-backend transformers --transformers-model distilbert-base-uncased --export-transformers-details --write-augmented-csv --write-combined-csv + ./run_scraper.sh fixtures --start-date 2025-08-15 --end-date 2025-10-15 -o data/pl_fixtures.csv + +Notes: +- If .env is missing, you'll be prompted to create it when needed (Telegram or fixtures commands). +- First Telegram login will prompt for phone, code, and optionally 2FA password. +EOF +} + +# Subcommand parsing +if [[ $# -lt 1 ]]; then + usage; exit 1 +fi +COMMAND="$1"; shift || true + +# Common and per-command args +CHANNEL=""; OUTPUT=""; LIMIT=""; OFFSET_DATE=""; PHONE=""; START_DATE=""; END_DATE=""; APPEND=false; SESSION_NAME="" +IDS=""; FROM_CSV=""; SCAN_LIMIT="" +INPUT_CSV=""; REPORT_OUT=""; CHANNEL_NAME=""; TAGS_CONFIG=""; REPLIES_CSV=""; FIXTURES_CSV=""; WRITE_AUG=false; WRITE_COMBINED=false; EMOJI_MODE=""; EMOJI_BOOST=false; SAVE_PLOTS=false; SENTIMENT_BACKEND=""; TRANSFORMERS_MODEL=""; EXPORT_TRANSFORMERS_DETAILS=false; PLOT_WIDTH_SCALE=""; PLOT_MAX_WIDTH=""; PLOT_HEIGHT=""; ACTIVITY_TOP_N=""; LABELS_MAX_PER_DAY=""; LABELS_PER_LINE=""; LABELS_BAND_Y=""; LABELS_STAGGER_ROWS=""; LABELS_ANNOTATE_MODE=""; GPT_MODEL=""; GPT_BASE_URL=""; GPT_BATCH_SIZE="" + +case "$COMMAND" in + scrape|replies|forwards) + while [[ $# -gt 0 ]]; do + case "$1" in + -c|--channel) CHANNEL="$2"; shift 2;; + -o|--output) OUTPUT="$2"; shift 2;; + --session-name) SESSION_NAME="$2"; shift 2;; + --limit) LIMIT="$2"; shift 2;; + --offset-date) OFFSET_DATE="$2"; shift 2;; + --start-date) START_DATE="$2"; shift 2;; + --end-date) END_DATE="$2"; shift 2;; + --scan-limit) SCAN_LIMIT="$2"; shift 2;; + --ids) IDS="$2"; shift 2;; + --from-csv) FROM_CSV="$2"; shift 2;; + --phone) PHONE="$2"; shift 2;; + --append) APPEND=true; shift;; + --min-replies) MIN_REPLIES="$2"; shift 2;; + --concurrency) CONCURRENCY="$2"; shift 2;; + --chunk-size) CHUNK_SIZE="$2"; shift 2;; + --resume) RESUME=true; shift;; + -h|--help) usage; exit 0;; + *) echo "Unknown arg: $1"; usage; exit 1;; + esac + done + ;; + analyze) + while [[ $# -gt 0 ]]; do + case "$1" in + -i|--input) INPUT_CSV="$2"; shift 2;; + -o|--output) REPORT_OUT="$2"; shift 2;; + --channel) CHANNEL_NAME="$2"; shift 2;; + --tags-config) TAGS_CONFIG="$2"; shift 2;; + --replies-csv) REPLIES_CSV="$2"; shift 2;; + --fixtures-csv) FIXTURES_CSV="$2"; shift 2;; + --write-augmented-csv) WRITE_AUG=true; shift;; + --write-combined-csv) WRITE_COMBINED=true; shift;; + --emoji-mode) EMOJI_MODE="$2"; shift 2;; + --emoji-boost) EMOJI_BOOST=true; shift;; + --save-plots) SAVE_PLOTS=true; shift;; + --sentiment-backend) SENTIMENT_BACKEND="$2"; shift 2;; + --transformers-model) TRANSFORMERS_MODEL="$2"; shift 2;; + --export-transformers-details) EXPORT_TRANSFORMERS_DETAILS=true; shift;; + --gpt-model) GPT_MODEL="$2"; shift 2;; + --gpt-base-url) GPT_BASE_URL="$2"; shift 2;; + --gpt-batch-size) GPT_BATCH_SIZE="$2"; shift 2;; + --plot-width-scale) PLOT_WIDTH_SCALE="$2"; shift 2;; + --plot-max-width) PLOT_MAX_WIDTH="$2"; shift 2;; + --plot-height) PLOT_HEIGHT="$2"; shift 2;; + --activity-top-n) ACTIVITY_TOP_N="$2"; shift 2;; + --labels-max-per-day) LABELS_MAX_PER_DAY="$2"; shift 2;; + --labels-per-line) LABELS_PER_LINE="$2"; shift 2;; + --labels-band-y) LABELS_BAND_Y="$2"; shift 2;; + --labels-stagger-rows) LABELS_STAGGER_ROWS="$2"; shift 2;; + --labels-annotate-mode) LABELS_ANNOTATE_MODE="$2"; shift 2;; + -h|--help) usage; exit 0;; + *) echo "Unknown arg: $1"; usage; exit 1;; + esac + done + # Defaults: always use local fine-tuned transformers model if not specified + if [[ -z "$SENTIMENT_BACKEND" ]]; then SENTIMENT_BACKEND="transformers"; fi + if [[ -z "$TRANSFORMERS_MODEL" ]]; then TRANSFORMERS_MODEL="models/sentiment-distilbert"; fi + ;; + fixtures) + while [[ $# -gt 0 ]]; do + case "$1" in + --start-date) START_DATE="$2"; shift 2;; + --end-date) END_DATE="$2"; shift 2;; + -o|--output) OUTPUT="$2"; shift 2;; + -h|--help) usage; exit 0;; + *) echo "Unknown arg: $1"; usage; exit 1;; + esac + done + ;; + -h|--help) + usage; exit 0;; + *) + echo "Unknown command: $COMMAND"; usage; exit 1;; +esac + +# Required args validation +if [[ "$COMMAND" == "scrape" ]]; then + if [[ -z "$CHANNEL" || -z "$OUTPUT" ]]; then echo "Error: scrape needs --channel and --output"; usage; exit 1; fi +elif [[ "$COMMAND" == "replies" || "$COMMAND" == "forwards" ]]; then + if [[ -z "$CHANNEL" || -z "$OUTPUT" ]]; then echo "Error: $COMMAND needs --channel and --output"; usage; exit 1; fi + if [[ -z "$IDS" && -z "$FROM_CSV" ]]; then echo "Error: $COMMAND needs --ids or --from-csv"; usage; exit 1; fi +elif [[ "$COMMAND" == "analyze" ]]; then + if [[ -z "$INPUT_CSV" ]]; then echo "Error: analyze needs --input"; usage; exit 1; fi +elif [[ "$COMMAND" == "fixtures" ]]; then + if [[ -z "$START_DATE" || -z "$END_DATE" || -z "$OUTPUT" ]]; then echo "Error: fixtures needs --start-date, --end-date, and --output"; usage; exit 1; fi +fi + +echo "[1/4] Ensuring virtual environment..." +if [[ ! -x "$PYTHON" ]]; then + echo "Creating virtual environment at .venv" + python3 -m venv .venv +fi + +echo "Activating virtual environment" +source .venv/bin/activate + +echo "[2/4] Installing dependencies" +"$PIP" install -q --upgrade pip +"$PIP" install -q -r "$REQUIREMENTS_FILE" + +echo "[3/4] Environment setup" +NEEDS_TELEGRAM=false +NEEDS_FIXTURES_TOKEN=false +if [[ "$COMMAND" == "scrape" || "$COMMAND" == "replies" || "$COMMAND" == "forwards" ]]; then NEEDS_TELEGRAM=true; fi +if [[ "$COMMAND" == "fixtures" ]]; then NEEDS_FIXTURES_TOKEN=true; fi + +if [[ "$NEEDS_TELEGRAM" == true || "$NEEDS_FIXTURES_TOKEN" == true ]]; then + if [[ ! -f .env ]]; then + echo ".env not found. Let's create one now." + if [[ "$NEEDS_TELEGRAM" == true ]]; then + print -n "Enter TELEGRAM_API_ID (from my.telegram.org): " + read -r TELEGRAM_API_ID + print -n "Enter TELEGRAM_API_HASH (from my.telegram.org): " + read -r TELEGRAM_API_HASH + : ${TELEGRAM_SESSION_NAME:=telegram} + fi + cat > .env < replies -> fixtures -> analyze +# Requirements: +# - .env has TELEGRAM_API_ID and TELEGRAM_API_HASH (and TELEGRAM_2FA_PASSWORD if 2FA is enabled) +# - CH/POSTS_CSV/REPLIES_CSV/FIXTURES_CSV/TAGS_CFG are set (defaults are defined above) +# - Provide optional start/end dates; defaults use FIXTURES_START_DATE/FIXTURES_END_DATE +# - Choose sentiment backend via arg 11: vader | transformers | gpt (default: transformers) +run_all() { + local ch="${1:-$CH}" + local start="${2:-$FIXTURES_START_DATE}" + local end="${3:-$FIXTURES_END_DATE}" + local posts="${4:-$POSTS_CSV}" + local replies="${5:-$REPLIES_CSV}" + local fixtures="${6:-$FIXTURES_CSV}" + local tags="${7:-$TAGS_CFG}" + local sess_scrape="${8:-$SESSION_DIR/telegram_scrape}" + local sess_replies="${9:-$SESSION_DIR/telegram_replies}" + local rep_conc="${10:-15}" + local backend="${11:-transformers}" # vader | transformers | gpt + local model="${12:-models/sentiment-distilbert}" + local gpt_model="${13:-llama3}" + local gpt_url="${14:-http://localhost:11434}" + + # 1) Scrape posts (overwrite) + ./run_scraper.sh scrape \ + -c "$ch" \ + -o "$posts" \ + --start-date "$start" \ + --end-date "$end" \ + --session-name "$sess_scrape" + + # 2) Fetch replies (resume+append safe) + ./run_scraper.sh replies \ + -c "$ch" \ + --from-csv "$posts" \ + -o "$replies" \ + --min-replies 1 \ + --concurrency "$rep_conc" \ + --resume \ + --append \ + --session-name "$sess_replies" + + # 3) Fetch fixtures for the same period + ./run_scraper.sh fixtures \ + --start-date "$start" \ + --end-date "$end" \ + -o "$fixtures" + + # 4) Analyze with plots (non-interactive) + local args=( + -i "$posts" + --tags-config "$tags" + --replies-csv "$replies" + --fixtures-csv "$fixtures" + --write-augmented-csv + --write-combined-csv + --emoji-mode keep + --emoji-boost + --save-plots + --plot-width-scale 0.8 + --plot-max-width 120 + --plot-height 8 + --activity-top-n 8 + --labels-stagger-rows 3 + ) + if [[ "$backend" == "transformers" ]]; then + args+=( --sentiment-backend transformers --transformers-model "$model" --export-transformers-details ) + elif [[ "$backend" == "gpt" ]]; then + args+=( --sentiment-backend gpt --gpt-model "$gpt_model" --gpt-base-url "$gpt_url" ) + else + args+=( --sentiment-backend vader ) + fi + + ./run_scraper.sh analyze "${args[@]}" +} diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..82789f2 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# This file is intentionally left blank. \ No newline at end of file diff --git a/src/analyze_csv.py b/src/analyze_csv.py new file mode 100644 index 0000000..ce58e72 --- /dev/null +++ b/src/analyze_csv.py @@ -0,0 +1,1313 @@ +""" +analyze_csv +============ + +Generates a Markdown report and optional plots from a Telegram posts CSV (and an optional replies CSV). + +Features +-------- +- Tagging from YAML keywords (config/tags.yaml) +- Sentiment via VADER (default), a local transformers model, or a local GPT (Ollama) +- Emoji-aware preprocessing with optional positivity/negativity boost +- Optional fixtures join to mark matchdays; compact team abbreviation labels inside daily charts +- Combined posts+replies augmented outputs and a merged CSV + +Key CLI flags +------------- +- --sentiment-backend vader|transformers|gpt +- --transformers-model NAME_OR_PATH +- --gpt-model NAME --gpt-base-url URL --gpt-batch-size K +- --emoji-mode keep|demojize|strip [--emoji-boost] +- --plot-width-scale FLOAT --plot-max-width INCHES --plot-height INCHES +- --activity-top-n N +- --labels-max-per-day N --labels-per-line N --labels-stagger-rows N --labels-band-y FLOAT --labels-annotate-mode ticks|all|ticks+top + +Plots (when --save-plots) +------------------------- +- posts_heatmap_hour_dow.png +- sentiment_by_tag_posts.png +- daily_activity_stacked.png +- daily_volume_and_sentiment.png (bars: volume; lines: positive% and negative%) +- matchday_sentiment_overall.png +- matchday_posts_volume_vs_sentiment.png +""" + +import argparse +import os +import re +from datetime import datetime +from typing import List, Optional, Tuple + +import pandas as pd +import yaml +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +import emoji as _emoji + + +def load_csv(path: str) -> pd.DataFrame: + df = pd.read_csv(path) + # Normalize columns we expect from the scraper + # Columns: id,date,message,sender_id,views,forwards,replies,url + # Parse date to datetime (naive) + if 'date' in df.columns: + df['date'] = pd.to_datetime(df['date'], errors='coerce') + for col in ['views', 'forwards', 'replies', 'sender_id']: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors='coerce') + # Basic cleaning + if 'message' in df.columns: + df['message'] = df['message'].fillna('') + return df + + +def summarize(df: pd.DataFrame) -> dict: + total = len(df) + with_text = int((df.get('message', pd.Series(dtype=str)) != '').sum()) if 'message' in df else 0 + no_text = total - with_text + views_mean = float(df['views'].mean()) if 'views' in df and not df['views'].empty else 0.0 + views_median = float(df['views'].median()) if 'views' in df and not df['views'].empty else 0.0 + forwards_mean = float(df['forwards'].mean()) if 'forwards' in df and not df['forwards'].empty else 0.0 + replies_mean = float(df['replies'].mean()) if 'replies' in df and not df['replies'].empty else 0.0 + + first_date = df['date'].min() if 'date' in df else None + last_date = df['date'].max() if 'date' in df else None + + return { + 'total_messages': total, + 'with_text': with_text, + 'no_text': no_text, + 'views_mean': views_mean, + 'views_median': views_median, + 'forwards_mean': forwards_mean, + 'replies_mean': replies_mean, + 'first_date': first_date, + 'last_date': last_date, + } + + +def top_messages(df: pd.DataFrame, by: str, k: int = 10) -> pd.DataFrame: + if by not in df.columns: + return pd.DataFrame() + return df.sort_values(by=by, ascending=False).head(k)[['id', 'date', 'message', by, 'url']] + + +def temporal_distributions(df: pd.DataFrame) -> dict: + if 'date' not in df: + return {} + out = {} + d = df.dropna(subset=['date']).copy() + d['day'] = d['date'].dt.date + d['hour'] = d['date'].dt.hour + out['per_day'] = d.groupby('day').size().reset_index(name='count') + out['per_hour'] = d.groupby('hour').size().reset_index(name='count') + return out + + +def write_markdown_report( + df: pd.DataFrame, + out_path: str, + channel: Optional[str] = None, + replies_df: Optional[pd.DataFrame] = None, +): + summ = summarize(df) + tops_views = top_messages(df, 'views', 10) + tops_forwards = top_messages(df, 'forwards', 10) + tops_replies = top_messages(df, 'replies', 10) + temps = temporal_distributions(df) + + lines = [] + title = f"Telegram Channel Report{f' - {channel}' if channel else ''}" + lines.append(f"# {title}") + lines.append("") + lines.append("## Summary") + lines.append("") + lines.append(f"- Total messages: {summ['total_messages']}") + lines.append(f"- With text: {summ['with_text']}") + lines.append(f"- Without text: {summ['no_text']}") + lines.append(f"- Views (mean/median): {summ['views_mean']:.1f} / {summ['views_median']:.1f}") + lines.append(f"- Forwards (mean): {summ['forwards_mean']:.2f}") + lines.append(f"- Replies (mean): {summ['replies_mean']:.2f}") + if summ['first_date'] is not None and summ['last_date'] is not None: + lines.append(f"- Date range: {summ['first_date']} — {summ['last_date']}") + + # Sentiment summary if available + if 'sentiment_compound' in df.columns: + lines.append("\n### Sentiment summary") + sent = df['sentiment_compound'].dropna() + if not sent.empty: + lines.append(f"- Mean compound: {sent.mean():.3f}") + lines.append(f"- Median compound: {sent.median():.3f}") + pos_share = (sent > 0.05).mean() + neg_share = (sent < -0.05).mean() + neu_share = max(0.0, 1.0 - pos_share - neg_share) + lines.append(f"- Share positive (compound > 0.05): {pos_share:.2%}") + lines.append(f"- Share neutral (|compound| ≤ 0.05): {neu_share:.2%}") + lines.append(f"- Share negative (compound < -0.05): {neg_share:.2%}") + + def table(df_small: pd.DataFrame, caption: str) -> None: + if df_small is None or df_small.empty: + lines.append(f"\n### {caption}\n\n_No data_\n") + return + lines.append(f"\n### {caption}\n") + # Limit message preview to first 120 chars + df_disp = df_small.copy() + if 'message' in df_disp.columns: + df_disp['message'] = df_disp['message'].astype(str).str.replace("\n", " ").str.slice(0, 120) + lines.append(df_disp.to_markdown(index=False)) + + table(tops_views, "Top 10 posts by views") + table(tops_forwards, "Top 10 posts by forwards") + table(tops_replies, "Top 10 posts by replies (channel field)") + + # If we computed scraped reply counts, include that ranking + if 'replies_count_scraped' in df.columns: + cols = ['id', 'date', 'message', 'replies_count_scraped'] + if 'replies_top_tags' in df.columns: + cols.append('replies_top_tags') + if 'url' in df.columns: + cols.append('url') + top_scraped = df.sort_values('replies_count_scraped', ascending=False).head(10)[cols] + lines.append("\n### Top 10 posts by scraped reply count") + df_disp = top_scraped.copy() + if 'message' in df_disp.columns: + df_disp['message'] = df_disp['message'].astype(str).str.replace("\n", " ").str.slice(0, 120) + lines.append(df_disp.to_markdown(index=False)) + + # Temporal distributions + if temps: + lines.append("\n## Temporal distribution") + if 'per_day' in temps and not temps['per_day'].empty: + lines.append("\n### Messages per day") + lines.append(temps['per_day'].to_markdown(index=False)) + if 'per_hour' in temps and not temps['per_hour'].empty: + lines.append("\n### Messages per hour (0-23)") + lines.append(temps['per_hour'].to_markdown(index=False)) + + # Per-tag engagement (if tags exist) + if 'tags' in df.columns: + tagged = df.copy() + # Normalize tags column to list + tagged['tags'] = tagged['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x])) + exploded = tagged.explode('tags') + exploded = exploded[exploded['tags'].notna() & (exploded['tags'] != '')] + if not exploded.empty: + grp = ( + exploded.groupby('tags') + .agg( + count=('id', 'count'), + views_mean=('views', 'mean'), + views_median=('views', 'median'), + replies_mean=('replies', 'mean'), + forwards_mean=('forwards', 'mean'), + sentiment_mean=('sentiment_compound', 'mean') if 'sentiment_compound' in exploded.columns else ('id','count') + ) + .reset_index() + .sort_values(['count', 'views_mean'], ascending=[False, False]) + ) + lines.append("\n## Per-tag engagement") + lines.append(grp.to_markdown(index=False)) + + # Per-tag sentiment breakdown for posts + if 'sentiment_compound' in exploded.columns: + s = exploded[['tags', 'sentiment_compound']].dropna() + if not s.empty: + s['is_pos'] = s['sentiment_compound'] > 0.05 + s['is_neg'] = s['sentiment_compound'] < -0.05 + sgrp = ( + s.groupby('tags') + .agg( + n=('sentiment_compound', 'count'), + mean=('sentiment_compound', 'mean'), + median=('sentiment_compound', 'median'), + pos_share=('is_pos', 'mean'), + neg_share=('is_neg', 'mean'), + ) + .reset_index() + .sort_values(['n', 'mean'], ascending=[False, False]) + ) + # Derive neutral share as residual + sgrp['neu_share'] = (1 - sgrp['pos_share'] - sgrp['neg_share']).clip(lower=0) + # Reorder columns for readability + cols = ['tags', 'n', 'mean', 'median', 'pos_share', 'neu_share', 'neg_share'] + sgrp = sgrp[[c for c in cols if c in sgrp.columns]] + lines.append("\n### Per-tag sentiment (posts)") + lines.append(sgrp.to_markdown(index=False)) + + # Replies per-tag summary (if provided and tagged) + if replies_df is not None and 'tags' in replies_df.columns: + rtagged = replies_df.copy() + rtagged['tags'] = rtagged['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x])) + rexpl = rtagged.explode('tags') + rexpl = rexpl[rexpl['tags'].notna() & (rexpl['tags'] != '')] + if not rexpl.empty: + rgrp = ( + rexpl.groupby('tags') + .agg( + replies_count=('id', 'count'), + replies_sentiment_mean=('sentiment_compound', 'mean') if 'sentiment_compound' in rexpl.columns else ('id','count'), + ) + .reset_index() + .sort_values(['replies_count'], ascending=[False]) + ) + lines.append("\n## Replies per-tag summary") + lines.append(rgrp.to_markdown(index=False)) + + # Per-tag sentiment breakdown for replies + if 'sentiment_compound' in rexpl.columns: + rs = rexpl[['tags', 'sentiment_compound']].dropna() + if not rs.empty: + rs['is_pos'] = rs['sentiment_compound'] > 0.05 + rs['is_neg'] = rs['sentiment_compound'] < -0.05 + rsgrp = ( + rs.groupby('tags') + .agg( + n=('sentiment_compound', 'count'), + mean=('sentiment_compound', 'mean'), + median=('sentiment_compound', 'median'), + pos_share=('is_pos', 'mean'), + neg_share=('is_neg', 'mean'), + ) + .reset_index() + .sort_values(['n', 'mean'], ascending=[False, False]) + ) + rsgrp['neu_share'] = (1 - rsgrp['pos_share'] - rsgrp['neg_share']).clip(lower=0) + cols = ['tags', 'n', 'mean', 'median', 'pos_share', 'neu_share', 'neg_share'] + rsgrp = rsgrp[[c for c in cols if c in rsgrp.columns]] + lines.append("\n### Per-tag sentiment (replies)") + lines.append(rsgrp.to_markdown(index=False)) + + # Combined sentiment (posts + replies) if replies are provided + if 'sentiment_compound' in df.columns and replies_df is not None and 'sentiment_compound' in replies_df.columns: + combined_cols = ['sentiment_compound'] + if 'tags' in df.columns or ('tags' in replies_df.columns): + combined_cols.append('tags') + posts_part = df[['sentiment_compound'] + (['tags'] if 'tags' in df.columns else [])].copy() + posts_part['content_type'] = 'post' + reps_part = replies_df[['sentiment_compound'] + (['tags'] if 'tags' in replies_df.columns else [])].copy() + reps_part['content_type'] = 'reply' + combined = pd.concat([posts_part, reps_part], ignore_index=True) + + lines.append("\n## Combined sentiment (posts + replies)") + sent_all = combined['sentiment_compound'].dropna() + if not sent_all.empty: + lines.append(f"- Mean compound: {sent_all.mean():.3f}") + lines.append(f"- Median compound: {sent_all.median():.3f}") + pos_share = (sent_all > 0.05).mean() + neg_share = (sent_all < -0.05).mean() + neu_share = max(0.0, 1.0 - pos_share - neg_share) + lines.append(f"- Share positive (compound > 0.05): {pos_share:.2%}") + lines.append(f"- Share neutral (|compound| ≤ 0.05): {neu_share:.2%}") + lines.append(f"- Share negative (compound < -0.05): {neg_share:.2%}") + + # Per-tag combined sentiment if tags exist + if 'tags' in combined.columns: + ctag = combined.copy() + ctag['tags'] = ctag['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x])) + cexpl = ctag.explode('tags') + cexpl = cexpl[cexpl['tags'].notna() & (cexpl['tags'] != '')] + if not cexpl.empty: + cexpl['is_pos'] = cexpl['sentiment_compound'] > 0.05 + cexpl['is_neg'] = cexpl['sentiment_compound'] < -0.05 + cgrp = ( + cexpl.groupby('tags') + .agg( + n=('sentiment_compound', 'count'), + mean=('sentiment_compound', 'mean'), + median=('sentiment_compound', 'median'), + pos_share=('is_pos', 'mean'), + neg_share=('is_neg', 'mean'), + ) + .reset_index() + .sort_values(['n', 'mean'], ascending=[False, False]) + ) + cgrp['neu_share'] = (1 - cgrp['pos_share'] - cgrp['neg_share']).clip(lower=0) + cols = ['tags', 'n', 'mean', 'median', 'pos_share', 'neu_share', 'neg_share'] + cgrp = cgrp[[c for c in cols if c in cgrp.columns]] + lines.append("\n### Per-tag sentiment (combined posts + replies)") + lines.append(cgrp.to_markdown(index=False)) + + # Matchday cross-analysis: on vs off matchdays for posts and replies + def _matchday_table(d: pd.DataFrame, col: str = 'is_matchday') -> Optional[pd.DataFrame]: + if d is None or d.empty or col not in d.columns: + return None + t = d.copy() + t = t.dropna(subset=[col]) + if t.empty: + return None + # Sentiment shares if sentiment available + has_sent = 'sentiment_compound' in t.columns and t['sentiment_compound'].notna().any() + if has_sent: + t['is_pos'] = t['sentiment_compound'] > 0.05 + t['is_neg'] = t['sentiment_compound'] < -0.05 + agg = { + 'id': 'count' + } + if has_sent: + agg.update({ + 'sentiment_compound': 'mean', + 'is_pos': 'mean', + 'is_neg': 'mean', + }) + g = t.groupby(col).agg(agg).rename(columns={'id': 'count'}) + if has_sent: + g = g.rename(columns={'sentiment_compound': 'sentiment_mean', 'is_pos': 'pos_share', 'is_neg': 'neg_share'}) + g['neu_share'] = (1 - g['pos_share'] - g['neg_share']).clip(lower=0) + # Reorder + g = g[['count', 'sentiment_mean', 'pos_share', 'neu_share', 'neg_share']] + return g.reset_index() + + posts_md_tbl = _matchday_table(df) + replies_md_tbl_parent = _matchday_table(replies_df, col='parent_is_matchday') if (replies_df is not None and 'parent_is_matchday' in replies_df.columns) else None + replies_md_tbl_reply = _matchday_table(replies_df, col='is_matchday') if (replies_df is not None and 'is_matchday' in replies_df.columns) else None + if posts_md_tbl is not None or replies_md_tbl_parent is not None or replies_md_tbl_reply is not None: + lines.append("\n## Matchday cross-analysis") + if posts_md_tbl is not None: + lines.append("\n### Posts: on vs off matchdays") + lines.append(posts_md_tbl.to_markdown(index=False)) + # If per-post replies are available, show engagement breakdown + if 'replies_count_scraped' in df.columns: + tmp = df.copy() + tmp['replies_count_scraped'] = pd.to_numeric(tmp['replies_count_scraped'], errors='coerce').fillna(0) + eng = ( + tmp.groupby('is_matchday') + .agg( + posts=('id','count'), + posts_with_replies=('replies_count_scraped', lambda s: (s>0).mean()), + replies_total=('replies_count_scraped','sum'), + replies_mean_per_post=('replies_count_scraped','mean'), + replies_median_per_post=('replies_count_scraped','median'), + ) + .reset_index() + ) + lines.append("\n### Posts engagement vs matchday (replies per post)") + lines.append(eng.to_markdown(index=False)) + if replies_md_tbl_parent is not None: + lines.append("\n### Replies (by parent matchday): on vs off matchdays") + lines.append(replies_md_tbl_parent.to_markdown(index=False)) + if replies_md_tbl_reply is not None: + lines.append("\n### Replies (by reply date): on vs off matchdays") + lines.append(replies_md_tbl_reply.to_markdown(index=False)) + + with open(out_path, 'w', encoding='utf-8') as f: + f.write("\n".join(lines)) + + +def main(): + parser = argparse.ArgumentParser(description="Analyze Telegram CSV and generate a Markdown report") + parser.add_argument('csv', help='Path to CSV file exported by the scraper') + parser.add_argument('-o', '--output', default=None, help='Output Markdown path (default: alongside CSV with .md)') + parser.add_argument('--channel', default=None, help='Optional channel name for the report title') + parser.add_argument('--tags-config', default=None, help='Path to YAML config for keyword tags (e.g., config/tags.yaml)') + parser.add_argument('--replies-csv', default=None, help='Optional CSV of replies with parent_id and sentiment_compound to aggregate per message') + parser.add_argument('--fixtures-csv', default=None, help='Optional fixtures CSV to derive a matchday flag (matches on date)') + parser.add_argument('--write-augmented-csv', action='store_true', help='Also write a CSV with computed fields (sentiment, tags) alongside the input') + parser.add_argument('--write-combined-csv', action='store_true', help='If replies are provided, also write a merged posts+replies CSV with a content_type column') + parser.add_argument('--save-plots', action='store_true', help='Also save common plots (daily sentiment, posts heatmap, sentiment-by-tag) next to the report') + parser.add_argument('--emoji-mode', choices=['keep', 'demojize', 'strip'], default='keep', help='How to treat emojis before sentiment: keep (default), demojize to :keywords:, or strip emojis') + parser.add_argument('--emoji-boost', action='store_true', help='If set with keep/demojize, gently boost VADER for clearly positive/negative emojis') + parser.add_argument('--sentiment-backend', choices=['vader', 'transformers', 'gpt'], default='vader', help='Choose sentiment engine: vader (default), transformers, or gpt (local via Ollama)') + parser.add_argument('--transformers-model', default='distilbert-base-uncased', help='HF model name or local path for transformers backend') + parser.add_argument('--export-transformers-details', action='store_true', help='When using transformers backend, also export predicted label and raw class probabilities') + # GPT local model knobs (Ollama) + parser.add_argument('--gpt-model', default='llama3', help='Local GPT model name (Ollama)') + parser.add_argument('--gpt-base-url', default='http://localhost:11434', help='Base URL for local GPT server (Ollama)') + parser.add_argument('--gpt-batch-size', type=int, default=8, help='Batch size for GPT requests') + # Plot sizing controls + parser.add_argument('--plot-width-scale', type=float, default=0.8, + help='Scale factor (inches per day) for dynamic plot width of daily activity chart. Default doubled from 0.4 to 0.8.') + parser.add_argument('--plot-max-width', type=float, default=104.0, + help='Maximum figure width (inches) clamp for daily activity chart. Default doubled from 52 to 104. Override to a larger value if needed.') + parser.add_argument('--plot-height', type=float, default=6.5, + help='Figure height (inches) for bar charts. Default 6.5 inches (taller than previous 5).') + parser.add_argument('--activity-top-n', type=int, default=5, + help='Number of top-activity days to highlight and annotate. Use 0 to disable highlighting.') + # Match label rendering controls + parser.add_argument('--labels-max-per-day', type=int, default=3, + help='Maximum number of match labels to show per day before collapsing into +N more.') + parser.add_argument('--labels-per-line', type=int, default=2, + help='Number of match labels per line when stacking within the label band.') + parser.add_argument('--labels-band-y', type=float, default=0.96, + help='Vertical position of the labels band in axes coordinates (inside the axes; 1.0 is top).') + parser.add_argument('--labels-stagger-rows', type=int, default=2, + help='Number of staggered rows in the label band to reduce neighbor collisions (1-3 recommended).') + parser.add_argument('--labels-annotate-mode', choices=['ticks','all','ticks+top'], default='ticks+top', + help='Which days to annotate with match labels: only ticked days, all days, or ticked days plus top-N highlighted days (default).') + args = parser.parse_args() + + df = load_csv(args.csv) + replies_df: Optional[pd.DataFrame] = None + + # Optional tagging step + if args.tags_config and os.path.exists(args.tags_config): + with open(args.tags_config, 'r', encoding='utf-8') as f: + cfg = yaml.safe_load(f) or {} + + # Compile patterns list: List[(tag, List[(pattern, is_regex)])] + patterns: List[Tuple[str, List[Tuple[str, bool]]]] = [] + for tag, arr in (cfg.items() if isinstance(cfg, dict) else []): + compiled: List[Tuple[str, bool]] = [] + for pat in (arr or []): + if isinstance(pat, str) and pat.startswith('re:'): + compiled.append((pat[3:], True)) + else: + compiled.append((str(pat), False)) + patterns.append((tag, compiled)) + + def tag_message(text: str) -> List[str]: + t = text or '' + tags: List[str] = [] + for tag, pats in patterns: + for pat, is_re in pats: + if is_re: + if re.search(pat, t, flags=re.IGNORECASE): + tags.append(tag) + break + else: + if pat.lower() in t.lower(): + tags.append(tag) + break + return tags + + if 'message' in df.columns: + df['tags'] = df['message'].apply(tag_message) + + # If replies CSV provided, apply tags to replies as well + if args.replies_csv and os.path.exists(args.replies_csv): + replies_df = pd.read_csv(args.replies_csv) + if 'message' in replies_df.columns: + replies_df['message'] = replies_df['message'].fillna('') + replies_df['tags'] = replies_df['message'].apply(tag_message) + + # Sentiment scoring + analyzer = SentimentIntensityAnalyzer() + tmodel = None + gpt = None + if args.sentiment_backend == 'transformers': + try: + from .transformer_sentiment import TransformerSentiment + tmodel = TransformerSentiment(args.transformers_model) + print(f"[transformers] Using model: {args.transformers_model} on {tmodel.device}") + except Exception as e: + print(f"[transformers] Falling back to VADER due to error: {e}") + args.sentiment_backend = 'vader' + elif args.sentiment_backend == 'gpt': + try: + from .gpt_sentiment import GPTSentiment + except Exception: + from gpt_sentiment import GPTSentiment + try: + gpt = GPTSentiment(base_url=args.gpt_base_url, model=args.gpt_model) + # Light connectivity probe: do a tiny call that should fail gracefully without raising here + print(f"[gpt] Using local GPT model: {args.gpt_model} at {args.gpt_base_url}") + except Exception as e: + print(f"[gpt] Falling back to VADER (init error): {e}") + args.sentiment_backend = 'vader' + + def _strip_emojis(text: str) -> str: + # Remove all emoji code points + return _emoji.replace_emoji(text or '', replace='') + + def _demojize(text: str) -> str: + return _emoji.demojize(text or '', delimiters=(":", ":")) + + # Simple emoji valence hints for boosting + POS_EMOJI_HINTS = {"😀", "😃", "😄", "😁", "😆", "😊", "🙂", "😍", "🥳", "👍", "🔥", "👏", "💯", "😺", "🤩", "🙌", "🫶", "⚽️", "🏆"} + NEG_EMOJI_HINTS = {"😞", "😟", "😠", "😡", "😢", "😭", "👎", "💔", "🤬", "🤢", "😫", "😩"} + + def _emoji_valence_boost(text: str, base: float) -> float: + if not args.emoji_boost: + return base + # Look at original text to preserve emoji presence regardless of preprocessing + pos_hits = any(ch in POS_EMOJI_HINTS for ch in text) + neg_hits = any(ch in NEG_EMOJI_HINTS for ch in text) + boost = 0.0 + if pos_hits and not neg_hits: + boost = 0.05 + elif neg_hits and not pos_hits: + boost = -0.05 + # Clamp to VADER range [-1, 1] + return max(-1.0, min(1.0, base + boost)) + + def _prep_for_sentiment(text: str) -> str: + if args.emoji_mode == 'strip': + return _strip_emojis(text or '') + if args.emoji_mode == 'demojize': + return _demojize(text or '') + return text or '' + + if 'message' in df.columns: + def _score_msg(t: str) -> float: + raw = t or '' + if args.sentiment_backend == 'transformers' and tmodel is not None: + # Use transformer model in batches later + return None # placeholder, fill after batch + if args.sentiment_backend == 'gpt' and gpt is not None: + return None + proc = _prep_for_sentiment(raw) + score = analyzer.polarity_scores(proc).get('compound') + return _emoji_valence_boost(raw, score) + df['sentiment_compound'] = df['message'].apply(_score_msg) + # Ensure replies have sentiment if present and missing + if replies_df is not None: + if 'message' in replies_df.columns and 'sentiment_compound' not in replies_df.columns: + def _score_rep(t: str) -> float: + raw = t or '' + if args.sentiment_backend == 'transformers' and tmodel is not None: + return None + if args.sentiment_backend == 'gpt' and gpt is not None: + return None + proc = _prep_for_sentiment(raw) + score = analyzer.polarity_scores(proc).get('compound') + return _emoji_valence_boost(raw, score) + replies_df['sentiment_compound'] = replies_df['message'].apply(_score_rep) + + # If transformers backend was selected, fill in sentiment_compound in batches + if args.sentiment_backend == 'transformers' and tmodel is not None: + if 'message' in df.columns: + mask = df['sentiment_compound'].isna() + texts = df.loc[mask, 'message'].astype(str).tolist() + if texts: + preds = tmodel.predict_compound_batch(texts, batch_size=32) + df.loc[mask, 'sentiment_compound'] = preds + if args.export_transformers_details: + # Re-run to get probabilities and labels + from .transformer_sentiment import TransformerSentiment + probs, labels = tmodel.predict_probs_and_labels(texts, batch_size=32) + df.loc[mask, 'sentiment_label'] = labels + df.loc[mask, 'sentiment_probs'] = [','.join(f"{p:.6f}" for p in row) for row in probs] + if replies_df is not None and 'message' in replies_df.columns and 'sentiment_compound' in replies_df.columns: + rmask = replies_df['sentiment_compound'].isna() + rtexts = replies_df.loc[rmask, 'message'].astype(str).tolist() + if rtexts: + rpreds = tmodel.predict_compound_batch(rtexts, batch_size=64) + replies_df.loc[rmask, 'sentiment_compound'] = rpreds + if args.export_transformers_details: + probs, labels = tmodel.predict_probs_and_labels(rtexts, batch_size=64) + replies_df.loc[rmask, 'sentiment_label'] = labels + replies_df.loc[rmask, 'sentiment_probs'] = [','.join(f"{p:.6f}" for p in row) for row in probs] + elif args.sentiment_backend == 'gpt' and gpt is not None: + def _vader_compounds_for(texts: List[str]) -> List[float]: + out_vals: List[float] = [] + for raw in texts: + proc = _prep_for_sentiment(raw) + sc = analyzer.polarity_scores(proc).get('compound') + out_vals.append(_emoji_valence_boost(raw, sc)) + return out_vals + # Fill posts sentiment via local GPT + if 'message' in df.columns: + mask = df['sentiment_compound'].isna() + texts = df.loc[mask, 'message'].astype(str).tolist() + if texts: + try: + preds = gpt.predict_compound_batch(texts, batch_size=int(getattr(args, 'gpt_batch_size', 8))) + df.loc[mask, 'sentiment_compound'] = preds + except Exception as e: + print(f"[gpt] Prediction error; falling back to VADER for remaining rows: {e}") + preds = _vader_compounds_for(texts) + df.loc[mask, 'sentiment_compound'] = preds + if replies_df is not None and 'message' in replies_df.columns and 'sentiment_compound' in replies_df.columns: + rmask = replies_df['sentiment_compound'].isna() + rtexts = replies_df.loc[rmask, 'message'].astype(str).tolist() + if rtexts: + try: + rpreds = gpt.predict_compound_batch(rtexts, batch_size=int(getattr(args, 'gpt_batch_size', 8))) + replies_df.loc[rmask, 'sentiment_compound'] = rpreds + except Exception as e: + print(f"[gpt] Replies prediction error; falling back to VADER for remaining rows: {e}") + rpreds = _vader_compounds_for(rtexts) + replies_df.loc[rmask, 'sentiment_compound'] = rpreds + + # Optional: aggregate replies sentiment per parent and join + if replies_df is not None and 'parent_id' in replies_df.columns and 'sentiment_compound' in replies_df.columns: + agg = replies_df.groupby('parent_id')['sentiment_compound'].mean().reset_index().rename(columns={'sentiment_compound':'replies_sentiment_mean'}) + if 'id' in df.columns: + df = df.merge(agg, how='left', left_on='id', right_on='parent_id').drop(columns=['parent_id']) + + # Optional: matchday flag by joining on date with fixtures (same day) for posts and replies + fixtures_present = bool(args.fixtures_csv and os.path.exists(args.fixtures_csv)) + matchdays = None + fixtures_by_day = None # map: date -> ["Home vs Away" or "Home X-Y Away"] + if fixtures_present: + fix = pd.read_csv(args.fixtures_csv) + if 'utcDate' in fix.columns: + fix['utcDate'] = pd.to_datetime(fix['utcDate'], errors='coerce') + fix['match_day'] = fix['utcDate'].dt.date + matchdays = fix[['match_day']].dropna().drop_duplicates() + # Build per-day match labels + try: + # Map full club names to standard PL 3-letter abbreviations + PL_ABBR = { + 'arsenal': 'ARS', + 'astonvilla': 'AVL', + 'bournemouth': 'BOU', + 'brentford': 'BRE', + 'brightonandhovealbion': 'BHA', + 'chelsea': 'CHE', + 'crystalpalace': 'CRY', + 'everton': 'EVE', + 'fulham': 'FUL', + 'ipswichtown': 'IPS', + 'leicestercity': 'LEI', + 'liverpool': 'LIV', + 'manchestercity': 'MCI', + 'manchesterunited': 'MUN', + 'newcastleunited': 'NEW', + 'nottinghamforest': 'NFO', + 'southampton': 'SOU', + 'tottenhamhotspur': 'TOT', + 'westhamunited': 'WHU', + 'wolverhamptonwanderers': 'WOL', + } + + def _canon_team_key(name: str) -> str: + s = str(name or '') + s = s.lower().replace('&', 'and') + # keep letters and spaces only + import re as _re + s = ''.join(ch if ch.isalpha() or ch.isspace() else ' ' for ch in s) + # collapse whitespace + s = ' '.join(s.split()) + # remove standalone fc/afc tokens + tokens = [t for t in s.split(' ') if t not in ('fc', 'afc')] + return ''.join(tokens) + + def _abbr_team(name: str) -> str: + key = _canon_team_key(name) + if key in PL_ABBR: + return PL_ABBR[key] + # Fallback: build a 3-letter code from initials or first letters + import re as _re + toks = _re.findall(r"[A-Za-z]+", str(name or '')) + toks = [t for t in toks if t.lower() not in ('fc', 'afc')] + if toks: + initials = ''.join(t[0] for t in toks).upper() + if len(initials) >= 3: + return initials[:3] + joined = ''.join(toks).upper() + return (joined + 'XXX')[:3] + return str(name or '')[:3].upper() + cols = [c for c in ['match_day','homeTeam','awayTeam','homeScore','awayScore'] if c in fix.columns] + lab_df = fix[cols].dropna(subset=['match_day']).copy() + def _mk_label(row): + # Only team abbreviations, no scores + ht = _abbr_team(row.get('homeTeam', '')) + at = _abbr_team(row.get('awayTeam', '')) + # Use a short separator to keep labels compact + return f"{ht}–{at}" + lab_df['label'] = lab_df.apply(_mk_label, axis=1) + fixtures_by_day = lab_df.groupby('match_day')['label'].apply(list).to_dict() + except Exception: + fixtures_by_day = None + if matchdays is not None: + if 'date' in df.columns: + df['post_day'] = pd.to_datetime(df['date'], errors='coerce').dt.date + df = df.merge(matchdays, how='left', left_on='post_day', right_on='match_day') + df['is_matchday'] = df['match_day'].notna() + df = df.drop(columns=['match_day', 'post_day']) + if replies_df is not None and 'date' in replies_df.columns: + replies_df['reply_day'] = pd.to_datetime(replies_df['date'], errors='coerce').dt.date + replies_df = replies_df.merge(matchdays, how='left', left_on='reply_day', right_on='match_day') + replies_df['is_matchday'] = replies_df['match_day'].notna() + replies_df = replies_df.drop(columns=['match_day', 'reply_day']) + # Also derive parent-based matchday classification for replies if possible + if 'parent_id' in replies_df.columns and 'id' in df.columns and 'is_matchday' in df.columns: + parent_map = df[['id', 'is_matchday']].rename(columns={'id': 'parent_id', 'is_matchday': 'parent_is_matchday'}) + replies_df = replies_df.merge(parent_map, how='left', on='parent_id') + # Diagnostics + try: + posts_md = int(df['is_matchday'].sum()) if 'is_matchday' in df.columns else 0 + replies_md = int(replies_df['is_matchday'].sum()) if (replies_df is not None and 'is_matchday' in replies_df.columns) else 0 + parent_md = int(replies_df['parent_is_matchday'].sum()) if (replies_df is not None and 'parent_is_matchday' in replies_df.columns) else 0 + print(f"[fixtures] Matchday join: posts matchday rows={posts_md}; replies by reply-date matchday rows={replies_md}; replies by parent matchday rows={parent_md}") + except Exception: + pass + + # Per-parent reply tag rollup: replies_count_scraped and replies_top_tags + if replies_df is not None and 'parent_id' in replies_df.columns: + # Replies count per parent + rcount = replies_df.groupby('parent_id').agg(replies_count_scraped=('id', 'count')).reset_index() + if 'id' in df.columns: + df = df.merge(rcount, how='left', left_on='id', right_on='parent_id').drop(columns=['parent_id']) + # Top tags per parent (if tagged) + if 'tags' in replies_df.columns: + rtagged = replies_df.copy() + rtagged['tags'] = rtagged['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x])) + rexpl = rtagged.explode('tags') + rexpl = rexpl[rexpl['tags'].notna() & (rexpl['tags'] != '')] + if not rexpl.empty: + tag_counts = rexpl.groupby(['parent_id', 'tags']).size().reset_index(name='count') + # Build top-3 tag string per parent + def top3(group: pd.DataFrame) -> str: + g = group.sort_values('count', ascending=False).head(3) + return '|'.join(f"{row['tags']}({int(row['count'])})" for _, row in g.iterrows()) + top_tags = tag_counts.groupby('parent_id').apply(top3).reset_index(name='replies_top_tags') + if 'id' in df.columns: + df = df.merge(top_tags, how='left', left_on='id', right_on='parent_id').drop(columns=['parent_id']) + + out = args.output + if out is None: + base, _ = os.path.splitext(args.csv) + out = base + '_report.md' + write_markdown_report(df, out_path=out, channel=args.channel, replies_df=replies_df) + print(f"Report written to {out}") + + if args.write_augmented_csv: + base, ext = os.path.splitext(args.csv) + aug = base + '_tagged.csv' + # Serialize tags list to a semicolon-separated string for CSV + if 'tags' in df.columns: + df_out = df.copy() + df_out['tags'] = df_out['tags'].apply(lambda xs: ';'.join(xs) if isinstance(xs, list) else '') + else: + df_out = df + df_out.to_csv(aug, index=False) + print(f"Augmented CSV written to {aug}") + + # Also write a tagged replies CSV if provided + if replies_df is not None: + rbase, rext = os.path.splitext(args.replies_csv) + raug = rbase + '_tagged.csv' + r_out = replies_df.copy() + if 'tags' in r_out.columns: + r_out['tags'] = r_out['tags'].apply(lambda xs: ';'.join(xs) if isinstance(xs, list) else '') + r_out.to_csv(raug, index=False) + print(f"Replies augmented CSV written to {raug}") + + # Optional: write combined posts+replies CSV + if args.write_combined_csv and replies_df is not None: + # Normalize posts columns + p = df.copy() + p['content_type'] = 'post' + # Ensure shared sentiment/tags columns exist + if 'sentiment_compound' not in p.columns and 'message' in p.columns: + analyzer = SentimentIntensityAnalyzer() + p['sentiment_compound'] = p['message'].apply(lambda t: analyzer.polarity_scores(t or '').get('compound')) + # Harmonize tag serialization to list before final serialization + if 'tags' in p.columns: + p_tags = p['tags'] + else: + p['tags'] = [[] for _ in range(len(p))] + + # Normalize replies columns + r = replies_df.copy() + r['content_type'] = 'reply' + # For replies, the post id is parent_id; ensure a common column 'parent_id' exists + if 'parent_id' not in r.columns and 'id' in r.columns: + r['parent_id'] = None + + # Select a union of reasonable columns + sel_cols = [] + for c in ['content_type', 'id', 'parent_id', 'date', 'message', 'sender_id', 'views', 'forwards', 'replies', 'sentiment_compound', 'sentiment_label', 'sentiment_probs', 'url', 'tags', 'is_matchday', 'parent_is_matchday']: + if c in p.columns or c in r.columns: + sel_cols.append(c) + p_sel = p.reindex(columns=sel_cols) + r_sel = r.reindex(columns=sel_cols) + + combined_df = pd.concat([p_sel, r_sel], ignore_index=True) + # Serialize tags for CSV + if 'tags' in combined_df.columns: + combined_df['tags'] = combined_df['tags'].apply(lambda xs: ';'.join(xs) if isinstance(xs, list) else ('' if pd.isna(xs) else str(xs))) + + base, _ = os.path.splitext(args.csv) + comb_path = base + '_combined.csv' + combined_df.to_csv(comb_path, index=False) + print(f"Combined posts+replies CSV written to {comb_path}") + + # Optional: save plots + if args.save_plots: + try: + import matplotlib.pyplot as plt + import seaborn as sns + except Exception as e: + print(f"[plots] Skipping plots; matplotlib/seaborn not available: {e}") + else: + out_dir = os.path.dirname(out) or "." + + # Removed: Daily average sentiment (combined posts + replies) + + # 2) Posts heatmap by day-of-week and hour + try: + if 'date' in df.columns and not df.empty: + t = df.dropna(subset=['date']).copy() + if not t.empty: + t['date'] = pd.to_datetime(t['date'], errors='coerce') + t = t.dropna(subset=['date']) + if not t.empty: + t['dow'] = t['date'].dt.day_name() + t['hour'] = t['date'].dt.hour + pivot = t.pivot_table(index='dow', columns='hour', values='id', aggfunc='count').fillna(0) + order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"] + pivot = pivot.reindex(order) + plt.figure(figsize=(10,5)) + sns.heatmap(pivot, cmap='Blues') + plt.title('Posts heatmap by day-of-week and hour') + plt.xlabel('Hour'); plt.ylabel('Day of week') + plt.tight_layout() + plt.savefig(os.path.join(out_dir, 'posts_heatmap_hour_dow.png'), dpi=150) + plt.close() + print(f"[plots] Saved {os.path.join(out_dir, 'posts_heatmap_hour_dow.png')}") + except Exception as e: + print(f"[plots] Failed posts heatmap: {e}") + + # 3) Sentiment shares by tag (posts) stacked bars + try: + if 'tags' in df.columns and ('sentiment_compound' in df.columns or 'sentiment_label' in df.columns): + p = df.copy() + p['tags'] = p['tags'].apply(lambda s: s if isinstance(s, list) else ([] if pd.isna(s) else [s])) + e = p.explode('tags') + # Keep rows with a tag and either a sentiment label or compound value + e = e[(e['tags'].notna()) & (e['tags']!='')] + if 'sentiment_label' in e.columns: + e = e[e['sentiment_label'].notna()] + else: + e = e[e['sentiment_compound'].notna()] + # Filter to team tags only (those starting with 'club_') + e = e[e['tags'].astype(str).str.startswith('club_')] + if not e.empty: + if 'sentiment_label' in e.columns: + # Use model-predicted labels when available + lab = e['sentiment_label'].astype(str).str.lower() + e['pos'] = lab.str.contains('pos|positive').astype(int) + e['neg'] = lab.str.contains('neg|negative').astype(int) + e['neu'] = (~(e['pos'].astype(bool) | e['neg'].astype(bool))).astype(int) + else: + # Fallback to compound thresholds + e['pos'] = (e['sentiment_compound'] > 0.05).astype(int) + e['neg'] = (e['sentiment_compound'] < -0.05).astype(int) + e['neu'] = 1 - e['pos'] - e['neg'] + # Group by team tag and compute average shares, include all teams (no top-N cap) + g = e.groupby('tags')[['pos','neu','neg']].mean().sort_values('pos', ascending=False) + # Dynamic width based on number of teams; reuse plot flags + n_teams = len(g.index) + fig_w = max(16, min(float(args.plot_max_width), float(args.plot_width_scale) * n_teams)) + try: + print(f"[plots] sentiment_by_tag_posts: teams={n_teams}, width_in={fig_w:.2f}, scale={float(args.plot_width_scale):.2f}, max={float(args.plot_max_width):.2f}") + except Exception: + pass + fig, ax = plt.subplots(figsize=(fig_w, float(args.plot_height))) + g[['pos','neu','neg']].plot(kind='bar', stacked=True, color=['#2ca02c','#aaaaaa','#d62728'], ax=ax) + ax.set_title('Sentiment shares by team (posts)') + ax.set_ylabel('Share') + # Improve label readability for many teams + for label in ax.get_xticklabels(): + label.set_rotation(45) + label.set_ha('right') + plt.tight_layout() + plt.savefig(os.path.join(out_dir, 'sentiment_by_tag_posts.png'), dpi=150) + plt.close() + print(f"[plots] Saved {os.path.join(out_dir, 'sentiment_by_tag_posts.png')}") + except Exception as e: + print(f"[plots] Failed sentiment-by-tag plot: {e}") + + # Removed: Replies daily average sentiment plot + + # 5) Combined activity: stacked counts by content_type per day + try: + if 'date' in df.columns: + posts_activity = df[['id','date']].dropna().copy() + posts_activity['date'] = pd.to_datetime(posts_activity['date'], errors='coerce') + posts_activity = posts_activity.dropna(subset=['date']) + posts_activity['day'] = posts_activity['date'].dt.date + posts_activity['content_type'] = 'post' + combined_act = posts_activity + if replies_df is not None and 'date' in replies_df.columns: + replies_activity = replies_df[['id','date']].dropna().copy() + replies_activity['date'] = pd.to_datetime(replies_activity['date'], errors='coerce') + replies_activity = replies_activity.dropna(subset=['date']) + replies_activity['day'] = replies_activity['date'].dt.date + replies_activity['content_type'] = 'reply' + combined_act = pd.concat([posts_activity, replies_activity], ignore_index=True) + if not combined_act.empty: + pv = combined_act.pivot_table(index='day', columns='content_type', values='id', aggfunc='count').fillna(0) + totals = pv.sum(axis=1) + num_days = len(pv.index) + # Determine top-N days to highlight (0 disables) + req_top_n = int(args.activity_top_n) if hasattr(args, 'activity_top_n') else 5 + top_n = max(0, min(num_days, req_top_n)) + top_days = list(totals.nlargest(top_n).index) if top_n > 0 else [] + # Improve readability for long ranges: scale width and thin x-ticks + # Reuse num_days defined above + # Make the figure wider for better x-axis readability using CLI-tunable params. + # Dynamic width scaled by the number of days, clamped to [16, plot_max_width]. + fig_w = max(16, min(float(args.plot_max_width), float(args.plot_width_scale) * num_days)) + # Debug print to help users verify width computation + try: + print(f"[plots] daily_activity_stacked: days={num_days}, width_in={fig_w:.2f}, scale={float(args.plot_width_scale):.2f}, max={float(args.plot_max_width):.2f}") + except Exception: + pass + fig, ax = plt.subplots(figsize=(fig_w, float(args.plot_height))) + pv.plot(kind='bar', stacked=True, color={'post':'#9467bd','reply':'#8c564b'}, ax=ax) + ax.set_title('Daily activity (posts vs replies)') + ax.set_xlabel('Day'); ax.set_ylabel('Count') + labels_in_band = False + show_pos = None + show_pos_set = set() + # Thin tick labels to ~12 evenly spaced labels for large ranges + try: + import numpy as _np + # Base tick positions (0..num_days-1) and labels + base_idx = list(range(num_days)) + # Positions of top days + highlight_pos = [pv.index.get_loc(d) for d in top_days] + if num_days > 20: + desired = 12 + step = max(1, int(_np.ceil(num_days / desired))) + show_pos = list(range(0, num_days, step)) + # Ensure highlight positions are included + show_pos = sorted(set(show_pos + highlight_pos)) + ax.set_xticks(show_pos) + labels_all = [f"{d} ({d.strftime('%a')})" if hasattr(d, 'strftime') else str(d) for d in pv.index] + show_labels = [labels_all[i] for i in show_pos] + ax.set_xticklabels(show_labels, rotation=45, ha='right') + show_pos_set = set(show_pos) + else: + # Set all labels with day names + labels = [f"{d} ({d.strftime('%a')})" if hasattr(d, 'strftime') else str(d) for d in pv.index] + ax.set_xticks(base_idx) + ax.set_xticklabels(labels) + for label in ax.get_xticklabels(): + label.set_rotation(45) + label.set_ha('right') + show_pos = base_idx + show_pos_set = set(base_idx) + # Color highlighted tick labels and annotate totals + # After setting ticks/labels, get back the positions we set + current_ticks = ax.get_xticks() + tick_to_pos = {i: i for i in current_ticks} + # Map current tick order to positions for styling + for tick_label, xpos in zip(ax.get_xticklabels(), current_ticks): + pos_int = int(round(xpos)) + if pos_int in highlight_pos: + tick_label.set_color('crimson') + tick_label.set_fontweight('bold') + # Annotate total above the stacked bar + y = float(totals.iloc[pos_int]) + # Compute breakdown for this day + try: + p_val = float(pv.iloc[pos_int]['post']) if 'post' in pv.columns else 0.0 + except Exception: + p_val = 0.0 + try: + r_val = float(pv.iloc[pos_int]['reply']) if 'reply' in pv.columns else 0.0 + except Exception: + r_val = 0.0 + lbl = f"{int(y)} ({int(p_val)}+{int(r_val)})" + ax.text(pos_int, y, lbl, color='crimson', fontsize=8, fontweight='bold', ha='center', va='bottom') + except Exception: + pass + # If fixtures are available, annotate games per day above bars + try: + if fixtures_by_day is not None and len(fixtures_by_day) > 0: + # Reserve a fixed band above the bars for match labels (axes coordinates) + from matplotlib import transforms as _mtrans + # Diagnostics: see how many pivot days have fixtures + try: + keys = list(fixtures_by_day.keys()) + matched_days = sum(1 for d in pv.index if d in fixtures_by_day) + print(f"[plots] fixtures days={len(keys)}; pivot days={len(pv.index)}; matched days={matched_days}") + except Exception: + pass + annotated_days = 0 + # Fixed band just above the axes (y in axes coords) + trans_xdata_yaxes = ax.get_xaxis_transform() + y_band = float(getattr(args, 'labels_band_y', 0.96)) + rows = max(1, int(getattr(args, 'labels_stagger_rows', 2))) + rows = min(rows, 4) + offset_step = 0.055 # vertical offset between stagger rows (in axes coords) + # Write a small debug CSV of expected labels + try: + dbg_path = os.path.join(out_dir, 'match_labels_debug.csv') + _rows = [] + for d, labs in fixtures_by_day.items(): + _rows.append({'day': str(d), 'labels': ' | '.join(str(x) for x in labs)}) + pd.DataFrame(_rows).to_csv(dbg_path, index=False) + print(f"[plots] wrote {dbg_path} with {len(_rows)} days") + except Exception: + pass + # Determine which positions to annotate based on mode + mode = getattr(args, 'labels_annotate_mode', 'ticks+top') + pos_all = set(range(num_days)) + pos_ticks = set(show_pos or []) + pos_top = set(highlight_pos) + if mode == 'all': + annotate_positions = pos_all + elif mode == 'ticks': + annotate_positions = pos_ticks + else: # ticks+top (default) + annotate_positions = pos_ticks | pos_top + + max_per_day = max(1, int(getattr(args, 'labels_max_per_day', 3))) + per_line = max(1, int(getattr(args, 'labels_per_line', 2))) + + def _chunk(xs, n): + return [xs[i:i+n] for i in range(0, len(xs), n)] + + for i, day in enumerate(pv.index): + if i not in annotate_positions: + continue + labels = fixtures_by_day.get(day) + if not labels: + continue + labs_all = [str(x) for x in labels] + if len(labs_all) > max_per_day: + extra = len(labs_all) - max_per_day + labs = labs_all[:max_per_day] + [f"+{extra} more"] + else: + labs = labs_all + # Build multi-line text: per_line entries per row + lines = [' • '.join(chunk) for chunk in _chunk(labs, per_line)] + text = '\n'.join(lines) + # Stagger vertically by index to reduce neighbor collisions + row_id = i % rows + # Stagger downward inside the axes, away from the title + y = y_band - (row_id * offset_step) + # Keep within the axes area + y = max(0.02, min(0.98, y)) + # Center above the bar; small bbox for readability + ax.text(i, y, text, + fontsize=7, ha='center', va='bottom', rotation=0, + clip_on=False, zorder=5, color='forestgreen', transform=trans_xdata_yaxes, + bbox=dict(facecolor='white', edgecolor='none', alpha=0.6, pad=1.5)) + annotated_days += 1 + if annotated_days > 0: + # Leave extra headroom above axes for the label band + try: + labels_in_band = True + # Add y-margin so tallest bars don't collide with labels + base_margin = 0.10 + extra = (rows - 1) * 0.03 + ax.margins(y=min(0.30, base_margin + extra)) + print(f"[plots] match labels annotated (inside band): days={annotated_days}; mode={mode}; max/day={max_per_day}; per_line={per_line}; rows={rows}; y_band={y_band:.2f}") + except Exception: + pass + except Exception as e: + print(f"[plots] match labels annotation skipped: {e}") + # First tighten layout, then reserve top margin if label band is used + plt.tight_layout() + try: + # If labels are placed inside (y_band < 1), no need to push the title + pass + except Exception: + pass + plt.savefig(os.path.join(out_dir, 'daily_activity_stacked.png'), dpi=150) + plt.close() + print(f"[plots] Saved {os.path.join(out_dir, 'daily_activity_stacked.png')}") + except Exception as e: + print(f"[plots] Failed daily activity stacked: {e}") + + # 5b) Daily volume (posts+replies) with positive/negative sentiment shares (twin y-axes) + try: + if 'date' in df.columns: + # Build per-day combined data with sentiment flags + parts = [] + # Posts + p = df[['id','date']].copy() + p['date'] = pd.to_datetime(p['date'], errors='coerce') + p = p.dropna(subset=['date']) + if not p.empty: + if 'sentiment_label' in df.columns and df['sentiment_label'].notna().any(): + lab = df.loc[p.index, 'sentiment_label'].astype(str).str.lower() + p['is_pos'] = lab.str.contains('pos|positive', regex=True, na=False) + p['is_neg'] = lab.str.contains('neg|negative', regex=True, na=False) + else: + # Fallback to compound thresholds + if 'sentiment_compound' in df.columns: + sc = pd.to_numeric(df.loc[p.index, 'sentiment_compound'], errors='coerce') + p['is_pos'] = sc > 0.05 + p['is_neg'] = sc < -0.05 + else: + p['is_pos'] = False + p['is_neg'] = False + p['day'] = p['date'].dt.date + parts.append(p[['day','is_pos','is_neg']]) + # Replies + if replies_df is not None and 'date' in replies_df.columns: + r = replies_df[['id','date']].copy() + r['date'] = pd.to_datetime(r['date'], errors='coerce') + r = r.dropna(subset=['date']) + if not r.empty: + if 'sentiment_label' in replies_df.columns and replies_df['sentiment_label'].notna().any(): + lab = replies_df.loc[r.index, 'sentiment_label'].astype(str).str.lower() + r['is_pos'] = lab.str.contains('pos|positive', regex=True, na=False) + r['is_neg'] = lab.str.contains('neg|negative', regex=True, na=False) + else: + if 'sentiment_compound' in replies_df.columns: + sc = pd.to_numeric(replies_df.loc[r.index, 'sentiment_compound'], errors='coerce') + r['is_pos'] = sc > 0.05 + r['is_neg'] = sc < -0.05 + else: + r['is_pos'] = False + r['is_neg'] = False + r['day'] = r['date'].dt.date + parts.append(r[['day','is_pos','is_neg']]) + if parts: + all_rows = pd.concat(parts, ignore_index=True) + grp = ( + all_rows.groupby('day') + .agg( + volume_total=('is_pos','count'), + pos_share=('is_pos','mean'), + neg_share=('is_neg','mean') + ) + .sort_index() + ) + if not grp.empty: + num_days = len(grp.index) + fig_w = max(16, min(float(args.plot_max_width), float(args.plot_width_scale) * num_days)) + import matplotlib.pyplot as _plt + from matplotlib.ticker import PercentFormatter as _PercentFormatter + try: + print(f"[plots] daily_volume_and_sentiment: days={num_days}, width_in={fig_w:.2f}") + except Exception: + pass + fig, ax1 = _plt.subplots(figsize=(fig_w, float(args.plot_height))) + x = range(num_days) + # Bars: total volume (posts+replies) + ax1.bar(x, grp['volume_total'], color='#6baed6', alpha=0.8, label='Volume (posts+replies)') + ax1.set_xlabel('Day') + ax1.set_ylabel('Volume', color='#335') + ax1.tick_params(axis='y', labelcolor='#335') + # Format x-ticks with dates + xticklabels = [f"{d} ({d.strftime('%a')})" if hasattr(d, 'strftime') else str(d) for d in grp.index] + ax1.set_xticks(list(x)) + ax1.set_xticklabels(xticklabels, rotation=45, ha='right') + # Lines: positive and negative sentiment shares + ax2 = ax1.twinx() + ax2.plot(x, grp['pos_share'].fillna(0), color='#2ca02c', marker='o', linewidth=1.5, label='Positive %') + ax2.plot(x, grp['neg_share'].fillna(0), color='#d62728', marker='o', linewidth=1.5, label='Negative %') + ax2.set_ylim(0, 1) + ax2.yaxis.set_major_formatter(_PercentFormatter(xmax=1.0)) + ax2.set_ylabel('Sentiment share', color='#333') + ax2.tick_params(axis='y', labelcolor='#333') + # Build a combined legend + lines1, labels1 = ax1.get_legend_handles_labels() + lines2, labels2 = ax2.get_legend_handles_labels() + ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left') + _plt.title('Daily volume vs positive/negative sentiment') + _plt.tight_layout() + outp = os.path.join(out_dir, 'daily_volume_and_sentiment.png') + _plt.savefig(outp, dpi=150) + _plt.close() + print(f"[plots] Saved {outp}") + except Exception as e: + print(f"[plots] Failed daily volume and sentiment plot: {e}") + + # 6) Tag co-occurrence heatmap (posts, top 15 tags) + try: + if 'tags' in df.columns: + # Prepare list of tag lists per message + tags_series = df['tags'].apply(lambda s: s if isinstance(s, list) else ([] if pd.isna(s) else [s])) + # Frequency of tags + from collections import Counter + freq = Counter() + for ts in tags_series: + freq.update(set([t for t in ts if t])) + # Removed: tag co-occurrence heatmap + except Exception as e: + print(f"[plots] Failed tag co-occurrence heatmap: {e}") + + # Removed: matchday boxplots (posts and replies) + + # 7) Overall matchday sentiment (posts and replies) + try: + if fixtures_present and 'date' in df.columns: + # Prepare posts per-day sentiment + pd_posts = df.copy() + pd_posts['date'] = pd.to_datetime(pd_posts['date'], errors='coerce') + pd_posts = pd_posts.dropna(subset=['date']) + if not pd_posts.empty and 'sentiment_compound' in pd_posts.columns: + pd_posts['day'] = pd_posts['date'].dt.date + g_posts = pd_posts.groupby('day').agg( + posts_n=('id','count'), + posts_mean=('sentiment_compound','mean') + ) + # Optional label-based shares + if 'sentiment_label' in pd_posts.columns: + lab = pd_posts[['day','sentiment_label']].dropna() + lab_s = lab['sentiment_label'].astype(str).str.lower() + lab['pos'] = lab_s.str.contains('pos|positive') + lab['neg'] = lab_s.str.contains('neg|negative') + s_posts = lab.groupby('day').agg(posts_pos_share=('pos','mean'), posts_neg_share=('neg','mean')) + g_posts = g_posts.join(s_posts, how='left') + else: + g_posts = None + + # Prepare replies per-day sentiment if available + g_replies = None + if replies_df is not None and 'date' in replies_df.columns and 'sentiment_compound' in replies_df.columns: + pd_rep = replies_df.copy() + pd_rep['date'] = pd.to_datetime(pd_rep['date'], errors='coerce') + pd_rep = pd_rep.dropna(subset=['date']) + if not pd_rep.empty: + pd_rep['day'] = pd_rep['date'].dt.date + g_replies = pd_rep.groupby('day').agg( + replies_n=('id','count'), + replies_mean=('sentiment_compound','mean') + ) + if 'sentiment_label' in pd_rep.columns: + lab = pd_rep[['day','sentiment_label']].dropna() + lab_s = lab['sentiment_label'].astype(str).str.lower() + lab['pos'] = lab_s.str.contains('pos|positive') + lab['neg'] = lab_s.str.contains('neg|negative') + s_rep = lab.groupby('day').agg(replies_pos_share=('pos','mean'), replies_neg_share=('neg','mean')) + g_replies = g_replies.join(s_rep, how='left') + + # Build fixtures day index + fix_days = None + try: + # re-use 'fix' if available; else build from fixtures_by_day keys + if 'fix' in locals() and isinstance(fix, pd.DataFrame) and 'utcDate' in fix.columns: + ftmp = fix.copy() + ftmp['utcDate'] = pd.to_datetime(ftmp['utcDate'], errors='coerce') + fix_days = ftmp.dropna(subset=['utcDate'])['utcDate'].dt.date.drop_duplicates().sort_values() + elif fixtures_by_day is not None: + fix_days = pd.Series(sorted(list(fixtures_by_day.keys()))) + except Exception: + pass + + if fix_days is not None: + # Join per-day aggregates on fixture days only + idx = pd.Index(fix_days, name='day') + agg = pd.DataFrame(index=idx) + if g_posts is not None: + agg = agg.join(g_posts, how='left') + if g_replies is not None: + agg = agg.join(g_replies, how='left') + out_csv = os.path.join(out_dir, 'matchday_sentiment_overall.csv') + agg.reset_index().to_csv(out_csv, index=False) + print(f"[plots] Wrote {out_csv}") + + # Plot time series of mean compound for posts/replies on match days + import matplotlib.pyplot as plt + plt.figure(figsize=(max(12, len(idx)*0.3), 4)) + if 'posts_mean' in agg.columns: + plt.plot(range(len(idx)), agg['posts_mean'], marker='o', label='Posts mean') + if 'replies_mean' in agg.columns: + plt.plot(range(len(idx)), agg['replies_mean'], marker='o', label='Replies mean') + plt.axhline(0.0, color='#888', linestyle='--', linewidth=1) + plt.xticks(range(len(idx)), [str(d) for d in idx], rotation=45, ha='right') + plt.ylabel('Compound sentiment (mean)') + plt.title('Matchday sentiment (overall)') + plt.legend() + plt.tight_layout() + path = os.path.join(out_dir, 'matchday_sentiment_overall.png') + plt.savefig(path, dpi=150); plt.close() + print(f"[plots] Saved {path}") + + # Scatter: posts_n vs posts_mean on matchdays + if 'posts_n' in agg.columns and 'posts_mean' in agg.columns: + plt.figure(figsize=(5,4)) + plt.scatter(agg['posts_n'].fillna(0), agg['posts_mean'].fillna(0), alpha=0.7, color='#1f77b4') + plt.xlabel('Posts count (matchday)') + plt.ylabel('Mean compound (posts)') + plt.title('Posts volume vs sentiment on matchdays') + plt.tight_layout() + sp = os.path.join(out_dir, 'matchday_posts_volume_vs_sentiment.png') + plt.savefig(sp, dpi=150); plt.close() + print(f"[plots] Saved {sp}") + except Exception as e: + print(f"[plots] Failed matchday sentiment overall: {e}") + + +if __name__ == '__main__': + main() diff --git a/src/apply_labels.py b/src/apply_labels.py new file mode 100644 index 0000000..6c23589 --- /dev/null +++ b/src/apply_labels.py @@ -0,0 +1,50 @@ +import argparse +import os +import pandas as pd + + +def read_csv(path: str) -> pd.DataFrame: + if not os.path.exists(path): + raise SystemExit(f"CSV not found: {path}") + return pd.read_csv(path) + + +def main(): + p = argparse.ArgumentParser(description='Apply labeled sentiments to posts/replies CSVs for analysis plots.') + p.add_argument('--labeled-csv', required=True, help='Path to labeled_sentiment.csv (must include id and label columns)') + p.add_argument('--posts-csv', required=True, help='Original posts CSV') + p.add_argument('--replies-csv', required=True, help='Original replies CSV') + p.add_argument('--posts-out', default=None, help='Output posts CSV path (default: with _with_labels suffix)') + p.add_argument('--replies-out', default=None, help='Output replies CSV path (default: with _with_labels suffix)') + args = p.parse_args() + + labeled = read_csv(args.labeled_csv) + if 'id' not in labeled.columns: + raise SystemExit('labeled CSV must include an id column to merge on') + # normalize label column name to sentiment_label + lab_col = 'label' if 'label' in labeled.columns else ('sentiment_label' if 'sentiment_label' in labeled.columns else None) + if lab_col is None: + raise SystemExit("labeled CSV must include a 'label' or 'sentiment_label' column") + labeled = labeled[['id', lab_col] + (['confidence'] if 'confidence' in labeled.columns else [])].copy() + labeled = labeled.rename(columns={lab_col: 'sentiment_label'}) + + posts = read_csv(args.posts_csv) + replies = read_csv(args.replies_csv) + + if 'id' not in posts.columns or 'id' not in replies.columns: + raise SystemExit('posts/replies CSVs must include id columns') + + posts_out = args.posts_out or os.path.splitext(args.posts_csv)[0] + '_with_labels.csv' + replies_out = args.replies_out or os.path.splitext(args.replies_csv)[0] + '_with_labels.csv' + + posts_merged = posts.merge(labeled, how='left', on='id', validate='m:1') + replies_merged = replies.merge(labeled, how='left', on='id', validate='m:1') + + posts_merged.to_csv(posts_out, index=False) + replies_merged.to_csv(replies_out, index=False) + print(f"Wrote posts with labels -> {posts_out} (rows={len(posts_merged)})") + print(f"Wrote replies with labels -> {replies_out} (rows={len(replies_merged)})") + + +if __name__ == '__main__': + main() diff --git a/src/audit_team_sentiment.py b/src/audit_team_sentiment.py new file mode 100644 index 0000000..b2e4130 --- /dev/null +++ b/src/audit_team_sentiment.py @@ -0,0 +1,107 @@ +import argparse +import os +from typing import List + +import pandas as pd +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + + +def parse_tags_column(series: pd.Series) -> pd.Series: + def _to_list(x): + if isinstance(x, list): + return x + if pd.isna(x): + return [] + s = str(x) + # Expect semicolon-delimited from augmented CSV, but also accept comma + if ';' in s: + return [t.strip() for t in s.split(';') if t.strip()] + if ',' in s: + return [t.strip() for t in s.split(',') if t.strip()] + return [s] if s else [] + return series.apply(_to_list) + + +def main(): + parser = argparse.ArgumentParser(description='Audit sentiment per team tag and export samples for inspection.') + parser.add_argument('--csv', default='data/premier_league_update_tagged.csv', help='Tagged posts CSV (augmented by analyze)') + parser.add_argument('--team', default='club_manchester_united', help='Team tag to export samples for (e.g., club_manchester_united)') + parser.add_argument('--out-dir', default='data', help='Directory to write audit outputs') + parser.add_argument('--samples', type=int, default=25, help='Number of samples to export for the specified team') + parser.add_argument('--with-vader', action='store_true', help='Also compute VADER-based sentiment shares as a sanity check') + args = parser.parse_args() + + if not os.path.exists(args.csv): + raise SystemExit(f"CSV not found: {args.csv}. Run analyze with --write-augmented-csv first.") + + df = pd.read_csv(args.csv) + if 'message' not in df.columns: + raise SystemExit('CSV missing message column') + if 'sentiment_compound' not in df.columns: + raise SystemExit('CSV missing sentiment_compound column') + if 'tags' not in df.columns: + raise SystemExit('CSV missing tags column') + + df = df.copy() + df['tags'] = parse_tags_column(df['tags']) + # Filter to team tags (prefix club_) + e = df.explode('tags') + e = e[e['tags'].notna() & (e['tags'] != '')] + e = e[e['tags'].astype(str).str.startswith('club_')] + e = e.dropna(subset=['sentiment_compound']) + if e.empty: + print('No team-tagged rows found.') + return + + # Shares + e = e.copy() + e['is_pos'] = e['sentiment_compound'] > 0.05 + e['is_neg'] = e['sentiment_compound'] < -0.05 + grp = ( + e.groupby('tags') + .agg( + n=('sentiment_compound', 'count'), + mean=('sentiment_compound', 'mean'), + median=('sentiment_compound', 'median'), + pos_share=('is_pos', 'mean'), + neg_share=('is_neg', 'mean'), + ) + .reset_index() + ) + grp['neu_share'] = (1 - grp['pos_share'] - grp['neg_share']).clip(lower=0) + grp = grp.sort_values(['n', 'mean'], ascending=[False, False]) + + if args.with_vader: + # Compute VADER shares on the underlying messages per team + analyzer = SentimentIntensityAnalyzer() + def _vader_sentiment_share(sub: pd.DataFrame): + if sub.empty: + return pd.Series({'pos_share_vader': 0.0, 'neg_share_vader': 0.0, 'neu_share_vader': 0.0}) + scores = sub['message'].astype(str).apply(lambda t: analyzer.polarity_scores(t or '')['compound']) + pos = (scores > 0.05).mean() + neg = (scores < -0.05).mean() + neu = max(0.0, 1.0 - pos - neg) + return pd.Series({'pos_share_vader': pos, 'neg_share_vader': neg, 'neu_share_vader': neu}) + vader_grp = e.groupby('tags').apply(_vader_sentiment_share).reset_index() + grp = grp.merge(vader_grp, on='tags', how='left') + + os.makedirs(args.out_dir, exist_ok=True) + out_summary = os.path.join(args.out_dir, 'team_sentiment_audit.csv') + grp.to_csv(out_summary, index=False) + print(f"Wrote summary: {out_summary}") + + # Export samples for selected team + te = e[e['tags'] == args.team].copy() + if te.empty: + print(f"No rows for team tag: {args.team}") + return + # Sort by sentiment descending to inspect highly positive claims + te = te.sort_values('sentiment_compound', ascending=False) + cols = [c for c in ['id', 'date', 'message', 'sentiment_compound', 'url'] if c in te.columns] + samples_path = os.path.join(args.out_dir, f"{args.team}_samples.csv") + te[cols].head(args.samples).to_csv(samples_path, index=False) + print(f"Wrote samples: {samples_path} ({min(args.samples, len(te))} rows)") + + +if __name__ == '__main__': + main() diff --git a/src/auto_label_sentiment.py b/src/auto_label_sentiment.py new file mode 100644 index 0000000..a6fa5ef --- /dev/null +++ b/src/auto_label_sentiment.py @@ -0,0 +1,218 @@ +import argparse +import os +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + +try: + # Allow both package and direct script execution + from .make_labeling_set import load_messages as _load_messages +except Exception: + from make_labeling_set import load_messages as _load_messages + + +def _combine_inputs(posts_csv: Optional[str], replies_csv: Optional[str], text_col: str = 'message', min_length: int = 3) -> pd.DataFrame: + frames: List[pd.DataFrame] = [] + if posts_csv: + frames.append(_load_messages(posts_csv, text_col=text_col)) + if replies_csv: + # include parent_id if present for replies + frames.append(_load_messages(replies_csv, text_col=text_col, extra_cols=['parent_id'])) + if not frames: + raise SystemExit('No input provided. Use --input-csv or --posts-csv/--replies-csv') + df = pd.concat(frames, ignore_index=True) + df['message'] = df['message'].fillna('').astype(str) + df = df[df['message'].str.len() >= min_length] + df = df.drop_duplicates(subset=['message']).reset_index(drop=True) + return df + + +def _map_label_str_to_int(labels: List[str]) -> List[int]: + mapping = {'neg': 0, 'negative': 0, 'neu': 1, 'neutral': 1, 'pos': 2, 'positive': 2} + out: List[int] = [] + for lab in labels: + lab_l = (lab or '').lower() + if lab_l in mapping: + out.append(mapping[lab_l]) + else: + # fallback: try to parse integer + try: + out.append(int(lab)) + except Exception: + out.append(1) # default to neutral + return out + + +def _vader_label(compound: float, pos_th: float, neg_th: float) -> str: + if compound >= pos_th: + return 'pos' + if compound <= neg_th: + return 'neg' + return 'neu' + + +def _auto_label_vader(texts: List[str], pos_th: float, neg_th: float, min_margin: float) -> Tuple[List[str], List[float]]: + analyzer = SentimentIntensityAnalyzer() + labels: List[str] = [] + confs: List[float] = [] + for t in texts: + s = analyzer.polarity_scores(t or '') + comp = float(s.get('compound', 0.0)) + lab = _vader_label(comp, pos_th, neg_th) + # Confidence heuristic: distance from neutral band edges + if lab == 'pos': + conf = max(0.0, comp - pos_th) + elif lab == 'neg': + conf = max(0.0, abs(comp - neg_th)) + else: + # closer to 0 is more neutral; confidence inversely related to |compound| + conf = max(0.0, (pos_th - abs(comp))) + labels.append(lab) + confs.append(conf) + # Normalize confidence roughly to [0,1] by clipping with a reasonable scale + confs = [min(1.0, c / max(1e-6, min_margin)) for c in confs] + return labels, confs + + +def _auto_label_transformers(texts: List[str], model_name_or_path: str, batch_size: int, min_prob: float, min_margin: float) -> Tuple[List[str], List[float]]: + try: + from .transformer_sentiment import TransformerSentiment + except Exception: + from transformer_sentiment import TransformerSentiment + + clf = TransformerSentiment(model_name_or_path) + probs_all, labels_all = clf.predict_probs_and_labels(texts, batch_size=batch_size) + confs: List[float] = [] + for row in probs_all: + row = np.array(row, dtype=float) + if row.size == 0: + confs.append(0.0) + continue + top2 = np.sort(row)[-2:] if row.size >= 2 else np.array([0.0, row.max()]) + max_p = float(row.max()) + margin = float(top2[-1] - top2[-2]) if row.size >= 2 else max_p + # Confidence must satisfy both conditions + conf = min(max(0.0, (max_p - min_prob) / max(1e-6, 1 - min_prob)), max(0.0, margin / max(1e-6, min_margin))) + confs.append(conf) + # Map arbitrary id2label names to canonical 'neg/neu/pos' when obvious; else keep as-is + canonical = [] + for lab in labels_all: + ll = (lab or '').lower() + if 'neg' in ll: + canonical.append('neg') + elif 'neu' in ll or 'neutral' in ll: + canonical.append('neu') + elif 'pos' in ll or 'positive' in ll: + canonical.append('pos') + else: + canonical.append(lab) + return canonical, confs + + +def main(): + parser = argparse.ArgumentParser(description='Automatically label sentiment without manual annotation.') + src = parser.add_mutually_exclusive_group(required=True) + src.add_argument('--input-csv', help='Single CSV containing a text column (default: message)') + src.add_argument('--posts-csv', help='Posts CSV to include') + parser.add_argument('--replies-csv', help='Replies CSV to include (combined with posts if provided)') + parser.add_argument('--text-col', default='message', help='Text column name in input CSV(s)') + parser.add_argument('-o', '--output', default='data/labeled_sentiment.csv', help='Output labeled CSV path') + parser.add_argument('--limit', type=int, default=None, help='Optional cap on number of rows') + parser.add_argument('--min-length', type=int, default=3, help='Minimum text length to consider') + + parser.add_argument('--backend', choices=['vader', 'transformers', 'gpt'], default='vader', help='Labeling backend: vader, transformers, or gpt (local via Ollama)') + # VADER knobs + parser.add_argument('--vader-pos', type=float, default=0.05, help='VADER positive threshold (compound >=)') + parser.add_argument('--vader-neg', type=float, default=-0.05, help='VADER negative threshold (compound <=)') + parser.add_argument('--vader-margin', type=float, default=0.2, help='Confidence scaling for VADER distance') + # Transformers knobs + parser.add_argument('--transformers-model', default='cardiffnlp/twitter-roberta-base-sentiment-latest', help='HF model for 3-class sentiment') + parser.add_argument('--batch-size', type=int, default=64) + parser.add_argument('--min-prob', type=float, default=0.6, help='Min top class probability to accept') + parser.add_argument('--min-margin', type=float, default=0.2, help='Min prob gap between top-1 and top-2 to accept') + + # GPT knobs + parser.add_argument('--gpt-model', default='llama3', help='Local GPT model name (Ollama)') + parser.add_argument('--gpt-base-url', default='http://localhost:11434', help='Base URL for local GPT server (Ollama)') + parser.add_argument('--gpt-batch-size', type=int, default=8) + + parser.add_argument('--label-format', choices=['str', 'int'], default='str', help="Output labels as strings ('neg/neu/pos') or integers (0/1/2)") + parser.add_argument('--only-confident', action='store_true', help='Drop rows that do not meet confidence thresholds') + + args = parser.parse_args() + + # Load inputs + if args.input_csv: + if not os.path.exists(args.input_csv): + raise SystemExit(f"Input CSV not found: {args.input_csv}") + df = pd.read_csv(args.input_csv) + if args.text_col not in df.columns: + raise SystemExit(f"Text column '{args.text_col}' not in {args.input_csv}") + df = df.copy() + df['message'] = df[args.text_col].astype(str) + base_cols = [c for c in ['id', 'date', 'message', 'url'] if c in df.columns] + df = df[base_cols if base_cols else ['message']] + df = df[df['message'].str.len() >= args.min_length] + df = df.drop_duplicates(subset=['message']).reset_index(drop=True) + else: + df = _combine_inputs(args.posts_csv, args.replies_csv, text_col=args.text_col, min_length=args.min_length) + + if args.limit and len(df) > args.limit: + df = df.head(args.limit) + + texts = df['message'].astype(str).tolist() + + # Predict labels + confidence + if args.backend == 'vader': + labels, conf = _auto_label_vader(texts, pos_th=args.vader_pos, neg_th=args.vader_neg, min_margin=args.vader_margin) + # For VADER, define acceptance: confident if outside neutral band by at least margin, or inside band with closeness to 0 below threshold + accept = [] + analyzer = SentimentIntensityAnalyzer() + for t in texts: + comp = analyzer.polarity_scores(t or '').get('compound') + if comp is None: + accept.append(False) + continue + comp = float(comp) + if comp >= args.vader_pos + args.vader_margin or comp <= args.vader_neg - args.vader_margin: + accept.append(True) + else: + # inside or near band -> consider less confident + accept.append(False) + elif args.backend == 'transformers': + labels, conf = _auto_label_transformers(texts, args.transformers_model, args.batch_size, args.min_prob, args.min_margin) + accept = [((c >= 1.0)) or ((c >= 0.5)) for c in conf] # normalize conf ~[0,1]; accept medium-high confidence + else: + # GPT backend via Ollama: expect label+confidence + try: + from .gpt_sentiment import GPTSentiment + except Exception: + from gpt_sentiment import GPTSentiment + clf = GPTSentiment(base_url=args.gpt_base_url, model=args.gpt_model) + labels, conf = clf.predict_label_conf_batch(texts, batch_size=args.gpt_batch_size) + # Accept medium-high confidence; simple threshold like transformers path + accept = [c >= 0.5 for c in conf] + + out = df.copy() + out.insert(1, 'label', labels) + out['confidence'] = conf + + if args.only_confident: + out = out[np.array(accept, dtype=bool)] + out = out.reset_index(drop=True) + + if args.label_format == 'int': + out['label'] = _map_label_str_to_int(out['label'].astype(str).tolist()) + + os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True) + out.to_csv(args.output, index=False) + kept = len(out) + print(f"Wrote {kept} labeled rows to {args.output} using backend={args.backend}") + if args.only_confident: + print("Note: only confident predictions were kept. You can remove --only-confident to include all rows.") + + +if __name__ == '__main__': + main() diff --git a/src/eval_sentiment.py b/src/eval_sentiment.py new file mode 100644 index 0000000..d2589e1 --- /dev/null +++ b/src/eval_sentiment.py @@ -0,0 +1,48 @@ +import argparse +import numpy as np +import pandas as pd +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report + +try: + from .transformer_sentiment import TransformerSentiment +except ImportError: + # Allow running as a script via -m src.eval_sentiment + from transformer_sentiment import TransformerSentiment + + +def main(): + parser = argparse.ArgumentParser(description='Evaluate a fine-tuned transformers sentiment model on a labeled CSV') + parser.add_argument('--csv', required=True, help='Labeled CSV path with message and label columns') + parser.add_argument('--text-col', default='message') + parser.add_argument('--label-col', default='label') + parser.add_argument('--model', required=True, help='Model name or local path') + parser.add_argument('--batch-size', type=int, default=64) + args = parser.parse_args() + + df = pd.read_csv(args.csv) + df = df[[args.text_col, args.label_col]].dropna().copy() + texts = df[args.text_col].astype(str).tolist() + true_labels = df[args.label_col].astype(str).tolist() + + clf = TransformerSentiment(args.model) + _, pred_labels = clf.predict_probs_and_labels(texts, batch_size=args.batch_size) + + y_true = np.array(true_labels) + y_pred = np.array(pred_labels) + + # If labels differ from model id2label names, normalize to strings for comparison + acc = accuracy_score(y_true, y_pred) + f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0) + prec_macro = precision_score(y_true, y_pred, average='macro', zero_division=0) + rec_macro = recall_score(y_true, y_pred, average='macro', zero_division=0) + + print('Accuracy:', f"{acc:.4f}") + print('F1 (macro):', f"{f1_macro:.4f}") + print('Precision (macro):', f"{prec_macro:.4f}") + print('Recall (macro):', f"{rec_macro:.4f}") + print('\nClassification report:') + print(classification_report(y_true, y_pred, zero_division=0)) + + +if __name__ == '__main__': + main() diff --git a/src/fetch_schedule.py b/src/fetch_schedule.py new file mode 100644 index 0000000..1ebe651 --- /dev/null +++ b/src/fetch_schedule.py @@ -0,0 +1,131 @@ +import argparse +import csv +import os +from datetime import datetime +from typing import Any, Dict, List, Optional + +import requests +from dotenv import load_dotenv + +API_BASE = "https://api.football-data.org/v4" +COMPETITION_CODE = "PL" # Premier League + + +def iso_date(d: str) -> str: + # Accept YYYY-MM-DD and return ISO date + try: + return datetime.fromisoformat(d).date().isoformat() + except Exception as e: + raise argparse.ArgumentTypeError(f"Invalid date: {d}. Use YYYY-MM-DD") from e + + +def fetch_matches(start_date: str, end_date: str, token: str) -> Dict[str, Any]: + url = f"{API_BASE}/competitions/{COMPETITION_CODE}/matches" + headers = {"X-Auth-Token": token} + params = { + "dateFrom": start_date, + "dateTo": end_date, + } + r = requests.get(url, headers=headers, params=params, timeout=30) + r.raise_for_status() + return r.json() + + +def normalize_match(m: Dict[str, Any]) -> Dict[str, Any]: + utc_date = m.get("utcDate") + # Convert to date/time strings + kick_iso = None + if utc_date: + try: + kick_iso = datetime.fromisoformat(utc_date.replace("Z", "+00:00")).isoformat() + except Exception: + kick_iso = utc_date + score = m.get("score", {}) + full_time = score.get("fullTime", {}) + + return { + "id": m.get("id"), + "status": m.get("status"), + "matchday": m.get("matchday"), + "utcDate": kick_iso, + "homeTeam": (m.get("homeTeam") or {}).get("name"), + "awayTeam": (m.get("awayTeam") or {}).get("name"), + "homeScore": full_time.get("home"), + "awayScore": full_time.get("away"), + "referees": ", ".join([r.get("name", "") for r in m.get("referees", []) if r.get("name")]), + "venue": m.get("area", {}).get("name"), + "competition": (m.get("competition") or {}).get("name"), + "stage": m.get("stage"), + "group": m.get("group"), + "link": m.get("id") and f"https://www.football-data.org/match/{m['id']}" or None, + } + + +def save_csv(matches: List[Dict[str, Any]], out_path: str) -> None: + if not matches: + # Write header only + fields = [ + "id", + "status", + "matchday", + "utcDate", + "homeTeam", + "awayTeam", + "homeScore", + "awayScore", + "referees", + "venue", + "competition", + "stage", + "group", + "link", + ] + with open(out_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fields) + writer.writeheader() + return + fields = list(matches[0].keys()) + with open(out_path, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fields) + writer.writeheader() + writer.writerows(matches) + + +def save_json(matches: List[Dict[str, Any]], out_path: str) -> None: + import json + + with open(out_path, "w", encoding="utf-8") as f: + json.dump(matches, f, ensure_ascii=False, indent=2) + + +def main(): + parser = argparse.ArgumentParser(description="Fetch Premier League fixtures in a date range and save to CSV/JSON") + parser.add_argument("--start-date", required=True, type=iso_date, help="YYYY-MM-DD (inclusive)") + parser.add_argument("--end-date", required=True, type=iso_date, help="YYYY-MM-DD (inclusive)") + parser.add_argument("-o", "--output", required=True, help="Output file path (.csv or .json)") + args = parser.parse_args() + + load_dotenv() + token = os.getenv("FOOTBALL_DATA_API_TOKEN") + if not token: + raise SystemExit("Missing FOOTBALL_DATA_API_TOKEN in environment (.env)") + + data = fetch_matches(args.start_date, args.end_date, token) + matches_raw = data.get("matches", []) + matches = [normalize_match(m) for m in matches_raw] + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + + ext = os.path.splitext(args.output)[1].lower() + if ext == ".csv": + save_csv(matches, args.output) + elif ext == ".json": + save_json(matches, args.output) + else: + raise SystemExit("Output must end with .csv or .json") + + print(f"Saved {len(matches)} matches to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/src/gpt_sentiment.py b/src/gpt_sentiment.py new file mode 100644 index 0000000..7609060 --- /dev/null +++ b/src/gpt_sentiment.py @@ -0,0 +1,93 @@ +import json +from typing import List, Tuple + +import requests + + +class GPTSentiment: + """ + Minimal client for a local GPT model served by Ollama. + + Expects the model to respond with a strict JSON object like: + {"label": "neg|neu|pos", "confidence": 0.0..1.0} + + Endpoint used: POST {base_url}/api/generate with payload: + {"model": , "prompt": , "stream": false, "format": "json"} + """ + + def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3", timeout: int = 30): + self.base_url = base_url.rstrip("/") + self.model = model + self.timeout = timeout + + def _build_prompt(self, text: str) -> str: + # Keep the instruction terse and deterministic; request strict JSON. + return ( + "You are a strict JSON generator for sentiment analysis. " + "Classify the INPUT text as one of: neg, neu, pos. " + "Return ONLY a JSON object with keys 'label' and 'confidence' (0..1). " + "No markdown, no prose.\n\n" + f"INPUT: {text}" + ) + + def _call(self, prompt: str) -> dict: + url = f"{self.base_url}/api/generate" + payload = { + "model": self.model, + "prompt": prompt, + "stream": False, + "format": "json", + } + r = requests.post(url, json=payload, timeout=self.timeout) + r.raise_for_status() + data = r.json() + # Ollama returns the model's response under 'response' + raw = data.get("response", "").strip() + try: + obj = json.loads(raw) + except Exception: + # Try to recover simple cases by stripping codefences + raw2 = raw.strip().removeprefix("```").removesuffix("```") + obj = json.loads(raw2) + return obj + + @staticmethod + def _canonical_label(s: str) -> str: + s = (s or "").strip().lower() + if "neg" in s: + return "neg" + if "neu" in s or "neutral" in s: + return "neu" + if "pos" in s or "positive" in s: + return "pos" + return s or "neu" + + @staticmethod + def _compound_from_label_conf(label: str, confidence: float) -> float: + label = GPTSentiment._canonical_label(label) + c = max(0.0, min(1.0, float(confidence or 0.0))) + if label == "pos": + return c + if label == "neg": + return -c + return 0.0 + + def predict_label_conf_batch(self, texts: List[str], batch_size: int = 8) -> Tuple[List[str], List[float]]: + labels: List[str] = [] + confs: List[float] = [] + for i in range(0, len(texts), batch_size): + batch = texts[i:i+batch_size] + for t in batch: + try: + obj = self._call(self._build_prompt(t)) + lab = self._canonical_label(obj.get("label", "")) + conf = float(obj.get("confidence", 0.0)) + except Exception: + lab, conf = "neu", 0.0 + labels.append(lab) + confs.append(conf) + return labels, confs + + def predict_compound_batch(self, texts: List[str], batch_size: int = 8) -> List[float]: + labels, confs = self.predict_label_conf_batch(texts, batch_size=batch_size) + return [self._compound_from_label_conf(lab, conf) for lab, conf in zip(labels, confs)] diff --git a/src/make_labeling_set.py b/src/make_labeling_set.py new file mode 100644 index 0000000..fdad990 --- /dev/null +++ b/src/make_labeling_set.py @@ -0,0 +1,65 @@ +import argparse +import os +import pandas as pd + + +def load_messages(csv_path: str, text_col: str = 'message', extra_cols=None) -> pd.DataFrame: + if not os.path.exists(csv_path): + return pd.DataFrame() + df = pd.read_csv(csv_path) + if text_col not in df.columns: + return pd.DataFrame() + cols = ['id', text_col, 'date'] + if extra_cols: + for c in extra_cols: + if c in df.columns: + cols.append(c) + cols = [c for c in cols if c in df.columns] + out = df[cols].copy() + out.rename(columns={text_col: 'message'}, inplace=True) + return out + + +def main(): + parser = argparse.ArgumentParser(description='Create a labeling CSV from posts and/or replies.') + parser.add_argument('--posts-csv', required=False, help='Posts CSV path (e.g., data/..._update.csv)') + parser.add_argument('--replies-csv', required=False, help='Replies CSV path') + parser.add_argument('-o', '--output', default='data/labeled_sentiment.csv', help='Output CSV for labeling') + parser.add_argument('--sample-size', type=int, default=1000, help='Total rows to include (after combining)') + parser.add_argument('--min-length', type=int, default=3, help='Minimum message length to include') + parser.add_argument('--shuffle', action='store_true', help='Shuffle before sampling (default true)') + parser.add_argument('--no-shuffle', dest='shuffle', action='store_false') + parser.set_defaults(shuffle=True) + args = parser.parse_args() + + frames = [] + if args.posts_csv: + frames.append(load_messages(args.posts_csv)) + if args.replies_csv: + # For replies, include parent_id if present + r = load_messages(args.replies_csv, extra_cols=['parent_id']) + frames.append(r) + if not frames: + raise SystemExit('No input CSVs provided. Use --posts-csv and/or --replies-csv.') + + df = pd.concat(frames, ignore_index=True) + # Basic filtering: non-empty text, min length, drop duplicates by message text + df['message'] = df['message'].fillna('').astype(str) + df = df[df['message'].str.len() >= args.min_length] + df = df.drop_duplicates(subset=['message']).reset_index(drop=True) + + if args.shuffle: + df = df.sample(frac=1.0, random_state=42).reset_index(drop=True) + if args.sample_size and len(df) > args.sample_size: + df = df.head(args.sample_size) + + # Add blank label column for human annotation + df.insert(1, 'label', '') + + os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True) + df.to_csv(args.output, index=False) + print(f"Wrote labeling CSV with {len(df)} rows to {args.output}") + + +if __name__ == '__main__': + main() diff --git a/src/plot_labeled.py b/src/plot_labeled.py new file mode 100644 index 0000000..c7ecf4b --- /dev/null +++ b/src/plot_labeled.py @@ -0,0 +1,137 @@ +import argparse +import os +from typing import Optional + +import pandas as pd + + +def safe_read(path: str) -> pd.DataFrame: + if not os.path.exists(path): + raise SystemExit(f"Input labeled CSV not found: {path}") + df = pd.read_csv(path) + if 'label' not in df.columns: + raise SystemExit("Expected a 'label' column in the labeled CSV") + if 'message' in df.columns: + df['message'] = df['message'].fillna('').astype(str) + if 'confidence' in df.columns: + df['confidence'] = pd.to_numeric(df['confidence'], errors='coerce') + if 'date' in df.columns: + df['date'] = pd.to_datetime(df['date'], errors='coerce') + return df + + +def ensure_out_dir(out_dir: str) -> str: + os.makedirs(out_dir, exist_ok=True) + return out_dir + + +def plot_all(df: pd.DataFrame, out_dir: str) -> None: + import matplotlib.pyplot as plt + import seaborn as sns + sns.set_style('whitegrid') + + out_dir = ensure_out_dir(out_dir) + + # 1) Class distribution + try: + plt.figure(figsize=(6,4)) + ax = (df['label'].astype(str).str.lower().value_counts() + .reindex(['neg','neu','pos']) + .fillna(0) + .rename_axis('label').reset_index(name='count') + .set_index('label') + .plot(kind='bar', legend=False, color=['#d62728','#aaaaaa','#2ca02c'])) + plt.title('Labeled class distribution') + plt.ylabel('Count') + plt.tight_layout() + path = os.path.join(out_dir, 'labeled_class_distribution.png') + plt.savefig(path, dpi=150) + plt.close() + print(f"[plots] Saved {path}") + except Exception as e: + print(f"[plots] Skipped class distribution: {e}") + + # 2) Confidence histogram (overall) + if 'confidence' in df.columns and df['confidence'].notna().any(): + try: + plt.figure(figsize=(6,4)) + sns.histplot(df['confidence'].dropna(), bins=30, color='#1f77b4') + plt.title('Confidence distribution (overall)') + plt.xlabel('Confidence'); plt.ylabel('Frequency') + plt.tight_layout() + path = os.path.join(out_dir, 'labeled_confidence_hist.png') + plt.savefig(path, dpi=150); plt.close() + print(f"[plots] Saved {path}") + except Exception as e: + print(f"[plots] Skipped confidence histogram: {e}") + + # 3) Confidence by label (boxplot) + try: + plt.figure(figsize=(6,4)) + t = df[['label','confidence']].dropna() + t['label'] = t['label'].astype(str).str.lower() + order = ['neg','neu','pos'] + sns.boxplot(data=t, x='label', y='confidence', order=order, palette=['#d62728','#aaaaaa','#2ca02c']) + plt.title('Confidence by label') + plt.xlabel('Label'); plt.ylabel('Confidence') + plt.tight_layout() + path = os.path.join(out_dir, 'labeled_confidence_by_label.png') + plt.savefig(path, dpi=150); plt.close() + print(f"[plots] Saved {path}") + except Exception as e: + print(f"[plots] Skipped confidence by label: {e}") + + # 4) Message length by label + if 'message' in df.columns: + try: + t = df[['label','message']].copy() + t['label'] = t['label'].astype(str).str.lower() + t['len'] = t['message'].astype(str).str.len() + plt.figure(figsize=(6,4)) + sns.boxplot(data=t, x='label', y='len', order=['neg','neu','pos'], palette=['#d62728','#aaaaaa','#2ca02c']) + plt.title('Message length by label') + plt.xlabel('Label'); plt.ylabel('Length (chars)') + plt.tight_layout() + path = os.path.join(out_dir, 'labeled_length_by_label.png') + plt.savefig(path, dpi=150); plt.close() + print(f"[plots] Saved {path}") + except Exception as e: + print(f"[plots] Skipped length by label: {e}") + + # 5) Daily counts per label (if date present) + if 'date' in df.columns and df['date'].notna().any(): + try: + t = df[['date','label']].dropna().copy() + t['day'] = pd.to_datetime(t['date'], errors='coerce').dt.date + t['label'] = t['label'].astype(str).str.lower() + pv = t.pivot_table(index='day', columns='label', values='date', aggfunc='count').fillna(0) + # ensure consistent column order + for c in ['neg','neu','pos']: + if c not in pv.columns: + pv[c] = 0 + pv = pv[['neg','neu','pos']] + import matplotlib.pyplot as plt + plt.figure(figsize=(10,4)) + pv.plot(kind='bar', stacked=True, color=['#d62728','#aaaaaa','#2ca02c']) + plt.title('Daily labeled counts (stacked)') + plt.xlabel('Day'); plt.ylabel('Count') + plt.tight_layout() + path = os.path.join(out_dir, 'labeled_daily_counts.png') + plt.savefig(path, dpi=150); plt.close() + print(f"[plots] Saved {path}") + except Exception as e: + print(f"[plots] Skipped daily counts: {e}") + + +def main(): + parser = argparse.ArgumentParser(description='Plot graphs from labeled sentiment data.') + parser.add_argument('-i', '--input', default='data/labeled_sentiment.csv', help='Path to labeled CSV') + parser.add_argument('-o', '--out-dir', default='data', help='Output directory for plots') + args = parser.parse_args() + + df = safe_read(args.input) + plot_all(df, args.out_dir) + + +if __name__ == '__main__': + main() diff --git a/src/telegram_scraper.py b/src/telegram_scraper.py new file mode 100644 index 0000000..c4eee88 --- /dev/null +++ b/src/telegram_scraper.py @@ -0,0 +1,749 @@ +import asyncio +import json +import os +from dataclasses import asdict, dataclass +from datetime import datetime +from typing import AsyncIterator, Iterable, Optional, Sequence, Set, List, Tuple + +from dotenv import load_dotenv +from telethon import TelegramClient +from telethon.errors import SessionPasswordNeededError +from telethon.errors.rpcerrorlist import MsgIdInvalidError, FloodWaitError +from telethon.tl.functions.messages import GetDiscussionMessageRequest +from telethon.tl.custom.message import Message +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + + +@dataclass +class ScrapedMessage: + id: int + date: Optional[str] # ISO format + message: Optional[str] + sender_id: Optional[int] + views: Optional[int] + forwards: Optional[int] + replies: Optional[int] + url: Optional[str] + + +def to_iso(dt: datetime) -> str: + return dt.replace(tzinfo=None).isoformat() + + +async def iter_messages( + client: TelegramClient, + entity: str, + limit: Optional[int] = None, + offset_date: Optional[datetime] = None, +) -> AsyncIterator[Message]: + async for msg in client.iter_messages(entity, limit=limit, offset_date=offset_date): + yield msg + + +def message_to_record(msg: Message, channel_username: str) -> ScrapedMessage: + return ScrapedMessage( + id=msg.id, + date=to_iso(msg.date) if msg.date else None, + message=msg.message, + sender_id=getattr(msg.sender_id, 'value', msg.sender_id) if hasattr(msg, 'sender_id') else None, + views=getattr(msg, 'views', None), + forwards=getattr(msg, 'forwards', None), + replies=(msg.replies.replies if getattr(msg, 'replies', None) else None), + url=f"https://t.me/{channel_username}/{msg.id}" if channel_username else None, + ) + + +async def ensure_login(client: TelegramClient, phone: Optional[str] = None, twofa_password: Optional[str] = None): + # Connect and log in, prompting interactively if needed + await client.connect() + if not await client.is_user_authorized(): + if not phone: + phone = input("Enter your phone number (with country code): ") + await client.send_code_request(phone) + code = input("Enter the login code you received: ") + try: + await client.sign_in(phone=phone, code=code) + except SessionPasswordNeededError: + if twofa_password is None: + twofa_password = input("Two-step verification enabled. Enter your password: ") + await client.sign_in(password=twofa_password) + + +async def scrape_channel( + channel: str, + output: str, + limit: Optional[int] = None, + offset_date: Optional[str] = None, # deprecated in favor of start_date + start_date: Optional[str] = None, + end_date: Optional[str] = None, + append: bool = False, + session_name: str = "telegram", + phone: Optional[str] = None, + twofa_password: Optional[str] = None, +): + load_dotenv() + api_id = os.getenv("TELEGRAM_API_ID") + api_hash = os.getenv("TELEGRAM_API_HASH") + session_name = os.getenv("TELEGRAM_SESSION_NAME", session_name) + + if not api_id or not api_hash: + raise RuntimeError("Missing TELEGRAM_API_ID/TELEGRAM_API_HASH in environment. See .env.example") + + # Some providers store api_id as string; Telethon expects int + try: + api_id_int = int(api_id) + except Exception as e: + raise RuntimeError("TELEGRAM_API_ID must be an integer") from e + + client = TelegramClient(session_name, api_id_int, api_hash) + + # Parse date filters + parsed_start = None + parsed_end = None + if start_date: + parsed_start = datetime.fromisoformat(start_date) + elif offset_date: # backward compatibility + parsed_start = datetime.fromisoformat(offset_date) + if end_date: + parsed_end = datetime.fromisoformat(end_date) + + await ensure_login(client, phone=phone, twofa_password=twofa_password) + + # Determine output format based on extension + ext = os.path.splitext(output)[1].lower() + is_jsonl = ext in (".jsonl", ".ndjson") + is_csv = ext == ".csv" + + if not (is_jsonl or is_csv): + raise ValueError("Output file must end with .jsonl or .csv") + + # Prepare output writers + csv_file = None + csv_writer = None + jsonl_file = None + if is_csv: + import csv + mode = "a" if append else "w" + csv_file = open(output, mode, newline="", encoding="utf-8") + csv_writer = csv.DictWriter( + csv_file, + fieldnames=[ + "id", + "date", + "message", + "sender_id", + "views", + "forwards", + "replies", + "url", + ], + ) + # Write header if not appending, or file is empty + need_header = True + try: + if append and os.path.exists(output) and os.path.getsize(output) > 0: + need_header = False + except Exception: + pass + if need_header: + csv_writer.writeheader() + elif is_jsonl: + # Open once; append or overwrite + mode = "a" if append else "w" + jsonl_file = open(output, mode, encoding="utf-8") + + written = 0 + try: + async for msg in iter_messages(client, channel, limit=None, offset_date=None): + # Telethon returns tz-aware datetimes; normalize for comparison + msg_dt = msg.date + if msg_dt is not None: + msg_dt = msg_dt.replace(tzinfo=None) + + # Date range filter: include if within [parsed_start, parsed_end] (inclusive) + if parsed_start and msg_dt and msg_dt < parsed_start: + # Since we're iterating newest-first, once older than start we can stop + break + if parsed_end and msg_dt and msg_dt > parsed_end: + continue + + rec = message_to_record(msg, channel_username=channel.lstrip("@")) + if is_jsonl and jsonl_file is not None: + jsonl_file.write(json.dumps(asdict(rec), ensure_ascii=False) + "\n") + else: + csv_writer.writerow(asdict(rec)) # type: ignore + written += 1 + if limit is not None and written >= limit: + break + finally: + if csv_file: + csv_file.close() + if jsonl_file: + jsonl_file.close() + await client.disconnect() + + return written + + +async def fetch_replies( + channel: str, + parent_ids: Sequence[int], + output_csv: str, + append: bool = False, + session_name: str = "telegram", + phone: Optional[str] = None, + twofa_password: Optional[str] = None, + concurrency: int = 5, + existing_pairs: Optional[Set[Tuple[int, int]]] = None, +): + load_dotenv() + api_id = os.getenv("TELEGRAM_API_ID") + api_hash = os.getenv("TELEGRAM_API_HASH") + session_name = os.getenv("TELEGRAM_SESSION_NAME", session_name) + + if not api_id or not api_hash: + raise RuntimeError("Missing TELEGRAM_API_ID/TELEGRAM_API_HASH in environment. See .env.example") + client = TelegramClient(session_name, int(api_id), api_hash) + await ensure_login(client, phone=phone, twofa_password=twofa_password) + + import csv + + # Rate limiting counters + flood_hits = 0 + flood_wait_seconds = 0 + + analyzer = SentimentIntensityAnalyzer() + os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True) + mode = "a" if append else "w" + with open(output_csv, mode, newline="", encoding="utf-8") as f: + writer = csv.DictWriter( + f, + fieldnames=["parent_id", "id", "date", "message", "sender_id", "sentiment_compound", "url"], + ) + # Write header only if not appending or file empty + need_header = True + try: + if append and os.path.exists(output_csv) and os.path.getsize(output_csv) > 0: + need_header = False + except Exception: + pass + if need_header: + writer.writeheader() + + write_lock = asyncio.Lock() + sem = asyncio.Semaphore(max(1, int(concurrency))) + + async def handle_parent(pid: int) -> List[dict]: + rows: List[dict] = [] + # First try replies within the same channel (works for groups/supergroups) + attempts = 0 + while attempts < 3: + try: + async for reply in client.iter_messages(channel, reply_to=pid): + dt = reply.date.replace(tzinfo=None) if reply.date else None + url = f"https://t.me/{channel.lstrip('@')}/{reply.id}" if reply.id else None + text = reply.message or "" + sent = analyzer.polarity_scores(text).get("compound") + rows.append( + { + "parent_id": pid, + "id": reply.id, + "date": to_iso(dt) if dt else None, + "message": text, + "sender_id": getattr(reply, "sender_id", None), + "sentiment_compound": sent, + "url": url, + } + ) + break + except FloodWaitError as e: + secs = int(getattr(e, 'seconds', 5)) + flood_hits += 1 + flood_wait_seconds += secs + print(f"[rate-limit] FloodWait while scanning replies in-channel for parent {pid}; waiting {secs}s", flush=True) + await asyncio.sleep(secs + 1) + attempts += 1 + continue + except MsgIdInvalidError: + # Likely a channel with a linked discussion group; fall back below + rows.clear() + break + except Exception: + break + + if rows: + return rows + + # Fallback: for channels with comments in a linked discussion group + try: + res = await client(GetDiscussionMessageRequest(peer=channel, msg_id=pid)) + except Exception: + # No discussion thread found or not accessible + return rows + + # Identify the discussion chat and the root message id in that chat + disc_chat = None + if getattr(res, "chats", None): + # Prefer the first chat returned as the discussion chat + disc_chat = res.chats[0] + + disc_root_id = None + for m in getattr(res, "messages", []) or []: + try: + peer_id = getattr(m, "peer_id", None) + if not peer_id or not disc_chat: + continue + ch_id = getattr(peer_id, "channel_id", None) or getattr(peer_id, "chat_id", None) + if ch_id == getattr(disc_chat, "id", None): + disc_root_id = m.id + break + except Exception: + continue + + if not disc_chat or not disc_root_id: + return rows + + group_username = getattr(disc_chat, "username", None) + attempts = 0 + while attempts < 3: + try: + async for reply in client.iter_messages(disc_chat, reply_to=disc_root_id): + dt = reply.date.replace(tzinfo=None) if reply.date else None + text = reply.message or "" + sent = analyzer.polarity_scores(text).get("compound") + # Construct URL only if the discussion group has a public username + url = None + if group_username and reply.id: + url = f"https://t.me/{group_username}/{reply.id}" + rows.append( + { + "parent_id": pid, + "id": reply.id, + "date": to_iso(dt) if dt else None, + "message": text, + "sender_id": getattr(reply, "sender_id", None), + "sentiment_compound": sent, + "url": url, + } + ) + break + except FloodWaitError as e: + secs = int(getattr(e, 'seconds', 5)) + flood_hits += 1 + flood_wait_seconds += secs + print(f"[rate-limit] FloodWait while scanning discussion group for parent {pid}; waiting {secs}s", flush=True) + await asyncio.sleep(secs + 1) + attempts += 1 + continue + except Exception: + break + return rows + + total_written = 0 + processed = 0 + total = len(list(parent_ids)) if hasattr(parent_ids, '__len__') else None + + async def worker(pid: int): + nonlocal total_written, processed + async with sem: + rows = await handle_parent(int(pid)) + async with write_lock: + if rows: + # Dedupe against existing pairs if provided (resume mode) + if existing_pairs is not None: + filtered: List[dict] = [] + for r in rows: + try: + key = (int(r.get("parent_id")), int(r.get("id"))) + except Exception: + continue + if key in existing_pairs: + continue + existing_pairs.add(key) + filtered.append(r) + rows = filtered + if rows: + writer.writerows(rows) + total_written += len(rows) + processed += 1 + if processed % 10 == 0 or (rows and len(rows) > 0): + if total is not None: + print(f"[replies] processed {processed}/{total} parents; last parent {pid} wrote {len(rows)} replies; total replies {total_written}", flush=True) + else: + print(f"[replies] processed {processed} parents; last parent {pid} wrote {len(rows)} replies; total replies {total_written}", flush=True) + + tasks = [asyncio.create_task(worker(pid)) for pid in parent_ids] + await asyncio.gather(*tasks) + + await client.disconnect() + if flood_hits: + print(f"[rate-limit] Summary: {flood_hits} FloodWait events; total waited ~{flood_wait_seconds}s", flush=True) + + +async def fetch_forwards( + channel: str, + parent_ids: Set[int], + output_csv: str, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + scan_limit: Optional[int] = None, + append: bool = False, + session_name: str = "telegram", + phone: Optional[str] = None, + twofa_password: Optional[str] = None, + concurrency: int = 5, + chunk_size: int = 1000, +): + """Best-effort: find forwarded messages within the SAME channel that reference the given parent_ids. + Telegram API does not provide a global reverse-lookup of forwards across all channels; we therefore scan + this channel's history and collect messages with fwd_from.channel_post matching a parent id. + """ + load_dotenv() + api_id = os.getenv("TELEGRAM_API_ID") + api_hash = os.getenv("TELEGRAM_API_HASH") + session_name = os.getenv("TELEGRAM_SESSION_NAME", session_name) + if not api_id or not api_hash: + raise RuntimeError("Missing TELEGRAM_API_ID/TELEGRAM_API_HASH in environment. See .env.example") + client = TelegramClient(session_name, int(api_id), api_hash) + await ensure_login(client, phone=phone, twofa_password=twofa_password) + + import csv + + # Rate limiting counters + flood_hits = 0 + import csv + + analyzer = SentimentIntensityAnalyzer() + os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True) + mode = "a" if append else "w" + write_lock = asyncio.Lock() + with open(output_csv, mode, newline="", encoding="utf-8") as f: + writer = csv.DictWriter( + f, + fieldnames=["parent_id", "id", "date", "message", "sender_id", "sentiment_compound", "url"], + ) + need_header = True + try: + if append and os.path.exists(output_csv) and os.path.getsize(output_csv) > 0: + need_header = False + except Exception: + pass + if need_header: + writer.writeheader() + + parsed_start = datetime.fromisoformat(start_date) if start_date else None + parsed_end = datetime.fromisoformat(end_date) if end_date else None + + # If no scan_limit provided, fall back to sequential scan to avoid unbounded concurrency + if scan_limit is None: + scanned = 0 + matched = 0 + async for msg in client.iter_messages(channel, limit=None): + dt = msg.date.replace(tzinfo=None) if msg.date else None + if parsed_start and dt and dt < parsed_start: + break + if parsed_end and dt and dt > parsed_end: + continue + fwd = getattr(msg, "fwd_from", None) + if not fwd: + continue + ch_post = getattr(fwd, "channel_post", None) + if ch_post and int(ch_post) in parent_ids: + text = msg.message or "" + sent = analyzer.polarity_scores(text).get("compound") + url = f"https://t.me/{channel.lstrip('@')}/{msg.id}" if msg.id else None + writer.writerow( + { + "parent_id": int(ch_post), + "id": msg.id, + "date": to_iso(dt) if dt else None, + "message": text, + "sender_id": getattr(msg, "sender_id", None), + "sentiment_compound": sent, + "url": url, + } + ) + matched += 1 + scanned += 1 + if scanned % 1000 == 0: + print(f"[forwards] scanned ~{scanned} messages; total forwards {matched}", flush=True) + else: + # Concurrent chunked scanning by id ranges + # Rate limiting counters + flood_hits = 0 + flood_wait_seconds = 0 + sem = asyncio.Semaphore(max(1, int(concurrency))) + progress_lock = asyncio.Lock() + matched_total = 0 + completed_chunks = 0 + + # Determine latest message id + latest_msg = await client.get_messages(channel, limit=1) + latest_id = None + try: + latest_id = getattr(latest_msg, 'id', None) or (latest_msg[0].id if latest_msg else None) + except Exception: + latest_id = None + if not latest_id: + await client.disconnect() + return + + total_chunks = max(1, (int(scan_limit) + int(chunk_size) - 1) // int(chunk_size)) + + async def process_chunk(idx: int): + nonlocal flood_hits, flood_wait_seconds + nonlocal matched_total, completed_chunks + max_id = latest_id - idx * int(chunk_size) + min_id = max(0, max_id - int(chunk_size)) + attempts = 0 + local_matches = 0 + while attempts < 3: + try: + async with sem: + async for msg in client.iter_messages(channel, min_id=min_id, max_id=max_id): + dt = msg.date.replace(tzinfo=None) if msg.date else None + if parsed_start and dt and dt < parsed_start: + # This range reached before start; skip remaining in this chunk + break + if parsed_end and dt and dt > parsed_end: + continue + fwd = getattr(msg, "fwd_from", None) + if not fwd: + continue + ch_post = getattr(fwd, "channel_post", None) + if ch_post and int(ch_post) in parent_ids: + text = msg.message or "" + sent = analyzer.polarity_scores(text).get("compound") + url = f"https://t.me/{channel.lstrip('@')}/{msg.id}" if msg.id else None + async with write_lock: + writer.writerow( + { + "parent_id": int(ch_post), + "id": msg.id, + "date": to_iso(dt) if dt else None, + "message": text, + "sender_id": getattr(msg, "sender_id", None), + "sentiment_compound": sent, + "url": url, + } + ) + local_matches += 1 + break + except FloodWaitError as e: + secs = int(getattr(e, 'seconds', 5)) + flood_hits += 1 + flood_wait_seconds += secs + print(f"[rate-limit] FloodWait while scanning ids {min_id}-{max_id}; waiting {secs}s", flush=True) + await asyncio.sleep(secs + 1) + attempts += 1 + continue + except Exception: + # best-effort; skip this chunk + break + async with progress_lock: + matched_total += local_matches + completed_chunks += 1 + print( + f"[forwards] chunks {completed_chunks}/{total_chunks}; last {min_id}-{max_id} wrote {local_matches} forwards; total forwards {matched_total}", + flush=True, + ) + + tasks = [asyncio.create_task(process_chunk(i)) for i in range(total_chunks)] + await asyncio.gather(*tasks) + + await client.disconnect() + # Print summary if we used concurrent chunking + try: + if scan_limit is not None and 'flood_hits' in locals() and flood_hits: + print(f"[rate-limit] Summary: {flood_hits} FloodWait events; total waited ~{flood_wait_seconds}s", flush=True) + except Exception: + pass + + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Telegram scraper utilities") + sub = parser.add_subparsers(dest="command", required=True) + + # Subcommand: scrape channel history + p_scrape = sub.add_parser("scrape", help="Scrape messages from a channel") + p_scrape.add_argument("channel", help="Channel username or t.me link, e.g. @python, https://t.me/python") + p_scrape.add_argument("--output", "-o", required=True, help="Output file (.jsonl or .csv)") + p_scrape.add_argument("--limit", type=int, default=None, help="Max number of messages to save after filtering") + p_scrape.add_argument("--offset-date", dest="offset_date", default=None, help="Deprecated: use --start-date instead. ISO date (inclusive)") + p_scrape.add_argument("--start-date", dest="start_date", default=None, help="ISO start date (inclusive)") + p_scrape.add_argument("--end-date", dest="end_date", default=None, help="ISO end date (inclusive)") + p_scrape.add_argument("--append", action="store_true", help="Append to the output file instead of overwriting") + p_scrape.add_argument("--session-name", default=os.getenv("TELEGRAM_SESSION_NAME", "telegram")) + p_scrape.add_argument("--phone", default=None) + p_scrape.add_argument("--twofa-password", default=os.getenv("TELEGRAM_2FA_PASSWORD")) + + # Subcommand: fetch replies for specific message ids + p_rep = sub.add_parser("replies", help="Fetch replies for given message IDs and save to CSV") + p_rep.add_argument("channel", help="Channel username or t.me link") + src = p_rep.add_mutually_exclusive_group(required=True) + src.add_argument("--ids", help="Comma-separated parent message IDs") + src.add_argument("--from-csv", dest="from_csv", help="Path to CSV with an 'id' column to use as parent IDs") + p_rep.add_argument("--output", "-o", required=True, help="Output CSV path (e.g., data/replies_channel.csv)") + p_rep.add_argument("--append", action="store_true", help="Append to the output file instead of overwriting") + p_rep.add_argument("--session-name", default=os.getenv("TELEGRAM_SESSION_NAME", "telegram")) + p_rep.add_argument("--phone", default=None) + p_rep.add_argument("--twofa-password", default=os.getenv("TELEGRAM_2FA_PASSWORD")) + p_rep.add_argument("--concurrency", type=int, default=5, help="Number of parent IDs to process in parallel (default 5)") + p_rep.add_argument("--min-replies", type=int, default=None, help="When using --from-csv, only process parents with replies >= this value") + p_rep.add_argument("--resume", action="store_true", help="Resume mode: skip parent_id,id pairs already present in the output CSV") + + # Subcommand: fetch forwards (same-channel forwards referencing parent ids) + p_fwd = sub.add_parser("forwards", help="Best-effort: find forwards within the same channel for given parent IDs") + p_fwd.add_argument("channel", help="Channel username or t.me link") + src2 = p_fwd.add_mutually_exclusive_group(required=True) + src2.add_argument("--ids", help="Comma-separated parent message IDs") + src2.add_argument("--from-csv", dest="from_csv", help="Path to CSV with an 'id' column to use as parent IDs") + p_fwd.add_argument("--output", "-o", required=True, help="Output CSV path (e.g., data/forwards_channel.csv)") + p_fwd.add_argument("--start-date", dest="start_date", default=None) + p_fwd.add_argument("--end-date", dest="end_date", default=None) + p_fwd.add_argument("--scan-limit", dest="scan_limit", type=int, default=None, help="Max messages to scan in channel history") + p_fwd.add_argument("--concurrency", type=int, default=5, help="Number of id-chunks to scan in parallel (requires --scan-limit)") + p_fwd.add_argument("--chunk-size", dest="chunk_size", type=int, default=1000, help="Approx. messages per chunk (ids)") + p_fwd.add_argument("--append", action="store_true", help="Append to the output file instead of overwriting") + p_fwd.add_argument("--session-name", default=os.getenv("TELEGRAM_SESSION_NAME", "telegram")) + p_fwd.add_argument("--phone", default=None) + p_fwd.add_argument("--twofa-password", default=os.getenv("TELEGRAM_2FA_PASSWORD")) + + args = parser.parse_args() + + # Normalize channel + channel = getattr(args, "channel", None) + if channel and channel.startswith("https://t.me/"): + channel = channel.replace("https://t.me/", "@") + + def _normalize_handle(ch: Optional[str]) -> Optional[str]: + if not ch: + return ch + # Expect inputs like '@name' or 'name'; return lowercase without leading '@' + return ch.lstrip('@').lower() + + def _extract_handle_from_url(url: str) -> Optional[str]: + try: + if not url: + return None + # Accept forms like https://t.me/Name/123 or http(s)://t.me/c// + # Only public usernames (not /c/ links) can be compared reliably + if "/t.me/" in url: + # crude parse without urlparse to avoid dependency + after = url.split("t.me/")[-1] + parts = after.split('/') + if parts and parts[0] and parts[0] != 'c': + return parts[0] + except Exception: + return None + return None + + if args.command == "scrape": + written = asyncio.run( + scrape_channel( + channel=channel, + output=args.output, + limit=args.limit, + offset_date=args.offset_date, + start_date=args.start_date, + end_date=args.end_date, + append=getattr(args, "append", False), + session_name=args.session_name, + phone=args.phone, + twofa_password=args.twofa_password, + ) + ) + print(f"Wrote {written} messages to {args.output}") + elif args.command == "replies": + # If using --from-csv, try to infer channel from URLs and warn on mismatch + try: + if getattr(args, 'from_csv', None): + import pandas as _pd # local import to keep startup light + # Read a small sample of URL column to detect handle + sample = _pd.read_csv(args.from_csv, usecols=['url'], nrows=20) + url_handles = [ + _extract_handle_from_url(str(u)) for u in sample['url'].dropna().tolist() if isinstance(u, (str,)) + ] + inferred = next((h for h in url_handles if h), None) + provided = _normalize_handle(channel) + if inferred and provided and _normalize_handle(inferred) != provided: + print( + f"[warning] CSV appears to be from @{_normalize_handle(inferred)} but you passed -c @{provided}. " + f"Replies may be empty. Consider using -c https://t.me/{inferred}", + flush=True, + ) + except Exception: + # Best-effort only; ignore any issues reading/inspecting CSV + pass + parent_ids: Set[int] + if getattr(args, "ids", None): + parent_ids = {int(x.strip()) for x in args.ids.split(",") if x.strip()} + else: + import pandas as pd # local import + usecols = ['id'] + if args.min_replies is not None: + usecols.append('replies') + df = pd.read_csv(args.from_csv, usecols=usecols) + if args.min_replies is not None and 'replies' in df.columns: + df = df[df['replies'].fillna(0).astype(int) >= int(args.min_replies)] + parent_ids = set(int(x) for x in df['id'].dropna().astype(int).tolist()) + existing_pairs = None + if args.resume and os.path.exists(args.output): + try: + import csv as _csv + existing_pairs = set() + with open(args.output, "r", encoding="utf-8") as _f: + reader = _csv.DictReader(_f) + for row in reader: + try: + existing_pairs.add((int(row.get("parent_id")), int(row.get("id")))) + except Exception: + continue + except Exception: + existing_pairs = None + + asyncio.run( + fetch_replies( + channel=channel, + parent_ids=sorted(parent_ids), + output_csv=args.output, + append=getattr(args, "append", False), + session_name=args.session_name, + phone=args.phone, + twofa_password=args.twofa_password, + concurrency=max(1, int(getattr(args, 'concurrency', 5))), + existing_pairs=existing_pairs, + ) + ) + print(f"Saved replies to {args.output}") + elif args.command == "forwards": + parent_ids: Set[int] + if getattr(args, "ids", None): + parent_ids = {int(x.strip()) for x in args.ids.split(",") if x.strip()} + else: + import pandas as pd + df = pd.read_csv(args.from_csv) + parent_ids = set(int(x) for x in df['id'].dropna().astype(int).tolist()) + asyncio.run( + fetch_forwards( + channel=channel, + parent_ids=parent_ids, + output_csv=args.output, + start_date=args.start_date, + end_date=args.end_date, + scan_limit=args.scan_limit, + concurrency=max(1, int(getattr(args, 'concurrency', 5))), + chunk_size=max(1, int(getattr(args, 'chunk_size', 1000))), + append=getattr(args, "append", False), + session_name=args.session_name, + phone=args.phone, + twofa_password=args.twofa_password, + ) + ) + print(f"Saved forwards to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/src/train_sentiment.py b/src/train_sentiment.py new file mode 100644 index 0000000..950f159 --- /dev/null +++ b/src/train_sentiment.py @@ -0,0 +1,135 @@ +import argparse +import os +from typing import Optional + +import pandas as pd +from datasets import Dataset, ClassLabel +from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer +import inspect +import numpy as np +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score + + +def build_dataset(df: pd.DataFrame, text_col: str, label_col: str, label_mapping: Optional[dict] = None) -> Dataset: + d = df[[text_col, label_col]].dropna().copy() + # Normalize and drop empty labels + d[label_col] = d[label_col].astype(str).str.strip() + d = d[d[label_col] != ''] + if d.empty: + raise SystemExit("No labeled rows found. Please fill the 'label' column in your CSV (e.g., neg/neu/pos or 0/1/2).") + if label_mapping: + d[label_col] = d[label_col].map(label_mapping) + # If labels are strings, factorize them + if d[label_col].dtype == object: + d[label_col] = d[label_col].astype('category') + label2id = {k: int(v) for v, k in enumerate(d[label_col].cat.categories)} + id2label = {v: k for k, v in label2id.items()} + d[label_col] = d[label_col].cat.codes + else: + # Assume numeric 0..N-1 + classes = sorted(d[label_col].unique().tolist()) + label2id = {str(c): int(c) for c in classes} + id2label = {int(c): str(c) for c in classes} + hf = Dataset.from_pandas(d.reset_index(drop=True)) + hf = hf.class_encode_column(label_col) + hf.features[label_col] = ClassLabel(num_classes=len(id2label), names=[id2label[i] for i in range(len(id2label))]) + return hf, label2id, id2label + + +def tokenize_fn(examples, tokenizer, text_col): + return tokenizer(examples[text_col], truncation=True, padding=False) + + +def compute_metrics(eval_pred): + logits, labels = eval_pred + preds = np.argmax(logits, axis=-1) + return { + 'accuracy': accuracy_score(labels, preds), + 'precision_macro': precision_score(labels, preds, average='macro', zero_division=0), + 'recall_macro': recall_score(labels, preds, average='macro', zero_division=0), + 'f1_macro': f1_score(labels, preds, average='macro', zero_division=0), + } + + +def main(): + parser = argparse.ArgumentParser(description='Fine-tune a transformers model for sentiment.') + parser.add_argument('--train-csv', required=True, help='Path to labeled CSV') + parser.add_argument('--text-col', default='message', help='Text column name') + parser.add_argument('--label-col', default='label', help='Label column name (e.g., pos/neu/neg or 2/1/0)') + parser.add_argument('--model-name', default='distilbert-base-uncased', help='Base model name or path') + parser.add_argument('--output-dir', default='models/sentiment-distilbert', help='Where to save the fine-tuned model') + parser.add_argument('--epochs', type=int, default=3) + parser.add_argument('--batch-size', type=int, default=16) + parser.add_argument('--lr', type=float, default=5e-5) + parser.add_argument('--eval-split', type=float, default=0.1, help='Fraction of data for eval') + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + df = pd.read_csv(args.train_csv) + ds, label2id, id2label = build_dataset(df, args.text_col, args.label_col) + if args.eval_split > 0: + ds = ds.train_test_split(test_size=args.eval_split, seed=42, stratify_by_column=args.label_col) + train_ds, eval_ds = ds['train'], ds['test'] + else: + train_ds, eval_ds = ds, None + + num_labels = len(id2label) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + model = AutoModelForSequenceClassification.from_pretrained( + args.model_name, + num_labels=num_labels, + id2label=id2label, + label2id={k: int(v) for k, v in label2id.items()}, + ) + + tokenized_train = train_ds.map(lambda x: tokenize_fn(x, tokenizer, args.text_col), batched=True) + tokenized_eval = eval_ds.map(lambda x: tokenize_fn(x, tokenizer, args.text_col), batched=True) if (eval_ds is not None) else None + + # Build TrainingArguments with compatibility across transformers versions + base_kwargs = { + 'output_dir': args.output_dir, + 'per_device_train_batch_size': args.batch_size, + 'per_device_eval_batch_size': args.batch_size, + 'num_train_epochs': args.epochs, + 'learning_rate': args.lr, + 'fp16': False, + 'logging_steps': 50, + } + eval_kwargs = {} + if tokenized_eval is not None: + # Set both evaluation_strategy and eval_strategy for compatibility across transformers versions + eval_kwargs.update({ + 'evaluation_strategy': 'epoch', + 'eval_strategy': 'epoch', + 'save_strategy': 'epoch', + 'load_best_model_at_end': True, + 'metric_for_best_model': 'f1_macro', + 'greater_is_better': True, + }) + + # Filter kwargs to only include parameters supported by this transformers version + sig = inspect.signature(TrainingArguments.__init__) + allowed = set(sig.parameters.keys()) + def _filter(d: dict) -> dict: + return {k: v for k, v in d.items() if k in allowed} + + training_args = TrainingArguments(**_filter(base_kwargs), **_filter(eval_kwargs)) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_eval, + tokenizer=tokenizer, + compute_metrics=compute_metrics if tokenized_eval else None, + ) + + trainer.train() + trainer.save_model(args.output_dir) + tokenizer.save_pretrained(args.output_dir) + print(f"Model saved to {args.output_dir}") + + +if __name__ == '__main__': + main() diff --git a/src/transformer_sentiment.py b/src/transformer_sentiment.py new file mode 100644 index 0000000..f8c2e89 --- /dev/null +++ b/src/transformer_sentiment.py @@ -0,0 +1,90 @@ +from typing import List, Optional + +import numpy as np +from transformers import AutoTokenizer, AutoModelForSequenceClassification +import torch + + +class TransformerSentiment: + def __init__(self, model_name_or_path: str, device: Optional[str] = None, max_length: int = 256): + self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path) + self.max_length = max_length + if device is None: + if torch.cuda.is_available(): + device = 'cuda' + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + device = 'mps' + else: + device = 'cpu' + self.device = device + self.model.to(self.device) + self.model.eval() + + # Expect labels roughly like {0:'neg',1:'neu',2:'pos'} or similar + self.id2label = self.model.config.id2label if hasattr(self.model.config, 'id2label') else {0:'0',1:'1',2:'2'} + + def _compound_from_probs(self, probs: np.ndarray) -> float: + # Map class probabilities to a [-1,1] compound-like score. + # If we have exactly 3 labels and names look like neg/neu/pos (any case), use that mapping. + labels = [self.id2label.get(i, str(i)).lower() for i in range(len(probs))] + try: + neg_idx = labels.index('neg') if 'neg' in labels else labels.index('negative') + except ValueError: + neg_idx = 0 + try: + neu_idx = labels.index('neu') if 'neu' in labels else labels.index('neutral') + except ValueError: + neu_idx = 1 if len(probs) > 2 else None + try: + pos_idx = labels.index('pos') if 'pos' in labels else labels.index('positive') + except ValueError: + pos_idx = (len(probs)-1) + + p_neg = float(probs[neg_idx]) if neg_idx is not None else 0.0 + p_pos = float(probs[pos_idx]) if pos_idx is not None else 0.0 + # A simple skew: pos - neg; keep within [-1,1] + comp = max(-1.0, min(1.0, p_pos - p_neg)) + return comp + + @torch.no_grad() + def predict_compound_batch(self, texts: List[str], batch_size: int = 32) -> List[float]: + out: List[float] = [] + for i in range(0, len(texts), batch_size): + batch = texts[i:i+batch_size] + enc = self.tokenizer( + batch, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors='pt' + ) + enc = {k: v.to(self.device) for k, v in enc.items()} + logits = self.model(**enc).logits + probs = torch.softmax(logits, dim=-1).cpu().numpy() + for row in probs: + out.append(self._compound_from_probs(row)) + return out + + @torch.no_grad() + def predict_probs_and_labels(self, texts: List[str], batch_size: int = 32): + probs_all = [] + labels_all: List[str] = [] + for i in range(0, len(texts), batch_size): + batch = texts[i:i+batch_size] + enc = self.tokenizer( + batch, + padding=True, + truncation=True, + max_length=self.max_length, + return_tensors='pt' + ) + enc = {k: v.to(self.device) for k, v in enc.items()} + logits = self.model(**enc).logits + probs = torch.softmax(logits, dim=-1).cpu().numpy() + preds = probs.argmax(axis=-1) + for j, row in enumerate(probs): + probs_all.append(row) + label = self.id2label.get(int(preds[j]), str(int(preds[j]))) + labels_all.append(label) + return probs_all, labels_all