diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..6313b56
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* text=auto eol=lf
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9fdb553
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,61 @@
+# OS / Editor
+.DS_Store
+.vscode/
+
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+*.pkl
+*.pickle
+.pytest_cache/
+.mypy_cache/
+.coverage
+coverage.xml
+
+# Environments
+.env
+.env.*
+.venv/
+venv/
+
+# Project outputs (large or generated)
+data/
+!data/.gitkeep
+models/
+!models/.gitkeep
+checkpoints/
+runs/
+
+# Sessions / secrets / sqlite
+*.session
+*.sqlite*
+*.db
+*.log
+
+# Notebooks
+.ipynb_checkpoints/
+
+# Caches and locks
+.cache/
+*.lock# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.venv/
+venv/
+.env
+*.env
+
+# Telethon session files
+*.session
+*.session-journal
+
+# Jupyter
+.ipynb_checkpoints/
+
+# macOS
+.DS_Store
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..db72e4d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,205 @@
+# Telegram analytics toolkit
+
+Scrape public Telegram channel posts, fetch replies and forwards, and generate rich analytics reports with tagging, sentiment, matchday overlays, and plots. Use VADER, a local transformers model, or a local GPT (Ollama) backend for sentiment.
+
+Highlights:
+- Fast replies scraping with concurrency, resume/append, and rate-limit visibility
+- Forwards scanning with chunked, concurrent search
+- Analyzer: tagging from YAML keywords; sentiment via VADER, transformers, or local GPT; emoji-aware modes; combined posts+replies metrics; and matchday cross-analysis
+- Plots: daily activity with in-plot match labels, daily volume vs sentiment (new), heatmaps, and per-tag (team) sentiment shares
+- Local learning: fine-tune and evaluate a transformers classifier and use it in analysis
+
+Full command reference is in `docs/COMMANDS.md`.
+
+## Quick start
+
+1) Configure secrets in `.env` (script will prompt if absent):
+```
+TELEGRAM_API_ID=123456
+TELEGRAM_API_HASH=your_api_hash
+# Optional
+TELEGRAM_SESSION_NAME=telegram
+TELEGRAM_2FA_PASSWORD=your_2fa_password
+FOOTBALL_DATA_API_TOKEN=your_token
+```
+
+2) Run any command via the wrapper (creates venv and installs deps automatically):
+
+```zsh
+# Fetch messages to CSV
+./run_scraper.sh scrape -c https://t.me/Premier_League_Update -o data/premier_league_update.csv --start-date 2025-08-15 --end-date 2025-10-15
+
+# Fetch replies fast
+./run_scraper.sh replies -c https://t.me/Premier_League_Update --from-csv data/premier_league_update.csv -o data/premier_league_replies.csv --min-replies 1 --concurrency 15 --resume --append
+
+# Analyze with tags, fixtures, emoji handling and plots
+./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv --fixtures-csv data/premier_league_schedule_2025-08-15_to_2025-10-15.csv --tags-config config/tags.yaml --write-augmented-csv --write-combined-csv --emoji-mode keep --emoji-boost --save-plots
+```
+
+3) Use transformers sentiment instead of VADER:
+
+```zsh
+# Off-the-shelf fine-tuned sentiment head
+./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv \
+  --sentiment-backend transformers \
+  --transformers-model distilbert-base-uncased-finetuned-sst-2-english \
+  --export-transformers-details \
+  --write-augmented-csv --write-combined-csv --save-plots
+```
+
+4) Use a local GPT backend (Ollama) for sentiment (JSON labels+confidence mapped to a compound score):
+
+```zsh
+# Ensure Ollama is running locally and the model is available (e.g., llama3)
+./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv \
+  --sentiment-backend gpt \
+  --gpt-model llama3 \
+  --gpt-base-url http://localhost:11434 \
+  --write-augmented-csv --write-combined-csv --save-plots
+```
+
+## Aliases
+
+Convenient zsh functions live in `scripts/aliases.zsh`:
+
+- `fast_replies` — resume+append replies with concurrency
+- `chunked_forwards` — concurrent forwards scan
+- `analyze_combined` — posts+replies+fixtures with tags
+- `analyze_emoji` — emoji-aware analyze with boost
+- `analyze_transformers` — analyze with transformers and export details
+- `apply_labels_and_analyze` — merge a labeled CSV into posts/replies and run analyzer (reuses sentiment_label)
+- `plot_labeled` — QA plots from a labeled CSV (class distribution, confidence, lengths)
+- `train_transformers` — fine-tune a model on a labeled CSV
+- `eval_transformers` — evaluate a fine-tuned model
+
+Source them:
+```zsh
+source scripts/aliases.zsh
+```
+
+## Local transformers (optional)
+
+Train a classifier:
+```zsh
+./.venv/bin/python -m src.train_sentiment \
+  --train-csv data/labeled_sentiment.csv \
+  --text-col message \
+  --label-col label \
+  --model-name distilbert-base-uncased \
+  --output-dir models/sentiment-distilbert \
+  --epochs 3 --batch-size 16
+```
+
+Evaluate it:
+```zsh
+./.venv/bin/python -m src.eval_sentiment \
+  --csv data/labeled_holdout.csv \
+  --text-col message \
+  --label-col label \
+  --model models/sentiment-distilbert
+```
+
+Use it in analyze:
+```zsh
+./run_scraper.sh analyze -i data/premier_league_update.csv --replies-csv data/premier_league_replies.csv \
+  --sentiment-backend transformers \
+  --transformers-model models/sentiment-distilbert \
+  --export-transformers-details \
+  --write-augmented-csv --write-combined-csv --save-plots
+```
+
+Notes:
+- GPU/Apple Silicon (MPS) is auto-detected; CPU is the fallback.
+- Torch pinning in `requirements.txt` uses conditional versions for smooth installs across Python versions.
+
+## Plots produced (when --save-plots is used)
+
+- `daily_activity_stacked.png` — stacked bar chart of posts vs replies per day.
+  - Dynamic sizing: `--plot-width-scale`, `--plot-max-width`, `--plot-height`
+  - Top-N highlights: `--activity-top-n` (labels show total and posts+replies breakdown)
+  - Match labels inside the plot using team abbreviations; control density with:
+    - `--labels-max-per-day`, `--labels-per-line`, `--labels-stagger-rows`, `--labels-band-y`, `--labels-annotate-mode`
+- `daily_volume_and_sentiment.png` — total volume (posts+replies) per day as bars (left Y) and positive%/negative% as lines (right Y). Uses `sentiment_label` when present, otherwise `sentiment_compound` thresholds.
+- `posts_heatmap_hour_dow.png` — heatmap of posts activity by hour and day-of-week.
+- `sentiment_by_tag_posts.png` — stacked shares of pos/neu/neg by team tag (tags starting with `club_`), with dynamic width.
+- Matchday rollups (when fixtures are provided):
+  - `matchday_sentiment_overall.csv` — per-fixture-day aggregates for posts (and replies when provided)
+  - `matchday_sentiment_overall.png` — mean sentiment time series on matchdays (posts, replies)
+  - `matchday_posts_volume_vs_sentiment.png` — scatter of posts volume vs mean sentiment on matchdays
+- Diagnostics:
+  - `match_labels_debug.csv` — per-day list of rendered match labels (helps tune label density)
+
+Tip: The analyzer adapts plot width to the number of days; for very long ranges, raise `--plot-max-width`.
+
+## Plot sizing and label flags (analyze)
+
+- `--plot-width-scale` (default 0.8): inches per day for the daily charts width.
+- `--plot-max-width` (default 104): cap on width in inches.
+- `--plot-height` (default 6.5): figure height in inches.
+- `--activity-top-n` (default 5): highlight top-N activity days; 0 disables.
+- Match label controls:
+  - `--labels-max-per-day` (default 3): cap labels per day (+N more).
+  - `--labels-per-line` (default 2): labels per line in the band.
+  - `--labels-band-y` (default 0.96): vertical position of the band (axes coords).
+  - `--labels-stagger-rows` (default 2): stagger rows to reduce collisions.
+  - `--labels-annotate-mode` (ticks|all|ticks+top): which x positions get labels.
+
+## Automatic labeling (no manual annotation)
+
+If you don't want to label data by hand, generate a labeled training set automatically and train a local model.
+
+Label with VADER (fast) or a pretrained transformers model (higher quality):
+
+```zsh
+# Load aliases
+source scripts/aliases.zsh
+
+# VADER: keeps only confident predictions by default
+auto_label_vader
+
+# Or Transformers: CardiffNLP 3-class sentiment (keeps confident only)
+auto_label_transformers
+
+# Output: data/labeled_sentiment.csv (message, label, confidence, ...)
+```
+
+Then fine-tune a classifier on the generated labels and use it in analysis:
+
+```zsh
+# Train on the auto-labeled CSV
+train_transformers
+
+# Analyze using your fine-tuned model
+./run_scraper.sh analyze -i data/premier_league_update.csv \
+  --replies-csv data/premier_league_replies.csv \
+  --fixtures-csv data/premier_league_schedule_2025-08-15_to_2025-10-15.csv \
+  --tags-config config/tags.yaml \
+  --sentiment-backend transformers \
+  --transformers-model models/sentiment-distilbert \
+  --export-transformers-details \
+  --write-augmented-csv --write-combined-csv --save-plots
+```
+
+Advanced knobs (optional):
+- VADER thresholds: `--vader-pos 0.05 --vader-neg -0.05 --vader-margin 0.2`
+- Transformers acceptance: `--min-prob 0.6 --min-margin 0.2`
+- Keep all predictions (not just confident): remove `--only-confident`
+
+## Local GPT backend (Ollama)
+
+You can use a local GPT model for sentiment. The analyzer requests strict JSON `{label, confidence}` and maps it to a compound score. If the GPT call fails for any rows, it gracefully falls back to VADER for those rows.
+
+Example:
+```zsh
+./run_scraper.sh analyze -i data/premier_league_update.csv \
+  --replies-csv data/premier_league_replies.csv \
+  --fixtures-csv data/premier_league_schedule_2025-08-15_to_2025-10-15.csv \
+  --tags-config config/tags.yaml \
+  --sentiment-backend gpt \
+  --gpt-model llama3 \
+  --gpt-base-url http://localhost:11434 \
+  --write-augmented-csv --write-combined-csv --save-plots
+```
+
+## License
+MIT (adjust as needed)
\ No newline at end of file
diff --git a/config/tags.yaml b/config/tags.yaml
new file mode 100644
index 0000000..b31ff9f
--- /dev/null
+++ b/config/tags.yaml
@@ -0,0 +1,103 @@
+# Keyword tag configuration
+# Each tag has a list of case-insensitive substrings or regex patterns (prefix with re:)
+# Messages matching ANY pattern for a tag are labeled with that tag.
+
+score_update:
+  - "FT"
+  - "full time"
+  - "final score"
+  - "HT"
+  - "half time"
+  - "kick-off"
+  - "kick off"
+
+transfer:
+  - "transfer"
+  - "signs"
+  - "signed"
+  - "loan"
+  - "contract"
+  - "deal"
+
+injury:
+  - "injury"
+  - "injured"
+  - "out for"
+  - "ruled out"
+
+match_highlight:
+  - "goal"
+  - "scores"
+  - "assist"
+  - "penalty"
+  - "VAR"
+  - "red card"
+  - "yellow card"
+
+club_arsenal:
+  - "Arsenal"
+club_manchester_city:
+  - "Manchester City"
+club_manchester_united:
+  - "Manchester United"
+club_chelsea:
+  - "Chelsea"
+club_liverpool:
+  - "Liverpool"
+club_tottenham:
+  - "Tottenham"
+club_newcastle:
+  - "Newcastle"
+club_west_ham:
+  - "West Ham"
+club_brighton:
+  - "Brighton"
+club_aston_villa:
+  - "Aston Villa"
+club_everton:
+  - "Everton"
+club_crystal_palace:
+  - "Crystal Palace"
+  - "Palace"
+club_bournemouth:
+  - "Bournemouth"
+  - "AFC Bournemouth"
+club_brentford:
+  - "Brentford"
+club_fulham:
+  - "Fulham"
+club_nottingham_forest:
+  - "Nottingham Forest"
+  - "Forest"
+club_wolves:
+  - "Wolves"
+  - "Wolverhampton"
+club_burnley:
+  - "Burnley"
+club_southampton:
+  - "Southampton"
+  - "Saints"
+club_leicester_city:
+  - "Leicester"
+  - "Leicester City"
+club_leeds_united:
+  - "Leeds"
+  - "Leeds United"
+club_sheffield_united:
+  - "Sheffield United"
+  - "Sheff Utd"
+club_west_bromwich_albion:
+  - "West Brom"
+  - "West Bromwich"
+club_ipswich_town:
+  - "Ipswich"
+  - "Ipswich Town"
+club_portsmouth:
+  - "Portsmouth"
+  - "Pompey"
+club_hull_city:
+  - "Hull"
+  - "Hull City"
+club_middlesbrough:
+  - "Middlesbrough"
+  - "Boro"
diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md
new file mode 100644
index 0000000..704333e
--- /dev/null
+++ b/docs/COMMANDS.md
@@ -0,0 +1,743 @@
+# Project command reference
+
+This file lists all supported commands and practical permutations for `./run_scraper.sh`, with short comments and tips. It mirrors the actual CLI flags in the code.
+
+- Shell: zsh (macOS) — commands below are ready to paste.
+- Env: A `.venv` is created automatically; dependencies installed from `requirements.txt`.
+- Secrets: Create `.env` with TELEGRAM_API_ID and TELEGRAM_API_HASH; for fixtures also set FOOTBALL_DATA_API_TOKEN.
+- 2FA: If you use Telegram two-step verification, set TELEGRAM_2FA_PASSWORD in `.env` (the shell wrapper doesn’t accept a flag for this).
+- Sessions: Telethon uses a SQLite session file (default `telegram.session`). When running multiple tools in parallel, use distinct `--session-name` values.
+
+## Common conventions
+
+- Channels
+  - Use either handle or URL: `-c @name` or `-c https://t.me/name`.
+  - For replies, the channel must match the posts’ source in your CSV `url` column.
+- Output behavior
+  - scrape/replies/forwards overwrite unless you pass `--append`.
+  - analyze always overwrites its outputs.
+- Rate-limits
+  - Replies/forwards log `[rate-limit]` if Telegram asks you to wait. Reduce `--concurrency` if frequent.
+- Parallel runs
+  - Add `--session-name <unique>` per process to avoid “database is locked”. Prefer sessions outside iCloud Drive.
+
+---
+
+## Scrape (posts/messages)
+
+Minimal (overwrite output):
+```zsh
+./run_scraper.sh scrape -c @SomeChannel -o data/messages.csv
+```
+
+With date range and limit:
+```zsh
+./run_scraper.sh scrape \
+  -c https://t.me/SomeChannel \
+  -o data/messages.jsonl \
+  --start-date 2025-01-01 \
+  --end-date 2025-03-31 \
+  --limit 500
+```
+
+Legacy offset date (deprecated; prefer --start-date):
+```zsh
+./run_scraper.sh scrape -c @SomeChannel -o data/messages.csv --offset-date 2025-01-01
+```
+
+Append to existing file and pass phone on first login:
+```zsh
+./run_scraper.sh scrape \
+  -c @SomeChannel \
+  -o data/messages.csv \
+  --append \
+  --phone +15551234567
+```
+
+Use a custom session (useful in parallel):
+```zsh
+./run_scraper.sh scrape -c @SomeChannel -o data/messages.csv --session-name telegram_scrape
+```
+
+Notes:
+- Output format inferred by extension: `.csv` or `.jsonl`/`.ndjson`.
+- Two-step verification: set TELEGRAM_2FA_PASSWORD in `.env` (no CLI flag in the shell wrapper).
+
+### All valid forms (scrape)
+
+Use one of the following combinations. Replace placeholders with your values.
+
+- Base variables:
+  - CH = @handle or https://t.me/handle
+  - OUT = path to .csv or .jsonl
+  - Optional value flags: [--limit N] [--session-name NAME] [--phone NUMBER]
+
+- Date filter permutations (4) × Append flag (2) × Limit presence (2) = 16 forms
+
+1) No dates, no append, no limit
+  ./run_scraper.sh scrape -c CH -o OUT
+2) No dates, no append, with limit
+  ./run_scraper.sh scrape -c CH -o OUT --limit N
+3) No dates, with append, no limit
+  ./run_scraper.sh scrape -c CH -o OUT --append
+4) No dates, with append, with limit
+  ./run_scraper.sh scrape -c CH -o OUT --append --limit N
+5) Start only, no append, no limit
+  ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD
+6) Start only, no append, with limit
+  ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --limit N
+7) Start only, with append, no limit
+  ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --append
+8) Start only, with append, with limit
+  ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --append --limit N
+9) End only, no append, no limit
+  ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD
+10) End only, no append, with limit
+   ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD --limit N
+11) End only, with append, no limit
+   ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD --append
+12) End only, with append, with limit
+   ./run_scraper.sh scrape -c CH -o OUT --end-date YYYY-MM-DD --append --limit N
+13) Start and end, no append, no limit
+   ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD
+14) Start and end, no append, with limit
+   ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD --limit N
+15) Start and end, with append, no limit
+   ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD --append
+16) Start and end, with append, with limit
+   ./run_scraper.sh scrape -c CH -o OUT --start-date YYYY-MM-DD --end-date YYYY-MM-DD --append --limit N
+
+Optional add-ons valid for any form above:
+- Append [--session-name NAME] and/or [--phone NUMBER]
+- Deprecated alternative to start-date: add [--offset-date YYYY-MM-DD]
+
+---
+
+## Replies (fetch replies to posts)
+
+From a posts CSV (fast path; skip posts with 0 replies in CSV):
+```zsh
+./run_scraper.sh replies \
+  -c https://t.me/SourceChannel \
+  --from-csv data/messages.csv \
+  -o data/replies.csv \
+  --min-replies 1 \
+  --concurrency 15 \
+  --resume \
+  --append
+```
+
+Using explicit message IDs:
+```zsh
+./run_scraper.sh replies \
+  -c @SourceChannel \
+  --ids "123,456,789" \
+  -o data/replies.csv \
+  --concurrency 5 \
+  --append
+```
+
+IDs from a file (one per line) using zsh substitution:
+```zsh
+IDS=$(tr '\n' ',' < parent_ids.txt | sed 's/,$//')
+./run_scraper.sh replies -c @SourceChannel --ids "$IDS" -o data/replies.csv --concurrency 8 --append
+```
+
+Parallel-safe session name:
+```zsh
+./run_scraper.sh replies -c @SourceChannel --from-csv data/messages.csv -o data/replies.csv --concurrency 12 --resume --append --session-name telegram_replies
+```
+
+What the flags do:
+- `--from-csv PATH` reads parent IDs from a CSV with an `id` column (optionally filtered by `--min-replies`).
+- `--ids` provides a comma-separated list of parent IDs.
+- `--concurrency K` processes K parent IDs in parallel (default 5).
+- `--resume` dedupes by `(parent_id,id)` pairs already present in the output.
+- `--append` appends to output instead of overwriting.
+
+Notes:
+- The channel (`-c`) must match the posts’ source in your CSV URLs (the tool warns on mismatch).
+- First login may require `--phone` (interactive prompt). For 2FA, set TELEGRAM_2FA_PASSWORD in `.env`.
+
+### All valid forms (replies)
+
+- Base variables:
+  - CH = @handle or https://t.me/handle
+  - OUT = path to .csv
+  - Source: exactly one of S1 or S2
+    - S1: --ids "id1,id2,..."
+    - S2: --from-csv PATH [--min-replies N]
+  - Optional: [--concurrency K] [--session-name NAME] [--phone NUMBER]
+  - Binary: [--append], [--resume]
+
+- Enumerated binary permutations for each source (4 per source = 8 total):
+
+S1 + no append + no resume
+  ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT
+S1 + no append + resume
+  ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT --resume
+S1 + append + no resume
+  ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT --append
+S1 + append + resume
+  ./run_scraper.sh replies -c CH --ids "IDLIST" -o OUT --append --resume
+
+S2 + no append + no resume
+  ./run_scraper.sh replies -c CH --from-csv PATH -o OUT
+S2 + no append + resume
+  ./run_scraper.sh replies -c CH --from-csv PATH -o OUT --resume
+S2 + append + no resume
+  ./run_scraper.sh replies -c CH --from-csv PATH -o OUT --append
+S2 + append + resume
+  ./run_scraper.sh replies -c CH --from-csv PATH -o OUT --append --resume
+
+Optional add-ons valid for any form above:
+- Add [--concurrency K] to tune speed; recommended 8–20
+- With S2 you may add [--min-replies N] to prioritize parents with replies
+- Add [--session-name NAME] and/or [--phone NUMBER]
+
+---
+
+## Forwards (same-channel forwards referencing posts)
+
+Typical concurrent scan (best-effort; often zero results):
+```zsh
+./run_scraper.sh forwards \
+  -c https://t.me/SourceChannel \
+  --from-csv data/messages.csv \
+  -o data/forwards.csv \
+  --scan-limit 20000 \
+  --concurrency 10 \
+  --chunk-size 1500
+```
+
+With date filters (applied to scanned messages):
+```zsh
+./run_scraper.sh forwards \
+  -c @SourceChannel \
+  --from-csv data/messages.csv \
+  -o data/forwards.csv \
+  --start-date 2025-01-01 \
+  --end-date 2025-03-31 \
+  --scan-limit 10000 \
+  --concurrency 8 \
+  --chunk-size 1000
+```
+
+Using explicit message IDs:
+```zsh
+./run_scraper.sh forwards -c @SourceChannel --ids "100,200,300" -o data/forwards.csv --scan-limit 8000 --concurrency 6 --chunk-size 1000
+```
+
+Sequential mode (no chunking) by omitting --scan-limit:
+```zsh
+./run_scraper.sh forwards -c @SourceChannel --from-csv data/messages.csv -o data/forwards.csv
+```
+
+What the flags do:
+- `--scan-limit N`: enables chunked, concurrent scanning of ~N recent message IDs.
+- `--concurrency K`: number of id-chunks to scan in parallel (requires `--scan-limit`).
+- `--chunk-size M`: approx. IDs per chunk (trade-off between balance/overhead). Start with 1000–2000.
+- `--append`: append instead of overwrite.
+
+Notes:
+- This only finds forwards within the same channel that reference your parent IDs (self-forwards). Many channels will yield zero.
+- Global cross-channel forward discovery is not supported here (can be added as a separate mode).
+- Without `--scan-limit`, the tool scans sequentially from newest backwards and logs progress every ~1000 messages.
+
+### All valid forms (forwards)
+
+- Base variables:
+  - CH = @handle or https://t.me/handle
+  - OUT = path to .csv
+  - Source: exactly one of S1 or S2
+    - S1: --ids "id1,id2,..."
+    - S2: --from-csv PATH
+  - Modes:
+    - M1: Sequential scan (omit --scan-limit)
+    - M2: Chunked concurrent scan (requires --scan-limit N; accepts --concurrency K and --chunk-size M)
+  - Optional date filters for both modes: [--start-date D] [--end-date D]
+  - Binary: [--append]
+  - Optional: [--session-name NAME] [--phone NUMBER]
+
+- Enumerated permutations by mode, source, and append (2 modes × 2 sources × 2 append = 8 forms):
+
+M1 + S1 + no append
+  ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT [--start-date D] [--end-date D]
+M1 + S1 + append
+  ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT --append [--start-date D] [--end-date D]
+M1 + S2 + no append
+  ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT [--start-date D] [--end-date D]
+M1 + S2 + append
+  ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT --append [--start-date D] [--end-date D]
+
+M2 + S1 + no append
+  ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT --scan-limit N [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D]
+M2 + S1 + append
+  ./run_scraper.sh forwards -c CH --ids "IDLIST" -o OUT --scan-limit N --append [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D]
+M2 + S2 + no append
+  ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT --scan-limit N [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D]
+M2 + S2 + append
+  ./run_scraper.sh forwards -c CH --from-csv PATH -o OUT --scan-limit N --append [--concurrency K] [--chunk-size M] [--start-date D] [--end-date D]
+
+Optional add-ons valid for any form above:
+- Add [--session-name NAME] and/or [--phone NUMBER]
+
+---
+
+## Analyze (reports and tagging)
+
+Posts-only report + tagged CSV:
+```zsh
+./run_scraper.sh analyze \
+  -i data/messages.csv \
+  --channel @SourceChannel \
+  --tags-config config/tags.yaml \
+  --fixtures-csv data/fixtures.csv \
+  --write-augmented-csv
+```
+Outputs:
+- `data/messages_report.md`
+- `data/messages_tagged.csv`
+
+Replies-only report + tagged CSV:
+```zsh
+./run_scraper.sh analyze \
+  -i data/replies.csv \
+  --channel "Replies - @SourceChannel" \
+  --tags-config config/tags.yaml \
+  --write-augmented-csv
+```
+Outputs:
+- `data/replies_report.md`
+- `data/replies_tagged.csv`
+
+Combined (posts report augmented with replies):
+```zsh
+./run_scraper.sh analyze \
+  -i data/messages.csv \
+  --channel @SourceChannel \
+  --tags-config config/tags.yaml \
+  --replies-csv data/replies.csv \
+  --fixtures-csv data/fixtures.csv \
+  --write-augmented-csv \
+  --write-combined-csv \
+  --emoji-mode keep \
+  --emoji-boost \
+  --save-plots
+```
+Adds to posts dataset:
+- `sentiment_compound` for posts (VADER)
+- `replies_sentiment_mean` (avg reply sentiment per post)
+- `replies_count_scraped` and `replies_top_tags` (rollup from replies)
+
+Report sections include:
+- Summary, top posts by views/forwards/replies
+- Temporal distributions
+- Per-tag engagement
+- Per-tag sentiment (posts)
+- Replies per-tag summary
+- Per-tag sentiment (replies)
+ - Combined sentiment (posts + replies)
+ - Matchday cross-analysis (when `--fixtures-csv` is provided):
+   - Posts: on vs off matchdays (counts and sentiment shares)
+  - Posts engagement vs matchday (replies per post: total, mean, median, share of posts with replies)
+   - Replies: on vs off matchdays (counts and sentiment shares)
+  - Replies by parent matchday and by reply date are both shown; parent-based classification is recommended for engagement.
+
+Notes:
+- Analyze overwrites outputs; use `-o` to customize report filename if needed.
+- Emoji handling: add `--emoji-mode keep|demojize|strip` (default keep). Optionally `--emoji-boost` to gently tilt scores when clearly positive/negative emojis are present.
+ - Add `--write-combined-csv` to emit a unified CSV of posts+replies with a `content_type` column.
+
+### All valid forms (analyze)
+
+- Base variables:
+  - IN = input CSV (posts or replies)
+  - Optional outputs/labels: [-o REPORT.md] [--channel @handle]
+  - Optional configs/data: [--tags-config config/tags.yaml] [--replies-csv REPLIES.csv] [--fixtures-csv FIXTURES.csv]
+  - Binary: [--write-augmented-csv]
+
+- Core permutations across replies-csv, fixtures-csv, write-augmented-csv (2×2×2 = 8 forms):
+
+1) No replies, no fixtures, no aug
+  ./run_scraper.sh analyze -i IN
+2) No replies, no fixtures, with aug
+  ./run_scraper.sh analyze -i IN --write-augmented-csv
+3) No replies, with fixtures, no aug
+  ./run_scraper.sh analyze -i IN --fixtures-csv FIXTURES.csv
+4) No replies, with fixtures, with aug
+  ./run_scraper.sh analyze -i IN --fixtures-csv FIXTURES.csv --write-augmented-csv
+5) With replies, no fixtures, no aug
+  ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv
+6) With replies, no fixtures, with aug
+  ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv --write-augmented-csv
+7) With replies, with fixtures, no aug
+  ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv --fixtures-csv FIXTURES.csv
+8) With replies, with fixtures, with aug
+  ./run_scraper.sh analyze -i IN --replies-csv REPLIES.csv --fixtures-csv FIXTURES.csv --write-augmented-csv
+
+Optional add-ons valid for any form above:
+- Append [-o REPORT.md] to control output filename
+- Append [--channel @handle] for title
+- Append [--tags-config config/tags.yaml] to enable tagging and per-tag summaries
+- Append [--emoji-mode keep|demojize|strip] and optionally [--emoji-boost]
+- Append [--write-combined-csv] to produce a merged posts+replies CSV
+ - Append [--save-plots] to emit plots to the data folder
+ - Append [--sentiment-backend transformers] and [--transformers-model <name-or-path>] to use a local HF model instead of VADER
+ - Append [--export-transformers-details] to include `sentiment_label` and `sentiment_probs` in augmented/combined CSVs
+ - Append [--sentiment-backend gpt] and optionally [--gpt-model MODEL] [--gpt-base-url URL] [--gpt-batch-size K] to use a local GPT (Ollama) backend
+ - Plot sizing and label controls (daily charts):
+   - [--plot-width-scale FLOAT] [--plot-max-width INCHES] [--plot-height INCHES]
+   - [--activity-top-n N]
+   - [--labels-max-per-day N] [--labels-per-line N] [--labels-band-y FLOAT] [--labels-stagger-rows N] [--labels-annotate-mode ticks|all|ticks+top]
+
+When fixtures are provided (`--fixtures-csv`):
+- The report adds a "## Matchday cross-analysis" section with on vs off matchday tables.
+- Plots include:
+  - daily_activity_stacked.png with match labels inside the chart
+  - daily_volume_and_sentiment.png (bars: volume; lines: pos%/neg%)
+  - matchday_sentiment_overall.png (time series on fixture days)
+  - matchday_posts_volume_vs_sentiment.png (scatter)
+- The combined CSV (with `--write-combined-csv`) includes `is_matchday` and, for replies, `parent_is_matchday` when available.
+- Replies are classified two ways: by reply date (`is_matchday` on the reply row) and by their parent post (`parent_is_matchday`). The latter better reflects matchday-driven engagement.
+
+Emoji and plots examples:
+```zsh
+# Keep emojis (default) and boost for strong positive/negative emojis
+./run_scraper.sh analyze -i data/messages.csv --emoji-mode keep --emoji-boost --save-plots
+
+# Demojize to :smiling_face: tokens (helps some tokenizers), with boost
+./run_scraper.sh analyze -i data/messages.csv --emoji-mode demojize --emoji-boost
+
+# Strip emojis entirely (if they add noise)
+./run_scraper.sh analyze -i data/messages.csv --emoji-mode strip --save-plots
+
+# Use a transformers model for sentiment (will auto-download on first use unless a local path is provided).
+# Tip: for an off-the-shelf sentiment head, try a fine-tuned model like SST-2:
+./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \
+  --sentiment-backend transformers \
+  --transformers-model distilbert-base-uncased-finetuned-sst-2-english
+
+## Local GPT backend (Ollama)
+
+Use a local GPT model that returns JSON {label, confidence} per message; the analyzer maps this to a compound score and falls back to VADER on errors.
+
+```zsh
+./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \
+  --sentiment-backend gpt \
+  --gpt-model llama3 \
+  --gpt-base-url http://localhost:11434 \
+  --write-augmented-csv --write-combined-csv --save-plots
+```
+```
+
+---
+
+## Train a local transformers sentiment model
+
+Prepare a labeled CSV with at least two columns: `message` and `label` (e.g., neg/neu/pos or 0/1/2).
+
+Don’t have one yet? Create a labeling set from your existing posts/replies:
+
+```zsh
+# Generate a CSV to annotate by hand (adds a blank 'label' column)
+./.venv/bin/python -m src.make_labeling_set \
+  --posts-csv data/premier_league_update.csv \
+  --replies-csv data/premier_league_replies.csv \
+  --sample-size 1000 \
+  -o data/labeled_sentiment.csv
+
+# Or via alias (after sourcing scripts/aliases.zsh)
+make_label_set "$POSTS_CSV" "$REPLIES_CSV" data/labeled_sentiment.csv 1000
+```
+
+Then fine-tune:
+
+```zsh
+# Ensure the venv exists (run any ./run_scraper.sh command once), then:
+./.venv/bin/python -m src.train_sentiment \
+  --train-csv data/labeled_sentiment.csv \
+  --text-col message \
+  --label-col label \
+  --model-name distilbert-base-uncased \
+  --output-dir models/sentiment-distilbert \
+  --epochs 3 --batch-size 16
+```
+
+Use it in analyze:
+
+```zsh
+./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \
+  --sentiment-backend transformers \
+  --transformers-model models/sentiment-distilbert
+```
+
+Export details (labels, probabilities) into CSVs:
+
+```zsh
+./run_scraper.sh analyze -i data/messages.csv --replies-csv data/replies.csv \
+  --sentiment-backend transformers \
+  --transformers-model models/sentiment-distilbert \
+  --export-transformers-details \
+  --write-augmented-csv --write-combined-csv
+```
+
+Notes:
+- The analyzer maps model class probabilities to a VADER-like compound score in [-1, 1] for compatibility with the rest of the report.
+- If the model has id2label including 'neg','neu','pos' labels, the mapping is more accurate; otherwise it defaults to pos - neg.
+- GPU/Apple Silicon (MPS) will be used automatically if available.
+
+Torch install note (macOS):
+- `requirements.txt` uses conditional pins: `torch==2.3.1` for Python < 3.13 and `torch>=2.7.1` for Python ≥ 3.13. This keeps installs smooth on macOS. If you hit install issues, let us know.
+
+## Evaluate a fine-tuned model
+
+```zsh
+./.venv/bin/python -m src.eval_sentiment \
+  --csv data/labeled_holdout.csv \
+  --text-col message \
+  --label-col label \
+  --model models/sentiment-distilbert
+```
+Prints accuracy, macro-precision/recall/F1, and a classification report.
+
+## Fixtures (Premier League schedule via football-data.org)
+
+Fetch fixtures between dates:
+```zsh
+./run_scraper.sh fixtures \
+  --start-date 2025-08-15 \
+  --end-date 2025-10-15 \
+  -o data/fixtures.csv
+```
+
+Notes:
+- Requires `FOOTBALL_DATA_API_TOKEN` in `.env`.
+- Output may be `.csv` or `.json` (by extension).
+
+### All valid forms (fixtures)
+
+- Base variables:
+  - SD = start date YYYY-MM-DD
+  - ED = end date YYYY-MM-DD
+  - OUT = output .csv or .json
+
+Form:
+  ./run_scraper.sh fixtures --start-date SD --end-date ED -o OUT
+
+---
+
+## Advanced recipes
+
+Parallel replies + forwards with separate sessions:
+```zsh
+# Terminal 1 – replies
+./run_scraper.sh replies \
+  -c https://t.me/SourceChannel \
+  --from-csv data/messages.csv \
+  -o data/replies.csv \
+  --min-replies 1 \
+  --concurrency 15 \
+  --resume \
+  --append \
+  --session-name "$HOME/.local/share/telethon_sessions/telegram_replies"
+
+# Terminal 2 – forwards
+./run_scraper.sh forwards \
+  -c https://t.me/SourceChannel \
+  --from-csv data/messages.csv \
+  -o data/forwards.csv \
+  --scan-limit 20000 \
+  --concurrency 10 \
+  --chunk-size 1500 \
+  --session-name "$HOME/.local/share/telethon_sessions/telegram_forwards"
+```
+
+Tuning for rate limits:
+- If `[rate-limit]` logs are frequent, reduce `--concurrency` (start -3 to -5) and keep `--chunk-size` around 1000–2000.
+- For replies, prioritize with `--min-replies 1` to avoid parents with zero replies.
+
+Safety:
+- Use `--append` with replies and `--resume` to avoid truncating and to dedupe.
+- Forwards and scrape don’t dedupe; prefer writing to a new file or dedupe after.
+
+---
+
+## Environment setup quick-start
+
+Create `.env` (script will prompt if missing):
+```
+TELEGRAM_API_ID=123456
+TELEGRAM_API_HASH=your_api_hash
+# Optional defaults
+TELEGRAM_SESSION_NAME=telegram
+TELEGRAM_2FA_PASSWORD=your_2fa_password
+FOOTBALL_DATA_API_TOKEN=your_token
+```
+
+First run will prompt for phone and code (and 2FA if enabled).
+
+---
+
+## Troubleshooting
+
+- Empty replies file
+  - Ensure `-c` matches the channel in your posts CSV URLs.
+  - Use `--append` so the file isn’t truncated before writing.
+- “database is locked”
+  - Use unique `--session-name` per parallel process; store sessions outside iCloud Drive.
+- Forwards empty
+  - Same-channel forwards are rare. This tool only finds self-forwards (not cross-channel).
+- Analyze errors
+  - Ensure CSVs have expected columns. Posts: `id,date,message,...`; Replies: `parent_id,id,date,message,...`.
+- Exit code 1 when starting
+  - Check the last log lines. Common causes: missing TELEGRAM_API_ID/HASH in `.env`, wrong channel handle vs CSV URLs, session file locked by another process (use distinct `--session-name`), or a bad output path.
+
+---
+
+## Quick aliases for daily runs (zsh) ⚡
+
+Paste this section into your current shell or your `~/.zshrc` to get convenient Make-like commands.
+
+### Project defaults (edit as needed)
+
+```zsh
+# Channel and files
+export CH="https://t.me/Premier_League_Update"
+export POSTS_CSV="data/premier_league_update.csv"
+export REPLIES_CSV="data/premier_league_replies.csv"
+export FORWARDS_CSV="data/premier_league_forwards.csv"
+export TAGS_CFG="config/tags.yaml"
+export FIXTURES_CSV="data/premier_league_schedule_2025-08-15_to_2025-10-15.csv"
+
+# Sessions directory outside iCloud (avoid sqlite locks)
+export SESSION_DIR="$HOME/.local/share/telethon_sessions"
+mkdir -p "$SESSION_DIR"
+```
+
+### Aliases (zsh functions)
+
+```zsh
+# Fast replies: resume+append, prioritizes parents with replies, tuned concurrency
+fast_replies() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local out="${3:-$REPLIES_CSV}"
+  local conc="${4:-15}"
+  local sess="${5:-$SESSION_DIR/telegram_replies}"
+  ./run_scraper.sh replies \
+    -c "$ch" \
+    --from-csv "$posts" \
+    -o "$out" \
+    --min-replies 1 \
+    --concurrency "$conc" \
+    --resume \
+    --append \
+    --session-name "$sess"
+}
+
+# Chunked forwards: concurrent chunk scan with progress logs
+chunked_forwards() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local out="${3:-$FORWARDS_CSV}"
+  local scan="${4:-20000}"
+  local conc="${5:-10}"
+  local chunk="${6:-1500}"
+  local sess="${7:-$SESSION_DIR/telegram_forwards}"
+  ./run_scraper.sh forwards \
+    -c "$ch" \
+    --from-csv "$posts" \
+    -o "$out" \
+    --scan-limit "$scan" \
+    --concurrency "$conc" \
+    --chunk-size "$chunk" \
+    --append \
+    --session-name "$sess"
+}
+
+# Combined analyze: posts + replies + fixtures with tags; writes augmented CSVs
+analyze_combined() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local tags="${3:-$TAGS_CFG}"
+  local fixtures="${4:-$FIXTURES_CSV}"
+  local ch="${5:-$CH}"
+  ./run_scraper.sh analyze \
+    -i "$posts" \
+    --channel "$ch" \
+    --tags-config "$tags" \
+    --replies-csv "$replies" \
+    --fixtures-csv "$fixtures" \
+    --write-augmented-csv \
+    --write-combined-csv
+}
+
+# Emoji-aware analyze with sensible defaults (keep + boost)
+analyze_emoji() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local tags="${3:-$TAGS_CFG}"
+  local fixtures="${4:-$FIXTURES_CSV}"
+  local ch="${5:-$CH}"
+  local mode="${6:-keep}"   # keep | demojize | strip
+  ./run_scraper.sh analyze \
+    -i "$posts" \
+    --channel "$ch" \
+    --tags-config "$tags" \
+    --replies-csv "$replies" \
+    --fixtures-csv "$fixtures" \
+    --write-augmented-csv \
+    --write-combined-csv \
+    --emoji-mode "$mode" \
+    --emoji-boost
+}
+
+# One-shot daily pipeline: fast replies then combined analyze
+run_daily() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local replies="${3:-$REPLIES_CSV}"
+  local conc="${4:-15}"
+  fast_replies "$ch" "$posts" "$replies" "$conc" "$SESSION_DIR/telegram_replies"
+  analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
+}
+
+# One-shot daily pipeline with forwards in parallel
+run_daily_with_forwards() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local replies="${3:-$REPLIES_CSV}"
+  local forwards="${4:-$FORWARDS_CSV}"
+  local rep_conc="${5:-15}"
+  local f_scan="${6:-20000}"
+  local f_conc="${7:-10}"
+  local f_chunk="${8:-1500}"
+  local sess_r="${9:-$SESSION_DIR/telegram_replies}"
+  local sess_f="${10:-$SESSION_DIR/telegram_forwards}"
+
+  # Launch replies and forwards in parallel with separate sessions
+  local pid_r pid_f
+  fast_replies "$ch" "$posts" "$replies" "$rep_conc" "$sess_r" & pid_r=$!
+  chunked_forwards "$ch" "$posts" "$forwards" "$f_scan" "$f_conc" "$f_chunk" "$sess_f" & pid_f=$!
+
+  # Wait for completion and then analyze with emoji handling
+  wait $pid_r
+  wait $pid_f
+  analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
+}
+```
+
+### Usage
+
+```zsh
+# Use project defaults
+fast_replies
+chunked_forwards
+analyze_combined
+
+# Override on the fly (channel, files, or tuning)
+fast_replies "https://t.me/AnotherChannel" data/other_posts.csv data/other_replies.csv 12
+chunked_forwards "$CH" "$POSTS_CSV" data/alt_forwards.csv 30000 12 2000
+analyze_combined data/other_posts.csv data/other_replies.csv "$TAGS_CFG" "$FIXTURES_CSV" "$CH"
+```
diff --git a/docs/SESSION_HISTORY.md b/docs/SESSION_HISTORY.md
new file mode 100644
index 0000000..bf6776f
--- /dev/null
+++ b/docs/SESSION_HISTORY.md
@@ -0,0 +1,50 @@
+# Session history (Oct 25, 2025)
+
+This document captures the key decisions, features added, and workflows established in the current development session so that future runs have quick context.
+
+## Highlights
+- Added a new plot: `daily_volume_and_sentiment.png` showing bars for total volume (posts+replies) and lines for positive% and negative% per day.
+- Improved daily activity chart with in-plot match labels (team abbreviations), density controls, and dynamic width/height.
+- Implemented matchday sentiment rollups and plots: `matchday_sentiment_overall.csv/.png`, `matchday_posts_volume_vs_sentiment.png`.
+- Integrated multiple sentiment backends:
+  - VADER (default)
+  - Transformers (local model at `models/sentiment-distilbert`)
+  - Local GPT via Ollama (JSON {label, confidence} mapped to compound) with graceful fallback to VADER
+- Labeled data workflow:
+  - `src/apply_labels.py` merges labels back into posts/replies as `sentiment_label`
+  - Analyzer reuses `sentiment_label` when present
+  - `src/plot_labeled.py` provides QA plots
+- Convenience: created `run_all` alias to run from scratch (scrape → replies → fixtures → analyze) non-interactively.
+
+## Key files and outputs
+- Code
+  - `src/analyze_csv.py` — analyzer with plots and matchday integration (now with module docstring)
+  - `src/gpt_sentiment.py`, `src/transformer_sentiment.py`, `src/auto_label_sentiment.py`, `src/apply_labels.py`, `src/plot_labeled.py`
+  - `scripts/aliases.zsh` — includes `run_all`, `apply_labels_and_analyze`, and more
+- Outputs (examples)
+  - `data/daily_activity_stacked.png`
+  - `data/daily_volume_and_sentiment.png`
+  - `data/posts_heatmap_hour_dow.png`
+  - `data/sentiment_by_tag_posts.png`
+  - `data/matchday_sentiment_overall.csv/.png`
+  - `data/matchday_posts_volume_vs_sentiment.png`
+
+## Important flags (analyze)
+- Sizing: `--plot-width-scale`, `--plot-max-width`, `--plot-height`
+- Labels: `--activity-top-n`, `--labels-max-per-day`, `--labels-per-line`, `--labels-stagger-rows`, `--labels-band-y`, `--labels-annotate-mode`
+- Sentiment backends: `--sentiment-backend vader|transformers|gpt`, plus `--transformers-model` or `--gpt-model`/`--gpt-base-url`
+- Emoji: `--emoji-mode keep|demojize|strip` and `--emoji-boost`
+
+## Aliases summary
+- `run_all [CH] [START] [END] [POSTS] [REPLIES] [FIXTURES] [TAGS] [SESS_SCRAPE] [SESS_REPLIES] [CONC] [BACKEND] [MODEL] [GPT_MODEL] [GPT_URL]`
+  - Full pipeline non-interactive, defaults set in `scripts/aliases.zsh`
+- `apply_labels_and_analyze [LABELED_CSV] [POSTS_IN] [REPLIES_IN] [POSTS_OUT] [REPLIES_OUT]`
+- `analyze_transformers`, `analyze_emoji`, `analyze_combined`, `fast_replies`, `chunked_forwards`, `plot_labeled`
+
+## Old vs New outputs
+- We maintain side-by-side outputs under `data/old` and `data/new` when running legacy vs labeled pipelines.
+
+## Next ideas
+- Per-club matchday sentiment breakdowns (fixture-level small multiples)
+- Side-by-side montage generation for old vs new plots
+
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..598fcee
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+numpy
+pandas
+scikit-learn
+matplotlib
+seaborn
+jupyter
+telethon
+python-dotenv
+tabulate
+requests
+pyyaml
+vaderSentiment
+emoji>=2.8.0
+transformers>=4.44.0
+# Torch pinning: use 2.3.1 on Python <3.13 (known-good on macOS), and a compatible newer torch on Python >=3.13
+torch==2.3.1; python_version < "3.13"
+torch>=2.7.1; python_version >= "3.13"
+datasets>=2.20.0
+accelerate>=0.26.0
\ No newline at end of file
diff --git a/run_scraper.sh b/run_scraper.sh
new file mode 100755
index 0000000..9e08ab9
--- /dev/null
+++ b/run_scraper.sh
@@ -0,0 +1,278 @@
+#!/usr/bin/env zsh
+
+
+# A convenience script to set up venv, install deps, create/load .env, and run tools:
+# - Telegram scraper: scrape | replies | forwards
+# - Analyzer: analyze (report + sentiment + tags)
+# - Fixtures: fixtures (Premier League schedule)
+set -euo pipefail
+
+# Change to script directory (handles spaces in path)
+cd "${0:A:h}"
+
+PROJECT_ROOT=$(pwd)
+PYTHON="${PROJECT_ROOT}/.venv/bin/python"
+PIP="${PROJECT_ROOT}/.venv/bin/pip"
+REQUIREMENTS_FILE="${PROJECT_ROOT}/requirements.txt"
+SCRAPER_MODULE="src.telegram_scraper"
+ANALYZE_MODULE="src.analyze_csv"
+FIXTURES_MODULE="src.fetch_schedule"
+
+usage() {
+  cat <<'EOF'
+Usage:
+  ./run_scraper.sh scrape   -c <channel> -o <output> [--limit N] [--start-date YYYY-MM-DD] [--end-date YYYY-MM-DD] [--phone <number>] [--append]
+  ./run_scraper.sh replies  -c <channel> (--ids "1,2,3" | --from-csv <path>) -o <output_csv> [--append] [--min-replies N] [--concurrency K] [--resume]
+  ./run_scraper.sh forwards -c <channel> (--ids "1,2,3" | --from-csv <path>) -o <output_csv> [--start-date YYYY-MM-DD] [--end-date YYYY-MM-DD] [--scan-limit N] [--append] [--concurrency K] [--chunk-size M]
+  ./run_scraper.sh analyze  -i <input_csv> [-o <report_md>] [--channel @handle] [--tags-config config/tags.yaml] [--replies-csv <csv>] [--fixtures-csv <csv>] [--write-augmented-csv] [--write-combined-csv] [--emoji-mode keep|demojize|strip] [--emoji-boost] [--save-plots] [--sentiment-backend vader|transformers] [--transformers-model <hf_or_path>] [--export-transformers-details]
+                            [--plot-width-scale <float>] [--plot-max-width <inches>] [--plot-height <inches>] [--activity-top-n <int>] \
+                            [--labels-max-per-day <int>] [--labels-per-line <int>] [--labels-band-y <float>] [--labels-stagger-rows <int>] [--labels-annotate-mode ticks|all|ticks+top]
+                            [--sentiment-backend gpt] [--gpt-model <name>] [--gpt-base-url <http://localhost:11434>] [--gpt-batch-size <int>]
+  ./run_scraper.sh fixtures --start-date YYYY-MM-DD --end-date YYYY-MM-DD -o <output.{csv|json}>
+
+Examples:
+  ./run_scraper.sh scrape -c @python -o data.jsonl --limit 200
+  ./run_scraper.sh scrape -c https://t.me/python -o data.csv --start-date 2025-01-01 --end-date 2025-03-31
+  ./run_scraper.sh replies -c @python --from-csv data/messages.csv -o data/replies.csv
+  ./run_scraper.sh forwards -c @python --from-csv data/messages.csv -o data/forwards.csv --start-date 2025-01-01 --end-date 2025-03-31 --scan-limit 20000
+  ./run_scraper.sh analyze -i data/messages.csv --channel @python --tags-config config/tags.yaml --replies-csv data/replies.csv --fixtures-csv data/fixtures.csv --write-augmented-csv
+  ./run_scraper.sh analyze -i data/messages.csv --sentiment-backend transformers --transformers-model distilbert-base-uncased --export-transformers-details --write-augmented-csv --write-combined-csv
+  ./run_scraper.sh fixtures --start-date 2025-08-15 --end-date 2025-10-15 -o data/pl_fixtures.csv
+
+Notes:
+- If .env is missing, you'll be prompted to create it when needed (Telegram or fixtures commands).
+- First Telegram login will prompt for phone, code, and optionally 2FA password.
+EOF
+}
+
+# Subcommand parsing
+if [[ $# -lt 1 ]]; then
+  usage; exit 1
+fi
+COMMAND="$1"; shift || true
+
+# Common and per-command args
+CHANNEL=""; OUTPUT=""; LIMIT=""; OFFSET_DATE=""; PHONE=""; START_DATE=""; END_DATE=""; APPEND=false; SESSION_NAME=""
+IDS=""; FROM_CSV=""; SCAN_LIMIT=""
+INPUT_CSV=""; REPORT_OUT=""; CHANNEL_NAME=""; TAGS_CONFIG=""; REPLIES_CSV=""; FIXTURES_CSV=""; WRITE_AUG=false; WRITE_COMBINED=false; EMOJI_MODE=""; EMOJI_BOOST=false; SAVE_PLOTS=false; SENTIMENT_BACKEND=""; TRANSFORMERS_MODEL=""; EXPORT_TRANSFORMERS_DETAILS=false; PLOT_WIDTH_SCALE=""; PLOT_MAX_WIDTH=""; PLOT_HEIGHT=""; ACTIVITY_TOP_N=""; LABELS_MAX_PER_DAY=""; LABELS_PER_LINE=""; LABELS_BAND_Y=""; LABELS_STAGGER_ROWS=""; LABELS_ANNOTATE_MODE=""; GPT_MODEL=""; GPT_BASE_URL=""; GPT_BATCH_SIZE=""
+
+case "$COMMAND" in
+  scrape|replies|forwards)
+    while [[ $# -gt 0 ]]; do
+      case "$1" in
+        -c|--channel) CHANNEL="$2"; shift 2;;
+        -o|--output) OUTPUT="$2"; shift 2;;
+        --session-name) SESSION_NAME="$2"; shift 2;;
+        --limit) LIMIT="$2"; shift 2;;
+        --offset-date) OFFSET_DATE="$2"; shift 2;;
+        --start-date) START_DATE="$2"; shift 2;;
+        --end-date) END_DATE="$2"; shift 2;;
+        --scan-limit) SCAN_LIMIT="$2"; shift 2;;
+        --ids) IDS="$2"; shift 2;;
+        --from-csv) FROM_CSV="$2"; shift 2;;
+        --phone) PHONE="$2"; shift 2;;
+        --append) APPEND=true; shift;;
+        --min-replies) MIN_REPLIES="$2"; shift 2;;
+        --concurrency) CONCURRENCY="$2"; shift 2;;
+        --chunk-size) CHUNK_SIZE="$2"; shift 2;;
+        --resume) RESUME=true; shift;;
+        -h|--help) usage; exit 0;;
+        *) echo "Unknown arg: $1"; usage; exit 1;;
+      esac
+    done
+    ;;
+  analyze)
+    while [[ $# -gt 0 ]]; do
+      case "$1" in
+        -i|--input) INPUT_CSV="$2"; shift 2;;
+        -o|--output) REPORT_OUT="$2"; shift 2;;
+        --channel) CHANNEL_NAME="$2"; shift 2;;
+        --tags-config) TAGS_CONFIG="$2"; shift 2;;
+        --replies-csv) REPLIES_CSV="$2"; shift 2;;
+        --fixtures-csv) FIXTURES_CSV="$2"; shift 2;;
+  --write-augmented-csv) WRITE_AUG=true; shift;;
+  --write-combined-csv) WRITE_COMBINED=true; shift;;
+  --emoji-mode) EMOJI_MODE="$2"; shift 2;;
+        --emoji-boost) EMOJI_BOOST=true; shift;;
+  --save-plots) SAVE_PLOTS=true; shift;;
+        --sentiment-backend) SENTIMENT_BACKEND="$2"; shift 2;;
+        --transformers-model) TRANSFORMERS_MODEL="$2"; shift 2;;
+        --export-transformers-details) EXPORT_TRANSFORMERS_DETAILS=true; shift;;
+    --gpt-model) GPT_MODEL="$2"; shift 2;;
+    --gpt-base-url) GPT_BASE_URL="$2"; shift 2;;
+    --gpt-batch-size) GPT_BATCH_SIZE="$2"; shift 2;;
+    --plot-width-scale) PLOT_WIDTH_SCALE="$2"; shift 2;;
+    --plot-max-width) PLOT_MAX_WIDTH="$2"; shift 2;;
+        --plot-height) PLOT_HEIGHT="$2"; shift 2;;
+        --activity-top-n) ACTIVITY_TOP_N="$2"; shift 2;;
+        --labels-max-per-day) LABELS_MAX_PER_DAY="$2"; shift 2;;
+        --labels-per-line) LABELS_PER_LINE="$2"; shift 2;;
+        --labels-band-y) LABELS_BAND_Y="$2"; shift 2;;
+        --labels-stagger-rows) LABELS_STAGGER_ROWS="$2"; shift 2;;
+        --labels-annotate-mode) LABELS_ANNOTATE_MODE="$2"; shift 2;;
+        -h|--help) usage; exit 0;;
+        *) echo "Unknown arg: $1"; usage; exit 1;;
+      esac
+    done
+  # Defaults: always use local fine-tuned transformers model if not specified
+  if [[ -z "$SENTIMENT_BACKEND" ]]; then SENTIMENT_BACKEND="transformers"; fi
+  if [[ -z "$TRANSFORMERS_MODEL" ]]; then TRANSFORMERS_MODEL="models/sentiment-distilbert"; fi
+    ;;
+  fixtures)
+    while [[ $# -gt 0 ]]; do
+      case "$1" in
+        --start-date) START_DATE="$2"; shift 2;;
+        --end-date) END_DATE="$2"; shift 2;;
+        -o|--output) OUTPUT="$2"; shift 2;;
+        -h|--help) usage; exit 0;;
+        *) echo "Unknown arg: $1"; usage; exit 1;;
+      esac
+    done
+    ;;
+  -h|--help)
+    usage; exit 0;;
+  *)
+    echo "Unknown command: $COMMAND"; usage; exit 1;;
+esac
+
+# Required args validation
+if [[ "$COMMAND" == "scrape" ]]; then
+  if [[ -z "$CHANNEL" || -z "$OUTPUT" ]]; then echo "Error: scrape needs --channel and --output"; usage; exit 1; fi
+elif [[ "$COMMAND" == "replies" || "$COMMAND" == "forwards" ]]; then
+  if [[ -z "$CHANNEL" || -z "$OUTPUT" ]]; then echo "Error: $COMMAND needs --channel and --output"; usage; exit 1; fi
+  if [[ -z "$IDS" && -z "$FROM_CSV" ]]; then echo "Error: $COMMAND needs --ids or --from-csv"; usage; exit 1; fi
+elif [[ "$COMMAND" == "analyze" ]]; then
+  if [[ -z "$INPUT_CSV" ]]; then echo "Error: analyze needs --input"; usage; exit 1; fi
+elif [[ "$COMMAND" == "fixtures" ]]; then
+  if [[ -z "$START_DATE" || -z "$END_DATE" || -z "$OUTPUT" ]]; then echo "Error: fixtures needs --start-date, --end-date, and --output"; usage; exit 1; fi
+fi
+
+echo "[1/4] Ensuring virtual environment..."
+if [[ ! -x "$PYTHON" ]]; then
+  echo "Creating virtual environment at .venv"
+  python3 -m venv .venv
+fi
+
+echo "Activating virtual environment"
+source .venv/bin/activate
+
+echo "[2/4] Installing dependencies"
+"$PIP" install -q --upgrade pip
+"$PIP" install -q -r "$REQUIREMENTS_FILE"
+
+echo "[3/4] Environment setup"
+NEEDS_TELEGRAM=false
+NEEDS_FIXTURES_TOKEN=false
+if [[ "$COMMAND" == "scrape" || "$COMMAND" == "replies" || "$COMMAND" == "forwards" ]]; then NEEDS_TELEGRAM=true; fi
+if [[ "$COMMAND" == "fixtures" ]]; then NEEDS_FIXTURES_TOKEN=true; fi
+
+if [[ "$NEEDS_TELEGRAM" == true || "$NEEDS_FIXTURES_TOKEN" == true ]]; then
+  if [[ ! -f .env ]]; then
+    echo ".env not found. Let's create one now."
+    if [[ "$NEEDS_TELEGRAM" == true ]]; then
+      print -n "Enter TELEGRAM_API_ID (from my.telegram.org): "
+      read -r TELEGRAM_API_ID
+      print -n "Enter TELEGRAM_API_HASH (from my.telegram.org): "
+      read -r TELEGRAM_API_HASH
+      : ${TELEGRAM_SESSION_NAME:=telegram}
+    fi
+    cat > .env <<EOF
+${TELEGRAM_API_ID:+TELEGRAM_API_ID=${TELEGRAM_API_ID}}
+${TELEGRAM_API_HASH:+TELEGRAM_API_HASH=${TELEGRAM_API_HASH}}
+${TELEGRAM_SESSION_NAME:+TELEGRAM_SESSION_NAME=${TELEGRAM_SESSION_NAME}}
+${FOOTBALL_DATA_API_TOKEN:+FOOTBALL_DATA_API_TOKEN=${FOOTBALL_DATA_API_TOKEN}}
+EOF
+    echo "Created .env"
+  fi
+
+  echo "Loading environment from .env"
+  set -a
+  source .env
+  set +a
+
+  if [[ "$NEEDS_TELEGRAM" == true ]]; then
+    if [[ -z "${TELEGRAM_API_ID:-}" || -z "${TELEGRAM_API_HASH:-}" ]]; then
+      echo "Error: TELEGRAM_API_ID and TELEGRAM_API_HASH must be set in .env"
+      exit 1
+    fi
+  fi
+  if [[ "$NEEDS_FIXTURES_TOKEN" == true ]]; then
+    if [[ -z "${FOOTBALL_DATA_API_TOKEN:-}" ]]; then
+      echo "Error: FOOTBALL_DATA_API_TOKEN must be set in .env for fixtures"
+      exit 1
+    fi
+  fi
+fi
+
+echo "[4/4] Running $COMMAND"
+PY_ARGS=()
+case "$COMMAND" in
+  scrape)
+    PY_ARGS=( -m "$SCRAPER_MODULE" scrape "$CHANNEL" --output "$OUTPUT" )
+    if [[ -n "$SESSION_NAME" ]]; then PY_ARGS+=( --session-name "$SESSION_NAME" ); fi
+    if [[ -n "$LIMIT" ]]; then PY_ARGS+=( --limit "$LIMIT" ); fi
+    if [[ -n "$OFFSET_DATE" ]]; then PY_ARGS+=( --offset-date "$OFFSET_DATE" ); fi
+    if [[ -n "$START_DATE" ]]; then PY_ARGS+=( --start-date "$START_DATE" ); fi
+    if [[ -n "$END_DATE" ]]; then PY_ARGS+=( --end-date "$END_DATE" ); fi
+    if [[ -n "$PHONE" ]]; then PY_ARGS+=( --phone "$PHONE" ); fi
+    if [[ "$APPEND" == true ]]; then PY_ARGS+=( --append ); fi
+    ;;
+  replies)
+  PY_ARGS=( -m "$SCRAPER_MODULE" replies "$CHANNEL" --output "$OUTPUT" )
+    if [[ -n "$SESSION_NAME" ]]; then PY_ARGS+=( --session-name "$SESSION_NAME" ); fi
+    if [[ -n "$IDS" ]]; then PY_ARGS+=( --ids "$IDS" ); fi
+    if [[ -n "$FROM_CSV" ]]; then PY_ARGS+=( --from-csv "$FROM_CSV" ); fi
+    if [[ -n "$PHONE" ]]; then PY_ARGS+=( --phone "$PHONE" ); fi
+    if [[ "$APPEND" == true ]]; then PY_ARGS+=( --append ); fi
+  if [[ -n "${MIN_REPLIES:-}" ]]; then PY_ARGS+=( --min-replies "$MIN_REPLIES" ); fi
+  if [[ -n "${CONCURRENCY:-}" ]]; then PY_ARGS+=( --concurrency "$CONCURRENCY" ); fi
+  if [[ "${RESUME:-false}" == true ]]; then PY_ARGS+=( --resume ); fi
+    ;;
+  forwards)
+    PY_ARGS=( -m "$SCRAPER_MODULE" forwards "$CHANNEL" --output "$OUTPUT" )
+    if [[ -n "$SESSION_NAME" ]]; then PY_ARGS+=( --session-name "$SESSION_NAME" ); fi
+    if [[ -n "$IDS" ]]; then PY_ARGS+=( --ids "$IDS" ); fi
+    if [[ -n "$FROM_CSV" ]]; then PY_ARGS+=( --from-csv "$FROM_CSV" ); fi
+    if [[ -n "$START_DATE" ]]; then PY_ARGS+=( --start-date "$START_DATE" ); fi
+    if [[ -n "$END_DATE" ]]; then PY_ARGS+=( --end-date "$END_DATE" ); fi
+    if [[ -n "$SCAN_LIMIT" ]]; then PY_ARGS+=( --scan-limit "$SCAN_LIMIT" ); fi
+    if [[ -n "${CONCURRENCY:-}" ]]; then PY_ARGS+=( --concurrency "$CONCURRENCY" ); fi
+    if [[ -n "${CHUNK_SIZE:-}" ]]; then PY_ARGS+=( --chunk-size "$CHUNK_SIZE" ); fi
+    if [[ -n "$PHONE" ]]; then PY_ARGS+=( --phone "$PHONE" ); fi
+    if [[ "$APPEND" == true ]]; then PY_ARGS+=( --append ); fi
+    ;;
+  analyze)
+    PY_ARGS=( -m "$ANALYZE_MODULE" "$INPUT_CSV" )
+    if [[ -n "$REPORT_OUT" ]]; then PY_ARGS+=( -o "$REPORT_OUT" ); fi
+    if [[ -n "$CHANNEL_NAME" ]]; then PY_ARGS+=( --channel "$CHANNEL_NAME" ); fi
+    if [[ -n "$TAGS_CONFIG" ]]; then PY_ARGS+=( --tags-config "$TAGS_CONFIG" ); fi
+    if [[ -n "$REPLIES_CSV" ]]; then PY_ARGS+=( --replies-csv "$REPLIES_CSV" ); fi
+    if [[ -n "$FIXTURES_CSV" ]]; then PY_ARGS+=( --fixtures-csv "$FIXTURES_CSV" ); fi
+    if [[ "$WRITE_AUG" == true ]]; then PY_ARGS+=( --write-augmented-csv ); fi
+  if [[ "$WRITE_COMBINED" == true ]]; then PY_ARGS+=( --write-combined-csv ); fi
+  if [[ -n "$EMOJI_MODE" ]]; then PY_ARGS+=( --emoji-mode "$EMOJI_MODE" ); fi
+  if [[ "${EMOJI_BOOST:-false}" == true ]]; then PY_ARGS+=( --emoji-boost ); fi
+  if [[ "${SAVE_PLOTS:-false}" == true ]]; then PY_ARGS+=( --save-plots ); fi
+    if [[ -n "$SENTIMENT_BACKEND" ]]; then PY_ARGS+=( --sentiment-backend "$SENTIMENT_BACKEND" ); fi
+    if [[ -n "$TRANSFORMERS_MODEL" ]]; then PY_ARGS+=( --transformers-model "$TRANSFORMERS_MODEL" ); fi
+    if [[ "${EXPORT_TRANSFORMERS_DETAILS:-false}" == true ]]; then PY_ARGS+=( --export-transformers-details ); fi
+    if [[ -n "$GPT_MODEL" ]]; then PY_ARGS+=( --gpt-model "$GPT_MODEL" ); fi
+    if [[ -n "$GPT_BASE_URL" ]]; then PY_ARGS+=( --gpt-base-url "$GPT_BASE_URL" ); fi
+    if [[ -n "$GPT_BATCH_SIZE" ]]; then PY_ARGS+=( --gpt-batch-size "$GPT_BATCH_SIZE" ); fi
+      if [[ -n "$PLOT_WIDTH_SCALE" ]]; then PY_ARGS+=( --plot-width-scale "$PLOT_WIDTH_SCALE" ); fi
+      if [[ -n "$PLOT_MAX_WIDTH" ]]; then PY_ARGS+=( --plot-max-width "$PLOT_MAX_WIDTH" ); fi
+  if [[ -n "$PLOT_HEIGHT" ]]; then PY_ARGS+=( --plot-height "$PLOT_HEIGHT" ); fi
+    if [[ -n "$ACTIVITY_TOP_N" ]]; then PY_ARGS+=( --activity-top-n "$ACTIVITY_TOP_N" ); fi
+    if [[ -n "$LABELS_MAX_PER_DAY" ]]; then PY_ARGS+=( --labels-max-per-day "$LABELS_MAX_PER_DAY" ); fi
+    if [[ -n "$LABELS_PER_LINE" ]]; then PY_ARGS+=( --labels-per-line "$LABELS_PER_LINE" ); fi
+    if [[ -n "$LABELS_BAND_Y" ]]; then PY_ARGS+=( --labels-band-y "$LABELS_BAND_Y" ); fi
+    if [[ -n "$LABELS_STAGGER_ROWS" ]]; then PY_ARGS+=( --labels-stagger-rows "$LABELS_STAGGER_ROWS" ); fi
+    if [[ -n "$LABELS_ANNOTATE_MODE" ]]; then PY_ARGS+=( --labels-annotate-mode "$LABELS_ANNOTATE_MODE" ); fi
+    ;;
+  fixtures)
+    PY_ARGS=( -m "$FIXTURES_MODULE" --start-date "$START_DATE" --end-date "$END_DATE" -o "$OUTPUT" )
+    ;;
+esac
+
+echo "Command: $PYTHON ${PY_ARGS[@]}"
+"$PYTHON" ${PY_ARGS[@]}
diff --git a/scripts/aliases.zsh b/scripts/aliases.zsh
new file mode 100644
index 0000000..34129f9
--- /dev/null
+++ b/scripts/aliases.zsh
@@ -0,0 +1,338 @@
+# Convenience aliases for daily runs (zsh)
+# Source this file in your shell:  source scripts/aliases.zsh
+
+# --- Project defaults (edit as needed) ---
+# Channel and files
+export CH="https://t.me/Premier_League_Update"
+export POSTS_CSV="data/premier_league_update.csv"
+export REPLIES_CSV="data/premier_league_replies.csv"
+export FORWARDS_CSV="data/premier_league_forwards.csv"
+export TAGS_CFG="config/tags.yaml"
+export FIXTURES_CSV="data/premier_league_schedule_2025-08-15_to_2025-10-15.csv"
+# Default fixtures date range (used by run_all)
+export FIXTURES_START_DATE="2025-08-15"
+export FIXTURES_END_DATE="2025-10-15"
+
+# Sessions directory outside iCloud (avoid sqlite locks)
+export SESSION_DIR="$HOME/.local/share/telethon_sessions"
+mkdir -p "$SESSION_DIR"
+
+# --- Aliases (zsh functions) ---
+
+# Fast replies: resume+append, prioritizes parents with replies, tuned concurrency
+fast_replies() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local out="${3:-$REPLIES_CSV}"
+  local conc="${4:-15}"
+  local sess="${5:-$SESSION_DIR/telegram_replies}"
+  ./run_scraper.sh replies \
+    -c "$ch" \
+    --from-csv "$posts" \
+    -o "$out" \
+    --min-replies 1 \
+    --concurrency "$conc" \
+    --resume \
+    --append \
+    --session-name "$sess"
+}
+
+# Chunked forwards: concurrent chunk scan with progress logs
+chunked_forwards() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local out="${3:-$FORWARDS_CSV}"
+  local scan="${4:-20000}"
+  local conc="${5:-10}"
+  local chunk="${6:-1500}"
+  local sess="${7:-$SESSION_DIR/telegram_forwards}"
+  ./run_scraper.sh forwards \
+    -c "$ch" \
+    --from-csv "$posts" \
+    -o "$out" \
+    --scan-limit "$scan" \
+    --concurrency "$conc" \
+    --chunk-size "$chunk" \
+    --append \
+    --session-name "$sess"
+}
+
+# Combined analyze: posts + replies + fixtures with tags; writes augmented CSVs
+analyze_combined() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local tags="${3:-$TAGS_CFG}"
+  local fixtures="${4:-$FIXTURES_CSV}"
+  local ch="${5:-$CH}"
+  ./run_scraper.sh analyze \
+    -i "$posts" \
+    --channel "$ch" \
+    --tags-config "$tags" \
+    --replies-csv "$replies" \
+    --fixtures-csv "$fixtures" \
+    --write-augmented-csv \
+    --write-combined-csv \
+    --save-plots
+  # Tip: add plot sizing/labels, e.g.: --plot-width-scale 0.8 --plot-max-width 120 --plot-height 8 --activity-top-n 8 --labels-stagger-rows 3
+}
+
+# Emoji-aware analyze with sensible defaults (keep + boost)
+analyze_emoji() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local tags="${3:-$TAGS_CFG}"
+  local fixtures="${4:-$FIXTURES_CSV}"
+  local ch="${5:-$CH}"
+  local mode="${6:-keep}"   # keep | demojize | strip
+  ./run_scraper.sh analyze \
+    -i "$posts" \
+    --channel "$ch" \
+    --tags-config "$tags" \
+    --replies-csv "$replies" \
+    --fixtures-csv "$fixtures" \
+    --write-augmented-csv \
+    --write-combined-csv \
+    --save-plots \
+    --emoji-mode "$mode" \
+    --emoji-boost
+}
+
+# Analyze with transformers (and export labels/probs)
+analyze_transformers() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local tags="${3:-$TAGS_CFG}"
+  local fixtures="${4:-$FIXTURES_CSV}"
+  local ch="${5:-$CH}"
+  local model="${6:-distilbert-base-uncased}"
+  ./run_scraper.sh analyze \
+    -i "$posts" \
+    --channel "$ch" \
+    --tags-config "$tags" \
+    --replies-csv "$replies" \
+    --fixtures-csv "$fixtures" \
+    --sentiment-backend transformers \
+    --transformers-model "$model" \
+    --export-transformers-details \
+    --write-augmented-csv \
+    --write-combined-csv \
+    --save-plots
+}
+
+# Plot graphs from labeled sentiment CSV
+plot_labeled() {
+  local labeled_csv="${1:-data/labeled_sentiment.csv}"
+  local out_dir="${2:-data}"
+  ./.venv/bin/python -m src.plot_labeled \
+    --input "$labeled_csv" \
+    --out-dir "$out_dir"
+}
+
+# Merge labeled CSV back into posts/replies to reuse analyzer plots
+apply_labels_and_analyze() {
+  local labeled_csv="${1:-data/labeled_sentiment.csv}"
+  local posts_in="${2:-$POSTS_CSV}"
+  local replies_in="${3:-$REPLIES_CSV}"
+  local posts_out="${4:-data/premier_league_update_with_labels.csv}"
+  local replies_out="${5:-data/premier_league_replies_with_labels.csv}"
+  ./.venv/bin/python -m src.apply_labels \
+    --labeled-csv "$labeled_csv" \
+    --posts-csv "$posts_in" \
+    --replies-csv "$replies_in" \
+    --posts-out "$posts_out" \
+    --replies-out "$replies_out"
+  # Reuse analyzer with the merged CSVs; it will pick up sentiment_label if present
+  ./run_scraper.sh analyze \
+    -i "$posts_out" \
+    --replies-csv "$replies_out" \
+    --fixtures-csv "$FIXTURES_CSV" \
+    --tags-config "$TAGS_CFG" \
+    --write-augmented-csv \
+    --write-combined-csv \
+    --save-plots
+}
+
+# Auto-label sentiment without manual annotation (VADER backend)
+auto_label_vader() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local out="${3:-data/labeled_sentiment.csv}"
+  ./.venv/bin/python -m src.auto_label_sentiment \
+    --posts-csv "$posts" \
+    --replies-csv "$replies" \
+    --backend vader \
+    --vader-pos 0.05 \
+    --vader-neg -0.05 \
+    --vader-margin 0.20 \
+    --only-confident \
+    -o "$out"
+}
+
+# Auto-label sentiment using a pretrained transformers model
+auto_label_transformers() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local model="${3:-cardiffnlp/twitter-roberta-base-sentiment-latest}"
+  local out="${4:-data/labeled_sentiment.csv}"
+  ./.venv/bin/python -m src.auto_label_sentiment \
+    --posts-csv "$posts" \
+    --replies-csv "$replies" \
+    --backend transformers \
+    --transformers-model "$model" \
+    --min-prob 0.6 \
+    --min-margin 0.2 \
+    --only-confident \
+    -o "$out"
+}
+
+# Train a transformers model with the project venv
+train_transformers() {
+  local train_csv="${1:-data/labeled_sentiment.csv}"
+  local text_col="${2:-message}"
+  local label_col="${3:-label}"
+  local base_model="${4:-distilbert-base-uncased}"
+  local out_dir="${5:-models/sentiment-distilbert}"
+  ./.venv/bin/python -m src.train_sentiment \
+    --train-csv "$train_csv" \
+    --text-col "$text_col" \
+    --label-col "$label_col" \
+    --model-name "$base_model" \
+    --output-dir "$out_dir" \
+    --epochs 3 \
+    --batch-size 16
+}
+
+# Evaluate a fine-tuned transformers model
+eval_transformers() {
+  local csv="${1:-data/labeled_holdout.csv}"
+  local text_col="${2:-message}"
+  local label_col="${3:-label}"
+  local model_dir="${4:-models/sentiment-distilbert}"
+  ./.venv/bin/python -m src.eval_sentiment \
+    --csv "$csv" \
+    --text-col "$text_col" \
+    --label-col "$label_col" \
+    --model "$model_dir"
+}
+
+# Build a labeling CSV from existing posts+replies
+make_label_set() {
+  local posts="${1:-$POSTS_CSV}"
+  local replies="${2:-$REPLIES_CSV}"
+  local out="${3:-data/labeled_sentiment.csv}"
+  local n="${4:-1000}"
+  ./.venv/bin/python -m src.make_labeling_set \
+    --posts-csv "$posts" \
+    --replies-csv "$replies" \
+    --sample-size "$n" \
+    -o "$out"
+}
+
+# One-shot daily pipeline: fast replies then combined analyze
+run_daily() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local replies="${3:-$REPLIES_CSV}"
+  local conc="${4:-15}"
+  fast_replies "$ch" "$posts" "$replies" "$conc" "$SESSION_DIR/telegram_replies"
+  analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
+}
+
+# One-shot daily pipeline with forwards in parallel
+run_daily_with_forwards() {
+  local ch="${1:-$CH}"
+  local posts="${2:-$POSTS_CSV}"
+  local replies="${3:-$REPLIES_CSV}"
+  local forwards="${4:-$FORWARDS_CSV}"
+  local rep_conc="${5:-15}"
+  local f_scan="${6:-20000}"
+  local f_conc="${7:-10}"
+  local f_chunk="${8:-1500}"
+  local sess_r="${9:-$SESSION_DIR/telegram_replies}"
+  local sess_f="${10:-$SESSION_DIR/telegram_forwards}"
+
+  # Launch replies and forwards in parallel with separate sessions
+  local pid_r pid_f
+  fast_replies "$ch" "$posts" "$replies" "$rep_conc" "$sess_r" & pid_r=$!
+  chunked_forwards "$ch" "$posts" "$forwards" "$f_scan" "$f_conc" "$f_chunk" "$sess_f" & pid_f=$!
+
+  # Wait for completion and then analyze with emoji handling
+  wait $pid_r
+  wait $pid_f
+  analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
+}
+
+# End-to-end, non-interactive pipeline (from scratch): scrape -> replies -> fixtures -> analyze
+# Requirements:
+# - .env has TELEGRAM_API_ID and TELEGRAM_API_HASH (and TELEGRAM_2FA_PASSWORD if 2FA is enabled)
+# - CH/POSTS_CSV/REPLIES_CSV/FIXTURES_CSV/TAGS_CFG are set (defaults are defined above)
+# - Provide optional start/end dates; defaults use FIXTURES_START_DATE/FIXTURES_END_DATE
+# - Choose sentiment backend via arg 11: vader | transformers | gpt (default: transformers)
+run_all() {
+  local ch="${1:-$CH}"
+  local start="${2:-$FIXTURES_START_DATE}"
+  local end="${3:-$FIXTURES_END_DATE}"
+  local posts="${4:-$POSTS_CSV}"
+  local replies="${5:-$REPLIES_CSV}"
+  local fixtures="${6:-$FIXTURES_CSV}"
+  local tags="${7:-$TAGS_CFG}"
+  local sess_scrape="${8:-$SESSION_DIR/telegram_scrape}"
+  local sess_replies="${9:-$SESSION_DIR/telegram_replies}"
+  local rep_conc="${10:-15}"
+  local backend="${11:-transformers}"   # vader | transformers | gpt
+  local model="${12:-models/sentiment-distilbert}"
+  local gpt_model="${13:-llama3}"
+  local gpt_url="${14:-http://localhost:11434}"
+
+  # 1) Scrape posts (overwrite)
+  ./run_scraper.sh scrape \
+    -c "$ch" \
+    -o "$posts" \
+    --start-date "$start" \
+    --end-date "$end" \
+    --session-name "$sess_scrape"
+
+  # 2) Fetch replies (resume+append safe)
+  ./run_scraper.sh replies \
+    -c "$ch" \
+    --from-csv "$posts" \
+    -o "$replies" \
+    --min-replies 1 \
+    --concurrency "$rep_conc" \
+    --resume \
+    --append \
+    --session-name "$sess_replies"
+
+  # 3) Fetch fixtures for the same period
+  ./run_scraper.sh fixtures \
+    --start-date "$start" \
+    --end-date "$end" \
+    -o "$fixtures"
+
+  # 4) Analyze with plots (non-interactive)
+  local args=(
+    -i "$posts"
+    --tags-config "$tags"
+    --replies-csv "$replies"
+    --fixtures-csv "$fixtures"
+    --write-augmented-csv
+    --write-combined-csv
+    --emoji-mode keep
+    --emoji-boost
+    --save-plots
+    --plot-width-scale 0.8
+    --plot-max-width 120
+    --plot-height 8
+    --activity-top-n 8
+    --labels-stagger-rows 3
+  )
+  if [[ "$backend" == "transformers" ]]; then
+    args+=( --sentiment-backend transformers --transformers-model "$model" --export-transformers-details )
+  elif [[ "$backend" == "gpt" ]]; then
+    args+=( --sentiment-backend gpt --gpt-model "$gpt_model" --gpt-base-url "$gpt_url" )
+  else
+    args+=( --sentiment-backend vader )
+  fi
+
+  ./run_scraper.sh analyze "${args[@]}"
+}
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..82789f2
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1 @@
+# This file is intentionally left blank.
\ No newline at end of file
diff --git a/src/analyze_csv.py b/src/analyze_csv.py
new file mode 100644
index 0000000..ce58e72
--- /dev/null
+++ b/src/analyze_csv.py
@@ -0,0 +1,1313 @@
+"""
+analyze_csv
+============
+
+Generates a Markdown report and optional plots from a Telegram posts CSV (and an optional replies CSV).
+
+Features
+--------
+- Tagging from YAML keywords (config/tags.yaml)
+- Sentiment via VADER (default), a local transformers model, or a local GPT (Ollama)
+- Emoji-aware preprocessing with optional positivity/negativity boost
+- Optional fixtures join to mark matchdays; compact team abbreviation labels inside daily charts
+- Combined posts+replies augmented outputs and a merged CSV
+
+Key CLI flags
+-------------
+- --sentiment-backend vader|transformers|gpt
+- --transformers-model NAME_OR_PATH
+- --gpt-model NAME --gpt-base-url URL --gpt-batch-size K
+- --emoji-mode keep|demojize|strip [--emoji-boost]
+- --plot-width-scale FLOAT --plot-max-width INCHES --plot-height INCHES
+- --activity-top-n N
+- --labels-max-per-day N --labels-per-line N --labels-stagger-rows N --labels-band-y FLOAT --labels-annotate-mode ticks|all|ticks+top
+
+Plots (when --save-plots)
+-------------------------
+- posts_heatmap_hour_dow.png
+- sentiment_by_tag_posts.png
+- daily_activity_stacked.png
+- daily_volume_and_sentiment.png (bars: volume; lines: positive% and negative%)
+- matchday_sentiment_overall.png
+- matchday_posts_volume_vs_sentiment.png
+"""
+
+import argparse
+import os
+import re
+from datetime import datetime
+from typing import List, Optional, Tuple
+
+import pandas as pd
+import yaml
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+import emoji as _emoji
+
+
+def load_csv(path: str) -> pd.DataFrame:
+    df = pd.read_csv(path)
+    # Normalize columns we expect from the scraper
+    # Columns: id,date,message,sender_id,views,forwards,replies,url
+    # Parse date to datetime (naive)
+    if 'date' in df.columns:
+        df['date'] = pd.to_datetime(df['date'], errors='coerce')
+    for col in ['views', 'forwards', 'replies', 'sender_id']:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+    # Basic cleaning
+    if 'message' in df.columns:
+        df['message'] = df['message'].fillna('')
+    return df
+
+
+def summarize(df: pd.DataFrame) -> dict:
+    total = len(df)
+    with_text = int((df.get('message', pd.Series(dtype=str)) != '').sum()) if 'message' in df else 0
+    no_text = total - with_text
+    views_mean = float(df['views'].mean()) if 'views' in df and not df['views'].empty else 0.0
+    views_median = float(df['views'].median()) if 'views' in df and not df['views'].empty else 0.0
+    forwards_mean = float(df['forwards'].mean()) if 'forwards' in df and not df['forwards'].empty else 0.0
+    replies_mean = float(df['replies'].mean()) if 'replies' in df and not df['replies'].empty else 0.0
+
+    first_date = df['date'].min() if 'date' in df else None
+    last_date = df['date'].max() if 'date' in df else None
+
+    return {
+        'total_messages': total,
+        'with_text': with_text,
+        'no_text': no_text,
+        'views_mean': views_mean,
+        'views_median': views_median,
+        'forwards_mean': forwards_mean,
+        'replies_mean': replies_mean,
+        'first_date': first_date,
+        'last_date': last_date,
+    }
+
+
+def top_messages(df: pd.DataFrame, by: str, k: int = 10) -> pd.DataFrame:
+    if by not in df.columns:
+        return pd.DataFrame()
+    return df.sort_values(by=by, ascending=False).head(k)[['id', 'date', 'message', by, 'url']]
+
+
+def temporal_distributions(df: pd.DataFrame) -> dict:
+    if 'date' not in df:
+        return {}
+    out = {}
+    d = df.dropna(subset=['date']).copy()
+    d['day'] = d['date'].dt.date
+    d['hour'] = d['date'].dt.hour
+    out['per_day'] = d.groupby('day').size().reset_index(name='count')
+    out['per_hour'] = d.groupby('hour').size().reset_index(name='count')
+    return out
+
+
+def write_markdown_report(
+    df: pd.DataFrame,
+    out_path: str,
+    channel: Optional[str] = None,
+    replies_df: Optional[pd.DataFrame] = None,
+):
+    summ = summarize(df)
+    tops_views = top_messages(df, 'views', 10)
+    tops_forwards = top_messages(df, 'forwards', 10)
+    tops_replies = top_messages(df, 'replies', 10)
+    temps = temporal_distributions(df)
+
+    lines = []
+    title = f"Telegram Channel Report{f' - {channel}' if channel else ''}"
+    lines.append(f"# {title}")
+    lines.append("")
+    lines.append("## Summary")
+    lines.append("")
+    lines.append(f"- Total messages: {summ['total_messages']}")
+    lines.append(f"- With text: {summ['with_text']}")
+    lines.append(f"- Without text: {summ['no_text']}")
+    lines.append(f"- Views (mean/median): {summ['views_mean']:.1f} / {summ['views_median']:.1f}")
+    lines.append(f"- Forwards (mean): {summ['forwards_mean']:.2f}")
+    lines.append(f"- Replies (mean): {summ['replies_mean']:.2f}")
+    if summ['first_date'] is not None and summ['last_date'] is not None:
+        lines.append(f"- Date range: {summ['first_date']} — {summ['last_date']}")
+
+    # Sentiment summary if available
+    if 'sentiment_compound' in df.columns:
+        lines.append("\n### Sentiment summary")
+        sent = df['sentiment_compound'].dropna()
+        if not sent.empty:
+            lines.append(f"- Mean compound: {sent.mean():.3f}")
+            lines.append(f"- Median compound: {sent.median():.3f}")
+            pos_share = (sent > 0.05).mean()
+            neg_share = (sent < -0.05).mean()
+            neu_share = max(0.0, 1.0 - pos_share - neg_share)
+            lines.append(f"- Share positive (compound > 0.05): {pos_share:.2%}")
+            lines.append(f"- Share neutral  (|compound| ≤ 0.05): {neu_share:.2%}")
+            lines.append(f"- Share negative (compound < -0.05): {neg_share:.2%}")
+
+    def table(df_small: pd.DataFrame, caption: str) -> None:
+        if df_small is None or df_small.empty:
+            lines.append(f"\n### {caption}\n\n_No data_\n")
+            return
+        lines.append(f"\n### {caption}\n")
+        # Limit message preview to first 120 chars
+        df_disp = df_small.copy()
+        if 'message' in df_disp.columns:
+            df_disp['message'] = df_disp['message'].astype(str).str.replace("\n", " ").str.slice(0, 120)
+        lines.append(df_disp.to_markdown(index=False))
+
+    table(tops_views, "Top 10 posts by views")
+    table(tops_forwards, "Top 10 posts by forwards")
+    table(tops_replies, "Top 10 posts by replies (channel field)")
+
+    # If we computed scraped reply counts, include that ranking
+    if 'replies_count_scraped' in df.columns:
+        cols = ['id', 'date', 'message', 'replies_count_scraped']
+        if 'replies_top_tags' in df.columns:
+            cols.append('replies_top_tags')
+        if 'url' in df.columns:
+            cols.append('url')
+        top_scraped = df.sort_values('replies_count_scraped', ascending=False).head(10)[cols]
+        lines.append("\n### Top 10 posts by scraped reply count")
+        df_disp = top_scraped.copy()
+        if 'message' in df_disp.columns:
+            df_disp['message'] = df_disp['message'].astype(str).str.replace("\n", " ").str.slice(0, 120)
+        lines.append(df_disp.to_markdown(index=False))
+
+    # Temporal distributions
+    if temps:
+        lines.append("\n## Temporal distribution")
+        if 'per_day' in temps and not temps['per_day'].empty:
+            lines.append("\n### Messages per day")
+            lines.append(temps['per_day'].to_markdown(index=False))
+        if 'per_hour' in temps and not temps['per_hour'].empty:
+            lines.append("\n### Messages per hour (0-23)")
+            lines.append(temps['per_hour'].to_markdown(index=False))
+
+    # Per-tag engagement (if tags exist)
+    if 'tags' in df.columns:
+        tagged = df.copy()
+        # Normalize tags column to list
+        tagged['tags'] = tagged['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x]))
+        exploded = tagged.explode('tags')
+        exploded = exploded[exploded['tags'].notna() & (exploded['tags'] != '')]
+        if not exploded.empty:
+            grp = (
+                exploded.groupby('tags')
+                .agg(
+                    count=('id', 'count'),
+                    views_mean=('views', 'mean'),
+                    views_median=('views', 'median'),
+                    replies_mean=('replies', 'mean'),
+                    forwards_mean=('forwards', 'mean'),
+                    sentiment_mean=('sentiment_compound', 'mean') if 'sentiment_compound' in exploded.columns else ('id','count')
+                )
+                .reset_index()
+                .sort_values(['count', 'views_mean'], ascending=[False, False])
+            )
+            lines.append("\n## Per-tag engagement")
+            lines.append(grp.to_markdown(index=False))
+
+            # Per-tag sentiment breakdown for posts
+            if 'sentiment_compound' in exploded.columns:
+                s = exploded[['tags', 'sentiment_compound']].dropna()
+                if not s.empty:
+                    s['is_pos'] = s['sentiment_compound'] > 0.05
+                    s['is_neg'] = s['sentiment_compound'] < -0.05
+                    sgrp = (
+                        s.groupby('tags')
+                        .agg(
+                            n=('sentiment_compound', 'count'),
+                            mean=('sentiment_compound', 'mean'),
+                            median=('sentiment_compound', 'median'),
+                            pos_share=('is_pos', 'mean'),
+                            neg_share=('is_neg', 'mean'),
+                        )
+                        .reset_index()
+                        .sort_values(['n', 'mean'], ascending=[False, False])
+                    )
+                    # Derive neutral share as residual
+                    sgrp['neu_share'] = (1 - sgrp['pos_share'] - sgrp['neg_share']).clip(lower=0)
+                    # Reorder columns for readability
+                    cols = ['tags', 'n', 'mean', 'median', 'pos_share', 'neu_share', 'neg_share']
+                    sgrp = sgrp[[c for c in cols if c in sgrp.columns]]
+                    lines.append("\n### Per-tag sentiment (posts)")
+                    lines.append(sgrp.to_markdown(index=False))
+
+    # Replies per-tag summary (if provided and tagged)
+    if replies_df is not None and 'tags' in replies_df.columns:
+        rtagged = replies_df.copy()
+        rtagged['tags'] = rtagged['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x]))
+        rexpl = rtagged.explode('tags')
+        rexpl = rexpl[rexpl['tags'].notna() & (rexpl['tags'] != '')]
+        if not rexpl.empty:
+            rgrp = (
+                rexpl.groupby('tags')
+                .agg(
+                    replies_count=('id', 'count'),
+                    replies_sentiment_mean=('sentiment_compound', 'mean') if 'sentiment_compound' in rexpl.columns else ('id','count'),
+                )
+                .reset_index()
+                .sort_values(['replies_count'], ascending=[False])
+            )
+            lines.append("\n## Replies per-tag summary")
+            lines.append(rgrp.to_markdown(index=False))
+
+            # Per-tag sentiment breakdown for replies
+            if 'sentiment_compound' in rexpl.columns:
+                rs = rexpl[['tags', 'sentiment_compound']].dropna()
+                if not rs.empty:
+                    rs['is_pos'] = rs['sentiment_compound'] > 0.05
+                    rs['is_neg'] = rs['sentiment_compound'] < -0.05
+                    rsgrp = (
+                        rs.groupby('tags')
+                        .agg(
+                            n=('sentiment_compound', 'count'),
+                            mean=('sentiment_compound', 'mean'),
+                            median=('sentiment_compound', 'median'),
+                            pos_share=('is_pos', 'mean'),
+                            neg_share=('is_neg', 'mean'),
+                        )
+                        .reset_index()
+                        .sort_values(['n', 'mean'], ascending=[False, False])
+                    )
+                    rsgrp['neu_share'] = (1 - rsgrp['pos_share'] - rsgrp['neg_share']).clip(lower=0)
+                    cols = ['tags', 'n', 'mean', 'median', 'pos_share', 'neu_share', 'neg_share']
+                    rsgrp = rsgrp[[c for c in cols if c in rsgrp.columns]]
+                    lines.append("\n### Per-tag sentiment (replies)")
+                    lines.append(rsgrp.to_markdown(index=False))
+
+    # Combined sentiment (posts + replies) if replies are provided
+    if 'sentiment_compound' in df.columns and replies_df is not None and 'sentiment_compound' in replies_df.columns:
+        combined_cols = ['sentiment_compound']
+        if 'tags' in df.columns or ('tags' in replies_df.columns):
+            combined_cols.append('tags')
+        posts_part = df[['sentiment_compound'] + (['tags'] if 'tags' in df.columns else [])].copy()
+        posts_part['content_type'] = 'post'
+        reps_part = replies_df[['sentiment_compound'] + (['tags'] if 'tags' in replies_df.columns else [])].copy()
+        reps_part['content_type'] = 'reply'
+        combined = pd.concat([posts_part, reps_part], ignore_index=True)
+
+        lines.append("\n## Combined sentiment (posts + replies)")
+        sent_all = combined['sentiment_compound'].dropna()
+        if not sent_all.empty:
+            lines.append(f"- Mean compound: {sent_all.mean():.3f}")
+            lines.append(f"- Median compound: {sent_all.median():.3f}")
+            pos_share = (sent_all > 0.05).mean()
+            neg_share = (sent_all < -0.05).mean()
+            neu_share = max(0.0, 1.0 - pos_share - neg_share)
+            lines.append(f"- Share positive (compound > 0.05): {pos_share:.2%}")
+            lines.append(f"- Share neutral  (|compound| ≤ 0.05): {neu_share:.2%}")
+            lines.append(f"- Share negative (compound < -0.05): {neg_share:.2%}")
+
+        # Per-tag combined sentiment if tags exist
+        if 'tags' in combined.columns:
+            ctag = combined.copy()
+            ctag['tags'] = ctag['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x]))
+            cexpl = ctag.explode('tags')
+            cexpl = cexpl[cexpl['tags'].notna() & (cexpl['tags'] != '')]
+            if not cexpl.empty:
+                cexpl['is_pos'] = cexpl['sentiment_compound'] > 0.05
+                cexpl['is_neg'] = cexpl['sentiment_compound'] < -0.05
+                cgrp = (
+                    cexpl.groupby('tags')
+                    .agg(
+                        n=('sentiment_compound', 'count'),
+                        mean=('sentiment_compound', 'mean'),
+                        median=('sentiment_compound', 'median'),
+                        pos_share=('is_pos', 'mean'),
+                        neg_share=('is_neg', 'mean'),
+                    )
+                    .reset_index()
+                    .sort_values(['n', 'mean'], ascending=[False, False])
+                )
+                cgrp['neu_share'] = (1 - cgrp['pos_share'] - cgrp['neg_share']).clip(lower=0)
+                cols = ['tags', 'n', 'mean', 'median', 'pos_share', 'neu_share', 'neg_share']
+                cgrp = cgrp[[c for c in cols if c in cgrp.columns]]
+                lines.append("\n### Per-tag sentiment (combined posts + replies)")
+                lines.append(cgrp.to_markdown(index=False))
+
+    # Matchday cross-analysis: on vs off matchdays for posts and replies
+    def _matchday_table(d: pd.DataFrame, col: str = 'is_matchday') -> Optional[pd.DataFrame]:
+        if d is None or d.empty or col not in d.columns:
+            return None
+        t = d.copy()
+        t = t.dropna(subset=[col])
+        if t.empty:
+            return None
+        # Sentiment shares if sentiment available
+        has_sent = 'sentiment_compound' in t.columns and t['sentiment_compound'].notna().any()
+        if has_sent:
+            t['is_pos'] = t['sentiment_compound'] > 0.05
+            t['is_neg'] = t['sentiment_compound'] < -0.05
+        agg = {
+            'id': 'count'
+        }
+        if has_sent:
+            agg.update({
+                'sentiment_compound': 'mean',
+                'is_pos': 'mean',
+                'is_neg': 'mean',
+            })
+        g = t.groupby(col).agg(agg).rename(columns={'id': 'count'})
+        if has_sent:
+            g = g.rename(columns={'sentiment_compound': 'sentiment_mean', 'is_pos': 'pos_share', 'is_neg': 'neg_share'})
+            g['neu_share'] = (1 - g['pos_share'] - g['neg_share']).clip(lower=0)
+            # Reorder
+            g = g[['count', 'sentiment_mean', 'pos_share', 'neu_share', 'neg_share']]
+        return g.reset_index()
+
+    posts_md_tbl = _matchday_table(df)
+    replies_md_tbl_parent = _matchday_table(replies_df, col='parent_is_matchday') if (replies_df is not None and 'parent_is_matchday' in replies_df.columns) else None
+    replies_md_tbl_reply = _matchday_table(replies_df, col='is_matchday') if (replies_df is not None and 'is_matchday' in replies_df.columns) else None
+    if posts_md_tbl is not None or replies_md_tbl_parent is not None or replies_md_tbl_reply is not None:
+        lines.append("\n## Matchday cross-analysis")
+        if posts_md_tbl is not None:
+            lines.append("\n### Posts: on vs off matchdays")
+            lines.append(posts_md_tbl.to_markdown(index=False))
+            # If per-post replies are available, show engagement breakdown
+            if 'replies_count_scraped' in df.columns:
+                tmp = df.copy()
+                tmp['replies_count_scraped'] = pd.to_numeric(tmp['replies_count_scraped'], errors='coerce').fillna(0)
+                eng = (
+                    tmp.groupby('is_matchday')
+                    .agg(
+                        posts=('id','count'),
+                        posts_with_replies=('replies_count_scraped', lambda s: (s>0).mean()),
+                        replies_total=('replies_count_scraped','sum'),
+                        replies_mean_per_post=('replies_count_scraped','mean'),
+                        replies_median_per_post=('replies_count_scraped','median'),
+                    )
+                    .reset_index()
+                )
+                lines.append("\n### Posts engagement vs matchday (replies per post)")
+                lines.append(eng.to_markdown(index=False))
+        if replies_md_tbl_parent is not None:
+            lines.append("\n### Replies (by parent matchday): on vs off matchdays")
+            lines.append(replies_md_tbl_parent.to_markdown(index=False))
+        if replies_md_tbl_reply is not None:
+            lines.append("\n### Replies (by reply date): on vs off matchdays")
+            lines.append(replies_md_tbl_reply.to_markdown(index=False))
+
+    with open(out_path, 'w', encoding='utf-8') as f:
+        f.write("\n".join(lines))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze Telegram CSV and generate a Markdown report")
+    parser.add_argument('csv', help='Path to CSV file exported by the scraper')
+    parser.add_argument('-o', '--output', default=None, help='Output Markdown path (default: alongside CSV with .md)')
+    parser.add_argument('--channel', default=None, help='Optional channel name for the report title')
+    parser.add_argument('--tags-config', default=None, help='Path to YAML config for keyword tags (e.g., config/tags.yaml)')
+    parser.add_argument('--replies-csv', default=None, help='Optional CSV of replies with parent_id and sentiment_compound to aggregate per message')
+    parser.add_argument('--fixtures-csv', default=None, help='Optional fixtures CSV to derive a matchday flag (matches on date)')
+    parser.add_argument('--write-augmented-csv', action='store_true', help='Also write a CSV with computed fields (sentiment, tags) alongside the input')
+    parser.add_argument('--write-combined-csv', action='store_true', help='If replies are provided, also write a merged posts+replies CSV with a content_type column')
+    parser.add_argument('--save-plots', action='store_true', help='Also save common plots (daily sentiment, posts heatmap, sentiment-by-tag) next to the report')
+    parser.add_argument('--emoji-mode', choices=['keep', 'demojize', 'strip'], default='keep', help='How to treat emojis before sentiment: keep (default), demojize to :keywords:, or strip emojis')
+    parser.add_argument('--emoji-boost', action='store_true', help='If set with keep/demojize, gently boost VADER for clearly positive/negative emojis')
+    parser.add_argument('--sentiment-backend', choices=['vader', 'transformers', 'gpt'], default='vader', help='Choose sentiment engine: vader (default), transformers, or gpt (local via Ollama)')
+    parser.add_argument('--transformers-model', default='distilbert-base-uncased', help='HF model name or local path for transformers backend')
+    parser.add_argument('--export-transformers-details', action='store_true', help='When using transformers backend, also export predicted label and raw class probabilities')
+    # GPT local model knobs (Ollama)
+    parser.add_argument('--gpt-model', default='llama3', help='Local GPT model name (Ollama)')
+    parser.add_argument('--gpt-base-url', default='http://localhost:11434', help='Base URL for local GPT server (Ollama)')
+    parser.add_argument('--gpt-batch-size', type=int, default=8, help='Batch size for GPT requests')
+    # Plot sizing controls
+    parser.add_argument('--plot-width-scale', type=float, default=0.8,
+                        help='Scale factor (inches per day) for dynamic plot width of daily activity chart. Default doubled from 0.4 to 0.8.')
+    parser.add_argument('--plot-max-width', type=float, default=104.0,
+                        help='Maximum figure width (inches) clamp for daily activity chart. Default doubled from 52 to 104. Override to a larger value if needed.')
+    parser.add_argument('--plot-height', type=float, default=6.5,
+                        help='Figure height (inches) for bar charts. Default 6.5 inches (taller than previous 5).')
+    parser.add_argument('--activity-top-n', type=int, default=5,
+                        help='Number of top-activity days to highlight and annotate. Use 0 to disable highlighting.')
+    # Match label rendering controls
+    parser.add_argument('--labels-max-per-day', type=int, default=3,
+                        help='Maximum number of match labels to show per day before collapsing into +N more.')
+    parser.add_argument('--labels-per-line', type=int, default=2,
+                        help='Number of match labels per line when stacking within the label band.')
+    parser.add_argument('--labels-band-y', type=float, default=0.96,
+                        help='Vertical position of the labels band in axes coordinates (inside the axes; 1.0 is top).')
+    parser.add_argument('--labels-stagger-rows', type=int, default=2,
+                        help='Number of staggered rows in the label band to reduce neighbor collisions (1-3 recommended).')
+    parser.add_argument('--labels-annotate-mode', choices=['ticks','all','ticks+top'], default='ticks+top',
+                        help='Which days to annotate with match labels: only ticked days, all days, or ticked days plus top-N highlighted days (default).')
+    args = parser.parse_args()
+
+    df = load_csv(args.csv)
+    replies_df: Optional[pd.DataFrame] = None
+
+    # Optional tagging step
+    if args.tags_config and os.path.exists(args.tags_config):
+        with open(args.tags_config, 'r', encoding='utf-8') as f:
+            cfg = yaml.safe_load(f) or {}
+
+        # Compile patterns list: List[(tag, List[(pattern, is_regex)])]
+        patterns: List[Tuple[str, List[Tuple[str, bool]]]] = []
+        for tag, arr in (cfg.items() if isinstance(cfg, dict) else []):
+            compiled: List[Tuple[str, bool]] = []
+            for pat in (arr or []):
+                if isinstance(pat, str) and pat.startswith('re:'):
+                    compiled.append((pat[3:], True))
+                else:
+                    compiled.append((str(pat), False))
+            patterns.append((tag, compiled))
+
+        def tag_message(text: str) -> List[str]:
+            t = text or ''
+            tags: List[str] = []
+            for tag, pats in patterns:
+                for pat, is_re in pats:
+                    if is_re:
+                        if re.search(pat, t, flags=re.IGNORECASE):
+                            tags.append(tag)
+                            break
+                    else:
+                        if pat.lower() in t.lower():
+                            tags.append(tag)
+                            break
+            return tags
+
+        if 'message' in df.columns:
+            df['tags'] = df['message'].apply(tag_message)
+
+        # If replies CSV provided, apply tags to replies as well
+        if args.replies_csv and os.path.exists(args.replies_csv):
+            replies_df = pd.read_csv(args.replies_csv)
+            if 'message' in replies_df.columns:
+                replies_df['message'] = replies_df['message'].fillna('')
+                replies_df['tags'] = replies_df['message'].apply(tag_message)
+
+    # Sentiment scoring
+    analyzer = SentimentIntensityAnalyzer()
+    tmodel = None
+    gpt = None
+    if args.sentiment_backend == 'transformers':
+        try:
+            from .transformer_sentiment import TransformerSentiment
+            tmodel = TransformerSentiment(args.transformers_model)
+            print(f"[transformers] Using model: {args.transformers_model} on {tmodel.device}")
+        except Exception as e:
+            print(f"[transformers] Falling back to VADER due to error: {e}")
+            args.sentiment_backend = 'vader'
+    elif args.sentiment_backend == 'gpt':
+        try:
+            from .gpt_sentiment import GPTSentiment
+        except Exception:
+            from gpt_sentiment import GPTSentiment
+        try:
+            gpt = GPTSentiment(base_url=args.gpt_base_url, model=args.gpt_model)
+            # Light connectivity probe: do a tiny call that should fail gracefully without raising here
+            print(f"[gpt] Using local GPT model: {args.gpt_model} at {args.gpt_base_url}")
+        except Exception as e:
+            print(f"[gpt] Falling back to VADER (init error): {e}")
+            args.sentiment_backend = 'vader'
+
+    def _strip_emojis(text: str) -> str:
+        # Remove all emoji code points
+        return _emoji.replace_emoji(text or '', replace='')
+
+    def _demojize(text: str) -> str:
+        return _emoji.demojize(text or '', delimiters=(":", ":"))
+
+    # Simple emoji valence hints for boosting
+    POS_EMOJI_HINTS = {"😀", "😃", "😄", "😁", "😆", "😊", "🙂", "😍", "🥳", "👍", "🔥", "👏", "💯", "😺", "🤩", "🙌", "🫶", "⚽️", "🏆"}
+    NEG_EMOJI_HINTS = {"😞", "😟", "😠", "😡", "😢", "😭", "👎", "💔", "🤬", "🤢", "😫", "😩"}
+
+    def _emoji_valence_boost(text: str, base: float) -> float:
+        if not args.emoji_boost:
+            return base
+        # Look at original text to preserve emoji presence regardless of preprocessing
+        pos_hits = any(ch in POS_EMOJI_HINTS for ch in text)
+        neg_hits = any(ch in NEG_EMOJI_HINTS for ch in text)
+        boost = 0.0
+        if pos_hits and not neg_hits:
+            boost = 0.05
+        elif neg_hits and not pos_hits:
+            boost = -0.05
+        # Clamp to VADER range [-1, 1]
+        return max(-1.0, min(1.0, base + boost))
+
+    def _prep_for_sentiment(text: str) -> str:
+        if args.emoji_mode == 'strip':
+            return _strip_emojis(text or '')
+        if args.emoji_mode == 'demojize':
+            return _demojize(text or '')
+        return text or ''
+
+    if 'message' in df.columns:
+        def _score_msg(t: str) -> float:
+            raw = t or ''
+            if args.sentiment_backend == 'transformers' and tmodel is not None:
+                # Use transformer model in batches later
+                return None  # placeholder, fill after batch
+            if args.sentiment_backend == 'gpt' and gpt is not None:
+                return None
+            proc = _prep_for_sentiment(raw)
+            score = analyzer.polarity_scores(proc).get('compound')
+            return _emoji_valence_boost(raw, score)
+        df['sentiment_compound'] = df['message'].apply(_score_msg)
+    # Ensure replies have sentiment if present and missing
+    if replies_df is not None:
+        if 'message' in replies_df.columns and 'sentiment_compound' not in replies_df.columns:
+            def _score_rep(t: str) -> float:
+                raw = t or ''
+                if args.sentiment_backend == 'transformers' and tmodel is not None:
+                    return None
+                if args.sentiment_backend == 'gpt' and gpt is not None:
+                    return None
+                proc = _prep_for_sentiment(raw)
+                score = analyzer.polarity_scores(proc).get('compound')
+                return _emoji_valence_boost(raw, score)
+            replies_df['sentiment_compound'] = replies_df['message'].apply(_score_rep)
+
+    # If transformers backend was selected, fill in sentiment_compound in batches
+    if args.sentiment_backend == 'transformers' and tmodel is not None:
+        if 'message' in df.columns:
+            mask = df['sentiment_compound'].isna()
+            texts = df.loc[mask, 'message'].astype(str).tolist()
+            if texts:
+                preds = tmodel.predict_compound_batch(texts, batch_size=32)
+                df.loc[mask, 'sentiment_compound'] = preds
+                if args.export_transformers_details:
+                    # Re-run to get probabilities and labels
+                    from .transformer_sentiment import TransformerSentiment
+                    probs, labels = tmodel.predict_probs_and_labels(texts, batch_size=32)
+                    df.loc[mask, 'sentiment_label'] = labels
+                    df.loc[mask, 'sentiment_probs'] = [','.join(f"{p:.6f}" for p in row) for row in probs]
+        if replies_df is not None and 'message' in replies_df.columns and 'sentiment_compound' in replies_df.columns:
+            rmask = replies_df['sentiment_compound'].isna()
+            rtexts = replies_df.loc[rmask, 'message'].astype(str).tolist()
+            if rtexts:
+                rpreds = tmodel.predict_compound_batch(rtexts, batch_size=64)
+                replies_df.loc[rmask, 'sentiment_compound'] = rpreds
+                if args.export_transformers_details:
+                    probs, labels = tmodel.predict_probs_and_labels(rtexts, batch_size=64)
+                    replies_df.loc[rmask, 'sentiment_label'] = labels
+                    replies_df.loc[rmask, 'sentiment_probs'] = [','.join(f"{p:.6f}" for p in row) for row in probs]
+    elif args.sentiment_backend == 'gpt' and gpt is not None:
+        def _vader_compounds_for(texts: List[str]) -> List[float]:
+            out_vals: List[float] = []
+            for raw in texts:
+                proc = _prep_for_sentiment(raw)
+                sc = analyzer.polarity_scores(proc).get('compound')
+                out_vals.append(_emoji_valence_boost(raw, sc))
+            return out_vals
+        # Fill posts sentiment via local GPT
+        if 'message' in df.columns:
+            mask = df['sentiment_compound'].isna()
+            texts = df.loc[mask, 'message'].astype(str).tolist()
+            if texts:
+                try:
+                    preds = gpt.predict_compound_batch(texts, batch_size=int(getattr(args, 'gpt_batch_size', 8)))
+                    df.loc[mask, 'sentiment_compound'] = preds
+                except Exception as e:
+                    print(f"[gpt] Prediction error; falling back to VADER for remaining rows: {e}")
+                    preds = _vader_compounds_for(texts)
+                    df.loc[mask, 'sentiment_compound'] = preds
+        if replies_df is not None and 'message' in replies_df.columns and 'sentiment_compound' in replies_df.columns:
+            rmask = replies_df['sentiment_compound'].isna()
+            rtexts = replies_df.loc[rmask, 'message'].astype(str).tolist()
+            if rtexts:
+                try:
+                    rpreds = gpt.predict_compound_batch(rtexts, batch_size=int(getattr(args, 'gpt_batch_size', 8)))
+                    replies_df.loc[rmask, 'sentiment_compound'] = rpreds
+                except Exception as e:
+                    print(f"[gpt] Replies prediction error; falling back to VADER for remaining rows: {e}")
+                    rpreds = _vader_compounds_for(rtexts)
+                    replies_df.loc[rmask, 'sentiment_compound'] = rpreds
+
+    # Optional: aggregate replies sentiment per parent and join
+    if replies_df is not None and 'parent_id' in replies_df.columns and 'sentiment_compound' in replies_df.columns:
+        agg = replies_df.groupby('parent_id')['sentiment_compound'].mean().reset_index().rename(columns={'sentiment_compound':'replies_sentiment_mean'})
+        if 'id' in df.columns:
+            df = df.merge(agg, how='left', left_on='id', right_on='parent_id').drop(columns=['parent_id'])
+
+    # Optional: matchday flag by joining on date with fixtures (same day) for posts and replies
+    fixtures_present = bool(args.fixtures_csv and os.path.exists(args.fixtures_csv))
+    matchdays = None
+    fixtures_by_day = None  # map: date -> ["Home vs Away" or "Home X-Y Away"]
+    if fixtures_present:
+        fix = pd.read_csv(args.fixtures_csv)
+        if 'utcDate' in fix.columns:
+            fix['utcDate'] = pd.to_datetime(fix['utcDate'], errors='coerce')
+            fix['match_day'] = fix['utcDate'].dt.date
+            matchdays = fix[['match_day']].dropna().drop_duplicates()
+            # Build per-day match labels
+            try:
+                # Map full club names to standard PL 3-letter abbreviations
+                PL_ABBR = {
+                    'arsenal': 'ARS',
+                    'astonvilla': 'AVL',
+                    'bournemouth': 'BOU',
+                    'brentford': 'BRE',
+                    'brightonandhovealbion': 'BHA',
+                    'chelsea': 'CHE',
+                    'crystalpalace': 'CRY',
+                    'everton': 'EVE',
+                    'fulham': 'FUL',
+                    'ipswichtown': 'IPS',
+                    'leicestercity': 'LEI',
+                    'liverpool': 'LIV',
+                    'manchestercity': 'MCI',
+                    'manchesterunited': 'MUN',
+                    'newcastleunited': 'NEW',
+                    'nottinghamforest': 'NFO',
+                    'southampton': 'SOU',
+                    'tottenhamhotspur': 'TOT',
+                    'westhamunited': 'WHU',
+                    'wolverhamptonwanderers': 'WOL',
+                }
+
+                def _canon_team_key(name: str) -> str:
+                    s = str(name or '')
+                    s = s.lower().replace('&', 'and')
+                    # keep letters and spaces only
+                    import re as _re
+                    s = ''.join(ch if ch.isalpha() or ch.isspace() else ' ' for ch in s)
+                    # collapse whitespace
+                    s = ' '.join(s.split())
+                    # remove standalone fc/afc tokens
+                    tokens = [t for t in s.split(' ') if t not in ('fc', 'afc')]
+                    return ''.join(tokens)
+
+                def _abbr_team(name: str) -> str:
+                    key = _canon_team_key(name)
+                    if key in PL_ABBR:
+                        return PL_ABBR[key]
+                    # Fallback: build a 3-letter code from initials or first letters
+                    import re as _re
+                    toks = _re.findall(r"[A-Za-z]+", str(name or ''))
+                    toks = [t for t in toks if t.lower() not in ('fc', 'afc')]
+                    if toks:
+                        initials = ''.join(t[0] for t in toks).upper()
+                        if len(initials) >= 3:
+                            return initials[:3]
+                        joined = ''.join(toks).upper()
+                        return (joined + 'XXX')[:3]
+                    return str(name or '')[:3].upper()
+                cols = [c for c in ['match_day','homeTeam','awayTeam','homeScore','awayScore'] if c in fix.columns]
+                lab_df = fix[cols].dropna(subset=['match_day']).copy()
+                def _mk_label(row):
+                    # Only team abbreviations, no scores
+                    ht = _abbr_team(row.get('homeTeam', ''))
+                    at = _abbr_team(row.get('awayTeam', ''))
+                    # Use a short separator to keep labels compact
+                    return f"{ht}–{at}"
+                lab_df['label'] = lab_df.apply(_mk_label, axis=1)
+                fixtures_by_day = lab_df.groupby('match_day')['label'].apply(list).to_dict()
+            except Exception:
+                fixtures_by_day = None
+    if matchdays is not None:
+        if 'date' in df.columns:
+            df['post_day'] = pd.to_datetime(df['date'], errors='coerce').dt.date
+            df = df.merge(matchdays, how='left', left_on='post_day', right_on='match_day')
+            df['is_matchday'] = df['match_day'].notna()
+            df = df.drop(columns=['match_day', 'post_day'])
+        if replies_df is not None and 'date' in replies_df.columns:
+            replies_df['reply_day'] = pd.to_datetime(replies_df['date'], errors='coerce').dt.date
+            replies_df = replies_df.merge(matchdays, how='left', left_on='reply_day', right_on='match_day')
+            replies_df['is_matchday'] = replies_df['match_day'].notna()
+            replies_df = replies_df.drop(columns=['match_day', 'reply_day'])
+            # Also derive parent-based matchday classification for replies if possible
+            if 'parent_id' in replies_df.columns and 'id' in df.columns and 'is_matchday' in df.columns:
+                parent_map = df[['id', 'is_matchday']].rename(columns={'id': 'parent_id', 'is_matchday': 'parent_is_matchday'})
+                replies_df = replies_df.merge(parent_map, how='left', on='parent_id')
+        # Diagnostics
+        try:
+            posts_md = int(df['is_matchday'].sum()) if 'is_matchday' in df.columns else 0
+            replies_md = int(replies_df['is_matchday'].sum()) if (replies_df is not None and 'is_matchday' in replies_df.columns) else 0
+            parent_md = int(replies_df['parent_is_matchday'].sum()) if (replies_df is not None and 'parent_is_matchday' in replies_df.columns) else 0
+            print(f"[fixtures] Matchday join: posts matchday rows={posts_md}; replies by reply-date matchday rows={replies_md}; replies by parent matchday rows={parent_md}")
+        except Exception:
+            pass
+
+    # Per-parent reply tag rollup: replies_count_scraped and replies_top_tags
+    if replies_df is not None and 'parent_id' in replies_df.columns:
+        # Replies count per parent
+        rcount = replies_df.groupby('parent_id').agg(replies_count_scraped=('id', 'count')).reset_index()
+        if 'id' in df.columns:
+            df = df.merge(rcount, how='left', left_on='id', right_on='parent_id').drop(columns=['parent_id'])
+        # Top tags per parent (if tagged)
+        if 'tags' in replies_df.columns:
+            rtagged = replies_df.copy()
+            rtagged['tags'] = rtagged['tags'].apply(lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x]))
+            rexpl = rtagged.explode('tags')
+            rexpl = rexpl[rexpl['tags'].notna() & (rexpl['tags'] != '')]
+            if not rexpl.empty:
+                tag_counts = rexpl.groupby(['parent_id', 'tags']).size().reset_index(name='count')
+                # Build top-3 tag string per parent
+                def top3(group: pd.DataFrame) -> str:
+                    g = group.sort_values('count', ascending=False).head(3)
+                    return '|'.join(f"{row['tags']}({int(row['count'])})" for _, row in g.iterrows())
+                top_tags = tag_counts.groupby('parent_id').apply(top3).reset_index(name='replies_top_tags')
+                if 'id' in df.columns:
+                    df = df.merge(top_tags, how='left', left_on='id', right_on='parent_id').drop(columns=['parent_id'])
+
+    out = args.output
+    if out is None:
+        base, _ = os.path.splitext(args.csv)
+        out = base + '_report.md'
+    write_markdown_report(df, out_path=out, channel=args.channel, replies_df=replies_df)
+    print(f"Report written to {out}")
+
+    if args.write_augmented_csv:
+        base, ext = os.path.splitext(args.csv)
+        aug = base + '_tagged.csv'
+        # Serialize tags list to a semicolon-separated string for CSV
+        if 'tags' in df.columns:
+            df_out = df.copy()
+            df_out['tags'] = df_out['tags'].apply(lambda xs: ';'.join(xs) if isinstance(xs, list) else '')
+        else:
+            df_out = df
+        df_out.to_csv(aug, index=False)
+        print(f"Augmented CSV written to {aug}")
+
+        # Also write a tagged replies CSV if provided
+        if replies_df is not None:
+            rbase, rext = os.path.splitext(args.replies_csv)
+            raug = rbase + '_tagged.csv'
+            r_out = replies_df.copy()
+            if 'tags' in r_out.columns:
+                r_out['tags'] = r_out['tags'].apply(lambda xs: ';'.join(xs) if isinstance(xs, list) else '')
+            r_out.to_csv(raug, index=False)
+            print(f"Replies augmented CSV written to {raug}")
+
+    # Optional: write combined posts+replies CSV
+    if args.write_combined_csv and replies_df is not None:
+        # Normalize posts columns
+        p = df.copy()
+        p['content_type'] = 'post'
+        # Ensure shared sentiment/tags columns exist
+        if 'sentiment_compound' not in p.columns and 'message' in p.columns:
+            analyzer = SentimentIntensityAnalyzer()
+            p['sentiment_compound'] = p['message'].apply(lambda t: analyzer.polarity_scores(t or '').get('compound'))
+        # Harmonize tag serialization to list before final serialization
+        if 'tags' in p.columns:
+            p_tags = p['tags']
+        else:
+            p['tags'] = [[] for _ in range(len(p))]
+
+        # Normalize replies columns
+        r = replies_df.copy()
+        r['content_type'] = 'reply'
+        # For replies, the post id is parent_id; ensure a common column 'parent_id' exists
+        if 'parent_id' not in r.columns and 'id' in r.columns:
+            r['parent_id'] = None
+
+        # Select a union of reasonable columns
+        sel_cols = []
+        for c in ['content_type', 'id', 'parent_id', 'date', 'message', 'sender_id', 'views', 'forwards', 'replies', 'sentiment_compound', 'sentiment_label', 'sentiment_probs', 'url', 'tags', 'is_matchday', 'parent_is_matchday']:
+            if c in p.columns or c in r.columns:
+                sel_cols.append(c)
+        p_sel = p.reindex(columns=sel_cols)
+        r_sel = r.reindex(columns=sel_cols)
+
+        combined_df = pd.concat([p_sel, r_sel], ignore_index=True)
+        # Serialize tags for CSV
+        if 'tags' in combined_df.columns:
+            combined_df['tags'] = combined_df['tags'].apply(lambda xs: ';'.join(xs) if isinstance(xs, list) else ('' if pd.isna(xs) else str(xs)))
+
+        base, _ = os.path.splitext(args.csv)
+        comb_path = base + '_combined.csv'
+        combined_df.to_csv(comb_path, index=False)
+        print(f"Combined posts+replies CSV written to {comb_path}")
+
+    # Optional: save plots
+    if args.save_plots:
+        try:
+            import matplotlib.pyplot as plt
+            import seaborn as sns
+        except Exception as e:
+            print(f"[plots] Skipping plots; matplotlib/seaborn not available: {e}")
+        else:
+            out_dir = os.path.dirname(out) or "."
+
+            # Removed: Daily average sentiment (combined posts + replies)
+
+            # 2) Posts heatmap by day-of-week and hour
+            try:
+                if 'date' in df.columns and not df.empty:
+                    t = df.dropna(subset=['date']).copy()
+                    if not t.empty:
+                        t['date'] = pd.to_datetime(t['date'], errors='coerce')
+                        t = t.dropna(subset=['date'])
+                        if not t.empty:
+                            t['dow'] = t['date'].dt.day_name()
+                            t['hour'] = t['date'].dt.hour
+                            pivot = t.pivot_table(index='dow', columns='hour', values='id', aggfunc='count').fillna(0)
+                            order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
+                            pivot = pivot.reindex(order)
+                            plt.figure(figsize=(10,5))
+                            sns.heatmap(pivot, cmap='Blues')
+                            plt.title('Posts heatmap by day-of-week and hour')
+                            plt.xlabel('Hour'); plt.ylabel('Day of week')
+                            plt.tight_layout()
+                            plt.savefig(os.path.join(out_dir, 'posts_heatmap_hour_dow.png'), dpi=150)
+                            plt.close()
+                            print(f"[plots] Saved {os.path.join(out_dir, 'posts_heatmap_hour_dow.png')}")
+            except Exception as e:
+                print(f"[plots] Failed posts heatmap: {e}")
+
+            # 3) Sentiment shares by tag (posts) stacked bars
+            try:
+                if 'tags' in df.columns and ('sentiment_compound' in df.columns or 'sentiment_label' in df.columns):
+                    p = df.copy()
+                    p['tags'] = p['tags'].apply(lambda s: s if isinstance(s, list) else ([] if pd.isna(s) else [s]))
+                    e = p.explode('tags')
+                    # Keep rows with a tag and either a sentiment label or compound value
+                    e = e[(e['tags'].notna()) & (e['tags']!='')]
+                    if 'sentiment_label' in e.columns:
+                        e = e[e['sentiment_label'].notna()]
+                    else:
+                        e = e[e['sentiment_compound'].notna()]
+                    # Filter to team tags only (those starting with 'club_')
+                    e = e[e['tags'].astype(str).str.startswith('club_')]
+                    if not e.empty:
+                        if 'sentiment_label' in e.columns:
+                            # Use model-predicted labels when available
+                            lab = e['sentiment_label'].astype(str).str.lower()
+                            e['pos'] = lab.str.contains('pos|positive').astype(int)
+                            e['neg'] = lab.str.contains('neg|negative').astype(int)
+                            e['neu'] = (~(e['pos'].astype(bool) | e['neg'].astype(bool))).astype(int)
+                        else:
+                            # Fallback to compound thresholds
+                            e['pos'] = (e['sentiment_compound'] > 0.05).astype(int)
+                            e['neg'] = (e['sentiment_compound'] < -0.05).astype(int)
+                            e['neu'] = 1 - e['pos'] - e['neg']
+                        # Group by team tag and compute average shares, include all teams (no top-N cap)
+                        g = e.groupby('tags')[['pos','neu','neg']].mean().sort_values('pos', ascending=False)
+                        # Dynamic width based on number of teams; reuse plot flags
+                        n_teams = len(g.index)
+                        fig_w = max(16, min(float(args.plot_max_width), float(args.plot_width_scale) * n_teams))
+                        try:
+                            print(f"[plots] sentiment_by_tag_posts: teams={n_teams}, width_in={fig_w:.2f}, scale={float(args.plot_width_scale):.2f}, max={float(args.plot_max_width):.2f}")
+                        except Exception:
+                            pass
+                        fig, ax = plt.subplots(figsize=(fig_w, float(args.plot_height)))
+                        g[['pos','neu','neg']].plot(kind='bar', stacked=True, color=['#2ca02c','#aaaaaa','#d62728'], ax=ax)
+                        ax.set_title('Sentiment shares by team (posts)')
+                        ax.set_ylabel('Share')
+                        # Improve label readability for many teams
+                        for label in ax.get_xticklabels():
+                            label.set_rotation(45)
+                            label.set_ha('right')
+                        plt.tight_layout()
+                        plt.savefig(os.path.join(out_dir, 'sentiment_by_tag_posts.png'), dpi=150)
+                        plt.close()
+                        print(f"[plots] Saved {os.path.join(out_dir, 'sentiment_by_tag_posts.png')}")
+            except Exception as e:
+                print(f"[plots] Failed sentiment-by-tag plot: {e}")
+
+            # Removed: Replies daily average sentiment plot
+
+            # 5) Combined activity: stacked counts by content_type per day
+            try:
+                if 'date' in df.columns:
+                    posts_activity = df[['id','date']].dropna().copy()
+                    posts_activity['date'] = pd.to_datetime(posts_activity['date'], errors='coerce')
+                    posts_activity = posts_activity.dropna(subset=['date'])
+                    posts_activity['day'] = posts_activity['date'].dt.date
+                    posts_activity['content_type'] = 'post'
+                    combined_act = posts_activity
+                    if replies_df is not None and 'date' in replies_df.columns:
+                        replies_activity = replies_df[['id','date']].dropna().copy()
+                        replies_activity['date'] = pd.to_datetime(replies_activity['date'], errors='coerce')
+                        replies_activity = replies_activity.dropna(subset=['date'])
+                        replies_activity['day'] = replies_activity['date'].dt.date
+                        replies_activity['content_type'] = 'reply'
+                        combined_act = pd.concat([posts_activity, replies_activity], ignore_index=True)
+                    if not combined_act.empty:
+                        pv = combined_act.pivot_table(index='day', columns='content_type', values='id', aggfunc='count').fillna(0)
+                        totals = pv.sum(axis=1)
+                        num_days = len(pv.index)
+                        # Determine top-N days to highlight (0 disables)
+                        req_top_n = int(args.activity_top_n) if hasattr(args, 'activity_top_n') else 5
+                        top_n = max(0, min(num_days, req_top_n))
+                        top_days = list(totals.nlargest(top_n).index) if top_n > 0 else []
+                        # Improve readability for long ranges: scale width and thin x-ticks
+                        # Reuse num_days defined above
+                        # Make the figure wider for better x-axis readability using CLI-tunable params.
+                        # Dynamic width scaled by the number of days, clamped to [16, plot_max_width].
+                        fig_w = max(16, min(float(args.plot_max_width), float(args.plot_width_scale) * num_days))
+                        # Debug print to help users verify width computation
+                        try:
+                            print(f"[plots] daily_activity_stacked: days={num_days}, width_in={fig_w:.2f}, scale={float(args.plot_width_scale):.2f}, max={float(args.plot_max_width):.2f}")
+                        except Exception:
+                            pass
+                        fig, ax = plt.subplots(figsize=(fig_w, float(args.plot_height)))
+                        pv.plot(kind='bar', stacked=True, color={'post':'#9467bd','reply':'#8c564b'}, ax=ax)
+                        ax.set_title('Daily activity (posts vs replies)')
+                        ax.set_xlabel('Day'); ax.set_ylabel('Count')
+                        labels_in_band = False
+                        show_pos = None
+                        show_pos_set = set()
+                        # Thin tick labels to ~12 evenly spaced labels for large ranges
+                        try:
+                            import numpy as _np
+                            # Base tick positions (0..num_days-1) and labels
+                            base_idx = list(range(num_days))
+                            # Positions of top days
+                            highlight_pos = [pv.index.get_loc(d) for d in top_days]
+                            if num_days > 20:
+                                desired = 12
+                                step = max(1, int(_np.ceil(num_days / desired)))
+                                show_pos = list(range(0, num_days, step))
+                                # Ensure highlight positions are included
+                                show_pos = sorted(set(show_pos + highlight_pos))
+                                ax.set_xticks(show_pos)
+                                labels_all = [f"{d} ({d.strftime('%a')})" if hasattr(d, 'strftime') else str(d) for d in pv.index]
+                                show_labels = [labels_all[i] for i in show_pos]
+                                ax.set_xticklabels(show_labels, rotation=45, ha='right')
+                                show_pos_set = set(show_pos)
+                            else:
+                                # Set all labels with day names
+                                labels = [f"{d} ({d.strftime('%a')})" if hasattr(d, 'strftime') else str(d) for d in pv.index]
+                                ax.set_xticks(base_idx)
+                                ax.set_xticklabels(labels)
+                                for label in ax.get_xticklabels():
+                                    label.set_rotation(45)
+                                    label.set_ha('right')
+                                show_pos = base_idx
+                                show_pos_set = set(base_idx)
+                            # Color highlighted tick labels and annotate totals
+                            # After setting ticks/labels, get back the positions we set
+                            current_ticks = ax.get_xticks()
+                            tick_to_pos = {i: i for i in current_ticks}
+                            # Map current tick order to positions for styling
+                            for tick_label, xpos in zip(ax.get_xticklabels(), current_ticks):
+                                pos_int = int(round(xpos))
+                                if pos_int in highlight_pos:
+                                    tick_label.set_color('crimson')
+                                    tick_label.set_fontweight('bold')
+                                    # Annotate total above the stacked bar
+                                    y = float(totals.iloc[pos_int])
+                                    # Compute breakdown for this day
+                                    try:
+                                        p_val = float(pv.iloc[pos_int]['post']) if 'post' in pv.columns else 0.0
+                                    except Exception:
+                                        p_val = 0.0
+                                    try:
+                                        r_val = float(pv.iloc[pos_int]['reply']) if 'reply' in pv.columns else 0.0
+                                    except Exception:
+                                        r_val = 0.0
+                                    lbl = f"{int(y)} ({int(p_val)}+{int(r_val)})"
+                                    ax.text(pos_int, y, lbl, color='crimson', fontsize=8, fontweight='bold', ha='center', va='bottom')
+                        except Exception:
+                            pass
+                        # If fixtures are available, annotate games per day above bars
+                        try:
+                            if fixtures_by_day is not None and len(fixtures_by_day) > 0:
+                                # Reserve a fixed band above the bars for match labels (axes coordinates)
+                                from matplotlib import transforms as _mtrans
+                                # Diagnostics: see how many pivot days have fixtures
+                                try:
+                                    keys = list(fixtures_by_day.keys())
+                                    matched_days = sum(1 for d in pv.index if d in fixtures_by_day)
+                                    print(f"[plots] fixtures days={len(keys)}; pivot days={len(pv.index)}; matched days={matched_days}")
+                                except Exception:
+                                    pass
+                                annotated_days = 0
+                                # Fixed band just above the axes (y in axes coords)
+                                trans_xdata_yaxes = ax.get_xaxis_transform()
+                                y_band = float(getattr(args, 'labels_band_y', 0.96))
+                                rows = max(1, int(getattr(args, 'labels_stagger_rows', 2)))
+                                rows = min(rows, 4)
+                                offset_step = 0.055  # vertical offset between stagger rows (in axes coords)
+                                # Write a small debug CSV of expected labels
+                                try:
+                                    dbg_path = os.path.join(out_dir, 'match_labels_debug.csv')
+                                    _rows = []
+                                    for d, labs in fixtures_by_day.items():
+                                        _rows.append({'day': str(d), 'labels': ' | '.join(str(x) for x in labs)})
+                                    pd.DataFrame(_rows).to_csv(dbg_path, index=False)
+                                    print(f"[plots] wrote {dbg_path} with {len(_rows)} days")
+                                except Exception:
+                                    pass
+                                # Determine which positions to annotate based on mode
+                                mode = getattr(args, 'labels_annotate_mode', 'ticks+top')
+                                pos_all = set(range(num_days))
+                                pos_ticks = set(show_pos or [])
+                                pos_top = set(highlight_pos)
+                                if mode == 'all':
+                                    annotate_positions = pos_all
+                                elif mode == 'ticks':
+                                    annotate_positions = pos_ticks
+                                else:  # ticks+top (default)
+                                    annotate_positions = pos_ticks | pos_top
+
+                                max_per_day = max(1, int(getattr(args, 'labels_max_per_day', 3)))
+                                per_line = max(1, int(getattr(args, 'labels_per_line', 2)))
+
+                                def _chunk(xs, n):
+                                    return [xs[i:i+n] for i in range(0, len(xs), n)]
+
+                                for i, day in enumerate(pv.index):
+                                    if i not in annotate_positions:
+                                        continue
+                                    labels = fixtures_by_day.get(day)
+                                    if not labels:
+                                        continue
+                                    labs_all = [str(x) for x in labels]
+                                    if len(labs_all) > max_per_day:
+                                        extra = len(labs_all) - max_per_day
+                                        labs = labs_all[:max_per_day] + [f"+{extra} more"]
+                                    else:
+                                        labs = labs_all
+                                    # Build multi-line text: per_line entries per row
+                                    lines = [' • '.join(chunk) for chunk in _chunk(labs, per_line)]
+                                    text = '\n'.join(lines)
+                                    # Stagger vertically by index to reduce neighbor collisions
+                                    row_id = i % rows
+                                    # Stagger downward inside the axes, away from the title
+                                    y = y_band - (row_id * offset_step)
+                                    # Keep within the axes area
+                                    y = max(0.02, min(0.98, y))
+                                    # Center above the bar; small bbox for readability
+                                    ax.text(i, y, text,
+                                            fontsize=7, ha='center', va='bottom', rotation=0,
+                                            clip_on=False, zorder=5, color='forestgreen', transform=trans_xdata_yaxes,
+                                            bbox=dict(facecolor='white', edgecolor='none', alpha=0.6, pad=1.5))
+                                    annotated_days += 1
+                                if annotated_days > 0:
+                                    # Leave extra headroom above axes for the label band
+                                    try:
+                                        labels_in_band = True
+                                        # Add y-margin so tallest bars don't collide with labels
+                                        base_margin = 0.10
+                                        extra = (rows - 1) * 0.03
+                                        ax.margins(y=min(0.30, base_margin + extra))
+                                        print(f"[plots] match labels annotated (inside band): days={annotated_days}; mode={mode}; max/day={max_per_day}; per_line={per_line}; rows={rows}; y_band={y_band:.2f}")
+                                    except Exception:
+                                        pass
+                        except Exception as e:
+                            print(f"[plots] match labels annotation skipped: {e}")
+                        # First tighten layout, then reserve top margin if label band is used
+                        plt.tight_layout()
+                        try:
+                            # If labels are placed inside (y_band < 1), no need to push the title
+                            pass
+                        except Exception:
+                            pass
+                        plt.savefig(os.path.join(out_dir, 'daily_activity_stacked.png'), dpi=150)
+                        plt.close()
+                        print(f"[plots] Saved {os.path.join(out_dir, 'daily_activity_stacked.png')}")
+            except Exception as e:
+                print(f"[plots] Failed daily activity stacked: {e}")
+
+            # 5b) Daily volume (posts+replies) with positive/negative sentiment shares (twin y-axes)
+            try:
+                if 'date' in df.columns:
+                    # Build per-day combined data with sentiment flags
+                    parts = []
+                    # Posts
+                    p = df[['id','date']].copy()
+                    p['date'] = pd.to_datetime(p['date'], errors='coerce')
+                    p = p.dropna(subset=['date'])
+                    if not p.empty:
+                        if 'sentiment_label' in df.columns and df['sentiment_label'].notna().any():
+                            lab = df.loc[p.index, 'sentiment_label'].astype(str).str.lower()
+                            p['is_pos'] = lab.str.contains('pos|positive', regex=True, na=False)
+                            p['is_neg'] = lab.str.contains('neg|negative', regex=True, na=False)
+                        else:
+                            # Fallback to compound thresholds
+                            if 'sentiment_compound' in df.columns:
+                                sc = pd.to_numeric(df.loc[p.index, 'sentiment_compound'], errors='coerce')
+                                p['is_pos'] = sc > 0.05
+                                p['is_neg'] = sc < -0.05
+                            else:
+                                p['is_pos'] = False
+                                p['is_neg'] = False
+                        p['day'] = p['date'].dt.date
+                        parts.append(p[['day','is_pos','is_neg']])
+                    # Replies
+                    if replies_df is not None and 'date' in replies_df.columns:
+                        r = replies_df[['id','date']].copy()
+                        r['date'] = pd.to_datetime(r['date'], errors='coerce')
+                        r = r.dropna(subset=['date'])
+                        if not r.empty:
+                            if 'sentiment_label' in replies_df.columns and replies_df['sentiment_label'].notna().any():
+                                lab = replies_df.loc[r.index, 'sentiment_label'].astype(str).str.lower()
+                                r['is_pos'] = lab.str.contains('pos|positive', regex=True, na=False)
+                                r['is_neg'] = lab.str.contains('neg|negative', regex=True, na=False)
+                            else:
+                                if 'sentiment_compound' in replies_df.columns:
+                                    sc = pd.to_numeric(replies_df.loc[r.index, 'sentiment_compound'], errors='coerce')
+                                    r['is_pos'] = sc > 0.05
+                                    r['is_neg'] = sc < -0.05
+                                else:
+                                    r['is_pos'] = False
+                                    r['is_neg'] = False
+                            r['day'] = r['date'].dt.date
+                            parts.append(r[['day','is_pos','is_neg']])
+                    if parts:
+                        all_rows = pd.concat(parts, ignore_index=True)
+                        grp = (
+                            all_rows.groupby('day')
+                            .agg(
+                                volume_total=('is_pos','count'),
+                                pos_share=('is_pos','mean'),
+                                neg_share=('is_neg','mean')
+                            )
+                            .sort_index()
+                        )
+                        if not grp.empty:
+                            num_days = len(grp.index)
+                            fig_w = max(16, min(float(args.plot_max_width), float(args.plot_width_scale) * num_days))
+                            import matplotlib.pyplot as _plt
+                            from matplotlib.ticker import PercentFormatter as _PercentFormatter
+                            try:
+                                print(f"[plots] daily_volume_and_sentiment: days={num_days}, width_in={fig_w:.2f}")
+                            except Exception:
+                                pass
+                            fig, ax1 = _plt.subplots(figsize=(fig_w, float(args.plot_height)))
+                            x = range(num_days)
+                            # Bars: total volume (posts+replies)
+                            ax1.bar(x, grp['volume_total'], color='#6baed6', alpha=0.8, label='Volume (posts+replies)')
+                            ax1.set_xlabel('Day')
+                            ax1.set_ylabel('Volume', color='#335')
+                            ax1.tick_params(axis='y', labelcolor='#335')
+                            # Format x-ticks with dates
+                            xticklabels = [f"{d} ({d.strftime('%a')})" if hasattr(d, 'strftime') else str(d) for d in grp.index]
+                            ax1.set_xticks(list(x))
+                            ax1.set_xticklabels(xticklabels, rotation=45, ha='right')
+                            # Lines: positive and negative sentiment shares
+                            ax2 = ax1.twinx()
+                            ax2.plot(x, grp['pos_share'].fillna(0), color='#2ca02c', marker='o', linewidth=1.5, label='Positive %')
+                            ax2.plot(x, grp['neg_share'].fillna(0), color='#d62728', marker='o', linewidth=1.5, label='Negative %')
+                            ax2.set_ylim(0, 1)
+                            ax2.yaxis.set_major_formatter(_PercentFormatter(xmax=1.0))
+                            ax2.set_ylabel('Sentiment share', color='#333')
+                            ax2.tick_params(axis='y', labelcolor='#333')
+                            # Build a combined legend
+                            lines1, labels1 = ax1.get_legend_handles_labels()
+                            lines2, labels2 = ax2.get_legend_handles_labels()
+                            ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
+                            _plt.title('Daily volume vs positive/negative sentiment')
+                            _plt.tight_layout()
+                            outp = os.path.join(out_dir, 'daily_volume_and_sentiment.png')
+                            _plt.savefig(outp, dpi=150)
+                            _plt.close()
+                            print(f"[plots] Saved {outp}")
+            except Exception as e:
+                print(f"[plots] Failed daily volume and sentiment plot: {e}")
+
+            # 6) Tag co-occurrence heatmap (posts, top 15 tags)
+            try:
+                if 'tags' in df.columns:
+                    # Prepare list of tag lists per message
+                    tags_series = df['tags'].apply(lambda s: s if isinstance(s, list) else ([] if pd.isna(s) else [s]))
+                    # Frequency of tags
+                    from collections import Counter
+                    freq = Counter()
+                    for ts in tags_series:
+                        freq.update(set([t for t in ts if t]))
+                    # Removed: tag co-occurrence heatmap
+            except Exception as e:
+                print(f"[plots] Failed tag co-occurrence heatmap: {e}")
+
+            # Removed: matchday boxplots (posts and replies)
+
+            # 7) Overall matchday sentiment (posts and replies)
+            try:
+                if fixtures_present and 'date' in df.columns:
+                    # Prepare posts per-day sentiment
+                    pd_posts = df.copy()
+                    pd_posts['date'] = pd.to_datetime(pd_posts['date'], errors='coerce')
+                    pd_posts = pd_posts.dropna(subset=['date'])
+                    if not pd_posts.empty and 'sentiment_compound' in pd_posts.columns:
+                        pd_posts['day'] = pd_posts['date'].dt.date
+                        g_posts = pd_posts.groupby('day').agg(
+                            posts_n=('id','count'),
+                            posts_mean=('sentiment_compound','mean')
+                        )
+                        # Optional label-based shares
+                        if 'sentiment_label' in pd_posts.columns:
+                            lab = pd_posts[['day','sentiment_label']].dropna()
+                            lab_s = lab['sentiment_label'].astype(str).str.lower()
+                            lab['pos'] = lab_s.str.contains('pos|positive')
+                            lab['neg'] = lab_s.str.contains('neg|negative')
+                            s_posts = lab.groupby('day').agg(posts_pos_share=('pos','mean'), posts_neg_share=('neg','mean'))
+                            g_posts = g_posts.join(s_posts, how='left')
+                    else:
+                        g_posts = None
+
+                    # Prepare replies per-day sentiment if available
+                    g_replies = None
+                    if replies_df is not None and 'date' in replies_df.columns and 'sentiment_compound' in replies_df.columns:
+                        pd_rep = replies_df.copy()
+                        pd_rep['date'] = pd.to_datetime(pd_rep['date'], errors='coerce')
+                        pd_rep = pd_rep.dropna(subset=['date'])
+                        if not pd_rep.empty:
+                            pd_rep['day'] = pd_rep['date'].dt.date
+                            g_replies = pd_rep.groupby('day').agg(
+                                replies_n=('id','count'),
+                                replies_mean=('sentiment_compound','mean')
+                            )
+                            if 'sentiment_label' in pd_rep.columns:
+                                lab = pd_rep[['day','sentiment_label']].dropna()
+                                lab_s = lab['sentiment_label'].astype(str).str.lower()
+                                lab['pos'] = lab_s.str.contains('pos|positive')
+                                lab['neg'] = lab_s.str.contains('neg|negative')
+                                s_rep = lab.groupby('day').agg(replies_pos_share=('pos','mean'), replies_neg_share=('neg','mean'))
+                                g_replies = g_replies.join(s_rep, how='left')
+
+                    # Build fixtures day index
+                    fix_days = None
+                    try:
+                        # re-use 'fix' if available; else build from fixtures_by_day keys
+                        if 'fix' in locals() and isinstance(fix, pd.DataFrame) and 'utcDate' in fix.columns:
+                            ftmp = fix.copy()
+                            ftmp['utcDate'] = pd.to_datetime(ftmp['utcDate'], errors='coerce')
+                            fix_days = ftmp.dropna(subset=['utcDate'])['utcDate'].dt.date.drop_duplicates().sort_values()
+                        elif fixtures_by_day is not None:
+                            fix_days = pd.Series(sorted(list(fixtures_by_day.keys())))
+                    except Exception:
+                        pass
+
+                    if fix_days is not None:
+                        # Join per-day aggregates on fixture days only
+                        idx = pd.Index(fix_days, name='day')
+                        agg = pd.DataFrame(index=idx)
+                        if g_posts is not None:
+                            agg = agg.join(g_posts, how='left')
+                        if g_replies is not None:
+                            agg = agg.join(g_replies, how='left')
+                        out_csv = os.path.join(out_dir, 'matchday_sentiment_overall.csv')
+                        agg.reset_index().to_csv(out_csv, index=False)
+                        print(f"[plots] Wrote {out_csv}")
+
+                        # Plot time series of mean compound for posts/replies on match days
+                        import matplotlib.pyplot as plt
+                        plt.figure(figsize=(max(12, len(idx)*0.3), 4))
+                        if 'posts_mean' in agg.columns:
+                            plt.plot(range(len(idx)), agg['posts_mean'], marker='o', label='Posts mean')
+                        if 'replies_mean' in agg.columns:
+                            plt.plot(range(len(idx)), agg['replies_mean'], marker='o', label='Replies mean')
+                        plt.axhline(0.0, color='#888', linestyle='--', linewidth=1)
+                        plt.xticks(range(len(idx)), [str(d) for d in idx], rotation=45, ha='right')
+                        plt.ylabel('Compound sentiment (mean)')
+                        plt.title('Matchday sentiment (overall)')
+                        plt.legend()
+                        plt.tight_layout()
+                        path = os.path.join(out_dir, 'matchday_sentiment_overall.png')
+                        plt.savefig(path, dpi=150); plt.close()
+                        print(f"[plots] Saved {path}")
+
+                        # Scatter: posts_n vs posts_mean on matchdays
+                        if 'posts_n' in agg.columns and 'posts_mean' in agg.columns:
+                            plt.figure(figsize=(5,4))
+                            plt.scatter(agg['posts_n'].fillna(0), agg['posts_mean'].fillna(0), alpha=0.7, color='#1f77b4')
+                            plt.xlabel('Posts count (matchday)')
+                            plt.ylabel('Mean compound (posts)')
+                            plt.title('Posts volume vs sentiment on matchdays')
+                            plt.tight_layout()
+                            sp = os.path.join(out_dir, 'matchday_posts_volume_vs_sentiment.png')
+                            plt.savefig(sp, dpi=150); plt.close()
+                            print(f"[plots] Saved {sp}")
+            except Exception as e:
+                print(f"[plots] Failed matchday sentiment overall: {e}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/apply_labels.py b/src/apply_labels.py
new file mode 100644
index 0000000..6c23589
--- /dev/null
+++ b/src/apply_labels.py
@@ -0,0 +1,50 @@
+import argparse
+import os
+import pandas as pd
+
+
+def read_csv(path: str) -> pd.DataFrame:
+    if not os.path.exists(path):
+        raise SystemExit(f"CSV not found: {path}")
+    return pd.read_csv(path)
+
+
+def main():
+    p = argparse.ArgumentParser(description='Apply labeled sentiments to posts/replies CSVs for analysis plots.')
+    p.add_argument('--labeled-csv', required=True, help='Path to labeled_sentiment.csv (must include id and label columns)')
+    p.add_argument('--posts-csv', required=True, help='Original posts CSV')
+    p.add_argument('--replies-csv', required=True, help='Original replies CSV')
+    p.add_argument('--posts-out', default=None, help='Output posts CSV path (default: <posts> with _with_labels suffix)')
+    p.add_argument('--replies-out', default=None, help='Output replies CSV path (default: <replies> with _with_labels suffix)')
+    args = p.parse_args()
+
+    labeled = read_csv(args.labeled_csv)
+    if 'id' not in labeled.columns:
+        raise SystemExit('labeled CSV must include an id column to merge on')
+    # normalize label column name to sentiment_label
+    lab_col = 'label' if 'label' in labeled.columns else ('sentiment_label' if 'sentiment_label' in labeled.columns else None)
+    if lab_col is None:
+        raise SystemExit("labeled CSV must include a 'label' or 'sentiment_label' column")
+    labeled = labeled[['id', lab_col] + (['confidence'] if 'confidence' in labeled.columns else [])].copy()
+    labeled = labeled.rename(columns={lab_col: 'sentiment_label'})
+
+    posts = read_csv(args.posts_csv)
+    replies = read_csv(args.replies_csv)
+
+    if 'id' not in posts.columns or 'id' not in replies.columns:
+        raise SystemExit('posts/replies CSVs must include id columns')
+
+    posts_out = args.posts_out or os.path.splitext(args.posts_csv)[0] + '_with_labels.csv'
+    replies_out = args.replies_out or os.path.splitext(args.replies_csv)[0] + '_with_labels.csv'
+
+    posts_merged = posts.merge(labeled, how='left', on='id', validate='m:1')
+    replies_merged = replies.merge(labeled, how='left', on='id', validate='m:1')
+
+    posts_merged.to_csv(posts_out, index=False)
+    replies_merged.to_csv(replies_out, index=False)
+    print(f"Wrote posts with labels -> {posts_out} (rows={len(posts_merged)})")
+    print(f"Wrote replies with labels -> {replies_out} (rows={len(replies_merged)})")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/audit_team_sentiment.py b/src/audit_team_sentiment.py
new file mode 100644
index 0000000..b2e4130
--- /dev/null
+++ b/src/audit_team_sentiment.py
@@ -0,0 +1,107 @@
+import argparse
+import os
+from typing import List
+
+import pandas as pd
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+
+
+def parse_tags_column(series: pd.Series) -> pd.Series:
+    def _to_list(x):
+        if isinstance(x, list):
+            return x
+        if pd.isna(x):
+            return []
+        s = str(x)
+        # Expect semicolon-delimited from augmented CSV, but also accept comma
+        if ';' in s:
+            return [t.strip() for t in s.split(';') if t.strip()]
+        if ',' in s:
+            return [t.strip() for t in s.split(',') if t.strip()]
+        return [s] if s else []
+    return series.apply(_to_list)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Audit sentiment per team tag and export samples for inspection.')
+    parser.add_argument('--csv', default='data/premier_league_update_tagged.csv', help='Tagged posts CSV (augmented by analyze)')
+    parser.add_argument('--team', default='club_manchester_united', help='Team tag to export samples for (e.g., club_manchester_united)')
+    parser.add_argument('--out-dir', default='data', help='Directory to write audit outputs')
+    parser.add_argument('--samples', type=int, default=25, help='Number of samples to export for the specified team')
+    parser.add_argument('--with-vader', action='store_true', help='Also compute VADER-based sentiment shares as a sanity check')
+    args = parser.parse_args()
+
+    if not os.path.exists(args.csv):
+        raise SystemExit(f"CSV not found: {args.csv}. Run analyze with --write-augmented-csv first.")
+
+    df = pd.read_csv(args.csv)
+    if 'message' not in df.columns:
+        raise SystemExit('CSV missing message column')
+    if 'sentiment_compound' not in df.columns:
+        raise SystemExit('CSV missing sentiment_compound column')
+    if 'tags' not in df.columns:
+        raise SystemExit('CSV missing tags column')
+
+    df = df.copy()
+    df['tags'] = parse_tags_column(df['tags'])
+    # Filter to team tags (prefix club_)
+    e = df.explode('tags')
+    e = e[e['tags'].notna() & (e['tags'] != '')]
+    e = e[e['tags'].astype(str).str.startswith('club_')]
+    e = e.dropna(subset=['sentiment_compound'])
+    if e.empty:
+        print('No team-tagged rows found.')
+        return
+
+    # Shares
+    e = e.copy()
+    e['is_pos'] = e['sentiment_compound'] > 0.05
+    e['is_neg'] = e['sentiment_compound'] < -0.05
+    grp = (
+        e.groupby('tags')
+        .agg(
+            n=('sentiment_compound', 'count'),
+            mean=('sentiment_compound', 'mean'),
+            median=('sentiment_compound', 'median'),
+            pos_share=('is_pos', 'mean'),
+            neg_share=('is_neg', 'mean'),
+        )
+        .reset_index()
+    )
+    grp['neu_share'] = (1 - grp['pos_share'] - grp['neg_share']).clip(lower=0)
+    grp = grp.sort_values(['n', 'mean'], ascending=[False, False])
+
+    if args.with_vader:
+        # Compute VADER shares on the underlying messages per team
+        analyzer = SentimentIntensityAnalyzer()
+        def _vader_sentiment_share(sub: pd.DataFrame):
+            if sub.empty:
+                return pd.Series({'pos_share_vader': 0.0, 'neg_share_vader': 0.0, 'neu_share_vader': 0.0})
+            scores = sub['message'].astype(str).apply(lambda t: analyzer.polarity_scores(t or '')['compound'])
+            pos = (scores > 0.05).mean()
+            neg = (scores < -0.05).mean()
+            neu = max(0.0, 1.0 - pos - neg)
+            return pd.Series({'pos_share_vader': pos, 'neg_share_vader': neg, 'neu_share_vader': neu})
+        vader_grp = e.groupby('tags').apply(_vader_sentiment_share).reset_index()
+        grp = grp.merge(vader_grp, on='tags', how='left')
+
+    os.makedirs(args.out_dir, exist_ok=True)
+    out_summary = os.path.join(args.out_dir, 'team_sentiment_audit.csv')
+    grp.to_csv(out_summary, index=False)
+    print(f"Wrote summary: {out_summary}")
+
+    # Export samples for selected team
+    te = e[e['tags'] == args.team].copy()
+    if te.empty:
+        print(f"No rows for team tag: {args.team}")
+        return
+    # Sort by sentiment descending to inspect highly positive claims
+    te = te.sort_values('sentiment_compound', ascending=False)
+    cols = [c for c in ['id', 'date', 'message', 'sentiment_compound', 'url'] if c in te.columns]
+    samples_path = os.path.join(args.out_dir, f"{args.team}_samples.csv")
+    te[cols].head(args.samples).to_csv(samples_path, index=False)
+    print(f"Wrote samples: {samples_path} ({min(args.samples, len(te))} rows)")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/auto_label_sentiment.py b/src/auto_label_sentiment.py
new file mode 100644
index 0000000..a6fa5ef
--- /dev/null
+++ b/src/auto_label_sentiment.py
@@ -0,0 +1,218 @@
+import argparse
+import os
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+
+try:
+    # Allow both package and direct script execution
+    from .make_labeling_set import load_messages as _load_messages
+except Exception:
+    from make_labeling_set import load_messages as _load_messages
+
+
+def _combine_inputs(posts_csv: Optional[str], replies_csv: Optional[str], text_col: str = 'message', min_length: int = 3) -> pd.DataFrame:
+    frames: List[pd.DataFrame] = []
+    if posts_csv:
+        frames.append(_load_messages(posts_csv, text_col=text_col))
+    if replies_csv:
+        # include parent_id if present for replies
+        frames.append(_load_messages(replies_csv, text_col=text_col, extra_cols=['parent_id']))
+    if not frames:
+        raise SystemExit('No input provided. Use --input-csv or --posts-csv/--replies-csv')
+    df = pd.concat(frames, ignore_index=True)
+    df['message'] = df['message'].fillna('').astype(str)
+    df = df[df['message'].str.len() >= min_length]
+    df = df.drop_duplicates(subset=['message']).reset_index(drop=True)
+    return df
+
+
+def _map_label_str_to_int(labels: List[str]) -> List[int]:
+    mapping = {'neg': 0, 'negative': 0, 'neu': 1, 'neutral': 1, 'pos': 2, 'positive': 2}
+    out: List[int] = []
+    for lab in labels:
+        lab_l = (lab or '').lower()
+        if lab_l in mapping:
+            out.append(mapping[lab_l])
+        else:
+            # fallback: try to parse integer
+            try:
+                out.append(int(lab))
+            except Exception:
+                out.append(1)  # default to neutral
+    return out
+
+
+def _vader_label(compound: float, pos_th: float, neg_th: float) -> str:
+    if compound >= pos_th:
+        return 'pos'
+    if compound <= neg_th:
+        return 'neg'
+    return 'neu'
+
+
+def _auto_label_vader(texts: List[str], pos_th: float, neg_th: float, min_margin: float) -> Tuple[List[str], List[float]]:
+    analyzer = SentimentIntensityAnalyzer()
+    labels: List[str] = []
+    confs: List[float] = []
+    for t in texts:
+        s = analyzer.polarity_scores(t or '')
+        comp = float(s.get('compound', 0.0))
+        lab = _vader_label(comp, pos_th, neg_th)
+        # Confidence heuristic: distance from neutral band edges
+        if lab == 'pos':
+            conf = max(0.0, comp - pos_th)
+        elif lab == 'neg':
+            conf = max(0.0, abs(comp - neg_th))
+        else:
+            # closer to 0 is more neutral; confidence inversely related to |compound|
+            conf = max(0.0, (pos_th - abs(comp)))
+        labels.append(lab)
+        confs.append(conf)
+    # Normalize confidence roughly to [0,1] by clipping with a reasonable scale
+    confs = [min(1.0, c / max(1e-6, min_margin)) for c in confs]
+    return labels, confs
+
+
+def _auto_label_transformers(texts: List[str], model_name_or_path: str, batch_size: int, min_prob: float, min_margin: float) -> Tuple[List[str], List[float]]:
+    try:
+        from .transformer_sentiment import TransformerSentiment
+    except Exception:
+        from transformer_sentiment import TransformerSentiment
+
+    clf = TransformerSentiment(model_name_or_path)
+    probs_all, labels_all = clf.predict_probs_and_labels(texts, batch_size=batch_size)
+    confs: List[float] = []
+    for row in probs_all:
+        row = np.array(row, dtype=float)
+        if row.size == 0:
+            confs.append(0.0)
+            continue
+        top2 = np.sort(row)[-2:] if row.size >= 2 else np.array([0.0, row.max()])
+        max_p = float(row.max())
+        margin = float(top2[-1] - top2[-2]) if row.size >= 2 else max_p
+        # Confidence must satisfy both conditions
+        conf = min(max(0.0, (max_p - min_prob) / max(1e-6, 1 - min_prob)), max(0.0, margin / max(1e-6, min_margin)))
+        confs.append(conf)
+    # Map arbitrary id2label names to canonical 'neg/neu/pos' when obvious; else keep as-is
+    canonical = []
+    for lab in labels_all:
+        ll = (lab or '').lower()
+        if 'neg' in ll:
+            canonical.append('neg')
+        elif 'neu' in ll or 'neutral' in ll:
+            canonical.append('neu')
+        elif 'pos' in ll or 'positive' in ll:
+            canonical.append('pos')
+        else:
+            canonical.append(lab)
+    return canonical, confs
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Automatically label sentiment without manual annotation.')
+    src = parser.add_mutually_exclusive_group(required=True)
+    src.add_argument('--input-csv', help='Single CSV containing a text column (default: message)')
+    src.add_argument('--posts-csv', help='Posts CSV to include')
+    parser.add_argument('--replies-csv', help='Replies CSV to include (combined with posts if provided)')
+    parser.add_argument('--text-col', default='message', help='Text column name in input CSV(s)')
+    parser.add_argument('-o', '--output', default='data/labeled_sentiment.csv', help='Output labeled CSV path')
+    parser.add_argument('--limit', type=int, default=None, help='Optional cap on number of rows')
+    parser.add_argument('--min-length', type=int, default=3, help='Minimum text length to consider')
+
+    parser.add_argument('--backend', choices=['vader', 'transformers', 'gpt'], default='vader', help='Labeling backend: vader, transformers, or gpt (local via Ollama)')
+    # VADER knobs
+    parser.add_argument('--vader-pos', type=float, default=0.05, help='VADER positive threshold (compound >=)')
+    parser.add_argument('--vader-neg', type=float, default=-0.05, help='VADER negative threshold (compound <=)')
+    parser.add_argument('--vader-margin', type=float, default=0.2, help='Confidence scaling for VADER distance')
+    # Transformers knobs
+    parser.add_argument('--transformers-model', default='cardiffnlp/twitter-roberta-base-sentiment-latest', help='HF model for 3-class sentiment')
+    parser.add_argument('--batch-size', type=int, default=64)
+    parser.add_argument('--min-prob', type=float, default=0.6, help='Min top class probability to accept')
+    parser.add_argument('--min-margin', type=float, default=0.2, help='Min prob gap between top-1 and top-2 to accept')
+
+    # GPT knobs
+    parser.add_argument('--gpt-model', default='llama3', help='Local GPT model name (Ollama)')
+    parser.add_argument('--gpt-base-url', default='http://localhost:11434', help='Base URL for local GPT server (Ollama)')
+    parser.add_argument('--gpt-batch-size', type=int, default=8)
+
+    parser.add_argument('--label-format', choices=['str', 'int'], default='str', help="Output labels as strings ('neg/neu/pos') or integers (0/1/2)")
+    parser.add_argument('--only-confident', action='store_true', help='Drop rows that do not meet confidence thresholds')
+
+    args = parser.parse_args()
+
+    # Load inputs
+    if args.input_csv:
+        if not os.path.exists(args.input_csv):
+            raise SystemExit(f"Input CSV not found: {args.input_csv}")
+        df = pd.read_csv(args.input_csv)
+        if args.text_col not in df.columns:
+            raise SystemExit(f"Text column '{args.text_col}' not in {args.input_csv}")
+        df = df.copy()
+        df['message'] = df[args.text_col].astype(str)
+        base_cols = [c for c in ['id', 'date', 'message', 'url'] if c in df.columns]
+        df = df[base_cols if base_cols else ['message']]
+        df = df[df['message'].str.len() >= args.min_length]
+        df = df.drop_duplicates(subset=['message']).reset_index(drop=True)
+    else:
+        df = _combine_inputs(args.posts_csv, args.replies_csv, text_col=args.text_col, min_length=args.min_length)
+
+    if args.limit and len(df) > args.limit:
+        df = df.head(args.limit)
+
+    texts = df['message'].astype(str).tolist()
+
+    # Predict labels + confidence
+    if args.backend == 'vader':
+        labels, conf = _auto_label_vader(texts, pos_th=args.vader_pos, neg_th=args.vader_neg, min_margin=args.vader_margin)
+        # For VADER, define acceptance: confident if outside neutral band by at least margin, or inside band with closeness to 0 below threshold
+        accept = []
+        analyzer = SentimentIntensityAnalyzer()
+        for t in texts:
+            comp = analyzer.polarity_scores(t or '').get('compound')
+            if comp is None:
+                accept.append(False)
+                continue
+            comp = float(comp)
+            if comp >= args.vader_pos + args.vader_margin or comp <= args.vader_neg - args.vader_margin:
+                accept.append(True)
+            else:
+                # inside or near band -> consider less confident
+                accept.append(False)
+    elif args.backend == 'transformers':
+        labels, conf = _auto_label_transformers(texts, args.transformers_model, args.batch_size, args.min_prob, args.min_margin)
+        accept = [((c >= 1.0)) or ((c >= 0.5)) for c in conf]  # normalize conf ~[0,1]; accept medium-high confidence
+    else:
+        # GPT backend via Ollama: expect label+confidence
+        try:
+            from .gpt_sentiment import GPTSentiment
+        except Exception:
+            from gpt_sentiment import GPTSentiment
+        clf = GPTSentiment(base_url=args.gpt_base_url, model=args.gpt_model)
+        labels, conf = clf.predict_label_conf_batch(texts, batch_size=args.gpt_batch_size)
+        # Accept medium-high confidence; simple threshold like transformers path
+        accept = [c >= 0.5 for c in conf]
+
+    out = df.copy()
+    out.insert(1, 'label', labels)
+    out['confidence'] = conf
+
+    if args.only_confident:
+        out = out[np.array(accept, dtype=bool)]
+        out = out.reset_index(drop=True)
+
+    if args.label_format == 'int':
+        out['label'] = _map_label_str_to_int(out['label'].astype(str).tolist())
+
+    os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
+    out.to_csv(args.output, index=False)
+    kept = len(out)
+    print(f"Wrote {kept} labeled rows to {args.output} using backend={args.backend}")
+    if args.only_confident:
+        print("Note: only confident predictions were kept. You can remove --only-confident to include all rows.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/eval_sentiment.py b/src/eval_sentiment.py
new file mode 100644
index 0000000..d2589e1
--- /dev/null
+++ b/src/eval_sentiment.py
@@ -0,0 +1,48 @@
+import argparse
+import numpy as np
+import pandas as pd
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
+
+try:
+    from .transformer_sentiment import TransformerSentiment
+except ImportError:
+    # Allow running as a script via -m src.eval_sentiment
+    from transformer_sentiment import TransformerSentiment
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate a fine-tuned transformers sentiment model on a labeled CSV')
+    parser.add_argument('--csv', required=True, help='Labeled CSV path with message and label columns')
+    parser.add_argument('--text-col', default='message')
+    parser.add_argument('--label-col', default='label')
+    parser.add_argument('--model', required=True, help='Model name or local path')
+    parser.add_argument('--batch-size', type=int, default=64)
+    args = parser.parse_args()
+
+    df = pd.read_csv(args.csv)
+    df = df[[args.text_col, args.label_col]].dropna().copy()
+    texts = df[args.text_col].astype(str).tolist()
+    true_labels = df[args.label_col].astype(str).tolist()
+
+    clf = TransformerSentiment(args.model)
+    _, pred_labels = clf.predict_probs_and_labels(texts, batch_size=args.batch_size)
+
+    y_true = np.array(true_labels)
+    y_pred = np.array(pred_labels)
+
+    # If labels differ from model id2label names, normalize to strings for comparison
+    acc = accuracy_score(y_true, y_pred)
+    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
+    prec_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
+    rec_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
+
+    print('Accuracy:', f"{acc:.4f}")
+    print('F1 (macro):', f"{f1_macro:.4f}")
+    print('Precision (macro):', f"{prec_macro:.4f}")
+    print('Recall (macro):', f"{rec_macro:.4f}")
+    print('\nClassification report:')
+    print(classification_report(y_true, y_pred, zero_division=0))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/fetch_schedule.py b/src/fetch_schedule.py
new file mode 100644
index 0000000..1ebe651
--- /dev/null
+++ b/src/fetch_schedule.py
@@ -0,0 +1,131 @@
+import argparse
+import csv
+import os
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+import requests
+from dotenv import load_dotenv
+
+API_BASE = "https://api.football-data.org/v4"
+COMPETITION_CODE = "PL"  # Premier League
+
+
+def iso_date(d: str) -> str:
+    # Accept YYYY-MM-DD and return ISO date
+    try:
+        return datetime.fromisoformat(d).date().isoformat()
+    except Exception as e:
+        raise argparse.ArgumentTypeError(f"Invalid date: {d}. Use YYYY-MM-DD") from e
+
+
+def fetch_matches(start_date: str, end_date: str, token: str) -> Dict[str, Any]:
+    url = f"{API_BASE}/competitions/{COMPETITION_CODE}/matches"
+    headers = {"X-Auth-Token": token}
+    params = {
+        "dateFrom": start_date,
+        "dateTo": end_date,
+    }
+    r = requests.get(url, headers=headers, params=params, timeout=30)
+    r.raise_for_status()
+    return r.json()
+
+
+def normalize_match(m: Dict[str, Any]) -> Dict[str, Any]:
+    utc_date = m.get("utcDate")
+    # Convert to date/time strings
+    kick_iso = None
+    if utc_date:
+        try:
+            kick_iso = datetime.fromisoformat(utc_date.replace("Z", "+00:00")).isoformat()
+        except Exception:
+            kick_iso = utc_date
+    score = m.get("score", {})
+    full_time = score.get("fullTime", {})
+
+    return {
+        "id": m.get("id"),
+        "status": m.get("status"),
+        "matchday": m.get("matchday"),
+        "utcDate": kick_iso,
+        "homeTeam": (m.get("homeTeam") or {}).get("name"),
+        "awayTeam": (m.get("awayTeam") or {}).get("name"),
+        "homeScore": full_time.get("home"),
+        "awayScore": full_time.get("away"),
+        "referees": ", ".join([r.get("name", "") for r in m.get("referees", []) if r.get("name")]),
+        "venue": m.get("area", {}).get("name"),
+        "competition": (m.get("competition") or {}).get("name"),
+        "stage": m.get("stage"),
+        "group": m.get("group"),
+        "link": m.get("id") and f"https://www.football-data.org/match/{m['id']}" or None,
+    }
+
+
+def save_csv(matches: List[Dict[str, Any]], out_path: str) -> None:
+    if not matches:
+        # Write header only
+        fields = [
+            "id",
+            "status",
+            "matchday",
+            "utcDate",
+            "homeTeam",
+            "awayTeam",
+            "homeScore",
+            "awayScore",
+            "referees",
+            "venue",
+            "competition",
+            "stage",
+            "group",
+            "link",
+        ]
+        with open(out_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.DictWriter(f, fieldnames=fields)
+            writer.writeheader()
+        return
+    fields = list(matches[0].keys())
+    with open(out_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(matches)
+
+
+def save_json(matches: List[Dict[str, Any]], out_path: str) -> None:
+    import json
+
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(matches, f, ensure_ascii=False, indent=2)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Fetch Premier League fixtures in a date range and save to CSV/JSON")
+    parser.add_argument("--start-date", required=True, type=iso_date, help="YYYY-MM-DD (inclusive)")
+    parser.add_argument("--end-date", required=True, type=iso_date, help="YYYY-MM-DD (inclusive)")
+    parser.add_argument("-o", "--output", required=True, help="Output file path (.csv or .json)")
+    args = parser.parse_args()
+
+    load_dotenv()
+    token = os.getenv("FOOTBALL_DATA_API_TOKEN")
+    if not token:
+        raise SystemExit("Missing FOOTBALL_DATA_API_TOKEN in environment (.env)")
+
+    data = fetch_matches(args.start_date, args.end_date, token)
+    matches_raw = data.get("matches", [])
+    matches = [normalize_match(m) for m in matches_raw]
+
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+
+    ext = os.path.splitext(args.output)[1].lower()
+    if ext == ".csv":
+        save_csv(matches, args.output)
+    elif ext == ".json":
+        save_json(matches, args.output)
+    else:
+        raise SystemExit("Output must end with .csv or .json")
+
+    print(f"Saved {len(matches)} matches to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/gpt_sentiment.py b/src/gpt_sentiment.py
new file mode 100644
index 0000000..7609060
--- /dev/null
+++ b/src/gpt_sentiment.py
@@ -0,0 +1,93 @@
+import json
+from typing import List, Tuple
+
+import requests
+
+
+class GPTSentiment:
+    """
+    Minimal client for a local GPT model served by Ollama.
+
+    Expects the model to respond with a strict JSON object like:
+      {"label": "neg|neu|pos", "confidence": 0.0..1.0}
+
+    Endpoint used: POST {base_url}/api/generate with payload:
+      {"model": <model>, "prompt": <prompt>, "stream": false, "format": "json"}
+    """
+
+    def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3", timeout: int = 30):
+        self.base_url = base_url.rstrip("/")
+        self.model = model
+        self.timeout = timeout
+
+    def _build_prompt(self, text: str) -> str:
+        # Keep the instruction terse and deterministic; request strict JSON.
+        return (
+            "You are a strict JSON generator for sentiment analysis. "
+            "Classify the INPUT text as one of: neg, neu, pos. "
+            "Return ONLY a JSON object with keys 'label' and 'confidence' (0..1). "
+            "No markdown, no prose.\n\n"
+            f"INPUT: {text}"
+        )
+
+    def _call(self, prompt: str) -> dict:
+        url = f"{self.base_url}/api/generate"
+        payload = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": False,
+            "format": "json",
+        }
+        r = requests.post(url, json=payload, timeout=self.timeout)
+        r.raise_for_status()
+        data = r.json()
+        # Ollama returns the model's response under 'response'
+        raw = data.get("response", "").strip()
+        try:
+            obj = json.loads(raw)
+        except Exception:
+            # Try to recover simple cases by stripping codefences
+            raw2 = raw.strip().removeprefix("```").removesuffix("```")
+            obj = json.loads(raw2)
+        return obj
+
+    @staticmethod
+    def _canonical_label(s: str) -> str:
+        s = (s or "").strip().lower()
+        if "neg" in s:
+            return "neg"
+        if "neu" in s or "neutral" in s:
+            return "neu"
+        if "pos" in s or "positive" in s:
+            return "pos"
+        return s or "neu"
+
+    @staticmethod
+    def _compound_from_label_conf(label: str, confidence: float) -> float:
+        label = GPTSentiment._canonical_label(label)
+        c = max(0.0, min(1.0, float(confidence or 0.0)))
+        if label == "pos":
+            return c
+        if label == "neg":
+            return -c
+        return 0.0
+
+    def predict_label_conf_batch(self, texts: List[str], batch_size: int = 8) -> Tuple[List[str], List[float]]:
+        labels: List[str] = []
+        confs: List[float] = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i+batch_size]
+            for t in batch:
+                try:
+                    obj = self._call(self._build_prompt(t))
+                    lab = self._canonical_label(obj.get("label", ""))
+                    conf = float(obj.get("confidence", 0.0))
+                except Exception:
+                    lab, conf = "neu", 0.0
+                labels.append(lab)
+                confs.append(conf)
+        return labels, confs
+
+    def predict_compound_batch(self, texts: List[str], batch_size: int = 8) -> List[float]:
+        labels, confs = self.predict_label_conf_batch(texts, batch_size=batch_size)
+        return [self._compound_from_label_conf(lab, conf) for lab, conf in zip(labels, confs)]
diff --git a/src/make_labeling_set.py b/src/make_labeling_set.py
new file mode 100644
index 0000000..fdad990
--- /dev/null
+++ b/src/make_labeling_set.py
@@ -0,0 +1,65 @@
+import argparse
+import os
+import pandas as pd
+
+
+def load_messages(csv_path: str, text_col: str = 'message', extra_cols=None) -> pd.DataFrame:
+    if not os.path.exists(csv_path):
+        return pd.DataFrame()
+    df = pd.read_csv(csv_path)
+    if text_col not in df.columns:
+        return pd.DataFrame()
+    cols = ['id', text_col, 'date']
+    if extra_cols:
+        for c in extra_cols:
+            if c in df.columns:
+                cols.append(c)
+    cols = [c for c in cols if c in df.columns]
+    out = df[cols].copy()
+    out.rename(columns={text_col: 'message'}, inplace=True)
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Create a labeling CSV from posts and/or replies.')
+    parser.add_argument('--posts-csv', required=False, help='Posts CSV path (e.g., data/..._update.csv)')
+    parser.add_argument('--replies-csv', required=False, help='Replies CSV path')
+    parser.add_argument('-o', '--output', default='data/labeled_sentiment.csv', help='Output CSV for labeling')
+    parser.add_argument('--sample-size', type=int, default=1000, help='Total rows to include (after combining)')
+    parser.add_argument('--min-length', type=int, default=3, help='Minimum message length to include')
+    parser.add_argument('--shuffle', action='store_true', help='Shuffle before sampling (default true)')
+    parser.add_argument('--no-shuffle', dest='shuffle', action='store_false')
+    parser.set_defaults(shuffle=True)
+    args = parser.parse_args()
+
+    frames = []
+    if args.posts_csv:
+        frames.append(load_messages(args.posts_csv))
+    if args.replies_csv:
+        # For replies, include parent_id if present
+        r = load_messages(args.replies_csv, extra_cols=['parent_id'])
+        frames.append(r)
+    if not frames:
+        raise SystemExit('No input CSVs provided. Use --posts-csv and/or --replies-csv.')
+
+    df = pd.concat(frames, ignore_index=True)
+    # Basic filtering: non-empty text, min length, drop duplicates by message text
+    df['message'] = df['message'].fillna('').astype(str)
+    df = df[df['message'].str.len() >= args.min_length]
+    df = df.drop_duplicates(subset=['message']).reset_index(drop=True)
+
+    if args.shuffle:
+        df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
+    if args.sample_size and len(df) > args.sample_size:
+        df = df.head(args.sample_size)
+
+    # Add blank label column for human annotation
+    df.insert(1, 'label', '')
+
+    os.makedirs(os.path.dirname(args.output) or '.', exist_ok=True)
+    df.to_csv(args.output, index=False)
+    print(f"Wrote labeling CSV with {len(df)} rows to {args.output}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/plot_labeled.py b/src/plot_labeled.py
new file mode 100644
index 0000000..c7ecf4b
--- /dev/null
+++ b/src/plot_labeled.py
@@ -0,0 +1,137 @@
+import argparse
+import os
+from typing import Optional
+
+import pandas as pd
+
+
+def safe_read(path: str) -> pd.DataFrame:
+    if not os.path.exists(path):
+        raise SystemExit(f"Input labeled CSV not found: {path}")
+    df = pd.read_csv(path)
+    if 'label' not in df.columns:
+        raise SystemExit("Expected a 'label' column in the labeled CSV")
+    if 'message' in df.columns:
+        df['message'] = df['message'].fillna('').astype(str)
+    if 'confidence' in df.columns:
+        df['confidence'] = pd.to_numeric(df['confidence'], errors='coerce')
+    if 'date' in df.columns:
+        df['date'] = pd.to_datetime(df['date'], errors='coerce')
+    return df
+
+
+def ensure_out_dir(out_dir: str) -> str:
+    os.makedirs(out_dir, exist_ok=True)
+    return out_dir
+
+
+def plot_all(df: pd.DataFrame, out_dir: str) -> None:
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    sns.set_style('whitegrid')
+
+    out_dir = ensure_out_dir(out_dir)
+
+    # 1) Class distribution
+    try:
+        plt.figure(figsize=(6,4))
+        ax = (df['label'].astype(str).str.lower().value_counts()
+              .reindex(['neg','neu','pos'])
+              .fillna(0)
+              .rename_axis('label').reset_index(name='count')
+              .set_index('label')
+              .plot(kind='bar', legend=False, color=['#d62728','#aaaaaa','#2ca02c']))
+        plt.title('Labeled class distribution')
+        plt.ylabel('Count')
+        plt.tight_layout()
+        path = os.path.join(out_dir, 'labeled_class_distribution.png')
+        plt.savefig(path, dpi=150)
+        plt.close()
+        print(f"[plots] Saved {path}")
+    except Exception as e:
+        print(f"[plots] Skipped class distribution: {e}")
+
+    # 2) Confidence histogram (overall)
+    if 'confidence' in df.columns and df['confidence'].notna().any():
+        try:
+            plt.figure(figsize=(6,4))
+            sns.histplot(df['confidence'].dropna(), bins=30, color='#1f77b4')
+            plt.title('Confidence distribution (overall)')
+            plt.xlabel('Confidence'); plt.ylabel('Frequency')
+            plt.tight_layout()
+            path = os.path.join(out_dir, 'labeled_confidence_hist.png')
+            plt.savefig(path, dpi=150); plt.close()
+            print(f"[plots] Saved {path}")
+        except Exception as e:
+            print(f"[plots] Skipped confidence histogram: {e}")
+
+        # 3) Confidence by label (boxplot)
+        try:
+            plt.figure(figsize=(6,4))
+            t = df[['label','confidence']].dropna()
+            t['label'] = t['label'].astype(str).str.lower()
+            order = ['neg','neu','pos']
+            sns.boxplot(data=t, x='label', y='confidence', order=order, palette=['#d62728','#aaaaaa','#2ca02c'])
+            plt.title('Confidence by label')
+            plt.xlabel('Label'); plt.ylabel('Confidence')
+            plt.tight_layout()
+            path = os.path.join(out_dir, 'labeled_confidence_by_label.png')
+            plt.savefig(path, dpi=150); plt.close()
+            print(f"[plots] Saved {path}")
+        except Exception as e:
+            print(f"[plots] Skipped confidence by label: {e}")
+
+    # 4) Message length by label
+    if 'message' in df.columns:
+        try:
+            t = df[['label','message']].copy()
+            t['label'] = t['label'].astype(str).str.lower()
+            t['len'] = t['message'].astype(str).str.len()
+            plt.figure(figsize=(6,4))
+            sns.boxplot(data=t, x='label', y='len', order=['neg','neu','pos'], palette=['#d62728','#aaaaaa','#2ca02c'])
+            plt.title('Message length by label')
+            plt.xlabel('Label'); plt.ylabel('Length (chars)')
+            plt.tight_layout()
+            path = os.path.join(out_dir, 'labeled_length_by_label.png')
+            plt.savefig(path, dpi=150); plt.close()
+            print(f"[plots] Saved {path}")
+        except Exception as e:
+            print(f"[plots] Skipped length by label: {e}")
+
+    # 5) Daily counts per label (if date present)
+    if 'date' in df.columns and df['date'].notna().any():
+        try:
+            t = df[['date','label']].dropna().copy()
+            t['day'] = pd.to_datetime(t['date'], errors='coerce').dt.date
+            t['label'] = t['label'].astype(str).str.lower()
+            pv = t.pivot_table(index='day', columns='label', values='date', aggfunc='count').fillna(0)
+            # ensure consistent column order
+            for c in ['neg','neu','pos']:
+                if c not in pv.columns:
+                    pv[c] = 0
+            pv = pv[['neg','neu','pos']]
+            import matplotlib.pyplot as plt
+            plt.figure(figsize=(10,4))
+            pv.plot(kind='bar', stacked=True, color=['#d62728','#aaaaaa','#2ca02c'])
+            plt.title('Daily labeled counts (stacked)')
+            plt.xlabel('Day'); plt.ylabel('Count')
+            plt.tight_layout()
+            path = os.path.join(out_dir, 'labeled_daily_counts.png')
+            plt.savefig(path, dpi=150); plt.close()
+            print(f"[plots] Saved {path}")
+        except Exception as e:
+            print(f"[plots] Skipped daily counts: {e}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Plot graphs from labeled sentiment data.')
+    parser.add_argument('-i', '--input', default='data/labeled_sentiment.csv', help='Path to labeled CSV')
+    parser.add_argument('-o', '--out-dir', default='data', help='Output directory for plots')
+    args = parser.parse_args()
+
+    df = safe_read(args.input)
+    plot_all(df, args.out_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/telegram_scraper.py b/src/telegram_scraper.py
new file mode 100644
index 0000000..c4eee88
--- /dev/null
+++ b/src/telegram_scraper.py
@@ -0,0 +1,749 @@
+import asyncio
+import json
+import os
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from typing import AsyncIterator, Iterable, Optional, Sequence, Set, List, Tuple
+
+from dotenv import load_dotenv
+from telethon import TelegramClient
+from telethon.errors import SessionPasswordNeededError
+from telethon.errors.rpcerrorlist import MsgIdInvalidError, FloodWaitError
+from telethon.tl.functions.messages import GetDiscussionMessageRequest
+from telethon.tl.custom.message import Message
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+
+
+@dataclass
+class ScrapedMessage:
+    id: int
+    date: Optional[str]  # ISO format
+    message: Optional[str]
+    sender_id: Optional[int]
+    views: Optional[int]
+    forwards: Optional[int]
+    replies: Optional[int]
+    url: Optional[str]
+
+
+def to_iso(dt: datetime) -> str:
+    return dt.replace(tzinfo=None).isoformat()
+
+
+async def iter_messages(
+    client: TelegramClient,
+    entity: str,
+    limit: Optional[int] = None,
+    offset_date: Optional[datetime] = None,
+) -> AsyncIterator[Message]:
+    async for msg in client.iter_messages(entity, limit=limit, offset_date=offset_date):
+        yield msg
+
+
+def message_to_record(msg: Message, channel_username: str) -> ScrapedMessage:
+    return ScrapedMessage(
+        id=msg.id,
+        date=to_iso(msg.date) if msg.date else None,
+        message=msg.message,
+        sender_id=getattr(msg.sender_id, 'value', msg.sender_id) if hasattr(msg, 'sender_id') else None,
+        views=getattr(msg, 'views', None),
+        forwards=getattr(msg, 'forwards', None),
+        replies=(msg.replies.replies if getattr(msg, 'replies', None) else None),
+        url=f"https://t.me/{channel_username}/{msg.id}" if channel_username else None,
+    )
+
+
+async def ensure_login(client: TelegramClient, phone: Optional[str] = None, twofa_password: Optional[str] = None):
+    # Connect and log in, prompting interactively if needed
+    await client.connect()
+    if not await client.is_user_authorized():
+        if not phone:
+            phone = input("Enter your phone number (with country code): ")
+        await client.send_code_request(phone)
+        code = input("Enter the login code you received: ")
+        try:
+            await client.sign_in(phone=phone, code=code)
+        except SessionPasswordNeededError:
+            if twofa_password is None:
+                twofa_password = input("Two-step verification enabled. Enter your password: ")
+            await client.sign_in(password=twofa_password)
+
+
+async def scrape_channel(
+    channel: str,
+    output: str,
+    limit: Optional[int] = None,
+    offset_date: Optional[str] = None,  # deprecated in favor of start_date
+    start_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    append: bool = False,
+    session_name: str = "telegram",
+    phone: Optional[str] = None,
+    twofa_password: Optional[str] = None,
+):
+    load_dotenv()
+    api_id = os.getenv("TELEGRAM_API_ID")
+    api_hash = os.getenv("TELEGRAM_API_HASH")
+    session_name = os.getenv("TELEGRAM_SESSION_NAME", session_name)
+
+    if not api_id or not api_hash:
+        raise RuntimeError("Missing TELEGRAM_API_ID/TELEGRAM_API_HASH in environment. See .env.example")
+
+    # Some providers store api_id as string; Telethon expects int
+    try:
+        api_id_int = int(api_id)
+    except Exception as e:
+        raise RuntimeError("TELEGRAM_API_ID must be an integer") from e
+
+    client = TelegramClient(session_name, api_id_int, api_hash)
+
+    # Parse date filters
+    parsed_start = None
+    parsed_end = None
+    if start_date:
+        parsed_start = datetime.fromisoformat(start_date)
+    elif offset_date:  # backward compatibility
+        parsed_start = datetime.fromisoformat(offset_date)
+    if end_date:
+        parsed_end = datetime.fromisoformat(end_date)
+
+    await ensure_login(client, phone=phone, twofa_password=twofa_password)
+
+    # Determine output format based on extension
+    ext = os.path.splitext(output)[1].lower()
+    is_jsonl = ext in (".jsonl", ".ndjson")
+    is_csv = ext == ".csv"
+
+    if not (is_jsonl or is_csv):
+        raise ValueError("Output file must end with .jsonl or .csv")
+
+    # Prepare output writers
+    csv_file = None
+    csv_writer = None
+    jsonl_file = None
+    if is_csv:
+        import csv
+        mode = "a" if append else "w"
+        csv_file = open(output, mode, newline="", encoding="utf-8")
+        csv_writer = csv.DictWriter(
+            csv_file,
+            fieldnames=[
+                "id",
+                "date",
+                "message",
+                "sender_id",
+                "views",
+                "forwards",
+                "replies",
+                "url",
+            ],
+        )
+        # Write header if not appending, or file is empty
+        need_header = True
+        try:
+            if append and os.path.exists(output) and os.path.getsize(output) > 0:
+                need_header = False
+        except Exception:
+            pass
+        if need_header:
+            csv_writer.writeheader()
+    elif is_jsonl:
+        # Open once; append or overwrite
+        mode = "a" if append else "w"
+        jsonl_file = open(output, mode, encoding="utf-8")
+
+    written = 0
+    try:
+        async for msg in iter_messages(client, channel, limit=None, offset_date=None):
+            # Telethon returns tz-aware datetimes; normalize for comparison
+            msg_dt = msg.date
+            if msg_dt is not None:
+                msg_dt = msg_dt.replace(tzinfo=None)
+
+            # Date range filter: include if within [parsed_start, parsed_end] (inclusive)
+            if parsed_start and msg_dt and msg_dt < parsed_start:
+                # Since we're iterating newest-first, once older than start we can stop
+                break
+            if parsed_end and msg_dt and msg_dt > parsed_end:
+                continue
+
+            rec = message_to_record(msg, channel_username=channel.lstrip("@"))
+            if is_jsonl and jsonl_file is not None:
+                jsonl_file.write(json.dumps(asdict(rec), ensure_ascii=False) + "\n")
+            else:
+                csv_writer.writerow(asdict(rec))  # type: ignore
+            written += 1
+            if limit is not None and written >= limit:
+                break
+    finally:
+        if csv_file:
+            csv_file.close()
+        if jsonl_file:
+            jsonl_file.close()
+        await client.disconnect()
+
+    return written
+
+
+async def fetch_replies(
+    channel: str,
+    parent_ids: Sequence[int],
+    output_csv: str,
+    append: bool = False,
+    session_name: str = "telegram",
+    phone: Optional[str] = None,
+    twofa_password: Optional[str] = None,
+    concurrency: int = 5,
+    existing_pairs: Optional[Set[Tuple[int, int]]] = None,
+):
+    load_dotenv()
+    api_id = os.getenv("TELEGRAM_API_ID")
+    api_hash = os.getenv("TELEGRAM_API_HASH")
+    session_name = os.getenv("TELEGRAM_SESSION_NAME", session_name)
+
+    if not api_id or not api_hash:
+        raise RuntimeError("Missing TELEGRAM_API_ID/TELEGRAM_API_HASH in environment. See .env.example")
+    client = TelegramClient(session_name, int(api_id), api_hash)
+    await ensure_login(client, phone=phone, twofa_password=twofa_password)
+
+    import csv
+
+    # Rate limiting counters
+    flood_hits = 0
+    flood_wait_seconds = 0
+
+    analyzer = SentimentIntensityAnalyzer()
+    os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)
+    mode = "a" if append else "w"
+    with open(output_csv, mode, newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=["parent_id", "id", "date", "message", "sender_id", "sentiment_compound", "url"],
+        )
+        # Write header only if not appending or file empty
+        need_header = True
+        try:
+            if append and os.path.exists(output_csv) and os.path.getsize(output_csv) > 0:
+                need_header = False
+        except Exception:
+            pass
+        if need_header:
+            writer.writeheader()
+
+        write_lock = asyncio.Lock()
+        sem = asyncio.Semaphore(max(1, int(concurrency)))
+
+        async def handle_parent(pid: int) -> List[dict]:
+            rows: List[dict] = []
+            # First try replies within the same channel (works for groups/supergroups)
+            attempts = 0
+            while attempts < 3:
+                try:
+                    async for reply in client.iter_messages(channel, reply_to=pid):
+                        dt = reply.date.replace(tzinfo=None) if reply.date else None
+                        url = f"https://t.me/{channel.lstrip('@')}/{reply.id}" if reply.id else None
+                        text = reply.message or ""
+                        sent = analyzer.polarity_scores(text).get("compound")
+                        rows.append(
+                            {
+                                "parent_id": pid,
+                                "id": reply.id,
+                                "date": to_iso(dt) if dt else None,
+                                "message": text,
+                                "sender_id": getattr(reply, "sender_id", None),
+                                "sentiment_compound": sent,
+                                "url": url,
+                            }
+                        )
+                    break
+                except FloodWaitError as e:
+                    secs = int(getattr(e, 'seconds', 5))
+                    flood_hits += 1
+                    flood_wait_seconds += secs
+                    print(f"[rate-limit] FloodWait while scanning replies in-channel for parent {pid}; waiting {secs}s", flush=True)
+                    await asyncio.sleep(secs + 1)
+                    attempts += 1
+                    continue
+                except MsgIdInvalidError:
+                    # Likely a channel with a linked discussion group; fall back below
+                    rows.clear()
+                    break
+                except Exception:
+                    break
+
+            if rows:
+                return rows
+
+            # Fallback: for channels with comments in a linked discussion group
+            try:
+                res = await client(GetDiscussionMessageRequest(peer=channel, msg_id=pid))
+            except Exception:
+                # No discussion thread found or not accessible
+                return rows
+
+            # Identify the discussion chat and the root message id in that chat
+            disc_chat = None
+            if getattr(res, "chats", None):
+                # Prefer the first chat returned as the discussion chat
+                disc_chat = res.chats[0]
+
+            disc_root_id = None
+            for m in getattr(res, "messages", []) or []:
+                try:
+                    peer_id = getattr(m, "peer_id", None)
+                    if not peer_id or not disc_chat:
+                        continue
+                    ch_id = getattr(peer_id, "channel_id", None) or getattr(peer_id, "chat_id", None)
+                    if ch_id == getattr(disc_chat, "id", None):
+                        disc_root_id = m.id
+                        break
+                except Exception:
+                    continue
+
+            if not disc_chat or not disc_root_id:
+                return rows
+
+            group_username = getattr(disc_chat, "username", None)
+            attempts = 0
+            while attempts < 3:
+                try:
+                    async for reply in client.iter_messages(disc_chat, reply_to=disc_root_id):
+                        dt = reply.date.replace(tzinfo=None) if reply.date else None
+                        text = reply.message or ""
+                        sent = analyzer.polarity_scores(text).get("compound")
+                        # Construct URL only if the discussion group has a public username
+                        url = None
+                        if group_username and reply.id:
+                            url = f"https://t.me/{group_username}/{reply.id}"
+                        rows.append(
+                            {
+                                "parent_id": pid,
+                                "id": reply.id,
+                                "date": to_iso(dt) if dt else None,
+                                "message": text,
+                                "sender_id": getattr(reply, "sender_id", None),
+                                "sentiment_compound": sent,
+                                "url": url,
+                            }
+                        )
+                    break
+                except FloodWaitError as e:
+                    secs = int(getattr(e, 'seconds', 5))
+                    flood_hits += 1
+                    flood_wait_seconds += secs
+                    print(f"[rate-limit] FloodWait while scanning discussion group for parent {pid}; waiting {secs}s", flush=True)
+                    await asyncio.sleep(secs + 1)
+                    attempts += 1
+                    continue
+                except Exception:
+                    break
+            return rows
+
+        total_written = 0
+        processed = 0
+        total = len(list(parent_ids)) if hasattr(parent_ids, '__len__') else None
+
+        async def worker(pid: int):
+            nonlocal total_written, processed
+            async with sem:
+                rows = await handle_parent(int(pid))
+            async with write_lock:
+                if rows:
+                    # Dedupe against existing pairs if provided (resume mode)
+                    if existing_pairs is not None:
+                        filtered: List[dict] = []
+                        for r in rows:
+                            try:
+                                key = (int(r.get("parent_id")), int(r.get("id")))
+                            except Exception:
+                                continue
+                            if key in existing_pairs:
+                                continue
+                            existing_pairs.add(key)
+                            filtered.append(r)
+                        rows = filtered
+                    if rows:
+                        writer.writerows(rows)
+                        total_written += len(rows)
+                processed += 1
+                if processed % 10 == 0 or (rows and len(rows) > 0):
+                    if total is not None:
+                        print(f"[replies] processed {processed}/{total} parents; last parent {pid} wrote {len(rows)} replies; total replies {total_written}", flush=True)
+                    else:
+                        print(f"[replies] processed {processed} parents; last parent {pid} wrote {len(rows)} replies; total replies {total_written}", flush=True)
+
+        tasks = [asyncio.create_task(worker(pid)) for pid in parent_ids]
+        await asyncio.gather(*tasks)
+
+    await client.disconnect()
+    if flood_hits:
+        print(f"[rate-limit] Summary: {flood_hits} FloodWait events; total waited ~{flood_wait_seconds}s", flush=True)
+
+
+async def fetch_forwards(
+    channel: str,
+    parent_ids: Set[int],
+    output_csv: str,
+    start_date: Optional[str] = None,
+    end_date: Optional[str] = None,
+    scan_limit: Optional[int] = None,
+    append: bool = False,
+    session_name: str = "telegram",
+    phone: Optional[str] = None,
+    twofa_password: Optional[str] = None,
+    concurrency: int = 5,
+    chunk_size: int = 1000,
+):
+    """Best-effort: find forwarded messages within the SAME channel that reference the given parent_ids.
+    Telegram API does not provide a global reverse-lookup of forwards across all channels; we therefore scan
+    this channel's history and collect messages with fwd_from.channel_post matching a parent id.
+    """
+    load_dotenv()
+    api_id = os.getenv("TELEGRAM_API_ID")
+    api_hash = os.getenv("TELEGRAM_API_HASH")
+    session_name = os.getenv("TELEGRAM_SESSION_NAME", session_name)
+    if not api_id or not api_hash:
+        raise RuntimeError("Missing TELEGRAM_API_ID/TELEGRAM_API_HASH in environment. See .env.example")
+    client = TelegramClient(session_name, int(api_id), api_hash)
+    await ensure_login(client, phone=phone, twofa_password=twofa_password)
+
+    import csv
+
+    # Rate limiting counters
+    flood_hits = 0
+    import csv
+
+    analyzer = SentimentIntensityAnalyzer()
+    os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)
+    mode = "a" if append else "w"
+    write_lock = asyncio.Lock()
+    with open(output_csv, mode, newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=["parent_id", "id", "date", "message", "sender_id", "sentiment_compound", "url"],
+        )
+        need_header = True
+        try:
+            if append and os.path.exists(output_csv) and os.path.getsize(output_csv) > 0:
+                need_header = False
+        except Exception:
+            pass
+        if need_header:
+            writer.writeheader()
+
+        parsed_start = datetime.fromisoformat(start_date) if start_date else None
+        parsed_end = datetime.fromisoformat(end_date) if end_date else None
+
+        # If no scan_limit provided, fall back to sequential scan to avoid unbounded concurrency
+        if scan_limit is None:
+            scanned = 0
+            matched = 0
+            async for msg in client.iter_messages(channel, limit=None):
+                dt = msg.date.replace(tzinfo=None) if msg.date else None
+                if parsed_start and dt and dt < parsed_start:
+                    break
+                if parsed_end and dt and dt > parsed_end:
+                    continue
+                fwd = getattr(msg, "fwd_from", None)
+                if not fwd:
+                    continue
+                ch_post = getattr(fwd, "channel_post", None)
+                if ch_post and int(ch_post) in parent_ids:
+                    text = msg.message or ""
+                    sent = analyzer.polarity_scores(text).get("compound")
+                    url = f"https://t.me/{channel.lstrip('@')}/{msg.id}" if msg.id else None
+                    writer.writerow(
+                        {
+                            "parent_id": int(ch_post),
+                            "id": msg.id,
+                            "date": to_iso(dt) if dt else None,
+                            "message": text,
+                            "sender_id": getattr(msg, "sender_id", None),
+                            "sentiment_compound": sent,
+                            "url": url,
+                        }
+                    )
+                    matched += 1
+                scanned += 1
+                if scanned % 1000 == 0:
+                    print(f"[forwards] scanned ~{scanned} messages; total forwards {matched}", flush=True)
+        else:
+            # Concurrent chunked scanning by id ranges
+            # Rate limiting counters
+            flood_hits = 0
+            flood_wait_seconds = 0
+            sem = asyncio.Semaphore(max(1, int(concurrency)))
+            progress_lock = asyncio.Lock()
+            matched_total = 0
+            completed_chunks = 0
+
+            # Determine latest message id
+            latest_msg = await client.get_messages(channel, limit=1)
+            latest_id = None
+            try:
+                latest_id = getattr(latest_msg, 'id', None) or (latest_msg[0].id if latest_msg else None)
+            except Exception:
+                latest_id = None
+            if not latest_id:
+                await client.disconnect()
+                return
+
+            total_chunks = max(1, (int(scan_limit) + int(chunk_size) - 1) // int(chunk_size))
+
+            async def process_chunk(idx: int):
+                nonlocal flood_hits, flood_wait_seconds
+                nonlocal matched_total, completed_chunks
+                max_id = latest_id - idx * int(chunk_size)
+                min_id = max(0, max_id - int(chunk_size))
+                attempts = 0
+                local_matches = 0
+                while attempts < 3:
+                    try:
+                        async with sem:
+                            async for msg in client.iter_messages(channel, min_id=min_id, max_id=max_id):
+                                dt = msg.date.replace(tzinfo=None) if msg.date else None
+                                if parsed_start and dt and dt < parsed_start:
+                                    # This range reached before start; skip remaining in this chunk
+                                    break
+                                if parsed_end and dt and dt > parsed_end:
+                                    continue
+                                fwd = getattr(msg, "fwd_from", None)
+                                if not fwd:
+                                    continue
+                                ch_post = getattr(fwd, "channel_post", None)
+                                if ch_post and int(ch_post) in parent_ids:
+                                    text = msg.message or ""
+                                    sent = analyzer.polarity_scores(text).get("compound")
+                                    url = f"https://t.me/{channel.lstrip('@')}/{msg.id}" if msg.id else None
+                                    async with write_lock:
+                                        writer.writerow(
+                                            {
+                                                "parent_id": int(ch_post),
+                                                "id": msg.id,
+                                                "date": to_iso(dt) if dt else None,
+                                                "message": text,
+                                                "sender_id": getattr(msg, "sender_id", None),
+                                                "sentiment_compound": sent,
+                                                "url": url,
+                                            }
+                                        )
+                                        local_matches += 1
+                        break
+                    except FloodWaitError as e:
+                        secs = int(getattr(e, 'seconds', 5))
+                        flood_hits += 1
+                        flood_wait_seconds += secs
+                        print(f"[rate-limit] FloodWait while scanning ids {min_id}-{max_id}; waiting {secs}s", flush=True)
+                        await asyncio.sleep(secs + 1)
+                        attempts += 1
+                        continue
+                    except Exception:
+                        # best-effort; skip this chunk
+                        break
+                async with progress_lock:
+                    matched_total += local_matches
+                    completed_chunks += 1
+                    print(
+                        f"[forwards] chunks {completed_chunks}/{total_chunks}; last {min_id}-{max_id} wrote {local_matches} forwards; total forwards {matched_total}",
+                        flush=True,
+                    )
+
+            tasks = [asyncio.create_task(process_chunk(i)) for i in range(total_chunks)]
+            await asyncio.gather(*tasks)
+
+    await client.disconnect()
+    # Print summary if we used concurrent chunking
+    try:
+        if scan_limit is not None and 'flood_hits' in locals() and flood_hits:
+            print(f"[rate-limit] Summary: {flood_hits} FloodWait events; total waited ~{flood_wait_seconds}s", flush=True)
+    except Exception:
+        pass
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Telegram scraper utilities")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    # Subcommand: scrape channel history
+    p_scrape = sub.add_parser("scrape", help="Scrape messages from a channel")
+    p_scrape.add_argument("channel", help="Channel username or t.me link, e.g. @python, https://t.me/python")
+    p_scrape.add_argument("--output", "-o", required=True, help="Output file (.jsonl or .csv)")
+    p_scrape.add_argument("--limit", type=int, default=None, help="Max number of messages to save after filtering")
+    p_scrape.add_argument("--offset-date", dest="offset_date", default=None, help="Deprecated: use --start-date instead. ISO date (inclusive)")
+    p_scrape.add_argument("--start-date", dest="start_date", default=None, help="ISO start date (inclusive)")
+    p_scrape.add_argument("--end-date", dest="end_date", default=None, help="ISO end date (inclusive)")
+    p_scrape.add_argument("--append", action="store_true", help="Append to the output file instead of overwriting")
+    p_scrape.add_argument("--session-name", default=os.getenv("TELEGRAM_SESSION_NAME", "telegram"))
+    p_scrape.add_argument("--phone", default=None)
+    p_scrape.add_argument("--twofa-password", default=os.getenv("TELEGRAM_2FA_PASSWORD"))
+
+    # Subcommand: fetch replies for specific message ids
+    p_rep = sub.add_parser("replies", help="Fetch replies for given message IDs and save to CSV")
+    p_rep.add_argument("channel", help="Channel username or t.me link")
+    src = p_rep.add_mutually_exclusive_group(required=True)
+    src.add_argument("--ids", help="Comma-separated parent message IDs")
+    src.add_argument("--from-csv", dest="from_csv", help="Path to CSV with an 'id' column to use as parent IDs")
+    p_rep.add_argument("--output", "-o", required=True, help="Output CSV path (e.g., data/replies_channel.csv)")
+    p_rep.add_argument("--append", action="store_true", help="Append to the output file instead of overwriting")
+    p_rep.add_argument("--session-name", default=os.getenv("TELEGRAM_SESSION_NAME", "telegram"))
+    p_rep.add_argument("--phone", default=None)
+    p_rep.add_argument("--twofa-password", default=os.getenv("TELEGRAM_2FA_PASSWORD"))
+    p_rep.add_argument("--concurrency", type=int, default=5, help="Number of parent IDs to process in parallel (default 5)")
+    p_rep.add_argument("--min-replies", type=int, default=None, help="When using --from-csv, only process parents with replies >= this value")
+    p_rep.add_argument("--resume", action="store_true", help="Resume mode: skip parent_id,id pairs already present in the output CSV")
+
+    # Subcommand: fetch forwards (same-channel forwards referencing parent ids)
+    p_fwd = sub.add_parser("forwards", help="Best-effort: find forwards within the same channel for given parent IDs")
+    p_fwd.add_argument("channel", help="Channel username or t.me link")
+    src2 = p_fwd.add_mutually_exclusive_group(required=True)
+    src2.add_argument("--ids", help="Comma-separated parent message IDs")
+    src2.add_argument("--from-csv", dest="from_csv", help="Path to CSV with an 'id' column to use as parent IDs")
+    p_fwd.add_argument("--output", "-o", required=True, help="Output CSV path (e.g., data/forwards_channel.csv)")
+    p_fwd.add_argument("--start-date", dest="start_date", default=None)
+    p_fwd.add_argument("--end-date", dest="end_date", default=None)
+    p_fwd.add_argument("--scan-limit", dest="scan_limit", type=int, default=None, help="Max messages to scan in channel history")
+    p_fwd.add_argument("--concurrency", type=int, default=5, help="Number of id-chunks to scan in parallel (requires --scan-limit)")
+    p_fwd.add_argument("--chunk-size", dest="chunk_size", type=int, default=1000, help="Approx. messages per chunk (ids)")
+    p_fwd.add_argument("--append", action="store_true", help="Append to the output file instead of overwriting")
+    p_fwd.add_argument("--session-name", default=os.getenv("TELEGRAM_SESSION_NAME", "telegram"))
+    p_fwd.add_argument("--phone", default=None)
+    p_fwd.add_argument("--twofa-password", default=os.getenv("TELEGRAM_2FA_PASSWORD"))
+
+    args = parser.parse_args()
+
+    # Normalize channel
+    channel = getattr(args, "channel", None)
+    if channel and channel.startswith("https://t.me/"):
+        channel = channel.replace("https://t.me/", "@")
+
+    def _normalize_handle(ch: Optional[str]) -> Optional[str]:
+        if not ch:
+            return ch
+        # Expect inputs like '@name' or 'name'; return lowercase without leading '@'
+        return ch.lstrip('@').lower()
+
+    def _extract_handle_from_url(url: str) -> Optional[str]:
+        try:
+            if not url:
+                return None
+            # Accept forms like https://t.me/Name/123 or http(s)://t.me/c/<id>/<msg>
+            # Only public usernames (not /c/ links) can be compared reliably
+            if "/t.me/" in url:
+                # crude parse without urlparse to avoid dependency
+                after = url.split("t.me/")[-1]
+                parts = after.split('/')
+                if parts and parts[0] and parts[0] != 'c':
+                    return parts[0]
+        except Exception:
+            return None
+        return None
+
+    if args.command == "scrape":
+        written = asyncio.run(
+            scrape_channel(
+                channel=channel,
+                output=args.output,
+                limit=args.limit,
+                offset_date=args.offset_date,
+                start_date=args.start_date,
+                end_date=args.end_date,
+                append=getattr(args, "append", False),
+                session_name=args.session_name,
+                phone=args.phone,
+                twofa_password=args.twofa_password,
+            )
+        )
+        print(f"Wrote {written} messages to {args.output}")
+    elif args.command == "replies":
+        # If using --from-csv, try to infer channel from URLs and warn on mismatch
+        try:
+            if getattr(args, 'from_csv', None):
+                import pandas as _pd  # local import to keep startup light
+                # Read a small sample of URL column to detect handle
+                sample = _pd.read_csv(args.from_csv, usecols=['url'], nrows=20)
+                url_handles = [
+                    _extract_handle_from_url(str(u)) for u in sample['url'].dropna().tolist() if isinstance(u, (str,))
+                ]
+                inferred = next((h for h in url_handles if h), None)
+                provided = _normalize_handle(channel)
+                if inferred and provided and _normalize_handle(inferred) != provided:
+                    print(
+                        f"[warning] CSV appears to be from @{_normalize_handle(inferred)} but you passed -c @{provided}. "
+                        f"Replies may be empty. Consider using -c https://t.me/{inferred}",
+                        flush=True,
+                    )
+        except Exception:
+            # Best-effort only; ignore any issues reading/inspecting CSV
+            pass
+        parent_ids: Set[int]
+        if getattr(args, "ids", None):
+            parent_ids = {int(x.strip()) for x in args.ids.split(",") if x.strip()}
+        else:
+            import pandas as pd  # local import
+            usecols = ['id']
+            if args.min_replies is not None:
+                usecols.append('replies')
+            df = pd.read_csv(args.from_csv, usecols=usecols)
+            if args.min_replies is not None and 'replies' in df.columns:
+                df = df[df['replies'].fillna(0).astype(int) >= int(args.min_replies)]
+            parent_ids = set(int(x) for x in df['id'].dropna().astype(int).tolist())
+        existing_pairs = None
+        if args.resume and os.path.exists(args.output):
+            try:
+                import csv as _csv
+                existing_pairs = set()
+                with open(args.output, "r", encoding="utf-8") as _f:
+                    reader = _csv.DictReader(_f)
+                    for row in reader:
+                        try:
+                            existing_pairs.add((int(row.get("parent_id")), int(row.get("id"))))
+                        except Exception:
+                            continue
+            except Exception:
+                existing_pairs = None
+
+        asyncio.run(
+            fetch_replies(
+                channel=channel,
+                parent_ids=sorted(parent_ids),
+                output_csv=args.output,
+                append=getattr(args, "append", False),
+                session_name=args.session_name,
+                phone=args.phone,
+                twofa_password=args.twofa_password,
+                concurrency=max(1, int(getattr(args, 'concurrency', 5))),
+                existing_pairs=existing_pairs,
+            )
+        )
+        print(f"Saved replies to {args.output}")
+    elif args.command == "forwards":
+        parent_ids: Set[int]
+        if getattr(args, "ids", None):
+            parent_ids = {int(x.strip()) for x in args.ids.split(",") if x.strip()}
+        else:
+            import pandas as pd
+            df = pd.read_csv(args.from_csv)
+            parent_ids = set(int(x) for x in df['id'].dropna().astype(int).tolist())
+        asyncio.run(
+            fetch_forwards(
+                channel=channel,
+                parent_ids=parent_ids,
+                output_csv=args.output,
+                start_date=args.start_date,
+                end_date=args.end_date,
+                scan_limit=args.scan_limit,
+                concurrency=max(1, int(getattr(args, 'concurrency', 5))),
+                chunk_size=max(1, int(getattr(args, 'chunk_size', 1000))),
+                append=getattr(args, "append", False),
+                session_name=args.session_name,
+                phone=args.phone,
+                twofa_password=args.twofa_password,
+            )
+        )
+        print(f"Saved forwards to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/train_sentiment.py b/src/train_sentiment.py
new file mode 100644
index 0000000..950f159
--- /dev/null
+++ b/src/train_sentiment.py
@@ -0,0 +1,135 @@
+import argparse
+import os
+from typing import Optional
+
+import pandas as pd
+from datasets import Dataset, ClassLabel
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
+import inspect
+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+
+
+def build_dataset(df: pd.DataFrame, text_col: str, label_col: str, label_mapping: Optional[dict] = None) -> Dataset:
+    d = df[[text_col, label_col]].dropna().copy()
+    # Normalize and drop empty labels
+    d[label_col] = d[label_col].astype(str).str.strip()
+    d = d[d[label_col] != '']
+    if d.empty:
+        raise SystemExit("No labeled rows found. Please fill the 'label' column in your CSV (e.g., neg/neu/pos or 0/1/2).")
+    if label_mapping:
+        d[label_col] = d[label_col].map(label_mapping)
+    # If labels are strings, factorize them
+    if d[label_col].dtype == object:
+        d[label_col] = d[label_col].astype('category')
+        label2id = {k: int(v) for v, k in enumerate(d[label_col].cat.categories)}
+        id2label = {v: k for k, v in label2id.items()}
+        d[label_col] = d[label_col].cat.codes
+    else:
+        # Assume numeric 0..N-1
+        classes = sorted(d[label_col].unique().tolist())
+        label2id = {str(c): int(c) for c in classes}
+        id2label = {int(c): str(c) for c in classes}
+    hf = Dataset.from_pandas(d.reset_index(drop=True))
+    hf = hf.class_encode_column(label_col)
+    hf.features[label_col] = ClassLabel(num_classes=len(id2label), names=[id2label[i] for i in range(len(id2label))])
+    return hf, label2id, id2label
+
+
+def tokenize_fn(examples, tokenizer, text_col):
+    return tokenizer(examples[text_col], truncation=True, padding=False)
+
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    return {
+        'accuracy': accuracy_score(labels, preds),
+        'precision_macro': precision_score(labels, preds, average='macro', zero_division=0),
+        'recall_macro': recall_score(labels, preds, average='macro', zero_division=0),
+        'f1_macro': f1_score(labels, preds, average='macro', zero_division=0),
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Fine-tune a transformers model for sentiment.')
+    parser.add_argument('--train-csv', required=True, help='Path to labeled CSV')
+    parser.add_argument('--text-col', default='message', help='Text column name')
+    parser.add_argument('--label-col', default='label', help='Label column name (e.g., pos/neu/neg or 2/1/0)')
+    parser.add_argument('--model-name', default='distilbert-base-uncased', help='Base model name or path')
+    parser.add_argument('--output-dir', default='models/sentiment-distilbert', help='Where to save the fine-tuned model')
+    parser.add_argument('--epochs', type=int, default=3)
+    parser.add_argument('--batch-size', type=int, default=16)
+    parser.add_argument('--lr', type=float, default=5e-5)
+    parser.add_argument('--eval-split', type=float, default=0.1, help='Fraction of data for eval')
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    df = pd.read_csv(args.train_csv)
+    ds, label2id, id2label = build_dataset(df, args.text_col, args.label_col)
+    if args.eval_split > 0:
+        ds = ds.train_test_split(test_size=args.eval_split, seed=42, stratify_by_column=args.label_col)
+        train_ds, eval_ds = ds['train'], ds['test']
+    else:
+        train_ds, eval_ds = ds, None
+
+    num_labels = len(id2label)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        args.model_name,
+        num_labels=num_labels,
+        id2label=id2label,
+        label2id={k: int(v) for k, v in label2id.items()},
+    )
+
+    tokenized_train = train_ds.map(lambda x: tokenize_fn(x, tokenizer, args.text_col), batched=True)
+    tokenized_eval = eval_ds.map(lambda x: tokenize_fn(x, tokenizer, args.text_col), batched=True) if (eval_ds is not None) else None
+
+    # Build TrainingArguments with compatibility across transformers versions
+    base_kwargs = {
+        'output_dir': args.output_dir,
+        'per_device_train_batch_size': args.batch_size,
+        'per_device_eval_batch_size': args.batch_size,
+        'num_train_epochs': args.epochs,
+        'learning_rate': args.lr,
+        'fp16': False,
+        'logging_steps': 50,
+    }
+    eval_kwargs = {}
+    if tokenized_eval is not None:
+        # Set both evaluation_strategy and eval_strategy for compatibility across transformers versions
+        eval_kwargs.update({
+            'evaluation_strategy': 'epoch',
+            'eval_strategy': 'epoch',
+            'save_strategy': 'epoch',
+            'load_best_model_at_end': True,
+            'metric_for_best_model': 'f1_macro',
+            'greater_is_better': True,
+        })
+
+    # Filter kwargs to only include parameters supported by this transformers version
+    sig = inspect.signature(TrainingArguments.__init__)
+    allowed = set(sig.parameters.keys())
+    def _filter(d: dict) -> dict:
+        return {k: v for k, v in d.items() if k in allowed}
+
+    training_args = TrainingArguments(**_filter(base_kwargs), **_filter(eval_kwargs))
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_train,
+        eval_dataset=tokenized_eval,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics if tokenized_eval else None,
+    )
+
+    trainer.train()
+    trainer.save_model(args.output_dir)
+    tokenizer.save_pretrained(args.output_dir)
+    print(f"Model saved to {args.output_dir}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/transformer_sentiment.py b/src/transformer_sentiment.py
new file mode 100644
index 0000000..f8c2e89
--- /dev/null
+++ b/src/transformer_sentiment.py
@@ -0,0 +1,90 @@
+from typing import List, Optional
+
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+
+
+class TransformerSentiment:
+    def __init__(self, model_name_or_path: str, device: Optional[str] = None, max_length: int = 256):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
+        self.max_length = max_length
+        if device is None:
+            if torch.cuda.is_available():
+                device = 'cuda'
+            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                device = 'mps'
+            else:
+                device = 'cpu'
+        self.device = device
+        self.model.to(self.device)
+        self.model.eval()
+
+        # Expect labels roughly like {0:'neg',1:'neu',2:'pos'} or similar
+        self.id2label = self.model.config.id2label if hasattr(self.model.config, 'id2label') else {0:'0',1:'1',2:'2'}
+
+    def _compound_from_probs(self, probs: np.ndarray) -> float:
+        # Map class probabilities to a [-1,1] compound-like score.
+        # If we have exactly 3 labels and names look like neg/neu/pos (any case), use that mapping.
+        labels = [self.id2label.get(i, str(i)).lower() for i in range(len(probs))]
+        try:
+            neg_idx = labels.index('neg') if 'neg' in labels else labels.index('negative')
+        except ValueError:
+            neg_idx = 0
+        try:
+            neu_idx = labels.index('neu') if 'neu' in labels else labels.index('neutral')
+        except ValueError:
+            neu_idx = 1 if len(probs) > 2 else None
+        try:
+            pos_idx = labels.index('pos') if 'pos' in labels else labels.index('positive')
+        except ValueError:
+            pos_idx = (len(probs)-1)
+
+        p_neg = float(probs[neg_idx]) if neg_idx is not None else 0.0
+        p_pos = float(probs[pos_idx]) if pos_idx is not None else 0.0
+        # A simple skew: pos - neg; keep within [-1,1]
+        comp = max(-1.0, min(1.0, p_pos - p_neg))
+        return comp
+
+    @torch.no_grad()
+    def predict_compound_batch(self, texts: List[str], batch_size: int = 32) -> List[float]:
+        out: List[float] = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i+batch_size]
+            enc = self.tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='pt'
+            )
+            enc = {k: v.to(self.device) for k, v in enc.items()}
+            logits = self.model(**enc).logits
+            probs = torch.softmax(logits, dim=-1).cpu().numpy()
+            for row in probs:
+                out.append(self._compound_from_probs(row))
+        return out
+
+    @torch.no_grad()
+    def predict_probs_and_labels(self, texts: List[str], batch_size: int = 32):
+        probs_all = []
+        labels_all: List[str] = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i+batch_size]
+            enc = self.tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors='pt'
+            )
+            enc = {k: v.to(self.device) for k, v in enc.items()}
+            logits = self.model(**enc).logits
+            probs = torch.softmax(logits, dim=-1).cpu().numpy()
+            preds = probs.argmax(axis=-1)
+            for j, row in enumerate(probs):
+                probs_all.append(row)
+                label = self.id2label.get(int(preds[j]), str(int(preds[j])))
+                labels_all.append(label)
+        return probs_all, labels_all