339 lines
10 KiB
Bash
339 lines
10 KiB
Bash
|
|
# Convenience aliases for daily runs (zsh)
|
||
|
|
# Source this file in your shell: source scripts/aliases.zsh
|
||
|
|
|
||
|
|
# --- Project defaults (edit as needed) ---
|
||
|
|
# Channel and files
|
||
|
|
export CH="https://t.me/Premier_League_Update"
|
||
|
|
export POSTS_CSV="data/premier_league_update.csv"
|
||
|
|
export REPLIES_CSV="data/premier_league_replies.csv"
|
||
|
|
export FORWARDS_CSV="data/premier_league_forwards.csv"
|
||
|
|
export TAGS_CFG="config/tags.yaml"
|
||
|
|
export FIXTURES_CSV="data/premier_league_schedule_2025-08-15_to_2025-10-15.csv"
|
||
|
|
# Default fixtures date range (used by run_all)
|
||
|
|
export FIXTURES_START_DATE="2025-08-15"
|
||
|
|
export FIXTURES_END_DATE="2025-10-15"
|
||
|
|
|
||
|
|
# Sessions directory outside iCloud (avoid sqlite locks)
|
||
|
|
export SESSION_DIR="$HOME/.local/share/telethon_sessions"
|
||
|
|
mkdir -p "$SESSION_DIR"
|
||
|
|
|
||
|
|
# --- Aliases (zsh functions) ---
|
||
|
|
|
||
|
|
# Fast replies: resume+append, prioritizes parents with replies, tuned concurrency
|
||
|
|
fast_replies() {
|
||
|
|
local ch="${1:-$CH}"
|
||
|
|
local posts="${2:-$POSTS_CSV}"
|
||
|
|
local out="${3:-$REPLIES_CSV}"
|
||
|
|
local conc="${4:-15}"
|
||
|
|
local sess="${5:-$SESSION_DIR/telegram_replies}"
|
||
|
|
./run_scraper.sh replies \
|
||
|
|
-c "$ch" \
|
||
|
|
--from-csv "$posts" \
|
||
|
|
-o "$out" \
|
||
|
|
--min-replies 1 \
|
||
|
|
--concurrency "$conc" \
|
||
|
|
--resume \
|
||
|
|
--append \
|
||
|
|
--session-name "$sess"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Chunked forwards: concurrent chunk scan with progress logs
|
||
|
|
chunked_forwards() {
|
||
|
|
local ch="${1:-$CH}"
|
||
|
|
local posts="${2:-$POSTS_CSV}"
|
||
|
|
local out="${3:-$FORWARDS_CSV}"
|
||
|
|
local scan="${4:-20000}"
|
||
|
|
local conc="${5:-10}"
|
||
|
|
local chunk="${6:-1500}"
|
||
|
|
local sess="${7:-$SESSION_DIR/telegram_forwards}"
|
||
|
|
./run_scraper.sh forwards \
|
||
|
|
-c "$ch" \
|
||
|
|
--from-csv "$posts" \
|
||
|
|
-o "$out" \
|
||
|
|
--scan-limit "$scan" \
|
||
|
|
--concurrency "$conc" \
|
||
|
|
--chunk-size "$chunk" \
|
||
|
|
--append \
|
||
|
|
--session-name "$sess"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Combined analyze: posts + replies + fixtures with tags; writes augmented CSVs
|
||
|
|
analyze_combined() {
|
||
|
|
local posts="${1:-$POSTS_CSV}"
|
||
|
|
local replies="${2:-$REPLIES_CSV}"
|
||
|
|
local tags="${3:-$TAGS_CFG}"
|
||
|
|
local fixtures="${4:-$FIXTURES_CSV}"
|
||
|
|
local ch="${5:-$CH}"
|
||
|
|
./run_scraper.sh analyze \
|
||
|
|
-i "$posts" \
|
||
|
|
--channel "$ch" \
|
||
|
|
--tags-config "$tags" \
|
||
|
|
--replies-csv "$replies" \
|
||
|
|
--fixtures-csv "$fixtures" \
|
||
|
|
--write-augmented-csv \
|
||
|
|
--write-combined-csv \
|
||
|
|
--save-plots
|
||
|
|
# Tip: add plot sizing/labels, e.g.: --plot-width-scale 0.8 --plot-max-width 120 --plot-height 8 --activity-top-n 8 --labels-stagger-rows 3
|
||
|
|
}
|
||
|
|
|
||
|
|
# Emoji-aware analyze with sensible defaults (keep + boost)
|
||
|
|
analyze_emoji() {
|
||
|
|
local posts="${1:-$POSTS_CSV}"
|
||
|
|
local replies="${2:-$REPLIES_CSV}"
|
||
|
|
local tags="${3:-$TAGS_CFG}"
|
||
|
|
local fixtures="${4:-$FIXTURES_CSV}"
|
||
|
|
local ch="${5:-$CH}"
|
||
|
|
local mode="${6:-keep}" # keep | demojize | strip
|
||
|
|
./run_scraper.sh analyze \
|
||
|
|
-i "$posts" \
|
||
|
|
--channel "$ch" \
|
||
|
|
--tags-config "$tags" \
|
||
|
|
--replies-csv "$replies" \
|
||
|
|
--fixtures-csv "$fixtures" \
|
||
|
|
--write-augmented-csv \
|
||
|
|
--write-combined-csv \
|
||
|
|
--save-plots \
|
||
|
|
--emoji-mode "$mode" \
|
||
|
|
--emoji-boost
|
||
|
|
}
|
||
|
|
|
||
|
|
# Analyze with transformers (and export labels/probs)
|
||
|
|
analyze_transformers() {
|
||
|
|
local posts="${1:-$POSTS_CSV}"
|
||
|
|
local replies="${2:-$REPLIES_CSV}"
|
||
|
|
local tags="${3:-$TAGS_CFG}"
|
||
|
|
local fixtures="${4:-$FIXTURES_CSV}"
|
||
|
|
local ch="${5:-$CH}"
|
||
|
|
local model="${6:-distilbert-base-uncased}"
|
||
|
|
./run_scraper.sh analyze \
|
||
|
|
-i "$posts" \
|
||
|
|
--channel "$ch" \
|
||
|
|
--tags-config "$tags" \
|
||
|
|
--replies-csv "$replies" \
|
||
|
|
--fixtures-csv "$fixtures" \
|
||
|
|
--sentiment-backend transformers \
|
||
|
|
--transformers-model "$model" \
|
||
|
|
--export-transformers-details \
|
||
|
|
--write-augmented-csv \
|
||
|
|
--write-combined-csv \
|
||
|
|
--save-plots
|
||
|
|
}
|
||
|
|
|
||
|
|
# Plot graphs from labeled sentiment CSV
|
||
|
|
plot_labeled() {
|
||
|
|
local labeled_csv="${1:-data/labeled_sentiment.csv}"
|
||
|
|
local out_dir="${2:-data}"
|
||
|
|
./.venv/bin/python -m src.plot_labeled \
|
||
|
|
--input "$labeled_csv" \
|
||
|
|
--out-dir "$out_dir"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Merge labeled CSV back into posts/replies to reuse analyzer plots
|
||
|
|
apply_labels_and_analyze() {
|
||
|
|
local labeled_csv="${1:-data/labeled_sentiment.csv}"
|
||
|
|
local posts_in="${2:-$POSTS_CSV}"
|
||
|
|
local replies_in="${3:-$REPLIES_CSV}"
|
||
|
|
local posts_out="${4:-data/premier_league_update_with_labels.csv}"
|
||
|
|
local replies_out="${5:-data/premier_league_replies_with_labels.csv}"
|
||
|
|
./.venv/bin/python -m src.apply_labels \
|
||
|
|
--labeled-csv "$labeled_csv" \
|
||
|
|
--posts-csv "$posts_in" \
|
||
|
|
--replies-csv "$replies_in" \
|
||
|
|
--posts-out "$posts_out" \
|
||
|
|
--replies-out "$replies_out"
|
||
|
|
# Reuse analyzer with the merged CSVs; it will pick up sentiment_label if present
|
||
|
|
./run_scraper.sh analyze \
|
||
|
|
-i "$posts_out" \
|
||
|
|
--replies-csv "$replies_out" \
|
||
|
|
--fixtures-csv "$FIXTURES_CSV" \
|
||
|
|
--tags-config "$TAGS_CFG" \
|
||
|
|
--write-augmented-csv \
|
||
|
|
--write-combined-csv \
|
||
|
|
--save-plots
|
||
|
|
}
|
||
|
|
|
||
|
|
# Auto-label sentiment without manual annotation (VADER backend)
|
||
|
|
auto_label_vader() {
|
||
|
|
local posts="${1:-$POSTS_CSV}"
|
||
|
|
local replies="${2:-$REPLIES_CSV}"
|
||
|
|
local out="${3:-data/labeled_sentiment.csv}"
|
||
|
|
./.venv/bin/python -m src.auto_label_sentiment \
|
||
|
|
--posts-csv "$posts" \
|
||
|
|
--replies-csv "$replies" \
|
||
|
|
--backend vader \
|
||
|
|
--vader-pos 0.05 \
|
||
|
|
--vader-neg -0.05 \
|
||
|
|
--vader-margin 0.20 \
|
||
|
|
--only-confident \
|
||
|
|
-o "$out"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Auto-label sentiment using a pretrained transformers model
|
||
|
|
auto_label_transformers() {
|
||
|
|
local posts="${1:-$POSTS_CSV}"
|
||
|
|
local replies="${2:-$REPLIES_CSV}"
|
||
|
|
local model="${3:-cardiffnlp/twitter-roberta-base-sentiment-latest}"
|
||
|
|
local out="${4:-data/labeled_sentiment.csv}"
|
||
|
|
./.venv/bin/python -m src.auto_label_sentiment \
|
||
|
|
--posts-csv "$posts" \
|
||
|
|
--replies-csv "$replies" \
|
||
|
|
--backend transformers \
|
||
|
|
--transformers-model "$model" \
|
||
|
|
--min-prob 0.6 \
|
||
|
|
--min-margin 0.2 \
|
||
|
|
--only-confident \
|
||
|
|
-o "$out"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Train a transformers model with the project venv
|
||
|
|
train_transformers() {
|
||
|
|
local train_csv="${1:-data/labeled_sentiment.csv}"
|
||
|
|
local text_col="${2:-message}"
|
||
|
|
local label_col="${3:-label}"
|
||
|
|
local base_model="${4:-distilbert-base-uncased}"
|
||
|
|
local out_dir="${5:-models/sentiment-distilbert}"
|
||
|
|
./.venv/bin/python -m src.train_sentiment \
|
||
|
|
--train-csv "$train_csv" \
|
||
|
|
--text-col "$text_col" \
|
||
|
|
--label-col "$label_col" \
|
||
|
|
--model-name "$base_model" \
|
||
|
|
--output-dir "$out_dir" \
|
||
|
|
--epochs 3 \
|
||
|
|
--batch-size 16
|
||
|
|
}
|
||
|
|
|
||
|
|
# Evaluate a fine-tuned transformers model
|
||
|
|
eval_transformers() {
|
||
|
|
local csv="${1:-data/labeled_holdout.csv}"
|
||
|
|
local text_col="${2:-message}"
|
||
|
|
local label_col="${3:-label}"
|
||
|
|
local model_dir="${4:-models/sentiment-distilbert}"
|
||
|
|
./.venv/bin/python -m src.eval_sentiment \
|
||
|
|
--csv "$csv" \
|
||
|
|
--text-col "$text_col" \
|
||
|
|
--label-col "$label_col" \
|
||
|
|
--model "$model_dir"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Build a labeling CSV from existing posts+replies
|
||
|
|
make_label_set() {
|
||
|
|
local posts="${1:-$POSTS_CSV}"
|
||
|
|
local replies="${2:-$REPLIES_CSV}"
|
||
|
|
local out="${3:-data/labeled_sentiment.csv}"
|
||
|
|
local n="${4:-1000}"
|
||
|
|
./.venv/bin/python -m src.make_labeling_set \
|
||
|
|
--posts-csv "$posts" \
|
||
|
|
--replies-csv "$replies" \
|
||
|
|
--sample-size "$n" \
|
||
|
|
-o "$out"
|
||
|
|
}
|
||
|
|
|
||
|
|
# One-shot daily pipeline: fast replies then combined analyze
|
||
|
|
run_daily() {
|
||
|
|
local ch="${1:-$CH}"
|
||
|
|
local posts="${2:-$POSTS_CSV}"
|
||
|
|
local replies="${3:-$REPLIES_CSV}"
|
||
|
|
local conc="${4:-15}"
|
||
|
|
fast_replies "$ch" "$posts" "$replies" "$conc" "$SESSION_DIR/telegram_replies"
|
||
|
|
analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
|
||
|
|
}
|
||
|
|
|
||
|
|
# One-shot daily pipeline with forwards in parallel
|
||
|
|
run_daily_with_forwards() {
|
||
|
|
local ch="${1:-$CH}"
|
||
|
|
local posts="${2:-$POSTS_CSV}"
|
||
|
|
local replies="${3:-$REPLIES_CSV}"
|
||
|
|
local forwards="${4:-$FORWARDS_CSV}"
|
||
|
|
local rep_conc="${5:-15}"
|
||
|
|
local f_scan="${6:-20000}"
|
||
|
|
local f_conc="${7:-10}"
|
||
|
|
local f_chunk="${8:-1500}"
|
||
|
|
local sess_r="${9:-$SESSION_DIR/telegram_replies}"
|
||
|
|
local sess_f="${10:-$SESSION_DIR/telegram_forwards}"
|
||
|
|
|
||
|
|
# Launch replies and forwards in parallel with separate sessions
|
||
|
|
local pid_r pid_f
|
||
|
|
fast_replies "$ch" "$posts" "$replies" "$rep_conc" "$sess_r" & pid_r=$!
|
||
|
|
chunked_forwards "$ch" "$posts" "$forwards" "$f_scan" "$f_conc" "$f_chunk" "$sess_f" & pid_f=$!
|
||
|
|
|
||
|
|
# Wait for completion and then analyze with emoji handling
|
||
|
|
wait $pid_r
|
||
|
|
wait $pid_f
|
||
|
|
analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
|
||
|
|
}
|
||
|
|
|
||
|
|
# End-to-end, non-interactive pipeline (from scratch): scrape -> replies -> fixtures -> analyze
|
||
|
|
# Requirements:
|
||
|
|
# - .env has TELEGRAM_API_ID and TELEGRAM_API_HASH (and TELEGRAM_2FA_PASSWORD if 2FA is enabled)
|
||
|
|
# - CH/POSTS_CSV/REPLIES_CSV/FIXTURES_CSV/TAGS_CFG are set (defaults are defined above)
|
||
|
|
# - Provide optional start/end dates; defaults use FIXTURES_START_DATE/FIXTURES_END_DATE
|
||
|
|
# - Choose sentiment backend via arg 11: vader | transformers | gpt (default: transformers)
|
||
|
|
run_all() {
|
||
|
|
local ch="${1:-$CH}"
|
||
|
|
local start="${2:-$FIXTURES_START_DATE}"
|
||
|
|
local end="${3:-$FIXTURES_END_DATE}"
|
||
|
|
local posts="${4:-$POSTS_CSV}"
|
||
|
|
local replies="${5:-$REPLIES_CSV}"
|
||
|
|
local fixtures="${6:-$FIXTURES_CSV}"
|
||
|
|
local tags="${7:-$TAGS_CFG}"
|
||
|
|
local sess_scrape="${8:-$SESSION_DIR/telegram_scrape}"
|
||
|
|
local sess_replies="${9:-$SESSION_DIR/telegram_replies}"
|
||
|
|
local rep_conc="${10:-15}"
|
||
|
|
local backend="${11:-transformers}" # vader | transformers | gpt
|
||
|
|
local model="${12:-models/sentiment-distilbert}"
|
||
|
|
local gpt_model="${13:-llama3}"
|
||
|
|
local gpt_url="${14:-http://localhost:11434}"
|
||
|
|
|
||
|
|
# 1) Scrape posts (overwrite)
|
||
|
|
./run_scraper.sh scrape \
|
||
|
|
-c "$ch" \
|
||
|
|
-o "$posts" \
|
||
|
|
--start-date "$start" \
|
||
|
|
--end-date "$end" \
|
||
|
|
--session-name "$sess_scrape"
|
||
|
|
|
||
|
|
# 2) Fetch replies (resume+append safe)
|
||
|
|
./run_scraper.sh replies \
|
||
|
|
-c "$ch" \
|
||
|
|
--from-csv "$posts" \
|
||
|
|
-o "$replies" \
|
||
|
|
--min-replies 1 \
|
||
|
|
--concurrency "$rep_conc" \
|
||
|
|
--resume \
|
||
|
|
--append \
|
||
|
|
--session-name "$sess_replies"
|
||
|
|
|
||
|
|
# 3) Fetch fixtures for the same period
|
||
|
|
./run_scraper.sh fixtures \
|
||
|
|
--start-date "$start" \
|
||
|
|
--end-date "$end" \
|
||
|
|
-o "$fixtures"
|
||
|
|
|
||
|
|
# 4) Analyze with plots (non-interactive)
|
||
|
|
local args=(
|
||
|
|
-i "$posts"
|
||
|
|
--tags-config "$tags"
|
||
|
|
--replies-csv "$replies"
|
||
|
|
--fixtures-csv "$fixtures"
|
||
|
|
--write-augmented-csv
|
||
|
|
--write-combined-csv
|
||
|
|
--emoji-mode keep
|
||
|
|
--emoji-boost
|
||
|
|
--save-plots
|
||
|
|
--plot-width-scale 0.8
|
||
|
|
--plot-max-width 120
|
||
|
|
--plot-height 8
|
||
|
|
--activity-top-n 8
|
||
|
|
--labels-stagger-rows 3
|
||
|
|
)
|
||
|
|
if [[ "$backend" == "transformers" ]]; then
|
||
|
|
args+=( --sentiment-backend transformers --transformers-model "$model" --export-transformers-details )
|
||
|
|
elif [[ "$backend" == "gpt" ]]; then
|
||
|
|
args+=( --sentiment-backend gpt --gpt-model "$gpt_model" --gpt-base-url "$gpt_url" )
|
||
|
|
else
|
||
|
|
args+=( --sentiment-backend vader )
|
||
|
|
fi
|
||
|
|
|
||
|
|
./run_scraper.sh analyze "${args[@]}"
|
||
|
|
}
|