Files
epl_python/scripts/aliases.zsh

339 lines
10 KiB
Bash
Raw Normal View History

# Convenience aliases for daily runs (zsh)
# Source this file in your shell: source scripts/aliases.zsh
# --- Project defaults (edit as needed) ---
# Channel and files
export CH="https://t.me/Premier_League_Update"
export POSTS_CSV="data/premier_league_update.csv"
export REPLIES_CSV="data/premier_league_replies.csv"
export FORWARDS_CSV="data/premier_league_forwards.csv"
export TAGS_CFG="config/tags.yaml"
export FIXTURES_CSV="data/premier_league_schedule_2025-08-15_to_2025-10-15.csv"
# Default fixtures date range (used by run_all)
export FIXTURES_START_DATE="2025-08-15"
export FIXTURES_END_DATE="2025-10-15"
# Sessions directory outside iCloud (avoid sqlite locks)
export SESSION_DIR="$HOME/.local/share/telethon_sessions"
mkdir -p "$SESSION_DIR"
# --- Aliases (zsh functions) ---
# Fast replies: resume+append, prioritizes parents with replies, tuned concurrency
fast_replies() {
local ch="${1:-$CH}"
local posts="${2:-$POSTS_CSV}"
local out="${3:-$REPLIES_CSV}"
local conc="${4:-15}"
local sess="${5:-$SESSION_DIR/telegram_replies}"
./run_scraper.sh replies \
-c "$ch" \
--from-csv "$posts" \
-o "$out" \
--min-replies 1 \
--concurrency "$conc" \
--resume \
--append \
--session-name "$sess"
}
# Chunked forwards: concurrent chunk scan with progress logs
chunked_forwards() {
local ch="${1:-$CH}"
local posts="${2:-$POSTS_CSV}"
local out="${3:-$FORWARDS_CSV}"
local scan="${4:-20000}"
local conc="${5:-10}"
local chunk="${6:-1500}"
local sess="${7:-$SESSION_DIR/telegram_forwards}"
./run_scraper.sh forwards \
-c "$ch" \
--from-csv "$posts" \
-o "$out" \
--scan-limit "$scan" \
--concurrency "$conc" \
--chunk-size "$chunk" \
--append \
--session-name "$sess"
}
# Combined analyze: posts + replies + fixtures with tags; writes augmented CSVs
analyze_combined() {
local posts="${1:-$POSTS_CSV}"
local replies="${2:-$REPLIES_CSV}"
local tags="${3:-$TAGS_CFG}"
local fixtures="${4:-$FIXTURES_CSV}"
local ch="${5:-$CH}"
./run_scraper.sh analyze \
-i "$posts" \
--channel "$ch" \
--tags-config "$tags" \
--replies-csv "$replies" \
--fixtures-csv "$fixtures" \
--write-augmented-csv \
--write-combined-csv \
--save-plots
# Tip: add plot sizing/labels, e.g.: --plot-width-scale 0.8 --plot-max-width 120 --plot-height 8 --activity-top-n 8 --labels-stagger-rows 3
}
# Emoji-aware analyze with sensible defaults (keep + boost)
analyze_emoji() {
local posts="${1:-$POSTS_CSV}"
local replies="${2:-$REPLIES_CSV}"
local tags="${3:-$TAGS_CFG}"
local fixtures="${4:-$FIXTURES_CSV}"
local ch="${5:-$CH}"
local mode="${6:-keep}" # keep | demojize | strip
./run_scraper.sh analyze \
-i "$posts" \
--channel "$ch" \
--tags-config "$tags" \
--replies-csv "$replies" \
--fixtures-csv "$fixtures" \
--write-augmented-csv \
--write-combined-csv \
--save-plots \
--emoji-mode "$mode" \
--emoji-boost
}
# Analyze with transformers (and export labels/probs)
analyze_transformers() {
local posts="${1:-$POSTS_CSV}"
local replies="${2:-$REPLIES_CSV}"
local tags="${3:-$TAGS_CFG}"
local fixtures="${4:-$FIXTURES_CSV}"
local ch="${5:-$CH}"
local model="${6:-distilbert-base-uncased}"
./run_scraper.sh analyze \
-i "$posts" \
--channel "$ch" \
--tags-config "$tags" \
--replies-csv "$replies" \
--fixtures-csv "$fixtures" \
--sentiment-backend transformers \
--transformers-model "$model" \
--export-transformers-details \
--write-augmented-csv \
--write-combined-csv \
--save-plots
}
# Plot graphs from labeled sentiment CSV
plot_labeled() {
local labeled_csv="${1:-data/labeled_sentiment.csv}"
local out_dir="${2:-data}"
./.venv/bin/python -m src.plot_labeled \
--input "$labeled_csv" \
--out-dir "$out_dir"
}
# Merge labeled CSV back into posts/replies to reuse analyzer plots
apply_labels_and_analyze() {
local labeled_csv="${1:-data/labeled_sentiment.csv}"
local posts_in="${2:-$POSTS_CSV}"
local replies_in="${3:-$REPLIES_CSV}"
local posts_out="${4:-data/premier_league_update_with_labels.csv}"
local replies_out="${5:-data/premier_league_replies_with_labels.csv}"
./.venv/bin/python -m src.apply_labels \
--labeled-csv "$labeled_csv" \
--posts-csv "$posts_in" \
--replies-csv "$replies_in" \
--posts-out "$posts_out" \
--replies-out "$replies_out"
# Reuse analyzer with the merged CSVs; it will pick up sentiment_label if present
./run_scraper.sh analyze \
-i "$posts_out" \
--replies-csv "$replies_out" \
--fixtures-csv "$FIXTURES_CSV" \
--tags-config "$TAGS_CFG" \
--write-augmented-csv \
--write-combined-csv \
--save-plots
}
# Auto-label sentiment without manual annotation (VADER backend)
auto_label_vader() {
local posts="${1:-$POSTS_CSV}"
local replies="${2:-$REPLIES_CSV}"
local out="${3:-data/labeled_sentiment.csv}"
./.venv/bin/python -m src.auto_label_sentiment \
--posts-csv "$posts" \
--replies-csv "$replies" \
--backend vader \
--vader-pos 0.05 \
--vader-neg -0.05 \
--vader-margin 0.20 \
--only-confident \
-o "$out"
}
# Auto-label sentiment using a pretrained transformers model
auto_label_transformers() {
local posts="${1:-$POSTS_CSV}"
local replies="${2:-$REPLIES_CSV}"
local model="${3:-cardiffnlp/twitter-roberta-base-sentiment-latest}"
local out="${4:-data/labeled_sentiment.csv}"
./.venv/bin/python -m src.auto_label_sentiment \
--posts-csv "$posts" \
--replies-csv "$replies" \
--backend transformers \
--transformers-model "$model" \
--min-prob 0.6 \
--min-margin 0.2 \
--only-confident \
-o "$out"
}
# Train a transformers model with the project venv
train_transformers() {
local train_csv="${1:-data/labeled_sentiment.csv}"
local text_col="${2:-message}"
local label_col="${3:-label}"
local base_model="${4:-distilbert-base-uncased}"
local out_dir="${5:-models/sentiment-distilbert}"
./.venv/bin/python -m src.train_sentiment \
--train-csv "$train_csv" \
--text-col "$text_col" \
--label-col "$label_col" \
--model-name "$base_model" \
--output-dir "$out_dir" \
--epochs 3 \
--batch-size 16
}
# Evaluate a fine-tuned transformers model
eval_transformers() {
local csv="${1:-data/labeled_holdout.csv}"
local text_col="${2:-message}"
local label_col="${3:-label}"
local model_dir="${4:-models/sentiment-distilbert}"
./.venv/bin/python -m src.eval_sentiment \
--csv "$csv" \
--text-col "$text_col" \
--label-col "$label_col" \
--model "$model_dir"
}
# Build a labeling CSV from existing posts+replies
make_label_set() {
local posts="${1:-$POSTS_CSV}"
local replies="${2:-$REPLIES_CSV}"
local out="${3:-data/labeled_sentiment.csv}"
local n="${4:-1000}"
./.venv/bin/python -m src.make_labeling_set \
--posts-csv "$posts" \
--replies-csv "$replies" \
--sample-size "$n" \
-o "$out"
}
# One-shot daily pipeline: fast replies then combined analyze
run_daily() {
local ch="${1:-$CH}"
local posts="${2:-$POSTS_CSV}"
local replies="${3:-$REPLIES_CSV}"
local conc="${4:-15}"
fast_replies "$ch" "$posts" "$replies" "$conc" "$SESSION_DIR/telegram_replies"
analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
}
# One-shot daily pipeline with forwards in parallel
run_daily_with_forwards() {
local ch="${1:-$CH}"
local posts="${2:-$POSTS_CSV}"
local replies="${3:-$REPLIES_CSV}"
local forwards="${4:-$FORWARDS_CSV}"
local rep_conc="${5:-15}"
local f_scan="${6:-20000}"
local f_conc="${7:-10}"
local f_chunk="${8:-1500}"
local sess_r="${9:-$SESSION_DIR/telegram_replies}"
local sess_f="${10:-$SESSION_DIR/telegram_forwards}"
# Launch replies and forwards in parallel with separate sessions
local pid_r pid_f
fast_replies "$ch" "$posts" "$replies" "$rep_conc" "$sess_r" & pid_r=$!
chunked_forwards "$ch" "$posts" "$forwards" "$f_scan" "$f_conc" "$f_chunk" "$sess_f" & pid_f=$!
# Wait for completion and then analyze with emoji handling
wait $pid_r
wait $pid_f
analyze_emoji "$posts" "$replies" "$TAGS_CFG" "$FIXTURES_CSV" "$ch" keep
}
# End-to-end, non-interactive pipeline (from scratch): scrape -> replies -> fixtures -> analyze
# Requirements:
# - .env has TELEGRAM_API_ID and TELEGRAM_API_HASH (and TELEGRAM_2FA_PASSWORD if 2FA is enabled)
# - CH/POSTS_CSV/REPLIES_CSV/FIXTURES_CSV/TAGS_CFG are set (defaults are defined above)
# - Provide optional start/end dates; defaults use FIXTURES_START_DATE/FIXTURES_END_DATE
# - Choose sentiment backend via arg 11: vader | transformers | gpt (default: transformers)
run_all() {
local ch="${1:-$CH}"
local start="${2:-$FIXTURES_START_DATE}"
local end="${3:-$FIXTURES_END_DATE}"
local posts="${4:-$POSTS_CSV}"
local replies="${5:-$REPLIES_CSV}"
local fixtures="${6:-$FIXTURES_CSV}"
local tags="${7:-$TAGS_CFG}"
local sess_scrape="${8:-$SESSION_DIR/telegram_scrape}"
local sess_replies="${9:-$SESSION_DIR/telegram_replies}"
local rep_conc="${10:-15}"
local backend="${11:-transformers}" # vader | transformers | gpt
local model="${12:-models/sentiment-distilbert}"
local gpt_model="${13:-llama3}"
local gpt_url="${14:-http://localhost:11434}"
# 1) Scrape posts (overwrite)
./run_scraper.sh scrape \
-c "$ch" \
-o "$posts" \
--start-date "$start" \
--end-date "$end" \
--session-name "$sess_scrape"
# 2) Fetch replies (resume+append safe)
./run_scraper.sh replies \
-c "$ch" \
--from-csv "$posts" \
-o "$replies" \
--min-replies 1 \
--concurrency "$rep_conc" \
--resume \
--append \
--session-name "$sess_replies"
# 3) Fetch fixtures for the same period
./run_scraper.sh fixtures \
--start-date "$start" \
--end-date "$end" \
-o "$fixtures"
# 4) Analyze with plots (non-interactive)
local args=(
-i "$posts"
--tags-config "$tags"
--replies-csv "$replies"
--fixtures-csv "$fixtures"
--write-augmented-csv
--write-combined-csv
--emoji-mode keep
--emoji-boost
--save-plots
--plot-width-scale 0.8
--plot-max-width 120
--plot-height 8
--activity-top-n 8
--labels-stagger-rows 3
)
if [[ "$backend" == "transformers" ]]; then
args+=( --sentiment-backend transformers --transformers-model "$model" --export-transformers-details )
elif [[ "$backend" == "gpt" ]]; then
args+=( --sentiment-backend gpt --gpt-model "$gpt_model" --gpt-base-url "$gpt_url" )
else
args+=( --sentiment-backend vader )
fi
./run_scraper.sh analyze "${args[@]}"
}