personal-portfolio/scripts/etl/toronto.sh

#!/usr/bin/env bash
# scripts/etl/toronto.sh - Run Toronto data pipeline
#
# Usage:
#   ./scripts/etl/toronto.sh --full        # Complete reload of all data
#   ./scripts/etl/toronto.sh --incremental # Only new data since last run
#   ./scripts/etl/toronto.sh               # Default: incremental
#
# Logs are written to .dev/logs/etl/

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
LOG_DIR="$PROJECT_ROOT/.dev/logs/etl"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="$LOG_DIR/toronto_${TIMESTAMP}.log"

MODE="${1:---incremental}"

mkdir -p "$LOG_DIR"

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

log "Starting Toronto ETL pipeline (mode: $MODE)"
log "Log file: $LOG_FILE"

cd "$PROJECT_ROOT"

# Activate virtual environment if it exists
if [ -d ".venv" ]; then
    source .venv/bin/activate
    log "Activated virtual environment"
fi

case "$MODE" in
    --full)
        log "Running FULL data reload..."

        log "Step 1/4: Parsing neighbourhood data..."
        python -m portfolio_app.toronto.parsers.neighbourhoods 2>&1 | tee -a "$LOG_FILE"

        log "Step 2/4: Parsing census data..."
        python -m portfolio_app.toronto.parsers.census 2>&1 | tee -a "$LOG_FILE"

        log "Step 3/4: Parsing crime data..."
        python -m portfolio_app.toronto.parsers.crime 2>&1 | tee -a "$LOG_FILE"

        log "Step 4/4: Running dbt transformations..."
        cd dbt && dbt run --full-refresh --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd ..
        ;;

    --incremental)
        log "Running INCREMENTAL update..."

        log "Step 1/2: Checking for new data..."
        # Add incremental logic here when implemented

        log "Step 2/2: Running dbt transformations..."
        cd dbt && dbt run --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd ..
        ;;

    *)
        log "ERROR: Unknown mode '$MODE'. Use --full or --incremental"
        exit 1
        ;;
esac

log "Toronto ETL pipeline completed successfully"
log "Full log available at: $LOG_FILE"