feat: Sprint 10 - Architecture docs, CI/CD, operational scripts

Phase 1 - Architecture Documentation: - Add Architecture section with Mermaid flowchart to README - Create docs/DATABASE_SCHEMA.md with full ERD Phase 2 - CI/CD: - Add CI badge to README - Create .gitea/workflows/ci.yml for linting and tests - Create .gitea/workflows/deploy-staging.yml - Create .gitea/workflows/deploy-production.yml Phase 3 - Operational Scripts: - Create scripts/logs.sh for docker compose log following - Create scripts/run-detached.sh with health check loop - Create scripts/etl/toronto.sh for Toronto data pipeline - Add Makefile targets: logs, run-detached, etl-toronto Phase 4 - Runbooks: - Create docs/runbooks/adding-dashboard.md - Create docs/runbooks/deployment.md Phase 5 - Hygiene: - Create MIT LICENSE file Phase 6 - Production: - Add live demo link to README (leodata.science) Closes #78, #79, #80, #81, #82, #83, #84, #85, #86, #87, #88, #89, #91 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 17:10:30 -05:00
parent d0f32edba7
commit bf6e392002
12 changed files with 1067 additions and 1 deletions
--- a/scripts/etl/toronto.sh
+++ b/scripts/etl/toronto.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# scripts/etl/toronto.sh - Run Toronto data pipeline
+#
+# Usage:
+#   ./scripts/etl/toronto.sh --full        # Complete reload of all data
+#   ./scripts/etl/toronto.sh --incremental # Only new data since last run
+#   ./scripts/etl/toronto.sh               # Default: incremental
+#
+# Logs are written to .dev/logs/etl/
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+LOG_DIR="$PROJECT_ROOT/.dev/logs/etl"
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOG_FILE="$LOG_DIR/toronto_${TIMESTAMP}.log"
+
+MODE="${1:---incremental}"
+
+mkdir -p "$LOG_DIR"
+
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+log "Starting Toronto ETL pipeline (mode: $MODE)"
+log "Log file: $LOG_FILE"
+
+cd "$PROJECT_ROOT"
+
+# Activate virtual environment if it exists
+if [ -d ".venv" ]; then
+    source .venv/bin/activate
+    log "Activated virtual environment"
+fi
+
+case "$MODE" in
+    --full)
+        log "Running FULL data reload..."
+
+        log "Step 1/4: Parsing neighbourhood data..."
+        python -m portfolio_app.toronto.parsers.neighbourhoods 2>&1 | tee -a "$LOG_FILE"
+
+        log "Step 2/4: Parsing census data..."
+        python -m portfolio_app.toronto.parsers.census 2>&1 | tee -a "$LOG_FILE"
+
+        log "Step 3/4: Parsing crime data..."
+        python -m portfolio_app.toronto.parsers.crime 2>&1 | tee -a "$LOG_FILE"
+
+        log "Step 4/4: Running dbt transformations..."
+        cd dbt && dbt run --full-refresh --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd ..
+        ;;
+
+    --incremental)
+        log "Running INCREMENTAL update..."
+
+        log "Step 1/2: Checking for new data..."
+        # Add incremental logic here when implemented
+
+        log "Step 2/2: Running dbt transformations..."
+        cd dbt && dbt run --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd ..
+        ;;
+
+    *)
+        log "ERROR: Unknown mode '$MODE'. Use --full or --incremental"
+        exit 1
+        ;;
+esac
+
+log "Toronto ETL pipeline completed successfully"
+log "Full log available at: $LOG_FILE"