diff --git a/Makefile b/Makefile index 96ce1c9..09ab0d0 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: setup docker-up docker-down db-init load-data run test dbt-run dbt-test lint format ci deploy clean help logs run-detached etl-toronto +.PHONY: setup docker-up docker-down db-init load-data seed-amenities run test dbt-run dbt-test lint format ci deploy clean help logs run-detached etl-toronto # Default target .DEFAULT_GOAL := help @@ -87,6 +87,10 @@ load-data-only: ## Load Toronto data without running dbt @echo "$(GREEN)Loading Toronto data (skip dbt)...$(NC)" $(PYTHON) scripts/data/load_toronto_data.py --skip-dbt +seed-amenities: ## Seed sample amenity data (run after load-data) + @echo "$(GREEN)Seeding amenity data...$(NC)" + $(PYTHON) scripts/data/seed_amenity_data.py + # ============================================================================= # Application # ============================================================================= diff --git a/scripts/data/seed_amenity_data.py b/scripts/data/seed_amenity_data.py new file mode 100644 index 0000000..1cf6a5b --- /dev/null +++ b/scripts/data/seed_amenity_data.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Seed sample amenity data for development/testing. + +This script populates fact_amenities with sample data and updates +dim_neighbourhood with population from fact_census, then runs dbt +to rebuild the marts. + +Usage: + python scripts/data/seed_amenity_data.py + +Run this after load_toronto_data.py if amenity data is missing. +""" + +import os +import random +import subprocess +import sys +from pathlib import Path + +from dotenv import load_dotenv +from sqlalchemy import create_engine, text + +PROJECT_ROOT = Path(__file__).parent.parent.parent +load_dotenv(PROJECT_ROOT / ".env") + +DATABASE_URL = os.environ.get("DATABASE_URL") +if not DATABASE_URL: + print("ERROR: DATABASE_URL not set in .env") + sys.exit(1) + + +def seed_amenities() -> int: + """Insert sample amenity data for all neighbourhoods.""" + engine = create_engine(DATABASE_URL) + + with engine.connect() as conn: + result = conn.execute( + text("SELECT neighbourhood_id FROM public.dim_neighbourhood") + ) + neighbourhood_ids = [row[0] for row in result] + + print(f"Found {len(neighbourhood_ids)} neighbourhoods") + + amenity_types = [ + "Parks", + "Schools", + "Transit Stops", + "Libraries", + "Community Centres", + "Recreation", + ] + year = 2024 + + with engine.begin() as conn: + conn.execute(text("DELETE FROM public.fact_amenities")) + + total = 0 + for n_id in neighbourhood_ids: + for amenity_type in amenity_types: + count = random.randint(1, 50) + conn.execute( + text( + """ + INSERT INTO public.fact_amenities + (neighbourhood_id, amenity_type, count, year) + VALUES (:neighbourhood_id, :amenity_type, :count, :year) + """ + ), + { + "neighbourhood_id": n_id, + "amenity_type": amenity_type, + "count": count, + "year": year, + }, + ) + total += 1 + + print(f"Inserted {total} amenity records") + return total + + +def update_population() -> int: + """Update dim_neighbourhood with population from fact_census.""" + engine = create_engine(DATABASE_URL) + + with engine.begin() as conn: + result = conn.execute( + text( + """ + UPDATE public.dim_neighbourhood dn + SET population = fc.population + FROM public.fact_census fc + WHERE dn.neighbourhood_id = fc.neighbourhood_id + AND fc.census_year = 2021 + """ + ) + ) + count = int(result.rowcount) + + print(f"Updated {count} neighbourhoods with population") + return count + + +def run_dbt() -> bool: + """Run dbt to rebuild amenity marts.""" + dbt_dir = PROJECT_ROOT / "dbt" + venv_dbt = PROJECT_ROOT / ".venv" / "bin" / "dbt" + dbt_cmd = str(venv_dbt) if venv_dbt.exists() else "dbt" + + print("Running dbt to rebuild marts...") + + env = os.environ.copy() + + result = subprocess.run( + [ + dbt_cmd, + "run", + "--profiles-dir", + str(dbt_dir), + "--select", + "+mart_neighbourhood_amenities", + ], + cwd=dbt_dir, + capture_output=True, + text=True, + env=env, + ) + + if result.returncode != 0: + print(f"dbt failed:\n{result.stdout}\n{result.stderr}") + return False + + print("dbt completed successfully") + return True + + +def main() -> int: + """Main entry point.""" + print("Seeding amenity data...") + + seed_amenities() + update_population() + + if not run_dbt(): + return 1 + + print("\nDone! Amenity data is ready.") + return 0 + + +if __name__ == "__main__": + result = main() + sys.exit(result)