staging #96

Merged
lmiranda merged 90 commits from staging into main 2026-02-01 21:33:13 +00:00
2 changed files with 158 additions and 1 deletions
Showing only changes of commit c3de98c4a5 - Show all commits

View File

@@ -1,4 +1,4 @@
.PHONY: setup docker-up docker-down db-init load-data run test dbt-run dbt-test lint format ci deploy clean help logs run-detached etl-toronto
.PHONY: setup docker-up docker-down db-init load-data seed-amenities run test dbt-run dbt-test lint format ci deploy clean help logs run-detached etl-toronto
# Default target
.DEFAULT_GOAL := help
@@ -87,6 +87,10 @@ load-data-only: ## Load Toronto data without running dbt
@echo "$(GREEN)Loading Toronto data (skip dbt)...$(NC)"
$(PYTHON) scripts/data/load_toronto_data.py --skip-dbt
seed-amenities: ## Seed sample amenity data (run after load-data)
@echo "$(GREEN)Seeding amenity data...$(NC)"
$(PYTHON) scripts/data/seed_amenity_data.py
# =============================================================================
# Application
# =============================================================================

View File

@@ -0,0 +1,153 @@
#!/usr/bin/env python3
"""Seed sample amenity data for development/testing.
This script populates fact_amenities with sample data and updates
dim_neighbourhood with population from fact_census, then runs dbt
to rebuild the marts.
Usage:
python scripts/data/seed_amenity_data.py
Run this after load_toronto_data.py if amenity data is missing.
"""
import os
import random
import subprocess
import sys
from pathlib import Path
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
PROJECT_ROOT = Path(__file__).parent.parent.parent
load_dotenv(PROJECT_ROOT / ".env")
DATABASE_URL = os.environ.get("DATABASE_URL")
if not DATABASE_URL:
print("ERROR: DATABASE_URL not set in .env")
sys.exit(1)
def seed_amenities() -> int:
"""Insert sample amenity data for all neighbourhoods."""
engine = create_engine(DATABASE_URL)
with engine.connect() as conn:
result = conn.execute(
text("SELECT neighbourhood_id FROM public.dim_neighbourhood")
)
neighbourhood_ids = [row[0] for row in result]
print(f"Found {len(neighbourhood_ids)} neighbourhoods")
amenity_types = [
"Parks",
"Schools",
"Transit Stops",
"Libraries",
"Community Centres",
"Recreation",
]
year = 2024
with engine.begin() as conn:
conn.execute(text("DELETE FROM public.fact_amenities"))
total = 0
for n_id in neighbourhood_ids:
for amenity_type in amenity_types:
count = random.randint(1, 50)
conn.execute(
text(
"""
INSERT INTO public.fact_amenities
(neighbourhood_id, amenity_type, count, year)
VALUES (:neighbourhood_id, :amenity_type, :count, :year)
"""
),
{
"neighbourhood_id": n_id,
"amenity_type": amenity_type,
"count": count,
"year": year,
},
)
total += 1
print(f"Inserted {total} amenity records")
return total
def update_population() -> int:
"""Update dim_neighbourhood with population from fact_census."""
engine = create_engine(DATABASE_URL)
with engine.begin() as conn:
result = conn.execute(
text(
"""
UPDATE public.dim_neighbourhood dn
SET population = fc.population
FROM public.fact_census fc
WHERE dn.neighbourhood_id = fc.neighbourhood_id
AND fc.census_year = 2021
"""
)
)
count = int(result.rowcount)
print(f"Updated {count} neighbourhoods with population")
return count
def run_dbt() -> bool:
"""Run dbt to rebuild amenity marts."""
dbt_dir = PROJECT_ROOT / "dbt"
venv_dbt = PROJECT_ROOT / ".venv" / "bin" / "dbt"
dbt_cmd = str(venv_dbt) if venv_dbt.exists() else "dbt"
print("Running dbt to rebuild marts...")
env = os.environ.copy()
result = subprocess.run(
[
dbt_cmd,
"run",
"--profiles-dir",
str(dbt_dir),
"--select",
"+mart_neighbourhood_amenities",
],
cwd=dbt_dir,
capture_output=True,
text=True,
env=env,
)
if result.returncode != 0:
print(f"dbt failed:\n{result.stdout}\n{result.stderr}")
return False
print("dbt completed successfully")
return True
def main() -> int:
"""Main entry point."""
print("Seeding amenity data...")
seed_amenities()
update_population()
if not run_dbt():
return 1
print("\nDone! Amenity data is ready.")
return 0
if __name__ == "__main__":
result = main()
sys.exit(result)