diff --git a/scripts/data/seed_amenity_data.py b/scripts/data/seed_amenity_data.py index 1cf6a5b..19ba360 100644 --- a/scripts/data/seed_amenity_data.py +++ b/scripts/data/seed_amenity_data.py @@ -1,14 +1,16 @@ #!/usr/bin/env python3 -"""Seed sample amenity data for development/testing. +"""Seed sample data for development/testing. -This script populates fact_amenities with sample data and updates -dim_neighbourhood with population from fact_census, then runs dbt -to rebuild the marts. +This script: +- Populates fact_amenities with sample data +- Updates dim_neighbourhood with population from fact_census +- Seeds median_age in fact_census where missing +- Runs dbt to rebuild the marts Usage: python scripts/data/seed_amenity_data.py -Run this after load_toronto_data.py if amenity data is missing. +Run this after load_toronto_data.py to ensure notebooks have data. """ import os @@ -101,8 +103,33 @@ def update_population() -> int: return count +def seed_median_age() -> int: + """Seed median_age in fact_census where missing.""" + engine = create_engine(DATABASE_URL) + + with engine.begin() as conn: + result = conn.execute( + text("SELECT id FROM public.fact_census WHERE median_age IS NULL") + ) + null_ids = [row[0] for row in result] + + if not null_ids: + print("No NULL median_age values found") + return 0 + + for census_id in null_ids: + age = random.randint(30, 50) + conn.execute( + text("UPDATE public.fact_census SET median_age = :age WHERE id = :id"), + {"age": age, "id": census_id}, + ) + + print(f"Seeded median_age for {len(null_ids)} census records") + return len(null_ids) + + def run_dbt() -> bool: - """Run dbt to rebuild amenity marts.""" + """Run dbt to rebuild marts.""" dbt_dir = PROJECT_ROOT / "dbt" venv_dbt = PROJECT_ROOT / ".venv" / "bin" / "dbt" dbt_cmd = str(venv_dbt) if venv_dbt.exists() else "dbt" @@ -118,7 +145,7 @@ def run_dbt() -> bool: "--profiles-dir", str(dbt_dir), "--select", - "+mart_neighbourhood_amenities", + "+mart_neighbourhood_amenities +mart_neighbourhood_demographics", ], cwd=dbt_dir, capture_output=True, @@ -136,15 +163,16 @@ def run_dbt() -> bool: def main() -> int: """Main entry point.""" - print("Seeding amenity data...") + print("Seeding development data...") seed_amenities() update_population() + seed_median_age() if not run_dbt(): return 1 - print("\nDone! Amenity data is ready.") + print("\nDone! Development data is ready.") return 0