fix: Add median_age seeding to development data script
Some checks failed
CI / lint-and-test (push) Has been cancelled
Some checks failed
CI / lint-and-test (push) Has been cancelled
Updates seed_amenity_data.py to also seed median_age values in fact_census where missing, ensuring demographics notebooks work. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,14 +1,16 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Seed sample amenity data for development/testing.
|
"""Seed sample data for development/testing.
|
||||||
|
|
||||||
This script populates fact_amenities with sample data and updates
|
This script:
|
||||||
dim_neighbourhood with population from fact_census, then runs dbt
|
- Populates fact_amenities with sample data
|
||||||
to rebuild the marts.
|
- Updates dim_neighbourhood with population from fact_census
|
||||||
|
- Seeds median_age in fact_census where missing
|
||||||
|
- Runs dbt to rebuild the marts
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python scripts/data/seed_amenity_data.py
|
python scripts/data/seed_amenity_data.py
|
||||||
|
|
||||||
Run this after load_toronto_data.py if amenity data is missing.
|
Run this after load_toronto_data.py to ensure notebooks have data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -101,8 +103,33 @@ def update_population() -> int:
|
|||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def seed_median_age() -> int:
|
||||||
|
"""Seed median_age in fact_census where missing."""
|
||||||
|
engine = create_engine(DATABASE_URL)
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
result = conn.execute(
|
||||||
|
text("SELECT id FROM public.fact_census WHERE median_age IS NULL")
|
||||||
|
)
|
||||||
|
null_ids = [row[0] for row in result]
|
||||||
|
|
||||||
|
if not null_ids:
|
||||||
|
print("No NULL median_age values found")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
for census_id in null_ids:
|
||||||
|
age = random.randint(30, 50)
|
||||||
|
conn.execute(
|
||||||
|
text("UPDATE public.fact_census SET median_age = :age WHERE id = :id"),
|
||||||
|
{"age": age, "id": census_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Seeded median_age for {len(null_ids)} census records")
|
||||||
|
return len(null_ids)
|
||||||
|
|
||||||
|
|
||||||
def run_dbt() -> bool:
|
def run_dbt() -> bool:
|
||||||
"""Run dbt to rebuild amenity marts."""
|
"""Run dbt to rebuild marts."""
|
||||||
dbt_dir = PROJECT_ROOT / "dbt"
|
dbt_dir = PROJECT_ROOT / "dbt"
|
||||||
venv_dbt = PROJECT_ROOT / ".venv" / "bin" / "dbt"
|
venv_dbt = PROJECT_ROOT / ".venv" / "bin" / "dbt"
|
||||||
dbt_cmd = str(venv_dbt) if venv_dbt.exists() else "dbt"
|
dbt_cmd = str(venv_dbt) if venv_dbt.exists() else "dbt"
|
||||||
@@ -118,7 +145,7 @@ def run_dbt() -> bool:
|
|||||||
"--profiles-dir",
|
"--profiles-dir",
|
||||||
str(dbt_dir),
|
str(dbt_dir),
|
||||||
"--select",
|
"--select",
|
||||||
"+mart_neighbourhood_amenities",
|
"+mart_neighbourhood_amenities +mart_neighbourhood_demographics",
|
||||||
],
|
],
|
||||||
cwd=dbt_dir,
|
cwd=dbt_dir,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
@@ -136,15 +163,16 @@ def run_dbt() -> bool:
|
|||||||
|
|
||||||
def main() -> int:
|
def main() -> int:
|
||||||
"""Main entry point."""
|
"""Main entry point."""
|
||||||
print("Seeding amenity data...")
|
print("Seeding development data...")
|
||||||
|
|
||||||
seed_amenities()
|
seed_amenities()
|
||||||
update_population()
|
update_population()
|
||||||
|
seed_median_age()
|
||||||
|
|
||||||
if not run_dbt():
|
if not run_dbt():
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
print("\nDone! Amenity data is ready.")
|
print("\nDone! Development data is ready.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user