From 14701f334cda868d6abc53b5227c887294c50e8d Mon Sep 17 00:00:00 2001 From: lmiranda Date: Sun, 18 Jan 2026 23:21:14 -0500 Subject: [PATCH] fix: Complete seed script with all missing data + add statsmodels - Seed script now seeds: amenities, population, median_age, census housing columns, housing mart (rent/affordability), overview mart (safety_score, population) - Add statsmodels dependency for scatter plot trendlines - Add dbt/.user.yml to gitignore All 15 notebooks now pass with valid data. Co-Authored-By: Claude Opus 4.5 --- .gitignore | 1 + pyproject.toml | 1 + scripts/data/seed_amenity_data.py | 190 ++++++++++++++++++++++++++++-- 3 files changed, 184 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index ac744a1..49c7a7f 100644 --- a/.gitignore +++ b/.gitignore @@ -198,3 +198,4 @@ cython_debug/ # PyPI configuration file .pypirc +dbt/.user.yml diff --git a/pyproject.toml b/pyproject.toml index 1f54cd4..ba3f5db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "geopandas>=1.1", "shapely>=2.0", "pyproj>=3.6", + "statsmodels>=0.14", # Visualization "dash>=3.3", diff --git a/scripts/data/seed_amenity_data.py b/scripts/data/seed_amenity_data.py index 19ba360..5e26578 100644 --- a/scripts/data/seed_amenity_data.py +++ b/scripts/data/seed_amenity_data.py @@ -5,6 +5,9 @@ This script: - Populates fact_amenities with sample data - Updates dim_neighbourhood with population from fact_census - Seeds median_age in fact_census where missing +- Seeds census housing columns (tenure, income, dwelling value) +- Seeds housing mart data (rent, affordability) +- Seeds overview mart data (safety_score, population) - Runs dbt to rebuild the marts Usage: @@ -128,6 +131,178 @@ def seed_median_age() -> int: return len(null_ids) +def seed_census_housing() -> int: + """Seed housing columns in fact_census where missing.""" + engine = create_engine(DATABASE_URL) + + with engine.begin() as conn: + result = conn.execute( + text("SELECT id FROM public.fact_census WHERE pct_owner_occupied IS NULL") + ) + null_ids = [row[0] for row in result] + + if not null_ids: + print("No NULL census housing values found") + return 0 + + for census_id in null_ids: + conn.execute( + text( + """ + UPDATE public.fact_census SET + pct_owner_occupied = :owner, + pct_renter_occupied = :renter, + average_dwelling_value = :dwelling, + median_household_income = :income + WHERE id = :id + """ + ), + { + "id": census_id, + "owner": round(random.uniform(30, 80), 1), + "renter": round(random.uniform(20, 70), 1), + "dwelling": random.randint(400000, 1500000), + "income": random.randint(50000, 180000), + }, + ) + + print(f"Seeded census housing data for {len(null_ids)} records") + return len(null_ids) + + +def seed_housing_mart() -> int: + """Seed housing mart with rental and affordability data.""" + engine = create_engine(DATABASE_URL) + + with engine.begin() as conn: + result = conn.execute( + text( + """ + SELECT neighbourhood_id, year + FROM public_marts.mart_neighbourhood_housing + WHERE avg_rent_2bed IS NULL + """ + ) + ) + rows = [dict(row._mapping) for row in result] + + if not rows: + print("No NULL housing mart values found") + return 0 + + for row in rows: + avg_rent = random.randint(1800, 3200) + income = random.randint(55000, 180000) + rent_to_income = round((avg_rent * 12 / income) * 100, 2) + affordability = round(rent_to_income / 30 * 100, 1) + + conn.execute( + text( + """ + UPDATE public_marts.mart_neighbourhood_housing SET + avg_rent_bachelor = :bachelor, + avg_rent_1bed = :onebed, + avg_rent_2bed = :twobed, + avg_rent_3bed = :threebed, + vacancy_rate = :vacancy, + rent_to_income_pct = :rent_income, + affordability_index = :afford_idx, + is_affordable = :is_afford, + median_household_income = :income, + pct_owner_occupied = :owner, + pct_renter_occupied = :renter + WHERE neighbourhood_id = :nid AND year = :year + """ + ), + { + "nid": row["neighbourhood_id"], + "year": row["year"], + "bachelor": avg_rent - 500, + "onebed": avg_rent - 300, + "twobed": avg_rent, + "threebed": avg_rent + 400, + "vacancy": round(random.uniform(0.5, 4.5), 1), + "rent_income": rent_to_income, + "afford_idx": affordability, + "is_afford": rent_to_income <= 30, + "income": income, + "owner": round(random.uniform(30, 75), 1), + "renter": round(random.uniform(25, 70), 1), + }, + ) + + print(f"Seeded housing mart data for {len(rows)} records") + return len(rows) + + +def seed_overview_mart() -> int: + """Seed overview mart with safety_score and population.""" + engine = create_engine(DATABASE_URL) + total = 0 + + with engine.begin() as conn: + # Seed safety_score + result = conn.execute( + text( + """ + SELECT neighbourhood_id, year + FROM public_marts.mart_neighbourhood_overview + WHERE safety_score IS NULL + """ + ) + ) + rows = [dict(row._mapping) for row in result] + + for row in rows: + conn.execute( + text( + """ + UPDATE public_marts.mart_neighbourhood_overview + SET safety_score = :score + WHERE neighbourhood_id = :nid AND year = :year + """ + ), + { + "nid": row["neighbourhood_id"], + "year": row["year"], + "score": round(random.uniform(40, 95), 1), + }, + ) + total += 1 + + # Seed population + result = conn.execute( + text( + """ + SELECT neighbourhood_id, year + FROM public_marts.mart_neighbourhood_overview + WHERE population IS NULL + """ + ) + ) + rows = [dict(row._mapping) for row in result] + + for row in rows: + conn.execute( + text( + """ + UPDATE public_marts.mart_neighbourhood_overview + SET population = :pop + WHERE neighbourhood_id = :nid AND year = :year + """ + ), + { + "nid": row["neighbourhood_id"], + "year": row["year"], + "pop": random.randint(8000, 45000), + }, + ) + total += 1 + + print(f"Seeded overview mart data for {total} records") + return total + + def run_dbt() -> bool: """Run dbt to rebuild marts.""" dbt_dir = PROJECT_ROOT / "dbt" @@ -137,16 +312,10 @@ def run_dbt() -> bool: print("Running dbt to rebuild marts...") env = os.environ.copy() + env["POSTGRES_PASSWORD"] = os.environ.get("POSTGRES_PASSWORD", "") result = subprocess.run( - [ - dbt_cmd, - "run", - "--profiles-dir", - str(dbt_dir), - "--select", - "+mart_neighbourhood_amenities +mart_neighbourhood_demographics", - ], + [dbt_cmd, "run", "--profiles-dir", str(dbt_dir)], cwd=dbt_dir, capture_output=True, text=True, @@ -168,10 +337,15 @@ def main() -> int: seed_amenities() update_population() seed_median_age() + seed_census_housing() if not run_dbt(): return 1 + # Seed mart tables after dbt rebuild + seed_housing_mart() + seed_overview_mart() + print("\nDone! Development data is ready.") return 0