fix: Repair data pipeline with StatCan CMHC rental data

- Add StatCan CMHC parser to fetch rental data from Statistics Canada API
- Create year spine (2014-2025) as time dimension driver instead of census
- Add CMA-level rental and income intermediate models
- Update mart_neighbourhood_overview to use rental years as base
- Fix neighbourhood_service queries to match dbt schema
- Add CMHC data loading to pipeline script

Data now flows correctly: 158 neighbourhoods × 12 years = 1,896 records
Rent data available 2019-2025, crime data 2014-2024

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-17 15:38:31 -05:00
parent 4818c53fd2
commit d0f32edba7
21 changed files with 955 additions and 156 deletions

View File

@@ -37,7 +37,7 @@ def get_neighbourhoods_geojson(year: int = 2021) -> dict[str, Any]:
ST_AsGeoJSON(geometry)::json as geom,
population,
livability_score
FROM mart_neighbourhood_overview
FROM public_marts.mart_neighbourhood_overview
WHERE year = :year
AND geometry IS NOT NULL
"""

View File

@@ -1,5 +1,6 @@
"""Service layer for querying neighbourhood data from dbt marts."""
import logging
from functools import lru_cache
from typing import Any
@@ -8,6 +9,8 @@ from sqlalchemy import text
from portfolio_app.toronto.models import get_engine
logger = logging.getLogger(__name__)
def _execute_query(sql: str, params: dict[str, Any] | None = None) -> pd.DataFrame:
"""Execute SQL query and return DataFrame.
@@ -23,8 +26,10 @@ def _execute_query(sql: str, params: dict[str, Any] | None = None) -> pd.DataFra
engine = get_engine()
with engine.connect() as conn:
return pd.read_sql(text(sql), conn, params=params)
except Exception:
# Return empty DataFrame on connection or query error
except Exception as e:
logger.error(f"Query failed: {e}")
logger.debug(f"Failed SQL: {sql}")
logger.debug(f"Params: {params}")
return pd.DataFrame()
@@ -56,7 +61,7 @@ def get_overview_data(year: int = 2021) -> pd.DataFrame:
rent_to_income_pct,
avg_rent_2bed,
total_amenities_per_1000
FROM mart_neighbourhood_overview
FROM public_marts.mart_neighbourhood_overview
WHERE year = :year
ORDER BY livability_score DESC NULLS LAST
"""
@@ -95,7 +100,7 @@ def get_housing_data(year: int = 2021) -> pd.DataFrame:
affordability_index,
rent_yoy_change_pct,
income_quintile
FROM mart_neighbourhood_housing
FROM public_marts.mart_neighbourhood_housing
WHERE year = :year
ORDER BY affordability_index ASC NULLS LAST
"""
@@ -112,26 +117,22 @@ def get_safety_data(year: int = 2021) -> pd.DataFrame:
Returns:
DataFrame with columns: neighbourhood_id, neighbourhood_name,
total_crime_rate, violent_crime_rate, property_crime_rate, etc.
total_crime_rate, violent_crimes, property_crimes, etc.
"""
sql = """
SELECT
neighbourhood_id,
neighbourhood_name,
year,
total_crimes,
total_incidents as total_crimes,
crime_rate_per_100k as total_crime_rate,
violent_crimes,
violent_crime_rate,
property_crimes,
property_crime_rate,
theft_crimes,
theft_rate,
crime_yoy_change_pct,
crime_trend
FROM mart_neighbourhood_safety
assault_count + robbery_count + homicide_count as violent_crimes,
break_enter_count + auto_theft_count as property_crimes,
theft_over_count as theft_crimes,
crime_yoy_change_pct
FROM public_marts.mart_neighbourhood_safety
WHERE year = :year
ORDER BY total_crime_rate ASC NULLS LAST
ORDER BY crime_rate_per_100k ASC NULLS LAST
"""
return _execute_query(sql, {"year": year})
@@ -152,22 +153,22 @@ def get_demographics_data(year: int = 2021) -> pd.DataFrame:
SELECT
neighbourhood_id,
neighbourhood_name,
census_year as year,
year,
population,
population_density,
population_change_pct,
median_household_income,
average_household_income,
income_quintile,
income_index,
median_age,
pct_under_18,
pct_18_to_64,
pct_65_plus,
pct_bachelors_or_higher,
age_index,
pct_owner_occupied,
pct_renter_occupied,
education_bachelors_pct as pct_bachelors_or_higher,
unemployment_rate,
diversity_index
FROM mart_neighbourhood_demographics
WHERE census_year = :year
tenure_diversity_index as diversity_index
FROM public_marts.mart_neighbourhood_demographics
WHERE year = :year
ORDER BY population DESC NULLS LAST
"""
return _execute_query(sql, {"year": year})
@@ -183,26 +184,26 @@ def get_amenities_data(year: int = 2021) -> pd.DataFrame:
Returns:
DataFrame with columns: neighbourhood_id, neighbourhood_name,
amenity_score, parks_per_capita, schools_per_capita, transit_score, etc.
amenity_score, parks_per_1000, schools_per_1000, etc.
"""
sql = """
SELECT
neighbourhood_id,
neighbourhood_name,
year,
park_count,
parks_count as park_count,
parks_per_1000,
school_count,
schools_count as school_count,
schools_per_1000,
childcare_count,
childcare_per_1000,
transit_count as childcare_count,
transit_per_1000 as childcare_per_1000,
total_amenities,
total_amenities_per_1000,
amenity_score,
amenity_rank
FROM mart_neighbourhood_amenities
amenity_index as amenity_score,
amenity_tier as amenity_rank
FROM public_marts.mart_neighbourhood_amenities
WHERE year = :year
ORDER BY amenity_score DESC NULLS LAST
ORDER BY amenity_index DESC NULLS LAST
"""
return _execute_query(sql, {"year": year})
@@ -249,17 +250,17 @@ def get_neighbourhood_details(
a.park_count,
a.school_count,
a.total_amenities
FROM mart_neighbourhood_overview o
LEFT JOIN mart_neighbourhood_safety s
FROM public_marts.mart_neighbourhood_overview o
LEFT JOIN public_marts.mart_neighbourhood_safety s
ON o.neighbourhood_id = s.neighbourhood_id
AND o.year = s.year
LEFT JOIN mart_neighbourhood_housing h
LEFT JOIN public_marts.mart_neighbourhood_housing h
ON o.neighbourhood_id = h.neighbourhood_id
AND o.year = h.year
LEFT JOIN mart_neighbourhood_demographics d
LEFT JOIN public_marts.mart_neighbourhood_demographics d
ON o.neighbourhood_id = d.neighbourhood_id
AND o.year = d.census_year
LEFT JOIN mart_neighbourhood_amenities a
LEFT JOIN public_marts.mart_neighbourhood_amenities a
ON o.neighbourhood_id = a.neighbourhood_id
AND o.year = a.year
WHERE o.neighbourhood_id = :neighbourhood_id
@@ -288,7 +289,7 @@ def get_neighbourhood_list(year: int = 2021) -> list[dict[str, Any]]:
neighbourhood_id,
neighbourhood_name,
population
FROM mart_neighbourhood_overview
FROM public_marts.mart_neighbourhood_overview
WHERE year = :year
ORDER BY neighbourhood_name
"""
@@ -317,19 +318,19 @@ def get_rankings(
"""
# Map metrics to their source tables
table_map = {
"livability_score": "mart_neighbourhood_overview",
"safety_score": "mart_neighbourhood_overview",
"affordability_score": "mart_neighbourhood_overview",
"amenity_score": "mart_neighbourhood_overview",
"crime_rate_per_100k": "mart_neighbourhood_safety",
"total_crime_rate": "mart_neighbourhood_safety",
"avg_rent_2bed": "mart_neighbourhood_housing",
"affordability_index": "mart_neighbourhood_housing",
"population": "mart_neighbourhood_demographics",
"median_household_income": "mart_neighbourhood_demographics",
"livability_score": "public_marts.mart_neighbourhood_overview",
"safety_score": "public_marts.mart_neighbourhood_overview",
"affordability_score": "public_marts.mart_neighbourhood_overview",
"amenity_score": "public_marts.mart_neighbourhood_overview",
"crime_rate_per_100k": "public_marts.mart_neighbourhood_safety",
"total_crime_rate": "public_marts.mart_neighbourhood_safety",
"avg_rent_2bed": "public_marts.mart_neighbourhood_housing",
"affordability_index": "public_marts.mart_neighbourhood_housing",
"population": "public_marts.mart_neighbourhood_demographics",
"median_household_income": "public_marts.mart_neighbourhood_demographics",
}
table = table_map.get(metric, "mart_neighbourhood_overview")
table = table_map.get(metric, "public_marts.mart_neighbourhood_overview")
year_col = "census_year" if "demographics" in table else "year"
order = "ASC" if ascending else "DESC"
@@ -375,7 +376,7 @@ def get_city_averages(year: int = 2021) -> dict[str, Any]:
AVG(crime_rate_per_100k) as avg_crime_rate,
AVG(avg_rent_2bed) as avg_rent_2bed,
AVG(rent_to_income_pct) as avg_rent_to_income
FROM mart_neighbourhood_overview
FROM public_marts.mart_neighbourhood_overview
WHERE year = :year
"""
df = _execute_query(sql, {"year": year})