fix: Repair data pipeline with StatCan CMHC rental data
- Add StatCan CMHC parser to fetch rental data from Statistics Canada API - Create year spine (2014-2025) as time dimension driver instead of census - Add CMA-level rental and income intermediate models - Update mart_neighbourhood_overview to use rental years as base - Fix neighbourhood_service queries to match dbt schema - Add CMHC data loading to pipeline script Data now flows correctly: 158 neighbourhoods × 12 years = 1,896 records Rent data available 2019-2025, crime data 2014-2024 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
"""Service layer for querying neighbourhood data from dbt marts."""
|
||||
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from typing import Any
|
||||
|
||||
@@ -8,6 +9,8 @@ from sqlalchemy import text
|
||||
|
||||
from portfolio_app.toronto.models import get_engine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _execute_query(sql: str, params: dict[str, Any] | None = None) -> pd.DataFrame:
|
||||
"""Execute SQL query and return DataFrame.
|
||||
@@ -23,8 +26,10 @@ def _execute_query(sql: str, params: dict[str, Any] | None = None) -> pd.DataFra
|
||||
engine = get_engine()
|
||||
with engine.connect() as conn:
|
||||
return pd.read_sql(text(sql), conn, params=params)
|
||||
except Exception:
|
||||
# Return empty DataFrame on connection or query error
|
||||
except Exception as e:
|
||||
logger.error(f"Query failed: {e}")
|
||||
logger.debug(f"Failed SQL: {sql}")
|
||||
logger.debug(f"Params: {params}")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
@@ -56,7 +61,7 @@ def get_overview_data(year: int = 2021) -> pd.DataFrame:
|
||||
rent_to_income_pct,
|
||||
avg_rent_2bed,
|
||||
total_amenities_per_1000
|
||||
FROM mart_neighbourhood_overview
|
||||
FROM public_marts.mart_neighbourhood_overview
|
||||
WHERE year = :year
|
||||
ORDER BY livability_score DESC NULLS LAST
|
||||
"""
|
||||
@@ -95,7 +100,7 @@ def get_housing_data(year: int = 2021) -> pd.DataFrame:
|
||||
affordability_index,
|
||||
rent_yoy_change_pct,
|
||||
income_quintile
|
||||
FROM mart_neighbourhood_housing
|
||||
FROM public_marts.mart_neighbourhood_housing
|
||||
WHERE year = :year
|
||||
ORDER BY affordability_index ASC NULLS LAST
|
||||
"""
|
||||
@@ -112,26 +117,22 @@ def get_safety_data(year: int = 2021) -> pd.DataFrame:
|
||||
|
||||
Returns:
|
||||
DataFrame with columns: neighbourhood_id, neighbourhood_name,
|
||||
total_crime_rate, violent_crime_rate, property_crime_rate, etc.
|
||||
total_crime_rate, violent_crimes, property_crimes, etc.
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
neighbourhood_id,
|
||||
neighbourhood_name,
|
||||
year,
|
||||
total_crimes,
|
||||
total_incidents as total_crimes,
|
||||
crime_rate_per_100k as total_crime_rate,
|
||||
violent_crimes,
|
||||
violent_crime_rate,
|
||||
property_crimes,
|
||||
property_crime_rate,
|
||||
theft_crimes,
|
||||
theft_rate,
|
||||
crime_yoy_change_pct,
|
||||
crime_trend
|
||||
FROM mart_neighbourhood_safety
|
||||
assault_count + robbery_count + homicide_count as violent_crimes,
|
||||
break_enter_count + auto_theft_count as property_crimes,
|
||||
theft_over_count as theft_crimes,
|
||||
crime_yoy_change_pct
|
||||
FROM public_marts.mart_neighbourhood_safety
|
||||
WHERE year = :year
|
||||
ORDER BY total_crime_rate ASC NULLS LAST
|
||||
ORDER BY crime_rate_per_100k ASC NULLS LAST
|
||||
"""
|
||||
return _execute_query(sql, {"year": year})
|
||||
|
||||
@@ -152,22 +153,22 @@ def get_demographics_data(year: int = 2021) -> pd.DataFrame:
|
||||
SELECT
|
||||
neighbourhood_id,
|
||||
neighbourhood_name,
|
||||
census_year as year,
|
||||
year,
|
||||
population,
|
||||
population_density,
|
||||
population_change_pct,
|
||||
median_household_income,
|
||||
average_household_income,
|
||||
income_quintile,
|
||||
income_index,
|
||||
median_age,
|
||||
pct_under_18,
|
||||
pct_18_to_64,
|
||||
pct_65_plus,
|
||||
pct_bachelors_or_higher,
|
||||
age_index,
|
||||
pct_owner_occupied,
|
||||
pct_renter_occupied,
|
||||
education_bachelors_pct as pct_bachelors_or_higher,
|
||||
unemployment_rate,
|
||||
diversity_index
|
||||
FROM mart_neighbourhood_demographics
|
||||
WHERE census_year = :year
|
||||
tenure_diversity_index as diversity_index
|
||||
FROM public_marts.mart_neighbourhood_demographics
|
||||
WHERE year = :year
|
||||
ORDER BY population DESC NULLS LAST
|
||||
"""
|
||||
return _execute_query(sql, {"year": year})
|
||||
@@ -183,26 +184,26 @@ def get_amenities_data(year: int = 2021) -> pd.DataFrame:
|
||||
|
||||
Returns:
|
||||
DataFrame with columns: neighbourhood_id, neighbourhood_name,
|
||||
amenity_score, parks_per_capita, schools_per_capita, transit_score, etc.
|
||||
amenity_score, parks_per_1000, schools_per_1000, etc.
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
neighbourhood_id,
|
||||
neighbourhood_name,
|
||||
year,
|
||||
park_count,
|
||||
parks_count as park_count,
|
||||
parks_per_1000,
|
||||
school_count,
|
||||
schools_count as school_count,
|
||||
schools_per_1000,
|
||||
childcare_count,
|
||||
childcare_per_1000,
|
||||
transit_count as childcare_count,
|
||||
transit_per_1000 as childcare_per_1000,
|
||||
total_amenities,
|
||||
total_amenities_per_1000,
|
||||
amenity_score,
|
||||
amenity_rank
|
||||
FROM mart_neighbourhood_amenities
|
||||
amenity_index as amenity_score,
|
||||
amenity_tier as amenity_rank
|
||||
FROM public_marts.mart_neighbourhood_amenities
|
||||
WHERE year = :year
|
||||
ORDER BY amenity_score DESC NULLS LAST
|
||||
ORDER BY amenity_index DESC NULLS LAST
|
||||
"""
|
||||
return _execute_query(sql, {"year": year})
|
||||
|
||||
@@ -249,17 +250,17 @@ def get_neighbourhood_details(
|
||||
a.park_count,
|
||||
a.school_count,
|
||||
a.total_amenities
|
||||
FROM mart_neighbourhood_overview o
|
||||
LEFT JOIN mart_neighbourhood_safety s
|
||||
FROM public_marts.mart_neighbourhood_overview o
|
||||
LEFT JOIN public_marts.mart_neighbourhood_safety s
|
||||
ON o.neighbourhood_id = s.neighbourhood_id
|
||||
AND o.year = s.year
|
||||
LEFT JOIN mart_neighbourhood_housing h
|
||||
LEFT JOIN public_marts.mart_neighbourhood_housing h
|
||||
ON o.neighbourhood_id = h.neighbourhood_id
|
||||
AND o.year = h.year
|
||||
LEFT JOIN mart_neighbourhood_demographics d
|
||||
LEFT JOIN public_marts.mart_neighbourhood_demographics d
|
||||
ON o.neighbourhood_id = d.neighbourhood_id
|
||||
AND o.year = d.census_year
|
||||
LEFT JOIN mart_neighbourhood_amenities a
|
||||
LEFT JOIN public_marts.mart_neighbourhood_amenities a
|
||||
ON o.neighbourhood_id = a.neighbourhood_id
|
||||
AND o.year = a.year
|
||||
WHERE o.neighbourhood_id = :neighbourhood_id
|
||||
@@ -288,7 +289,7 @@ def get_neighbourhood_list(year: int = 2021) -> list[dict[str, Any]]:
|
||||
neighbourhood_id,
|
||||
neighbourhood_name,
|
||||
population
|
||||
FROM mart_neighbourhood_overview
|
||||
FROM public_marts.mart_neighbourhood_overview
|
||||
WHERE year = :year
|
||||
ORDER BY neighbourhood_name
|
||||
"""
|
||||
@@ -317,19 +318,19 @@ def get_rankings(
|
||||
"""
|
||||
# Map metrics to their source tables
|
||||
table_map = {
|
||||
"livability_score": "mart_neighbourhood_overview",
|
||||
"safety_score": "mart_neighbourhood_overview",
|
||||
"affordability_score": "mart_neighbourhood_overview",
|
||||
"amenity_score": "mart_neighbourhood_overview",
|
||||
"crime_rate_per_100k": "mart_neighbourhood_safety",
|
||||
"total_crime_rate": "mart_neighbourhood_safety",
|
||||
"avg_rent_2bed": "mart_neighbourhood_housing",
|
||||
"affordability_index": "mart_neighbourhood_housing",
|
||||
"population": "mart_neighbourhood_demographics",
|
||||
"median_household_income": "mart_neighbourhood_demographics",
|
||||
"livability_score": "public_marts.mart_neighbourhood_overview",
|
||||
"safety_score": "public_marts.mart_neighbourhood_overview",
|
||||
"affordability_score": "public_marts.mart_neighbourhood_overview",
|
||||
"amenity_score": "public_marts.mart_neighbourhood_overview",
|
||||
"crime_rate_per_100k": "public_marts.mart_neighbourhood_safety",
|
||||
"total_crime_rate": "public_marts.mart_neighbourhood_safety",
|
||||
"avg_rent_2bed": "public_marts.mart_neighbourhood_housing",
|
||||
"affordability_index": "public_marts.mart_neighbourhood_housing",
|
||||
"population": "public_marts.mart_neighbourhood_demographics",
|
||||
"median_household_income": "public_marts.mart_neighbourhood_demographics",
|
||||
}
|
||||
|
||||
table = table_map.get(metric, "mart_neighbourhood_overview")
|
||||
table = table_map.get(metric, "public_marts.mart_neighbourhood_overview")
|
||||
year_col = "census_year" if "demographics" in table else "year"
|
||||
|
||||
order = "ASC" if ascending else "DESC"
|
||||
@@ -375,7 +376,7 @@ def get_city_averages(year: int = 2021) -> dict[str, Any]:
|
||||
AVG(crime_rate_per_100k) as avg_crime_rate,
|
||||
AVG(avg_rent_2bed) as avg_rent_2bed,
|
||||
AVG(rent_to_income_pct) as avg_rent_to_income
|
||||
FROM mart_neighbourhood_overview
|
||||
FROM public_marts.mart_neighbourhood_overview
|
||||
WHERE year = :year
|
||||
"""
|
||||
df = _execute_query(sql, {"year": year})
|
||||
|
||||
Reference in New Issue
Block a user