fix: Repair data pipeline with StatCan CMHC rental data

- Add StatCan CMHC parser to fetch rental data from Statistics Canada API
- Create year spine (2014-2025) as time dimension driver instead of census
- Add CMA-level rental and income intermediate models
- Update mart_neighbourhood_overview to use rental years as base
- Fix neighbourhood_service queries to match dbt schema
- Add CMHC data loading to pipeline script

Data now flows correctly: 158 neighbourhoods × 12 years = 1,896 records
Rent data available 2019-2025, crime data 2014-2024

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-17 15:38:31 -05:00
parent 4818c53fd2
commit d0f32edba7
21 changed files with 955 additions and 156 deletions

View File

@@ -38,12 +38,16 @@ from portfolio_app.toronto.loaders import ( # noqa: E402
load_census_data,
load_crime_data,
load_neighbourhoods,
load_statcan_cmhc_data,
load_time_dimension,
)
from portfolio_app.toronto.parsers import ( # noqa: E402
TorontoOpenDataParser,
TorontoPoliceParser,
)
from portfolio_app.toronto.parsers.statcan_cmhc import ( # noqa: E402
fetch_toronto_rental_data,
)
from portfolio_app.toronto.schemas import Neighbourhood # noqa: E402
# Configure logging
@@ -91,6 +95,9 @@ class DataPipeline:
# 5. Load amenities
self._load_amenities(session)
# 6. Load CMHC rental data from StatCan
self._load_rentals(session)
session.commit()
logger.info("All data committed to database")
@@ -241,6 +248,32 @@ class DataPipeline:
self.stats["amenities"] = total_count
def _load_rentals(self, session: Any) -> None:
"""Fetch and load CMHC rental data from StatCan."""
logger.info("Fetching CMHC rental data from Statistics Canada...")
if self.dry_run:
logger.info(" [DRY RUN] Would fetch and load CMHC rental data")
return
try:
# Fetch rental data (2014-present)
rental_records = fetch_toronto_rental_data(start_year=2014)
if not rental_records:
logger.warning(" No rental records fetched")
return
count = load_statcan_cmhc_data(rental_records, session)
self.stats["rentals"] = count
logger.info(f" Loaded {count} CMHC rental records")
except Exception as e:
logger.warning(f" Failed to load CMHC rental data: {e}")
if self.verbose:
import traceback
traceback.print_exc()
def run_dbt(self) -> bool:
"""Run dbt to transform data.