feat: add loaders and dbt models for Toronto housing data

Sprint 4 implementation:

Loaders:
- base.py: Session management, bulk insert, upsert utilities
- dimensions.py: Load time, district, zone, neighbourhood, policy dimensions
- trreb.py: Load TRREB purchase data to fact_purchases
- cmhc.py: Load CMHC rental data to fact_rentals

dbt Project:
- Project configuration (dbt_project.yml, packages.yml)
- Staging models for all fact and dimension tables
- Intermediate models with dimension enrichment
- Marts: purchase analysis, rental analysis, market summary

Closes #16

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-11 16:07:30 -05:00
parent 88e23674a8
commit 457bb49395
22 changed files with 1320 additions and 0 deletions

View File

@@ -0,0 +1,24 @@
version: 2
models:
- name: int_purchases__monthly
description: "Purchase data enriched with time and district dimensions"
columns:
- name: purchase_id
tests:
- unique
- not_null
- name: district_code
tests:
- not_null
- name: int_rentals__annual
description: "Rental data enriched with time and zone dimensions"
columns:
- name: rental_id
tests:
- unique
- not_null
- name: zone_code
tests:
- not_null

View File

@@ -0,0 +1,62 @@
-- Intermediate: Monthly purchase data enriched with dimensions
-- Joins purchases with time and district dimensions for analysis
with purchases as (
select * from {{ ref('stg_trreb__purchases') }}
),
time_dim as (
select * from {{ ref('stg_dimensions__time') }}
),
district_dim as (
select * from {{ ref('stg_dimensions__trreb_districts') }}
),
enriched as (
select
p.purchase_id,
-- Time attributes
t.date_key,
t.full_date,
t.year,
t.month,
t.quarter,
t.month_name,
-- District attributes
d.district_key,
d.district_code,
d.district_name,
d.area_type,
-- Metrics
p.sales_count,
p.dollar_volume,
p.avg_price,
p.median_price,
p.new_listings,
p.active_listings,
p.days_on_market,
p.sale_to_list_ratio,
-- Calculated metrics
case
when p.active_listings > 0
then round(p.sales_count::numeric / p.active_listings, 3)
else null
end as absorption_rate,
case
when p.sales_count > 0
then round(p.active_listings::numeric / p.sales_count, 1)
else null
end as months_of_inventory
from purchases p
inner join time_dim t on p.date_key = t.date_key
inner join district_dim d on p.district_key = d.district_key
)
select * from enriched

View File

@@ -0,0 +1,57 @@
-- Intermediate: Annual rental data enriched with dimensions
-- Joins rentals with time and zone dimensions for analysis
with rentals as (
select * from {{ ref('stg_cmhc__rentals') }}
),
time_dim as (
select * from {{ ref('stg_dimensions__time') }}
),
zone_dim as (
select * from {{ ref('stg_dimensions__cmhc_zones') }}
),
enriched as (
select
r.rental_id,
-- Time attributes
t.date_key,
t.full_date,
t.year,
t.month,
t.quarter,
-- Zone attributes
z.zone_key,
z.zone_code,
z.zone_name,
-- Bedroom type
r.bedroom_type,
-- Metrics
r.rental_universe,
r.avg_rent,
r.median_rent,
r.vacancy_rate,
r.availability_rate,
r.turnover_rate,
r.year_over_year_rent_change,
r.reliability_code,
-- Calculated metrics
case
when r.rental_universe > 0 and r.vacancy_rate is not null
then round(r.rental_universe * (r.vacancy_rate / 100), 0)
else null
end as vacant_units_estimate
from rentals r
inner join time_dim t on r.date_key = t.date_key
inner join zone_dim z on r.zone_key = z.zone_key
)
select * from enriched

View File

@@ -0,0 +1,23 @@
version: 2
models:
- name: mart_toronto_purchases
description: "Final mart for Toronto purchase/sales analysis by district and time"
columns:
- name: purchase_id
description: "Unique purchase record identifier"
tests:
- unique
- not_null
- name: mart_toronto_rentals
description: "Final mart for Toronto rental market analysis by zone and time"
columns:
- name: rental_id
description: "Unique rental record identifier"
tests:
- unique
- not_null
- name: mart_toronto_market_summary
description: "Combined market summary aggregating purchases and rentals at Toronto level"

View File

@@ -0,0 +1,81 @@
-- Mart: Toronto Market Summary
-- Aggregated view combining purchase and rental market indicators
-- Grain: One row per year-month
with purchases_agg as (
select
year,
month,
month_name,
quarter,
-- Aggregate purchase metrics across all districts
sum(sales_count) as total_sales,
sum(dollar_volume) as total_dollar_volume,
round(avg(avg_price), 0) as avg_price_all_districts,
round(avg(median_price), 0) as median_price_all_districts,
sum(new_listings) as total_new_listings,
sum(active_listings) as total_active_listings,
round(avg(days_on_market), 0) as avg_days_on_market,
round(avg(sale_to_list_ratio), 2) as avg_sale_to_list_ratio,
round(avg(absorption_rate), 3) as avg_absorption_rate,
round(avg(months_of_inventory), 1) as avg_months_of_inventory,
round(avg(avg_price_yoy_pct), 2) as avg_price_yoy_pct
from {{ ref('mart_toronto_purchases') }}
group by year, month, month_name, quarter
),
rentals_agg as (
select
year,
-- Aggregate rental metrics across all zones (all bedroom types)
round(avg(avg_rent), 0) as avg_rent_all_zones,
round(avg(vacancy_rate), 2) as avg_vacancy_rate,
round(avg(rent_change_pct), 2) as avg_rent_change_pct,
sum(rental_universe) as total_rental_universe
from {{ ref('mart_toronto_rentals') }}
group by year
),
final as (
select
p.year,
p.month,
p.month_name,
p.quarter,
-- Purchase market indicators
p.total_sales,
p.total_dollar_volume,
p.avg_price_all_districts,
p.median_price_all_districts,
p.total_new_listings,
p.total_active_listings,
p.avg_days_on_market,
p.avg_sale_to_list_ratio,
p.avg_absorption_rate,
p.avg_months_of_inventory,
p.avg_price_yoy_pct,
-- Rental market indicators (annual, so join on year)
r.avg_rent_all_zones,
r.avg_vacancy_rate,
r.avg_rent_change_pct,
r.total_rental_universe,
-- Affordability indicator (price to rent ratio)
case
when r.avg_rent_all_zones > 0
then round(p.avg_price_all_districts / (r.avg_rent_all_zones * 12), 1)
else null
end as price_to_annual_rent_ratio
from purchases_agg p
left join rentals_agg r on p.year = r.year
)
select * from final
order by year desc, month desc

View File

@@ -0,0 +1,79 @@
-- Mart: Toronto Purchase Market Analysis
-- Final analytical table for purchase/sales data visualization
-- Grain: One row per district per month
with purchases as (
select * from {{ ref('int_purchases__monthly') }}
),
-- Add year-over-year calculations
with_yoy as (
select
p.*,
-- Previous year same month values
lag(p.avg_price, 12) over (
partition by p.district_code
order by p.date_key
) as avg_price_prev_year,
lag(p.sales_count, 12) over (
partition by p.district_code
order by p.date_key
) as sales_count_prev_year,
lag(p.median_price, 12) over (
partition by p.district_code
order by p.date_key
) as median_price_prev_year
from purchases p
),
final as (
select
purchase_id,
date_key,
full_date,
year,
month,
quarter,
month_name,
district_key,
district_code,
district_name,
area_type,
sales_count,
dollar_volume,
avg_price,
median_price,
new_listings,
active_listings,
days_on_market,
sale_to_list_ratio,
absorption_rate,
months_of_inventory,
-- Year-over-year changes
case
when avg_price_prev_year > 0
then round(((avg_price - avg_price_prev_year) / avg_price_prev_year) * 100, 2)
else null
end as avg_price_yoy_pct,
case
when sales_count_prev_year > 0
then round(((sales_count - sales_count_prev_year)::numeric / sales_count_prev_year) * 100, 2)
else null
end as sales_count_yoy_pct,
case
when median_price_prev_year > 0
then round(((median_price - median_price_prev_year) / median_price_prev_year) * 100, 2)
else null
end as median_price_yoy_pct
from with_yoy
)
select * from final

View File

@@ -0,0 +1,64 @@
-- Mart: Toronto Rental Market Analysis
-- Final analytical table for rental market visualization
-- Grain: One row per zone per bedroom type per survey year
with rentals as (
select * from {{ ref('int_rentals__annual') }}
),
-- Add year-over-year calculations
with_yoy as (
select
r.*,
-- Previous year values
lag(r.avg_rent, 1) over (
partition by r.zone_code, r.bedroom_type
order by r.year
) as avg_rent_prev_year,
lag(r.vacancy_rate, 1) over (
partition by r.zone_code, r.bedroom_type
order by r.year
) as vacancy_rate_prev_year
from rentals r
),
final as (
select
rental_id,
date_key,
full_date,
year,
quarter,
zone_key,
zone_code,
zone_name,
bedroom_type,
rental_universe,
avg_rent,
median_rent,
vacancy_rate,
availability_rate,
turnover_rate,
year_over_year_rent_change,
reliability_code,
vacant_units_estimate,
-- Calculated year-over-year (if not provided)
coalesce(
year_over_year_rent_change,
case
when avg_rent_prev_year > 0
then round(((avg_rent - avg_rent_prev_year) / avg_rent_prev_year) * 100, 2)
else null
end
) as rent_change_pct,
vacancy_rate - vacancy_rate_prev_year as vacancy_rate_change
from with_yoy
)
select * from final

View File

@@ -0,0 +1,61 @@
version: 2
sources:
- name: toronto_housing
description: "Toronto housing data loaded from TRREB and CMHC sources"
database: portfolio
schema: public
tables:
- name: fact_purchases
description: "TRREB monthly purchase/sales statistics by district"
columns:
- name: id
description: "Primary key"
- name: date_key
description: "Foreign key to dim_time"
- name: district_key
description: "Foreign key to dim_trreb_district"
- name: fact_rentals
description: "CMHC annual rental survey data by zone and bedroom type"
columns:
- name: id
description: "Primary key"
- name: date_key
description: "Foreign key to dim_time"
- name: zone_key
description: "Foreign key to dim_cmhc_zone"
- name: dim_time
description: "Time dimension (monthly grain)"
columns:
- name: date_key
description: "Primary key (YYYYMMDD format)"
- name: dim_trreb_district
description: "TRREB district dimension with geometry"
columns:
- name: district_key
description: "Primary key"
- name: district_code
description: "TRREB district code"
- name: dim_cmhc_zone
description: "CMHC zone dimension with geometry"
columns:
- name: zone_key
description: "Primary key"
- name: zone_code
description: "CMHC zone code"
- name: dim_neighbourhood
description: "City of Toronto neighbourhoods (reference only)"
columns:
- name: neighbourhood_id
description: "Primary key"
- name: dim_policy_event
description: "Housing policy events for annotation"
columns:
- name: event_id
description: "Primary key"

View File

@@ -0,0 +1,73 @@
version: 2
models:
- name: stg_trreb__purchases
description: "Staged TRREB purchase/sales data from fact_purchases"
columns:
- name: purchase_id
description: "Unique identifier for purchase record"
tests:
- unique
- not_null
- name: date_key
description: "Date dimension key (YYYYMMDD)"
tests:
- not_null
- name: district_key
description: "TRREB district dimension key"
tests:
- not_null
- name: stg_cmhc__rentals
description: "Staged CMHC rental market data from fact_rentals"
columns:
- name: rental_id
description: "Unique identifier for rental record"
tests:
- unique
- not_null
- name: date_key
description: "Date dimension key (YYYYMMDD)"
tests:
- not_null
- name: zone_key
description: "CMHC zone dimension key"
tests:
- not_null
- name: stg_dimensions__time
description: "Staged time dimension"
columns:
- name: date_key
description: "Date dimension key (YYYYMMDD)"
tests:
- unique
- not_null
- name: stg_dimensions__trreb_districts
description: "Staged TRREB district dimension"
columns:
- name: district_key
description: "District dimension key"
tests:
- unique
- not_null
- name: district_code
description: "TRREB district code (e.g., W01, C01)"
tests:
- unique
- not_null
- name: stg_dimensions__cmhc_zones
description: "Staged CMHC zone dimension"
columns:
- name: zone_key
description: "Zone dimension key"
tests:
- unique
- not_null
- name: zone_code
description: "CMHC zone code"
tests:
- unique
- not_null

View File

@@ -0,0 +1,26 @@
-- Staged CMHC rental market survey data
-- Source: fact_rentals table loaded from CMHC CSV exports
-- Grain: One row per zone per bedroom type per survey year
with source as (
select * from {{ source('toronto_housing', 'fact_rentals') }}
),
staged as (
select
id as rental_id,
date_key,
zone_key,
bedroom_type,
universe as rental_universe,
avg_rent,
median_rent,
vacancy_rate,
availability_rate,
turnover_rate,
rent_change_pct as year_over_year_rent_change,
reliability_code
from source
)
select * from staged

View File

@@ -0,0 +1,18 @@
-- Staged CMHC zone dimension
-- Source: dim_cmhc_zone table
-- Grain: One row per zone
with source as (
select * from {{ source('toronto_housing', 'dim_cmhc_zone') }}
),
staged as (
select
zone_key,
zone_code,
zone_name,
geometry
from source
)
select * from staged

View File

@@ -0,0 +1,21 @@
-- Staged time dimension
-- Source: dim_time table
-- Grain: One row per month
with source as (
select * from {{ source('toronto_housing', 'dim_time') }}
),
staged as (
select
date_key,
full_date,
year,
month,
quarter,
month_name,
is_month_start
from source
)
select * from staged

View File

@@ -0,0 +1,19 @@
-- Staged TRREB district dimension
-- Source: dim_trreb_district table
-- Grain: One row per district
with source as (
select * from {{ source('toronto_housing', 'dim_trreb_district') }}
),
staged as (
select
district_key,
district_code,
district_name,
area_type,
geometry
from source
)
select * from staged

View File

@@ -0,0 +1,25 @@
-- Staged TRREB purchase/sales data
-- Source: fact_purchases table loaded from TRREB Market Watch PDFs
-- Grain: One row per district per month
with source as (
select * from {{ source('toronto_housing', 'fact_purchases') }}
),
staged as (
select
id as purchase_id,
date_key,
district_key,
sales_count,
dollar_volume,
avg_price,
median_price,
new_listings,
active_listings,
avg_dom as days_on_market,
avg_sp_lp as sale_to_list_ratio
from source
)
select * from staged