diff --git a/CLAUDE.md b/CLAUDE.md index 49be4ae..1a8f644 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,9 +6,8 @@ Working context for Claude Code on the Analytics Portfolio project. ## Project Status -**Current Sprint**: 8 (Portfolio Website Expansion - Complete) -**Next Sprint**: 9 (Neighbourhood Dashboard Transition) -**Phase**: Transitioning to Toronto Neighbourhood Dashboard +**Current Sprint**: 9 (Neighbourhood Dashboard Transition) +**Phase**: Toronto Neighbourhood Dashboard **Branch**: `development` (feature branches merge here) --- @@ -189,27 +188,20 @@ portfolio_app/ ### Geographic Reality (Toronto Housing) ``` -TRREB Districts (~35) - Purchase data (W01, C01, E01...) +City Neighbourhoods (158) - Primary geographic unit for analysis CMHC Zones (~20) - Rental data (Census Tract aligned) -City Neighbourhoods (158) - Enrichment/overlay only ``` -**Critical**: These geographies do NOT align. Display as separate layers—do not force crosswalks. - ### Star Schema | Table | Type | Keys | |-------|------|------| -| `fact_purchases` | Fact | -> dim_time, dim_trreb_district | | `fact_rentals` | Fact | -> dim_time, dim_cmhc_zone | | `dim_time` | Dimension | date_key (PK) | -| `dim_trreb_district` | Dimension | district_key (PK), geometry | | `dim_cmhc_zone` | Dimension | zone_key (PK), geometry | | `dim_neighbourhood` | Dimension | neighbourhood_id (PK), geometry | | `dim_policy_event` | Dimension | event_id (PK) | -**V1 Rule**: `dim_neighbourhood` has NO FK to fact tables—reference overlay only. - ### dbt Layers | Layer | Naming | Purpose | @@ -220,37 +212,15 @@ City Neighbourhoods (158) - Enrichment/overlay only --- -## DO NOT BUILD (Phase 1) +## Deferred Features **Stop and flag if a task seems to require these**: | Feature | Reason | |---------|--------| -| `bridge_district_neighbourhood` table | Area-weighted aggregation is Phase 4 | -| Crime data integration | Deferred to Phase 4 | | Historical boundary reconciliation (140->158) | 2021+ data only for V1 | -| ML prediction models | Energy project scope (Phase 3) | -| Multi-project shared infrastructure | Build first, abstract second (Phase 2) | - ---- - -## Sprint 1 Deliverables - -| Category | Tasks | -|----------|-------| -| **Bootstrap** | Git init, pyproject.toml, .env.example, Makefile, CLAUDE.md | -| **Infrastructure** | Docker Compose (PostgreSQL + PostGIS), scripts/ directory | -| **App Foundation** | portfolio_app/ structure, config.py, error handling | -| **Tests** | tests/ directory, conftest.py, pytest config | -| **Data Acquisition** | Download TRREB PDFs, START boundary digitization (HUMAN task) | - -### Human Tasks (Cannot Automate) - -| Task | Tool | Effort | -|------|------|--------| -| Digitize TRREB district boundaries | QGIS | 3-4 hours | -| Research policy events (10-20) | Manual | 2-3 hours | -| Replace social link placeholders | Manual | 5 minutes | +| ML prediction models | Energy project scope (future phase) | +| Multi-project shared infrastructure | Build first, abstract second | --- @@ -285,21 +255,10 @@ All scripts in `scripts/`: | Document | Location | Use When | |----------|----------|----------| -| Full specification | `docs/PROJECT_REFERENCE.md` | Architecture decisions | -| Data schemas (legacy) | `docs/toronto_housing_dashboard_spec_v5.md` | Reference only - being replaced | -| WBS details (legacy) | `docs/wbs_sprint_plan_v4.md` | Reference only - being replaced | -| **Neighbourhood Dashboard Vision** | `docs/changes/Change-Toronto-Analysis.md` | New dashboard specification | -| **Implementation Plan** | `docs/changes/Change-Toronto-Analysis-Reviewed.md` | Sprint planning, cleanup tasks | +| Project reference | `docs/PROJECT_REFERENCE.md` | Architecture decisions | +| Dashboard vision | `docs/changes/Change-Toronto-Analysis.md` | Dashboard specification | +| Implementation plan | `docs/changes/Change-Toronto-Analysis-Reviewed.md` | Sprint planning | --- -## Pending Transition - -**Note**: This project is transitioning from a TRREB district-based housing dashboard to a comprehensive Toronto Neighbourhood Dashboard (158 neighbourhoods). See the Implementation Plan for details on: -- Files being deprecated (TRREB parsers, schemas, loaders) -- New data sources (Toronto Open Data, Toronto Police, CMHC APIs) -- New dashboard tabs (Overview, Housing, Safety, Demographics, Amenities) - ---- - -*Last Updated: Sprint 8* +*Last Updated: Sprint 9* diff --git a/dbt/models/intermediate/_intermediate.yml b/dbt/models/intermediate/_intermediate.yml index ab62db8..9fd7fd9 100644 --- a/dbt/models/intermediate/_intermediate.yml +++ b/dbt/models/intermediate/_intermediate.yml @@ -1,17 +1,6 @@ version: 2 models: - - name: int_purchases__monthly - description: "Purchase data enriched with time and district dimensions" - columns: - - name: purchase_id - tests: - - unique - - not_null - - name: district_code - tests: - - not_null - - name: int_rentals__annual description: "Rental data enriched with time and zone dimensions" columns: diff --git a/dbt/models/intermediate/int_purchases__monthly.sql b/dbt/models/intermediate/int_purchases__monthly.sql deleted file mode 100644 index b03f5d8..0000000 --- a/dbt/models/intermediate/int_purchases__monthly.sql +++ /dev/null @@ -1,62 +0,0 @@ --- Intermediate: Monthly purchase data enriched with dimensions --- Joins purchases with time and district dimensions for analysis - -with purchases as ( - select * from {{ ref('stg_trreb__purchases') }} -), - -time_dim as ( - select * from {{ ref('stg_dimensions__time') }} -), - -district_dim as ( - select * from {{ ref('stg_dimensions__trreb_districts') }} -), - -enriched as ( - select - p.purchase_id, - - -- Time attributes - t.date_key, - t.full_date, - t.year, - t.month, - t.quarter, - t.month_name, - - -- District attributes - d.district_key, - d.district_code, - d.district_name, - d.area_type, - - -- Metrics - p.sales_count, - p.dollar_volume, - p.avg_price, - p.median_price, - p.new_listings, - p.active_listings, - p.days_on_market, - p.sale_to_list_ratio, - - -- Calculated metrics - case - when p.active_listings > 0 - then round(p.sales_count::numeric / p.active_listings, 3) - else null - end as absorption_rate, - - case - when p.sales_count > 0 - then round(p.active_listings::numeric / p.sales_count, 1) - else null - end as months_of_inventory - - from purchases p - inner join time_dim t on p.date_key = t.date_key - inner join district_dim d on p.district_key = d.district_key -) - -select * from enriched diff --git a/dbt/models/marts/_marts.yml b/dbt/models/marts/_marts.yml index b6419a2..1f89efe 100644 --- a/dbt/models/marts/_marts.yml +++ b/dbt/models/marts/_marts.yml @@ -1,15 +1,6 @@ version: 2 models: - - name: mart_toronto_purchases - description: "Final mart for Toronto purchase/sales analysis by district and time" - columns: - - name: purchase_id - description: "Unique purchase record identifier" - tests: - - unique - - not_null - - name: mart_toronto_rentals description: "Final mart for Toronto rental market analysis by zone and time" columns: @@ -18,6 +9,3 @@ models: tests: - unique - not_null - - - name: mart_toronto_market_summary - description: "Combined market summary aggregating purchases and rentals at Toronto level" diff --git a/dbt/models/marts/mart_toronto_market_summary.sql b/dbt/models/marts/mart_toronto_market_summary.sql deleted file mode 100644 index cec3c77..0000000 --- a/dbt/models/marts/mart_toronto_market_summary.sql +++ /dev/null @@ -1,81 +0,0 @@ --- Mart: Toronto Market Summary --- Aggregated view combining purchase and rental market indicators --- Grain: One row per year-month - -with purchases_agg as ( - select - year, - month, - month_name, - quarter, - - -- Aggregate purchase metrics across all districts - sum(sales_count) as total_sales, - sum(dollar_volume) as total_dollar_volume, - round(avg(avg_price), 0) as avg_price_all_districts, - round(avg(median_price), 0) as median_price_all_districts, - sum(new_listings) as total_new_listings, - sum(active_listings) as total_active_listings, - round(avg(days_on_market), 0) as avg_days_on_market, - round(avg(sale_to_list_ratio), 2) as avg_sale_to_list_ratio, - round(avg(absorption_rate), 3) as avg_absorption_rate, - round(avg(months_of_inventory), 1) as avg_months_of_inventory, - round(avg(avg_price_yoy_pct), 2) as avg_price_yoy_pct - - from {{ ref('mart_toronto_purchases') }} - group by year, month, month_name, quarter -), - -rentals_agg as ( - select - year, - - -- Aggregate rental metrics across all zones (all bedroom types) - round(avg(avg_rent), 0) as avg_rent_all_zones, - round(avg(vacancy_rate), 2) as avg_vacancy_rate, - round(avg(rent_change_pct), 2) as avg_rent_change_pct, - sum(rental_universe) as total_rental_universe - - from {{ ref('mart_toronto_rentals') }} - group by year -), - -final as ( - select - p.year, - p.month, - p.month_name, - p.quarter, - - -- Purchase market indicators - p.total_sales, - p.total_dollar_volume, - p.avg_price_all_districts, - p.median_price_all_districts, - p.total_new_listings, - p.total_active_listings, - p.avg_days_on_market, - p.avg_sale_to_list_ratio, - p.avg_absorption_rate, - p.avg_months_of_inventory, - p.avg_price_yoy_pct, - - -- Rental market indicators (annual, so join on year) - r.avg_rent_all_zones, - r.avg_vacancy_rate, - r.avg_rent_change_pct, - r.total_rental_universe, - - -- Affordability indicator (price to rent ratio) - case - when r.avg_rent_all_zones > 0 - then round(p.avg_price_all_districts / (r.avg_rent_all_zones * 12), 1) - else null - end as price_to_annual_rent_ratio - - from purchases_agg p - left join rentals_agg r on p.year = r.year -) - -select * from final -order by year desc, month desc diff --git a/dbt/models/marts/mart_toronto_purchases.sql b/dbt/models/marts/mart_toronto_purchases.sql deleted file mode 100644 index 80c5766..0000000 --- a/dbt/models/marts/mart_toronto_purchases.sql +++ /dev/null @@ -1,79 +0,0 @@ --- Mart: Toronto Purchase Market Analysis --- Final analytical table for purchase/sales data visualization --- Grain: One row per district per month - -with purchases as ( - select * from {{ ref('int_purchases__monthly') }} -), - --- Add year-over-year calculations -with_yoy as ( - select - p.*, - - -- Previous year same month values - lag(p.avg_price, 12) over ( - partition by p.district_code - order by p.date_key - ) as avg_price_prev_year, - - lag(p.sales_count, 12) over ( - partition by p.district_code - order by p.date_key - ) as sales_count_prev_year, - - lag(p.median_price, 12) over ( - partition by p.district_code - order by p.date_key - ) as median_price_prev_year - - from purchases p -), - -final as ( - select - purchase_id, - date_key, - full_date, - year, - month, - quarter, - month_name, - district_key, - district_code, - district_name, - area_type, - sales_count, - dollar_volume, - avg_price, - median_price, - new_listings, - active_listings, - days_on_market, - sale_to_list_ratio, - absorption_rate, - months_of_inventory, - - -- Year-over-year changes - case - when avg_price_prev_year > 0 - then round(((avg_price - avg_price_prev_year) / avg_price_prev_year) * 100, 2) - else null - end as avg_price_yoy_pct, - - case - when sales_count_prev_year > 0 - then round(((sales_count - sales_count_prev_year)::numeric / sales_count_prev_year) * 100, 2) - else null - end as sales_count_yoy_pct, - - case - when median_price_prev_year > 0 - then round(((median_price - median_price_prev_year) / median_price_prev_year) * 100, 2) - else null - end as median_price_yoy_pct - - from with_yoy -) - -select * from final diff --git a/dbt/models/staging/_sources.yml b/dbt/models/staging/_sources.yml index ff92376..1fffd9a 100644 --- a/dbt/models/staging/_sources.yml +++ b/dbt/models/staging/_sources.yml @@ -2,20 +2,10 @@ version: 2 sources: - name: toronto_housing - description: "Toronto housing data loaded from TRREB and CMHC sources" + description: "Toronto housing data loaded from CMHC and City of Toronto sources" database: portfolio schema: public tables: - - name: fact_purchases - description: "TRREB monthly purchase/sales statistics by district" - columns: - - name: id - description: "Primary key" - - name: date_key - description: "Foreign key to dim_time" - - name: district_key - description: "Foreign key to dim_trreb_district" - - name: fact_rentals description: "CMHC annual rental survey data by zone and bedroom type" columns: @@ -32,14 +22,6 @@ sources: - name: date_key description: "Primary key (YYYYMMDD format)" - - name: dim_trreb_district - description: "TRREB district dimension with geometry" - columns: - - name: district_key - description: "Primary key" - - name: district_code - description: "TRREB district code" - - name: dim_cmhc_zone description: "CMHC zone dimension with geometry" columns: @@ -49,7 +31,7 @@ sources: description: "CMHC zone code" - name: dim_neighbourhood - description: "City of Toronto neighbourhoods (reference only)" + description: "City of Toronto neighbourhoods (158 official boundaries)" columns: - name: neighbourhood_id description: "Primary key" diff --git a/dbt/models/staging/_staging.yml b/dbt/models/staging/_staging.yml index a3458a6..1fb83f4 100644 --- a/dbt/models/staging/_staging.yml +++ b/dbt/models/staging/_staging.yml @@ -1,23 +1,6 @@ version: 2 models: - - name: stg_trreb__purchases - description: "Staged TRREB purchase/sales data from fact_purchases" - columns: - - name: purchase_id - description: "Unique identifier for purchase record" - tests: - - unique - - not_null - - name: date_key - description: "Date dimension key (YYYYMMDD)" - tests: - - not_null - - name: district_key - description: "TRREB district dimension key" - tests: - - not_null - - name: stg_cmhc__rentals description: "Staged CMHC rental market data from fact_rentals" columns: @@ -44,20 +27,6 @@ models: - unique - not_null - - name: stg_dimensions__trreb_districts - description: "Staged TRREB district dimension" - columns: - - name: district_key - description: "District dimension key" - tests: - - unique - - not_null - - name: district_code - description: "TRREB district code (e.g., W01, C01)" - tests: - - unique - - not_null - - name: stg_dimensions__cmhc_zones description: "Staged CMHC zone dimension" columns: diff --git a/dbt/models/staging/stg_dimensions__trreb_districts.sql b/dbt/models/staging/stg_dimensions__trreb_districts.sql deleted file mode 100644 index c0e5dc6..0000000 --- a/dbt/models/staging/stg_dimensions__trreb_districts.sql +++ /dev/null @@ -1,19 +0,0 @@ --- Staged TRREB district dimension --- Source: dim_trreb_district table --- Grain: One row per district - -with source as ( - select * from {{ source('toronto_housing', 'dim_trreb_district') }} -), - -staged as ( - select - district_key, - district_code, - district_name, - area_type, - geometry - from source -) - -select * from staged diff --git a/dbt/models/staging/stg_trreb__purchases.sql b/dbt/models/staging/stg_trreb__purchases.sql deleted file mode 100644 index 3694d71..0000000 --- a/dbt/models/staging/stg_trreb__purchases.sql +++ /dev/null @@ -1,25 +0,0 @@ --- Staged TRREB purchase/sales data --- Source: fact_purchases table loaded from TRREB Market Watch PDFs --- Grain: One row per district per month - -with source as ( - select * from {{ source('toronto_housing', 'fact_purchases') }} -), - -staged as ( - select - id as purchase_id, - date_key, - district_key, - sales_count, - dollar_volume, - avg_price, - median_price, - new_listings, - active_listings, - avg_dom as days_on_market, - avg_sp_lp as sale_to_list_ratio - from source -) - -select * from staged diff --git a/docs/PROJECT_REFERENCE.md b/docs/PROJECT_REFERENCE.md index 50b29f5..43d2735 100644 --- a/docs/PROJECT_REFERENCE.md +++ b/docs/PROJECT_REFERENCE.md @@ -65,8 +65,8 @@ Two-project analytics portfolio demonstrating end-to-end data engineering, visua | Context | Style | Example | |---------|-------|---------| -| Same directory | Single dot | `from .trreb import TRREBParser` | -| Sibling directory | Double dot | `from ..schemas.trreb import TRREBRecord` | +| Same directory | Single dot | `from .neighbourhood import NeighbourhoodParser` | +| Sibling directory | Double dot | `from ..schemas.neighbourhood import CensusRecord` | | External packages | Absolute | `import pandas as pd` | ### Module Separation @@ -75,7 +75,7 @@ Two-project analytics portfolio demonstrating end-to-end data engineering, visua |-----------|----------|---------| | `schemas/` | Pydantic models | Data validation | | `models/` | SQLAlchemy ORM | Database persistence | -| `parsers/` | PDF/CSV extraction | Raw data ingestion | +| `parsers/` | API/CSV extraction | Raw data ingestion | | `loaders/` | Database operations | Data loading | | `figures/` | Chart factories | Plotly figure generation | | `callbacks/` | Dash callbacks | Per-dashboard, in `pages/{dashboard}/callbacks/` | @@ -145,45 +145,36 @@ portfolio_app/ --- -## Phase 1: Toronto Housing Dashboard +## Phase 1: Toronto Neighbourhood Dashboard ### Data Sources | Track | Source | Format | Geography | Frequency | |-------|--------|--------|-----------|-----------| -| Purchases | TRREB Monthly Reports | PDF | ~35 Districts | Monthly | -| Rentals | CMHC Rental Market Survey | CSV | ~20 Zones | Annual | -| Enrichment | City of Toronto Open Data | GeoJSON/CSV | 158 Neighbourhoods | Census | +| Rentals | CMHC Rental Market Survey | API/CSV | ~20 Zones | Annual | +| Neighbourhoods | City of Toronto Open Data | GeoJSON/CSV | 158 Neighbourhoods | Census | | Policy Events | Curated list | CSV | N/A | Event-based | ### Geographic Reality ``` ┌─────────────────────────────────────────────────────────────────┐ -│ City of Toronto Neighbourhoods (158) │ ← Enrichment only -├─────────────────────────────────────────────────────────────────┤ -│ TRREB Districts (~35) — W01, C01, E01, etc. │ ← Purchase data +│ City of Toronto Neighbourhoods (158) │ ← Primary analysis unit ├─────────────────────────────────────────────────────────────────┤ │ CMHC Zones (~20) — Census Tract aligned │ ← Rental data └─────────────────────────────────────────────────────────────────┘ ``` -**Critical**: These geographies do NOT align. Display as separate layers with toggle—do not force crosswalks. - ### Data Model (Star Schema) | Table | Type | Keys | |-------|------|------| -| `fact_purchases` | Fact | → dim_time, dim_trreb_district | | `fact_rentals` | Fact | → dim_time, dim_cmhc_zone | | `dim_time` | Dimension | date_key (PK) | -| `dim_trreb_district` | Dimension | district_key (PK), geometry | | `dim_cmhc_zone` | Dimension | zone_key (PK), geometry | | `dim_neighbourhood` | Dimension | neighbourhood_id (PK), geometry | | `dim_policy_event` | Dimension | event_id (PK) | -**V1 Rule**: `dim_neighbourhood` has NO FK to fact tables—reference overlay only. - ### dbt Layer Structure | Layer | Naming | Purpose | @@ -198,31 +189,11 @@ portfolio_app/ | Sprint | Focus | Milestone | |--------|-------|-----------| -| 1 | Project bootstrap, start TRREB digitization | — | -| 2 | Bio page, data acquisition | **Launch 1: Bio Live** | -| 3 | Parsers, schemas, models | — | -| 4 | Loaders, dbt | — | -| 5 | Visualization | — | -| 6 | Polish, deploy dashboard | **Launch 2: Dashboard Live** | -| 7 | Buffer | — | - -### Sprint 1 Deliverables - -| Category | Tasks | -|----------|-------| -| **Bootstrap** | Git init, pyproject.toml, .env.example, Makefile, CLAUDE.md | -| **Infrastructure** | Docker Compose (PostgreSQL + PostGIS), scripts/ directory | -| **App Foundation** | portfolio_app/ structure, config.py, error handling | -| **Tests** | tests/ directory, conftest.py, pytest config | -| **Data Acquisition** | Download TRREB PDFs, START boundary digitization (HUMAN task) | - -### Human Tasks (Cannot Automate) - -| Task | Tool | Effort | -|------|------|--------| -| Digitize TRREB district boundaries | QGIS | 3-4 hours | -| Research policy events (10-20) | Manual research | 2-3 hours | -| Replace social link placeholders | Manual | 5 minutes | +| 1-6 | Foundation and initial dashboard | **Launch 1: Bio Live** | +| 7 | Navigation & theme modernization | — | +| 8 | Portfolio website expansion | **Launch 2: Website Live** | +| 9 | Neighbourhood dashboard transition | Cleanup complete | +| 10+ | Dashboard implementation | **Launch 3: Dashboard Live** | --- @@ -230,27 +201,24 @@ portfolio_app/ ### Phase 1 — Build These -- Bio landing page with content from bio_content_v2.md -- TRREB PDF parser -- CMHC CSV processor +- Bio landing page and portfolio website +- CMHC rental data processor +- Toronto neighbourhood data integration - PostgreSQL + PostGIS database layer - Star schema (facts + dimensions) - dbt models with tests - Choropleth visualization (Dash) - Policy event annotation layer -- Neighbourhood overlay (toggle-able) -### Phase 1 — Do NOT Build +### Deferred Features | Feature | Reason | When | |---------|--------|------| -| `bridge_district_neighbourhood` table | Area-weighted aggregation is Phase 4 | After Energy project | -| Crime data integration | Deferred scope | Phase 4 | -| Historical boundary reconciliation (140→158) | 2021+ data only for V1 | Phase 4 | +| Historical boundary reconciliation (140→158) | 2021+ data only for V1 | Future phase | | ML prediction models | Energy project scope | Phase 3 | -| Multi-project shared infrastructure | Build first, abstract second | Phase 2 | +| Multi-project shared infrastructure | Build first, abstract second | Future | -If a task seems to require Phase 3/4 features, **stop and flag it**. +If a task seems to require deferred features, **stop and flag it**. --- @@ -362,19 +330,24 @@ LOG_LEVEL=INFO ## Success Criteria -### Launch 1 (Sprint 2) -- [ ] Bio page accessible via HTTPS -- [ ] All bio content rendered (from bio_content_v2.md) -- [ ] No placeholder text visible -- [ ] Mobile responsive -- [ ] Social links functional +### Launch 1 (Bio Live) +- [x] Bio page accessible via HTTPS +- [x] All bio content rendered +- [x] No placeholder text visible +- [x] Mobile responsive +- [x] Social links functional -### Launch 2 (Sprint 6) -- [ ] Choropleth renders TRREB districts and CMHC zones -- [ ] Purchase/rental mode toggle works +### Launch 2 (Website Live) +- [x] Full portfolio website with navigation +- [x] About, Contact, Projects, Resume, Blog pages +- [x] Dark mode theme support +- [x] Sidebar navigation + +### Launch 3 (Dashboard Live) +- [ ] Choropleth renders neighbourhoods and CMHC zones +- [ ] Rental data visualization works - [ ] Time navigation works - [ ] Policy event markers visible -- [ ] Neighbourhood overlay toggleable - [ ] Methodology documentation published - [ ] Data sources cited @@ -386,11 +359,10 @@ For detailed specifications, see: | Document | Location | Use When | |----------|----------|----------| -| Data schemas | `docs/toronto_housing_spec.md` | Parser/model tasks | -| WBS details | `docs/wbs.md` | Sprint planning | -| Bio content | `docs/bio_content.md` | Building home.py | +| Dashboard vision | `docs/changes/Change-Toronto-Analysis.md` | Dashboard specification | +| Implementation plan | `docs/changes/Change-Toronto-Analysis-Reviewed.md` | Sprint planning | --- -*Reference Version: 1.0* -*Created: January 2026* +*Reference Version: 2.0* +*Updated: Sprint 9* diff --git a/docs/toronto_housing_dashboard_spec_v5.md b/docs/toronto_housing_dashboard_spec_v5.md deleted file mode 100644 index 56fc90c..0000000 --- a/docs/toronto_housing_dashboard_spec_v5.md +++ /dev/null @@ -1,809 +0,0 @@ -# Toronto Housing Price Dashboard -## Portfolio Project — Data Specification & Architecture - -**Version**: 5.1 -**Last Updated**: January 2026 -**Status**: Specification Complete - ---- - -## Document Context - -| Attribute | Value | -|-----------|-------| -| **Parent Document** | `portfolio_project_plan_v5.md` | -| **Role** | Detailed specification for Toronto Housing Dashboard | -| **Scope** | Data schemas, source URLs, geographic boundaries, V1/V2 decisions | - -**Rule**: For overall project scope, phasing, tech stack, and deployment architecture, see `portfolio_project_plan_v5.md`. This document provides implementation-level detail for the Toronto Housing project specifically. - -**Terminology Note**: This document uses **Stages 1–4** to describe Toronto Housing implementation steps. These are distinct from the **Phases 1–5** in `portfolio_project_plan_v5.md`, which describe the overall portfolio project lifecycle. - ---- - -## Project Overview - -A dashboard analyzing housing price variations across Toronto neighbourhoods over time, with dual analysis tracks: - -| Track | Data Domain | Primary Source | Geographic Unit | -|-------|-------------|----------------|-----------------| -| **Purchases** | Sales transactions | TRREB Monthly Reports | ~35 Districts | -| **Rentals** | Rental market stats | CMHC Rental Market Survey | ~20 Zones | - -**Core Visualization**: Interactive choropleth map of Toronto with toggle between rental/purchase analysis, time-series exploration by month/year. - -**Enrichment Layer** (V1: overlay only): Neighbourhood-level demographic and socioeconomic context including population density, education attainment, and income. Crime data deferred to Phase 4 of the portfolio project (post-Energy project). - -**Tech Stack & Deployment**: See `portfolio_project_plan_v5.md` → Tech Stack, Deployment Architecture - ---- - -## Geographic Layers - -### Layer Architecture - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ City of Toronto Official Neighbourhoods (158) │ ← Reference overlay + Enrichment data -├─────────────────────────────────────────────────────────────────┤ -│ TRREB Districts (~35) — W01, C01, E01, etc. │ ← Purchase data -├─────────────────────────────────────────────────────────────────┤ -│ CMHC Survey Zones (~20) — Census Tract aligned │ ← Rental data -└─────────────────────────────────────────────────────────────────┘ -``` - -### Boundary Files - -| Layer | Zones | Format | Source | Status | -|-------|-------|--------|--------|--------| -| **City Neighbourhoods** | 158 | GeoJSON, Shapefile | [GitHub - jasonicarter/toronto-geojson](https://github.com/jasonicarter/toronto-geojson) | ✅ Ready to use | -| **TRREB Districts** | ~35 | PDF only | [TRREB Toronto Map PDF](https://webapp.proptx.ca/trrebdata/common/maps/Toronto.pdf) | ⚠ Requires manual digitization | -| **CMHC Zones** | ~20 | R package | R `cmhc` package via `get_cmhc_geography()` | ✅ Available (see note) | - -### Digitization Task: TRREB Districts - -**Input**: TRREB Toronto PDF map -**Output**: GeoJSON with district codes (W01-W10, C01-C15, E01-E11) -**Tool**: QGIS - -**Process**: -1. Import PDF as raster layer in QGIS -2. Create vector layer with polygon features -3. Trace district boundaries -4. Add attributes: `district_code`, `district_name`, `area_type` (West/Central/East) -5. Export as GeoJSON (WGS84 / EPSG:4326) - -### CMHC Zone Boundaries - -**Source**: The R `cmhc` package provides CMHC survey geography via the `get_cmhc_geography()` function. - -**Extraction Process**: -```r -# In R -library(cmhc) -library(sf) - -# Get Toronto CMA zones -toronto_zones <- get_cmhc_geography( -geography_type = "ZONE", -cma = "Toronto" -) - -# Export to GeoJSON for Python/PostGIS -st_write(toronto_zones, "cmhc_zones.geojson", driver = "GeoJSON") -``` - -**Output**: `data/toronto/raw/geo/cmhc_zones.geojson` - -**Why R?**: CMHC zone boundaries are not published as standalone files. The `cmhc` R package is the only reliable programmatic source. One-time extraction, then use GeoJSON in Python stack. - -### ⚠ Neighbourhood Boundary Change (140 → 158) - -The City of Toronto updated from 140 to 158 social planning neighbourhoods in **April 2021**. This affects data alignment: - -| Data Source | Pre-2021 | Post-2021 | Handling | -|-------------|----------|-----------|----------| -| Census (2016 and earlier) | 140 neighbourhoods | N/A | Use 140-model files | -| Census (2021+) | N/A | 158 neighbourhoods | Use 158-model files | - -**V1 Strategy**: Use 2021 Census on 158 boundaries only. Defer historical trend analysis to portfolio Phase 4. - ---- - -## Data Source #1: TRREB Monthly Market Reports - -### Source Details - -| Attribute | Value | -|-----------|-------| -| **Provider** | Toronto Regional Real Estate Board | -| **URL** | [TRREB Market Watch](https://trreb.ca/index.php/market-news/market-watch) | -| **Format** | PDF (monthly reports) | -| **Update Frequency** | Monthly | -| **Historical Availability** | 2007–Present | -| **Access** | Public (aggregate data in PDFs) | -| **Extraction Method** | PDF parsing (`pdfplumber` or `camelot-py`) | - -### Available Tables - -#### Table: `trreb_monthly_summary` -**Location in PDF**: Pages 3-4 (Summary by Area) - -| Column | Data Type | Description | -|--------|-----------|-------------| -| `report_date` | DATE | First of month (YYYY-MM-01) | -| `area_code` | VARCHAR(3) | District code (W01, C01, E01, etc.) | -| `area_name` | VARCHAR(100) | District name | -| `area_type` | VARCHAR(10) | West / Central / East / North | -| `sales` | INTEGER | Number of transactions | -| `dollar_volume` | DECIMAL | Total sales volume ($) | -| `avg_price` | DECIMAL | Average sale price ($) | -| `median_price` | DECIMAL | Median sale price ($) | -| `new_listings` | INTEGER | New listings count | -| `active_listings` | INTEGER | Active listings at month end | -| `avg_sp_lp` | DECIMAL | Avg sale price / list price ratio (%) | -| `avg_dom` | INTEGER | Average days on market | - -### Dimensions - -| Dimension | Granularity | Values | -|-----------|-------------|--------| -| **Time** | Monthly | 2007-01 to present | -| **Geography** | District | ~35 TRREB districts | -| **Property Type** | Aggregate | All residential (no breakdown in summary) | - -### Metrics Available - -| Metric | Aggregation | Use Case | -|--------|-------------|----------| -| `avg_price` | Pre-calculated monthly avg | Primary price indicator | -| `median_price` | Pre-calculated monthly median | Robust price indicator | -| `sales` | Count | Market activity volume | -| `avg_dom` | Average | Market velocity | -| `avg_sp_lp` | Ratio | Buyer/seller market indicator | -| `new_listings` | Count | Supply indicator | -| `active_listings` | Snapshot | Inventory level | - -### ⚠ Limitations - -- No transaction-level data (aggregates only) -- Property type breakdown requires parsing additional tables -- PDF structure may vary slightly across years -- District boundaries haven't changed since 2011 - ---- - -## Data Source #2: CMHC Rental Market Survey - -### Source Details - -| Attribute | Value | -|-----------|-------| -| **Provider** | Canada Mortgage and Housing Corporation | -| **URL** | [CMHC Housing Market Information Portal](https://www03.cmhc-schl.gc.ca/hmip-pimh/) | -| **Format** | CSV export, API | -| **Update Frequency** | Annual (October survey) | -| **Historical Availability** | 1990–Present | -| **Access** | Public, free registration for bulk downloads | -| **Geographic Levels** | CMA → Zone → Neighbourhood → Census Tract | - -### Available Tables - -#### Table: `cmhc_rental_summary` -**Portal Path**: Toronto → Primary Rental Market → Summary Statistics - -| Column | Data Type | Description | -|--------|-----------|-------------| -| `survey_year` | INTEGER | Survey year (October) | -| `zone_code` | VARCHAR(10) | CMHC zone identifier | -| `zone_name` | VARCHAR(100) | Zone name | -| `bedroom_type` | VARCHAR(20) | Bachelor / 1-Bed / 2-Bed / 3-Bed+ / Total | -| `universe` | INTEGER | Total rental units in zone | -| `vacancy_rate` | DECIMAL | Vacancy rate (%) | -| `vacancy_rate_reliability` | VARCHAR(1) | Reliability code (a/b/c/d) | -| `availability_rate` | DECIMAL | Availability rate (%) | -| `average_rent` | DECIMAL | Average monthly rent ($) | -| `average_rent_reliability` | VARCHAR(1) | Reliability code | -| `median_rent` | DECIMAL | Median monthly rent ($) | -| `rent_change_pct` | DECIMAL | YoY rent change (%) | -| `turnover_rate` | DECIMAL | Unit turnover rate (%) | - -### Dimensions - -| Dimension | Granularity | Values | -|-----------|-------------|--------| -| **Time** | Annual | 1990 to present (October snapshot) | -| **Geography** | Zone | ~20 CMHC zones in Toronto CMA | -| **Bedroom Type** | Category | Bachelor, 1-Bed, 2-Bed, 3-Bed+, Total | -| **Structure Type** | Category | Row, Apartment (available in detailed tables) | - -### Metrics Available - -| Metric | Aggregation | Use Case | -|--------|-------------|----------| -| `average_rent` | Pre-calculated avg | Primary rent indicator | -| `median_rent` | Pre-calculated median | Robust rent indicator | -| `vacancy_rate` | Percentage | Market tightness | -| `availability_rate` | Percentage | Supply accessibility | -| `turnover_rate` | Percentage | Tenant mobility | -| `rent_change_pct` | YoY % | Rent growth tracking | -| `universe` | Count | Market size | - -### Reliability Codes - -| Code | Meaning | Coefficient of Variation | -|------|---------|-------------------------| -| `a` | Excellent | CV ≤ 2.5% | -| `b` | Good | 2.5% < CV ≤ 5% | -| `c` | Fair | 5% < CV ≤ 10% | -| `d` | Poor (use with caution) | CV > 10% | -| `**` | Data suppressed | Sample too small | - -### ⚠ Limitations - -- Annual only (no monthly granularity) -- October snapshot (point-in-time) -- Zones are larger than TRREB districts -- Purpose-built rental only (excludes condo rentals in base survey) - ---- - -## Data Source #3: City of Toronto Open Data - -### Source Details - -| Attribute | Value | -|-----------|-------| -| **Provider** | City of Toronto | -| **URL** | [Toronto Open Data Portal](https://open.toronto.ca/) | -| **Format** | GeoJSON, Shapefile, CSV | -| **Use Case** | Reference layer, demographic enrichment | - -### Relevant Datasets - -#### Dataset: `neighbourhoods` - -| Column | Data Type | Description | -|--------|-----------|-------------| -| `area_id` | INTEGER | Neighbourhood ID (1-158) | -| `area_name` | VARCHAR(100) | Official neighbourhood name | -| `geometry` | POLYGON | Boundary geometry | - -#### Dataset: `neighbourhood_profiles` (Census-linked) - -| Column | Data Type | Description | -|--------|-----------|-------------| -| `neighbourhood_id` | INTEGER | Links to neighbourhoods | -| `population` | INTEGER | Total population | -| `avg_household_income` | DECIMAL | Average household income | -| `dwelling_count` | INTEGER | Total dwellings | -| `owner_pct` | DECIMAL | % owner-occupied | -| `renter_pct` | DECIMAL | % renter-occupied | - -### Enrichment Potential - -Can overlay demographic context on housing data: -- Income brackets by neighbourhood -- Ownership vs rental ratios -- Population density -- Dwelling type distribution - ---- - -## Data Source #4: Enrichment Data (Density, Education) - -### Purpose - -Provide socioeconomic context to housing price analysis. Enables questions like: -- Do neighbourhoods with higher education attainment have higher prices? -- How does population density correlate with price per square foot? - -### Geographic Alignment Reality - -**Critical constraint**: Enrichment data is available at the **158-neighbourhood** level, while core housing data sits at **TRREB districts (~35)** and **CMHC zones (~20)**. These do not align cleanly. - -``` -158 Neighbourhoods (fine) → Enrichment data lives here -(no clean crosswalk) -~35 TRREB Districts (coarse) → Purchase data lives here -~20 CMHC Zones (coarse) → Rental data lives here -``` - -### Available Enrichment Datasets - -#### Dataset: Neighbourhood Profiles (Census) - -| Attribute | Value | -|-----------|-------| -| **Provider** | City of Toronto (via Statistics Canada Census) | -| **URL** | [Toronto Open Data - Neighbourhood Profiles](https://open.toronto.ca/dataset/neighbourhood-profiles/) | -| **Format** | CSV, JSON, XML, XLSX | -| **Update Frequency** | Every 5 years (Census cycle) | -| **Available Years** | 2001, 2006, 2011, 2016, 2021 | -| **Geographic Unit** | 158 neighbourhoods (140 pre-2021) | - -**Key Variables**: - -| Variable | Description | Use Case | -|----------|-------------|----------| -| `population` | Total population | Density calculation | -| `land_area_sqkm` | Area in square kilometers | Density calculation | -| `pop_density_per_sqkm` | Population per km | Density metric | -| `pct_bachelors_or_higher` | % age 25-64 with bachelor's+ | Education proxy | -| `median_household_income` | Median total household income | Income metric | -| `avg_household_income` | Average total household income | Income metric | -| `pct_owner_occupied` | % owner-occupied dwellings | Tenure split | -| `pct_renter_occupied` | % renter-occupied dwellings | Tenure split | - -**Download URL (2021, 158 neighbourhoods)**: -``` -https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/6e19a90f-971c-46b3-852c-0c48c436d1fc/resource/19d4a806-7385-4889-acf2-256f1e079060/download/nbhd_2021_census_profile_full_158model.xlsx -``` - -### Crime Data — Deferred to Portfolio Phase 4 - -Crime data (TPS Neighbourhood Crime Rates) is **not included in V1 scope**. It will be added in portfolio Phase 4 after the Energy Pricing project is complete. - -**Rationale**: -- Crime data is socially/politically sensitive and requires careful methodology documentation -- V1 focuses on core housing metrics and policy events -- Deferral reduces scope creep risk - -**Future Reference** (Portfolio Phase 4): -- Source: [TPS Public Safety Data Portal](https://data.torontopolice.on.ca/) -- Dataset: Neighbourhood Crime Rates (Major Crime Indicators) -- Geographic Unit: 158 neighbourhoods - -### V1 Enrichment Data Summary - -| Measure | Source | Geography | Frequency | Format | Status | -|---------|--------|-----------|-----------|--------|--------| -| **Population Density** | Neighbourhood Profiles | 158 neighbourhoods | Census (5-year) | CSV/JSON | ✅ Ready | -| **Education Attainment** | Neighbourhood Profiles | 158 neighbourhoods | Census (5-year) | CSV/JSON | ✅ Ready | -| **Median Income** | Neighbourhood Profiles | 158 neighbourhoods | Census (5-year) | CSV/JSON | ✅ Ready | -| **Crime Rates (MCI)** | TPS Data Portal | 158 neighbourhoods | Annual | GeoJSON/CSV | Deferred to Portfolio Phase 4 | - ---- - -## Data Source #5: Policy Events - -### Purpose - -Provide temporal context for housing price movements. Display as annotation markers on time series charts. **No causation claims** — correlation/context only. - -### Event Schema - -#### Table: `dim_policy_event` - -| Column | Data Type | Description | -|--------|-----------|-------------| -| `event_id` | INTEGER (PK) | Auto-increment primary key | -| `event_date` | DATE | Date event was announced/occurred | -| `effective_date` | DATE | Date policy took effect (if different) | -| `level` | VARCHAR(20) | `federal` / `provincial` / `municipal` | -| `category` | VARCHAR(20) | `monetary` / `tax` / `regulatory` / `supply` / `economic` | -| `title` | VARCHAR(200) | Short event title for display | -| `description` | TEXT | Longer description for tooltip | -| `expected_direction` | VARCHAR(10) | `bearish` / `bullish` / `neutral` | -| `source_url` | VARCHAR(500) | Link to official announcement/documentation | -| `confidence` | VARCHAR(10) | `high` / `medium` / `low` | -| `created_at` | TIMESTAMP | Record creation timestamp | - -### Event Tiers - -| Tier | Level | Category Examples | Inclusion Criteria | -|------|-------|-------------------|-------------------| -| **1** | Federal | BoC rate decisions, OSFI stress tests | Always include; objective, documented | -| **1** | Provincial | Fair Housing Plan, foreign buyer tax, rent control | Always include; legislative record | -| **2** | Municipal | Zoning reforms, development charges | Include if material impact expected | -| **2** | Economic | COVID measures, major employer closures | Include if Toronto-specific impact | -| **3** | Market | Major project announcements | Strict criteria; must be verifiable | - -### Expected Direction Values - -| Value | Meaning | Example | -|-------|---------|---------| -| `bullish` | Expected to increase prices | Rate cut, supply restriction | -| `bearish` | Expected to decrease prices | Rate hike, foreign buyer tax | -| `neutral` | Uncertain or mixed impact | Regulatory clarification | - -### ⚠ Caveats - -- **No causation claims**: Events are context, not explanation -- **Lag effects**: Policy impact may not be immediate -- **Confounding factors**: Multiple simultaneous influences -- **Display only**: No statistical analysis in V1 - -### Sample Events (Tier 1) - -| Date | Level | Category | Title | Direction | -|------|-------|----------|-------|-----------| -| 2017-04-20 | provincial | tax | Ontario Fair Housing Plan | bearish | -| 2018-01-01 | federal | regulatory | OSFI B-20 Stress Test | bearish | -| 2020-03-27 | federal | monetary | BoC Emergency Rate Cut (0.25%) | bullish | -| 2022-03-02 | federal | monetary | BoC Rate Hike Cycle Begins | bearish | -| 2023-06-01 | federal | tax | Federal 2-Year Foreign Buyer Ban | bearish | - ---- - -## Data Integration Strategy - -### Temporal Alignment - -| Source | Native Frequency | Alignment Strategy | -|--------|------------------|---------------------| -| TRREB | Monthly | Use as-is | -| CMHC | Annual (October) | Spread to monthly OR display annual overlay | -| Census/Enrichment | 5-year | Static snapshot; display as reference | -| Policy Events | Event-based | Display as vertical markers on time axis | - -**Recommendation**: Keep separate time axes. TRREB monthly for purchases, CMHC annual for rentals. Don't force artificial monthly rental data. - -### Geographic Alignment - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ VISUALIZATION APPROACH │ -├─────────────────────────────────────────────────────────────────┤ -│ │ -│ Purchase Mode Rental Mode │ -│ ───────────────── ────────────── │ -│ Map: TRREB Districts Map: CMHC Zones │ -│ Time: Monthly slider Time: Annual selector │ -│ Metrics: Price, Sales Metrics: Rent, Vacancy │ -│ │ -│ ┌───────────────────────────────────────────────────────┐ │ -│ │ City Neighbourhoods Overlay │ │ -│ │ (158 boundaries as reference layer) │ │ -│ │ + Enrichment data (density, education, income) │ │ -│ ──────────────────────────────────────────────────────────┘ │ -│ │ -────────────────────────────────────────────────────────────────────┘ -``` - -### Enrichment Integration Strategy (Phased) - -#### V1: Reference Overlay (Current Scope) - -**Approach**: Display neighbourhood enrichment as a separate toggle-able layer. No joins to housing data. - -**UX**: -- User hovers over TRREB district → tooltip shows "This district contains neighbourhoods: Annex, Casa Loma, Yorkville..." -- User toggles "Show Enrichment" → choropleth switches to neighbourhood-level density/education/income -- Enrichment and housing metrics displayed side-by-side, not merged - -**Pros**: -- No imputation or dodgy aggregations -- Honest about geographic mismatch -- Ships faster - -**Cons**: -- Can't do correlation analysis (price vs. enrichment) directly in dashboard - -**Implementation**: -- `dim_neighbourhood` as standalone dimension (no FK to fact tables) -- Spatial lookup on hover (point-in-polygon) - -#### V2/Portfolio Phase 4: Area-Weighted Aggregation (Future Scope) - -**Approach**: Pre-compute area-weighted averages of neighbourhood metrics for each TRREB district and CMHC zone. - -**Process**: -1. Spatial join: intersect neighbourhood polygons with TRREB/CMHC polygons -2. Compute overlap area for each neighbourhood-district pair -3. Weight neighbourhood metrics by overlap area proportion -4. User selects aggregation method in UI - -**Aggregation Methods to Expose**: - -| Method | Description | Best For | -|--------|-------------|----------| -| **Area-weighted mean** | Weight by % overlap area | Continuous metrics (density) | -| **Population-weighted mean** | Weight by population in overlap | Per-capita metrics (education) | -| **Majority assignment** | Assign neighbourhood to district with >50% overlap | Categorical data | -| **Max overlap** | Assign to single district with largest overlap | 1:1 mapping needs | - -**Default**: Population-weighted (more defensible for per-capita metrics). Hide selector behind "Advanced" toggle. - -### V1 Future-Proofing (Do Now) - -| Action | Why | -|--------|-----| -| Store neighbourhood boundaries in same CRS as TRREB/CMHC (WGS84) | Avoids reprojection headaches | -| Keep `dim_neighbourhood` normalized (not denormalized into district tables) | Clean separation for V2 join | -| Document Census year for each metric | Ready for 2026 Census | -| Include `census_year` column in dim_neighbourhood | Enables SCD tracking | - -### V1 Defer (Don't Do Yet) - -| Action | Why Not | -|--------|---------| -| Pre-compute area-weighted crosswalk | Don't need for V1 | -| Build aggregation method selector UI | No backend to support it | -| Crime data integration | Deferred to Portfolio Phase 4 | -| Historical neighbourhood boundary reconciliation (140→158) | Use 2021+ data only for V1 | - ---- - -## Proposed Data Model - -### Star Schema - -``` -┌──────────────────┐ -│ dim_time │ -├──────────────────┤ -│ date_key (PK) │ -│ year │ -│ month │ -│ quarter │ -│ month_name │ -───────────────────────┘ -│ -┌─────────────────────────────────────────────┐ -│ │ │ -│ -┌──────────────────┐ │ ┌──────────────────┐ -│ dim_trreb_district│ │ │ dim_cmhc_zone │ -├──────────────────┤ │ ├──────────────────┤ -│ district_key (PK)│ │ │ zone_key (PK) │ -│ district_code │ │ │ zone_code │ -│ district_name │ │ │ zone_name │ -│ area_type │ │ │ geometry │ -│ geometry │ -───────────────────────┘ │ │ -│ │ │ -│ -┌──────────────────┐ │ ┌──────────────────┐ -│ fact_purchases │ │ │ fact_rentals │ -├──────────────────┤ │ ├──────────────────┤ -│ date_key (FK) │ │ │ date_key (FK) │ -│ district_key (FK)│ │ │ zone_key (FK) │ -│ sales_count │ │ │ bedroom_type │ -│ avg_price │ │ │ avg_rent │ -│ median_price │ │ │ median_rent │ -│ new_listings │ │ │ vacancy_rate │ -│ active_listings │ │ │ universe │ -│ avg_dom │ │ │ turnover_rate │ -│ avg_sp_lp │ │ │ reliability_code │ -─────────────────────┘ │ ─────────────────────┘ -│ - -┌───────────────────────────┐ -│ dim_neighbourhood │ -├───────────────────────────┤ -│ neighbourhood_id (PK) │ -│ name │ -│ geometry │ -│ population │ -│ land_area_sqkm │ -│ pop_density_per_sqkm │ -│ pct_bachelors_or_higher │ -│ median_household_income │ -│ pct_owner_occupied │ -│ pct_renter_occupied │ -│ census_year │ ← For SCD tracking -──────────────────────────────┘ - -┌───────────────────────────┐ -│ dim_policy_event │ -├───────────────────────────┤ -│ event_id (PK) │ -│ event_date │ -│ effective_date │ -│ level │ ← federal/provincial/municipal -│ category │ ← monetary/tax/regulatory/supply/economic -│ title │ -│ description │ -│ expected_direction │ ← bearish/bullish/neutral -│ source_url │ -│ confidence │ ← high/medium/low -│ created_at │ -──────────────────────────────┘ - -┌───────────────────────────┐ -│ bridge_district_neighbourhood │ ← Portfolio Phase 4 ONLY -├───────────────────────────┤ -│ district_key (FK) │ -│ neighbourhood_id (FK) │ -│ area_overlap_pct │ -│ population_overlap │ ← For pop-weighted agg -──────────────────────────────┘ -``` - -**Notes**: -- `dim_neighbourhood` has no FK relationship to fact tables in V1 -- `dim_policy_event` is standalone (no FK to facts); used for time-series annotation -- `bridge_district_neighbourhood` is Portfolio Phase 4 scope only -- Similar bridge table needed for CMHC zones in Portfolio Phase 4 - ---- - -## File Structure - -> **Note**: Toronto Housing data logic lives in `portfolio_app/toronto/`. See `portfolio_project_plan_v5.md` for full project structure. - -### Data Directory Structure - -``` -data/ -└── toronto/ - ├── raw/ - │ ├── trreb/ - │ │ └── market_watch_YYYY_MM.pdf - │ ├── cmhc/ - │ │ └── rental_survey_YYYY.csv - │ ├── enrichment/ - │ │ └── neighbourhood_profiles_2021.xlsx - │ └── geo/ - │ ├── toronto_neighbourhoods.geojson - │ ├── trreb_districts.geojson ← (to be created via QGIS) - │ └── cmhc_zones.geojson ← (from R cmhc package) - │ - ├── processed/ ← gitignored - │ ├── fact_purchases.parquet - │ ├── fact_rentals.parquet - │ ├── dim_time.parquet - │ ├── dim_trreb_district.parquet - │ ├── dim_cmhc_zone.parquet - │ ├── dim_neighbourhood.parquet - │ └── dim_policy_event.parquet - │ - └── reference/ - ├── policy_events.csv ← Curated event list - └── neighbourhood_boundary_changelog.md ← 140→158 notes -``` - -### Code Module Structure - -``` -portfolio_app/toronto/ -├── __init__.py -├── parsers/ -│ ├── __init__.py -│ ├── trreb.py # PDF extraction -│ └── cmhc.py # CSV processing -├── loaders/ -│ ├── __init__.py -│ └── database.py # DB operations -├── schemas/ # Pydantic models -│ ├── __init__.py -│ ├── trreb.py -│ ├── cmhc.py -│ ├── enrichment.py -│ └── policy_event.py -├── models/ # SQLAlchemy ORM -│ ├── __init__.py -│ ├── base.py # DeclarativeBase, engine -│ ├── dimensions.py # dim_time, dim_trreb_district, dim_policy_event, etc. -│ └── facts.py # fact_purchases, fact_rentals -└── transforms/ - └── __init__.py -``` - -### Notebooks - -``` -notebooks/ -├── 01_trreb_pdf_extraction.ipynb -├── 02_cmhc_data_prep.ipynb -├── 03_geo_layer_prep.ipynb -├── 04_enrichment_data_prep.ipynb -├── 05_policy_events_curation.ipynb -└── 06_spatial_crosswalk.ipynb ← Portfolio Phase 4 only -``` - ---- - -## ✅ Implementation Checklist - -> **Note**: These are **Stages** within the Toronto Housing project (Portfolio Phase 1). They are distinct from the overall portfolio **Phases** defined in `portfolio_project_plan_v5.md`. - -### Stage 1: Data Acquisition -- [ ] Download TRREB monthly PDFs (2020-present as MVP) -- [ ] Register for CMHC portal and export Toronto rental data -- [ ] Extract CMHC zone boundaries via R `cmhc` package -- [ ] Download City of Toronto neighbourhood GeoJSON (158 boundaries) -- [ ] Digitize TRREB district boundaries in QGIS -- [ ] Download Neighbourhood Profiles (2021 Census, 158-model) - -### Stage 2: Data Processing -- [ ] Build TRREB PDF parser (`portfolio_app/toronto/parsers/trreb.py`) -- [ ] Build Pydantic schemas (`portfolio_app/toronto/schemas/`) -- [ ] Build SQLAlchemy models (`portfolio_app/toronto/models/`) -- [ ] Extract and validate TRREB monthly summaries -- [ ] Clean and structure CMHC rental data -- [ ] Process Neighbourhood Profiles into `dim_neighbourhood` -- [ ] Curate and load policy events into `dim_policy_event` -- [ ] Create dimension tables -- [ ] Build fact tables -- [ ] Validate all geospatial layers use same CRS (WGS84/EPSG:4326) - -### Stage 3: Visualization (V1) -- [ ] Create dashboard page (`portfolio_app/pages/toronto/dashboard.py`) -- [ ] Build choropleth figures (`portfolio_app/figures/choropleth.py`) -- [ ] Build time series figures (`portfolio_app/figures/time_series.py`) -- [ ] Design dashboard layout (purchase/rental toggle) -- [ ] Implement choropleth map with layer switching -- [ ] Add time slider/selector -- [ ] Build neighbourhood overlay (toggle-able) -- [ ] Add enrichment layer toggle (density/education/income choropleth) -- [ ] Add policy event markers on time series -- [ ] Add tooltips with cross-reference info ("This district contains...") -- [ ] Add tooltips showing enrichment metrics on hover - -### Stage 4: Polish (V1) -- [ ] Add data source citations -- [ ] Document methodology (especially geographic limitations) -- [ ] Write docs (`docs/methodology.md`, `docs/data_sources.md`) -- [ ] Deploy to portfolio - -### Future Enhancements (Portfolio Phase 4 — Post-Energy Project) -- [ ] Add crime data to dim_neighbourhood -- [ ] Build spatial crosswalk (neighbourhood ↔ district/zone intersections) -- [ ] Compute area-weighted and population-weighted aggregations -- [ ] Add aggregation method selector to UI -- [ ] Enable correlation analysis (price vs. enrichment metrics) -- [ ] Add historical neighbourhood boundary support (140→158) - -**Deployment & dbt Architecture**: See `portfolio_project_plan_v5.md` for: -- dbt layer structure and testing strategy -- Deployment architecture -- Data quality framework - ---- - -## References & Links - -### Core Housing Data - -| Resource | URL | -|----------|-----| -| TRREB Market Watch | https://trreb.ca/index.php/market-news/market-watch | -| CMHC Housing Portal | https://www03.cmhc-schl.gc.ca/hmip-pimh/ | - -### Geographic Boundaries - -| Resource | URL | -|----------|-----| -| Toronto Neighbourhoods GeoJSON | https://github.com/jasonicarter/toronto-geojson | -| TRREB District Map (PDF) | https://webapp.proptx.ca/trrebdata/common/maps/Toronto.pdf | -| Statistics Canada Census Tracts | https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/index-eng.cfm | -| R `cmhc` package (CRAN) | https://cran.r-project.org/package=cmhc | - -### Enrichment Data - -| Resource | URL | -|----------|-----| -| Toronto Open Data Portal | https://open.toronto.ca/ | -| Neighbourhood Profiles (CKAN) | https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/neighbourhood-profiles | -| Neighbourhood Profiles 2021 (Direct Download) | https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/6e19a90f-971c-46b3-852c-0c48c436d1fc/resource/19d4a806-7385-4889-acf2-256f1e079060/download/nbhd_2021_census_profile_full_158model.xlsx | - -### Policy Events Research - -| Resource | URL | -|----------|-----| -| Bank of Canada Interest Rates | https://www.bankofcanada.ca/rates/interest-rates/ | -| OSFI (Stress Test Rules) | https://www.osfi-bsif.gc.ca/ | -| Ontario Legislature (Bills) | https://www.ola.org/ | - -### Reference Documentation - -| Resource | URL | -|----------|-----| -| Statistics Canada 2021 Census Reference | https://www12.statcan.gc.ca/census-recensement/2021/ref/index-eng.cfm | -| City of Toronto Neighbourhood Profiles Overview | https://www.toronto.ca/city-government/data-research-maps/neighbourhoods-communities/neighbourhood-profiles/ | - ---- - -## Related Documents - -| Document | Relationship | Use For | -|----------|--------------|---------| -| `portfolio_project_plan_v5.md` | Parent document | Overall scope, phasing, tech stack, deployment, dbt architecture, data quality framework | - ---- - -*Document Version: 5.1* -*Updated: January 2026* -*Project: Toronto Housing Price Dashboard — Portfolio Piece* diff --git a/docs/wbs_sprint_plan_v4.md b/docs/wbs_sprint_plan_v4.md deleted file mode 100644 index b261ec8..0000000 --- a/docs/wbs_sprint_plan_v4.md +++ /dev/null @@ -1,794 +0,0 @@ -# Work Breakdown Structure & Sprint Plan - -**Project**: Toronto Housing Dashboard (Portfolio Phase 1) -**Version**: 4.1 -**Date**: January 2026 - ---- - -## Document Context - -| Attribute | Value | -|-----------|-------| -| **Parent Documents** | `portfolio_project_plan_v5.md`, `toronto_housing_dashboard_spec_v5.md` | -| **Content Source** | `bio_content_v2.md` | -| **Role** | Executable sprint plan for Phase 1 delivery | - ---- - -## Milestones - -| Milestone | Deliverable | Target Sprint | -|-----------|-------------|---------------| -| **Launch 1** | Bio Landing Page | Sprint 2 | -| **Launch 2** | Toronto Housing Dashboard | Sprint 6 | - ---- - -## WBS Structure - -``` -1.0 Launch 1: Bio Landing Page -├── 1.1 Project Bootstrap -├── 1.2 Infrastructure -├── 1.3 Application Foundation -├── 1.4 Bio Page -└── 1.5 Deployment - -2.0 Launch 2: Toronto Housing Dashboard -├── 2.1 Data Acquisition -├── 2.2 Data Processing -├── 2.3 Database Layer -├── 2.4 dbt Transformation -├── 2.5 Visualization -├── 2.6 Documentation -└── 2.7 Operations -``` - ---- - -## Launch 1: Bio Landing Page - -### 1.1 Project Bootstrap - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 1.1.1 | Git repository initialization | — | Low | Low | -| 1.1.2 | Create `.gitignore` | 1.1.1 | Low | Low | -| 1.1.3 | Create `pyproject.toml` | 1.1.1 | Low | Low | -| 1.1.4 | Create `.python-version` (3.11+) | 1.1.1 | Low | Low | -| 1.1.5 | Create `.env.example` | 1.1.1 | Low | Low | -| 1.1.6 | Create `README.md` (initial) | 1.1.1 | Low | Low | -| 1.1.7 | Create `CLAUDE.md` | 1.1.1 | Low | Low | -| 1.1.8 | Create `Makefile` with all targets | 1.1.3 | Low | Medium | - -### 1.2 Infrastructure - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 1.2.1 | Python env setup (pyenv, venv, deps) | 1.1.3, 1.1.4 | Low | Low | -| 1.2.2 | Create `.pre-commit-config.yaml` | 1.2.1 | Low | Low | -| 1.2.3 | Install pre-commit hooks | 1.2.2 | Low | Low | -| 1.2.4 | Create `docker-compose.yml` (PostgreSQL + PostGIS) | 1.1.5 | Low | Low | -| 1.2.5 | Create `scripts/` directory structure | 1.1.1 | Low | Low | -| 1.2.6 | Create `scripts/docker/up.sh` | 1.2.5 | Low | Low | -| 1.2.7 | Create `scripts/docker/down.sh` | 1.2.5 | Low | Low | -| 1.2.8 | Create `scripts/docker/logs.sh` | 1.2.5 | Low | Low | -| 1.2.9 | Create `scripts/docker/rebuild.sh` | 1.2.5 | Low | Low | -| 1.2.10 | Create `scripts/db/init.sh` (PostGIS extension) | 1.2.5 | Low | Low | -| 1.2.11 | Create `scripts/dev/setup.sh` | 1.2.5 | Low | Low | -| 1.2.12 | Verify Docker + PostGIS working | 1.2.4, 1.2.10 | Low | Low | - -### 1.3 Application Foundation - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 1.3.1 | Create `portfolio_app/` directory structure (full tree) | 1.2.1 | Low | Low | -| 1.3.2 | Create `portfolio_app/__init__.py` | 1.3.1 | Low | Low | -| 1.3.3 | Create `portfolio_app/config.py` (Pydantic BaseSettings) | 1.3.1 | Low | Medium | -| 1.3.4 | Create `portfolio_app/errors/__init__.py` | 1.3.1 | Low | Low | -| 1.3.5 | Create `portfolio_app/errors/exceptions.py` | 1.3.4 | Low | Low | -| 1.3.6 | Create `portfolio_app/errors/handlers.py` | 1.3.5 | Low | Medium | -| 1.3.7 | Create `portfolio_app/app.py` (Dash + Pages routing) | 1.3.3 | Low | Medium | -| 1.3.8 | Configure dash-mantine-components theme | 1.3.7 | Low | Low | -| 1.3.9 | Create `portfolio_app/assets/` directory | 1.3.1 | Low | Low | -| 1.3.10 | Create `portfolio_app/assets/styles.css` | 1.3.9 | Low | Medium | -| 1.3.11 | Create `portfolio_app/assets/variables.css` | 1.3.9 | Low | Low | -| 1.3.12 | Add `portfolio_app/assets/favicon.ico` | 1.3.9 | Low | Low | -| 1.3.13 | Create `portfolio_app/assets/images/` directory | 1.3.9 | Low | Low | -| 1.3.14 | Create `tests/` directory structure | 1.2.1 | Low | Low | -| 1.3.15 | Create `tests/__init__.py` | 1.3.14 | Low | Low | -| 1.3.16 | Create `tests/conftest.py` | 1.3.14 | Low | Medium | -| 1.3.17 | Configure pytest in `pyproject.toml` | 1.1.3, 1.3.14 | Low | Low | - -### 1.4 Bio Page - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 1.4.1 | Create `portfolio_app/components/__init__.py` | 1.3.1 | Low | Low | -| 1.4.2 | Create `portfolio_app/components/navbar.py` | 1.4.1, 1.3.8 | Low | Low | -| 1.4.3 | Create `portfolio_app/components/footer.py` | 1.4.1, 1.3.8 | Low | Low | -| 1.4.4 | Create `portfolio_app/components/cards.py` | 1.4.1, 1.3.8 | Low | Low | -| 1.4.5 | Create `portfolio_app/pages/__init__.py` | 1.3.1 | Low | Low | -| 1.4.6 | Create `portfolio_app/pages/home.py` (layout) | 1.4.5, 1.4.2, 1.4.3 | Low | Low | -| 1.4.7 | Integrate bio content from `bio_content_v2.md` | 1.4.6 | Low | Low | -| 1.4.8 | Replace social link placeholders with real URLs | 1.4.7 | Low | Low | -| 1.4.9 | Implement project cards (deployed/in-dev logic) | 1.4.4, 1.4.6 | Low | Low | -| 1.4.10 | Test bio page renders locally | 1.4.9 | Low | Low | - -### 1.5 Deployment - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 1.5.1 | Install PostgreSQL + PostGIS on VPS | — | Low | Low | -| 1.5.2 | Configure firewall (ufw: SSH, HTTP, HTTPS) | 1.5.1 | Low | Low | -| 1.5.3 | Create application database user | 1.5.1 | Low | Low | -| 1.5.4 | Create Gunicorn systemd service file | 1.4.10 | Low | Low | -| 1.5.5 | Configure Nginx reverse proxy | 1.5.4 | Low | Low | -| 1.5.6 | Configure SSL (certbot) | 1.5.5 | Low | Low | -| 1.5.7 | Create `scripts/deploy/deploy.sh` | 1.2.5 | Low | Low | -| 1.5.8 | Create `scripts/deploy/health-check.sh` | 1.2.5 | Low | Low | -| 1.5.9 | Deploy bio page | 1.5.6, 1.5.7 | Low | Low | -| 1.5.10 | Verify HTTPS access | 1.5.9 | Low | Low | - ---- - -## Launch 2: Toronto Housing Dashboard - -### 2.1 Data Acquisition - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.1.1 | Define TRREB year scope + download PDFs | — | Low | Low | -| 2.1.2 | **HUMAN**: Digitize TRREB district boundaries (QGIS) | 2.1.1 | High | High | -| 2.1.3 | Register for CMHC portal | — | Low | Low | -| 2.1.4 | Export CMHC Toronto rental CSVs | 2.1.3 | Low | Low | -| 2.1.5 | Extract CMHC zone boundaries (R cmhc package) | 2.1.3 | Low | Medium | -| 2.1.6 | Download neighbourhoods GeoJSON (158 boundaries) | — | Low | Low | -| 2.1.7 | Download Neighbourhood Profiles 2021 (xlsx) | — | Low | Low | -| 2.1.8 | Validate CRS alignment (all geo files WGS84) | 2.1.2, 2.1.5, 2.1.6 | Low | Medium | -| 2.1.9 | Research Tier 1 policy events (10—20 events) | — | Mid | Medium | -| 2.1.10 | Create `data/toronto/reference/policy_events.csv` | 2.1.9 | Low | Low | -| 2.1.11 | Create `data/` directory structure | 1.3.1 | Low | Low | -| 2.1.12 | Organize raw files into `data/toronto/raw/` | 2.1.11 | Low | Low | -| 2.1.13 | Test TRREB parser across year boundaries | 2.2.3 | Low | Medium | - -### 2.2 Data Processing - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.2.1 | Create `portfolio_app/toronto/__init__.py` | 1.3.1 | Low | Low | -| 2.2.2 | Create `portfolio_app/toronto/parsers/__init__.py` | 2.2.1 | Low | Low | -| 2.2.3 | Build TRREB PDF parser (`parsers/trreb.py`) | 2.2.2, 2.1.1 | Mid | High | -| 2.2.4 | TRREB data cleaning/normalization | 2.2.3 | Low | Medium | -| 2.2.5 | TRREB parser unit tests | 2.2.4 | Low | Low | -| 2.2.6 | Build CMHC CSV processor (`parsers/cmhc.py`) | 2.2.2, 2.1.4 | Low | Low | -| 2.2.7 | CMHC reliability code handling | 2.2.6 | Low | Low | -| 2.2.8 | CMHC processor unit tests | 2.2.7 | Low | Low | -| 2.2.9 | Build Neighbourhood Profiles parser | 2.2.1, 2.1.7 | Low | Low | -| 2.2.10 | Policy events CSV loader | 2.2.1, 2.1.10 | Low | Low | - -### 2.3 Database Layer - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.3.1 | Create `portfolio_app/toronto/schemas/__init__.py` | 2.2.1 | Low | Low | -| 2.3.2 | Create TRREB Pydantic schemas (`schemas/trreb.py`) | 2.3.1 | Low | Medium | -| 2.3.3 | Create CMHC Pydantic schemas (`schemas/cmhc.py`) | 2.3.1 | Low | Medium | -| 2.3.4 | Create enrichment Pydantic schemas (`schemas/enrichment.py`) | 2.3.1 | Low | Low | -| 2.3.5 | Create policy event Pydantic schema (`schemas/policy_event.py`) | 2.3.1 | Low | Low | -| 2.3.6 | Create `portfolio_app/toronto/models/__init__.py` | 2.2.1 | Low | Low | -| 2.3.7 | Create SQLAlchemy base (`models/base.py`) | 2.3.6, 1.3.3 | Low | Medium | -| 2.3.8 | Create dimension models (`models/dimensions.py`) | 2.3.7 | Low | Medium | -| 2.3.9 | Create fact models (`models/facts.py`) | 2.3.8 | Low | Medium | -| 2.3.10 | Create `portfolio_app/toronto/loaders/__init__.py` | 2.2.1 | Low | Low | -| 2.3.11 | Create dimension loaders (`loaders/database.py`) | 2.3.10, 2.3.8 | Low | Medium | -| 2.3.12 | Create fact loaders | 2.3.11, 2.3.9, 2.2.4, 2.2.7 | Mid | Medium | -| 2.3.13 | Loader integration tests | 2.3.12 | Low | Medium | -| 2.3.14 | Create SQL views for dashboard queries | 2.3.12 | Low | Medium | - -### 2.4 dbt Transformation - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.4.1 | Create `dbt/` directory structure | 1.3.1 | Low | Low | -| 2.4.2 | Create `dbt/dbt_project.yml` | 2.4.1 | Low | Low | -| 2.4.3 | Create `dbt/profiles.yml` | 2.4.1, 1.3.3 | Low | Low | -| 2.4.4 | Create `scripts/dbt/run.sh` | 1.2.5 | Low | Low | -| 2.4.5 | Create `scripts/dbt/test.sh` | 1.2.5 | Low | Low | -| 2.4.6 | Create `scripts/dbt/docs.sh` | 1.2.5 | Low | Low | -| 2.4.7 | Create `scripts/dbt/fresh.sh` | 1.2.5 | Low | Low | -| 2.4.8 | Create staging models (`stg_trreb__monthly`, `stg_cmhc__rental`) | 2.4.3, 2.3.12 | Low | Medium | -| 2.4.9 | Create intermediate models | 2.4.8 | Low | Medium | -| 2.4.10 | Create mart models | 2.4.9 | Low | Medium | -| 2.4.11 | Create dbt schema tests (unique, not_null, relationships) | 2.4.10 | Low | Medium | -| 2.4.12 | Create custom dbt tests (anomaly detection) | 2.4.11 | Low | Medium | -| 2.4.13 | Create dbt documentation (schema.yml) | 2.4.10 | Low | Low | - -### 2.5 Visualization - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.5.1 | Create `portfolio_app/figures/__init__.py` | 1.3.1 | Low | Low | -| 2.5.2 | Build choropleth factory (`figures/choropleth.py`) | 2.5.1, 2.1.8 | Mid | Medium | -| 2.5.3 | Build time series factory (`figures/time_series.py`) | 2.5.1 | Low | Medium | -| 2.5.4 | Build YoY change chart factory (`figures/statistical.py`) | 2.5.1 | Low | Medium | -| 2.5.5 | Build seasonality decomposition chart | 2.5.4 | Low | Medium | -| 2.5.6 | Build district correlation matrix chart | 2.5.4 | Low | Medium | -| 2.5.7 | Create `portfolio_app/pages/toronto/__init__.py` | 1.4.5 | Low | Low | -| 2.5.8 | Create `portfolio_app/pages/toronto/dashboard.py` (layout only) | 2.5.7, 1.4.2, 1.4.3 | Mid | High | -| 2.5.9 | Implement purchase/rental mode toggle | 2.5.8 | Low | Low | -| 2.5.10 | Implement monthly time slider | 2.5.8 | Low | Medium | -| 2.5.11 | Implement annual time selector (CMHC) | 2.5.8 | Low | Low | -| 2.5.12 | Implement layer toggles (districts/zones/neighbourhoods) | 2.5.8 | Low | Medium | -| 2.5.13 | Create `portfolio_app/pages/toronto/callbacks/__init__.py` | 2.5.7 | Low | Low | -| 2.5.14 | Create `callbacks/map_callbacks.py` | 2.5.13, 2.5.2 | Mid | Medium | -| 2.5.15 | Create `callbacks/filter_callbacks.py` | 2.5.13 | Low | Medium | -| 2.5.16 | Create `callbacks/timeseries_callbacks.py` | 2.5.13, 2.5.3 | Low | Medium | -| 2.5.17 | Implement district/zone tooltips | 2.5.14 | Low | Low | -| 2.5.18 | Implement neighbourhood overlay | 2.5.14, 2.1.6 | Low | Medium | -| 2.5.19 | Implement enrichment layer toggle | 2.5.18 | Low | Medium | -| 2.5.20 | Implement policy event markers on time series | 2.5.16, 2.2.10 | Low | Medium | -| 2.5.21 | Implement "district contains neighbourhoods" tooltip | 2.5.17 | Low | Low | -| 2.5.22 | Test dashboard renders with sample data | 2.5.20 | Low | Medium | - -### 2.6 Documentation - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.6.1 | Create `docs/` directory | 1.3.1 | Low | Low | -| 2.6.2 | Write `docs/methodology.md` (geographic limitations) | 2.5.22 | Low | Medium | -| 2.6.3 | Write `docs/data_sources.md` (citations) | 2.5.22 | Low | Low | -| 2.6.4 | Write `docs/user_guide.md` | 2.5.22 | Low | Low | -| 2.6.5 | Update `README.md` (final) | 2.6.2, 2.6.3 | Low | Low | -| 2.6.6 | Update `CLAUDE.md` (final) | 2.6.5 | Low | Low | - -### 2.7 Operations - -| ID | Task | Depends On | Effort | Complexity | -|----|------|------------|--------|------------| -| 2.7.1 | Create `scripts/db/backup.sh` | 1.2.5 | Low | Low | -| 2.7.2 | Create `scripts/db/restore.sh` | 1.2.5 | Low | Low | -| 2.7.3 | Create `scripts/db/reset.sh` (dev only) | 1.2.5 | Low | Low | -| 2.7.4 | Create `scripts/deploy/rollback.sh` | 1.2.5 | Low | Medium | -| 2.7.5 | Implement backup retention policy | 2.7.1 | Low | Low | -| 2.7.6 | Add `/health` endpoint | 2.5.8 | Low | Low | -| 2.7.7 | Configure uptime monitoring (external) | 2.7.6 | Low | Low | -| 2.7.8 | Deploy Toronto dashboard | 1.5.9, 2.5.22 | Low | Low | -| 2.7.9 | Verify production deployment | 2.7.8 | Low | Low | - ---- - -## L3 Task Details - -### 1.1 Project Bootstrap - -#### 1.1.1 Git repository initialization -| Attribute | Value | -|-----------|-------| -| **What** | Initialize git repo with main branch | -| **How** | `git init`, initial commit | -| **Inputs** | — | -| **Outputs** | `.git/` directory | -| **Why** | Version control foundation | - -#### 1.1.2 Create `.gitignore` -| Attribute | Value | -|-----------|-------| -| **What** | Git ignore rules per project plan | -| **How** | Create file with patterns for: `.env`, `data/*/processed/`, `reports/`, `backups/`, `notebooks/*.html`, `__pycache__/`, `.venv/` | -| **Inputs** | Project plan → Directory Rules | -| **Outputs** | `.gitignore` | - -#### 1.1.3 Create `pyproject.toml` -| Attribute | Value | -|-----------|-------| -| **What** | Python packaging config | -| **How** | Define project metadata, dependencies, tool configs (ruff, mypy, pytest) | -| **Inputs** | Tech stack versions from project plan | -| **Outputs** | `pyproject.toml` | -| **Dependencies** | PostgreSQL 16.x, Pydantic ≥2.0, SQLAlchemy ≥2.0, dbt-postgres ≥1.7, Pandas ≥2.1, GeoPandas ≥0.14, Dash ≥2.14, dash-mantine-components (latest), pytest ≥7.0 | - -#### 1.1.4 Create `.python-version` -| Attribute | Value | -|-----------|-------| -| **What** | pyenv version file | -| **How** | Single line: `3.11` or specific patch version | -| **Outputs** | `.python-version` | - -#### 1.1.5 Create `.env.example` -| Attribute | Value | -|-----------|-------| -| **What** | Environment variable template | -| **How** | Template with: DATABASE_URL, POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB, DASH_DEBUG, SECRET_KEY, LOG_LEVEL | -| **Inputs** | Project plan → Environment Setup | -| **Outputs** | `.env.example` | - -#### 1.1.6 Create `README.md` (initial) -| Attribute | Value | -|-----------|-------| -| **What** | Project overview stub | -| **How** | Title, brief description, "Setup coming soon" | -| **Outputs** | `README.md` | - -#### 1.1.7 Create `CLAUDE.md` -| Attribute | Value | -|-----------|-------| -| **What** | AI assistant context file | -| **How** | Project context, architecture decisions, patterns, conventions | -| **Inputs** | Project plan → Code Architecture | -| **Outputs** | `CLAUDE.md` | -| **Why** | Claude Code effectiveness from day 1 | - -#### 1.1.8 Create `Makefile` -| Attribute | Value | -|-----------|-------| -| **What** | All make targets from project plan | -| **How** | Implement targets: setup, venv, clean, docker-up/down/logs/rebuild, db-init/backup/restore/reset, run, run-prod, dbt-run/test/docs/fresh, test, test-cov, lint, format, typecheck, ci, deploy, rollback | -| **Inputs** | Project plan → Makefile Targets | -| **Outputs** | `Makefile` | - -### 1.2 Infrastructure - -#### 1.2.4 Create `docker-compose.yml` -| Attribute | Value | -|-----------|-------| -| **What** | Docker Compose V2 for PostgreSQL 16 + PostGIS | -| **How** | Service definition, volume mounts, port 5432, env vars from `.env` | -| **Inputs** | `.env.example` | -| **Outputs** | `docker-compose.yml` | -| **Note** | No `version` field (Docker Compose V2) | - -#### 1.2.5 Create `scripts/` directory structure -| Attribute | Value | -|-----------|-------| -| **What** | Full scripts tree per project plan | -| **How** | `mkdir -p scripts/{db,docker,deploy,dbt,dev}` | -| **Outputs** | `scripts/db/`, `scripts/docker/`, `scripts/deploy/`, `scripts/dbt/`, `scripts/dev/` | - -#### 1.2.10 Create `scripts/db/init.sh` -| Attribute | Value | -|-----------|-------| -| **What** | Database initialization with PostGIS | -| **How** | `CREATE DATABASE`, `CREATE EXTENSION postgis`, schema creation | -| **Standard** | `set -euo pipefail`, usage comment, idempotent | -| **Outputs** | `scripts/db/init.sh` | - -### 1.3 Application Foundation - -#### 1.3.1 Create `portfolio_app/` directory structure -| Attribute | Value | -|-----------|-------| -| **What** | Full application tree per project plan | -| **Directories** | `portfolio_app/`, `portfolio_app/assets/`, `portfolio_app/assets/images/`, `portfolio_app/pages/`, `portfolio_app/pages/toronto/`, `portfolio_app/pages/toronto/callbacks/`, `portfolio_app/components/`, `portfolio_app/figures/`, `portfolio_app/toronto/`, `portfolio_app/toronto/parsers/`, `portfolio_app/toronto/loaders/`, `portfolio_app/toronto/schemas/`, `portfolio_app/toronto/models/`, `portfolio_app/toronto/transforms/`, `portfolio_app/errors/` | -| **Pattern** | Callbacks in `pages/{dashboard}/callbacks/` per project plan | - -#### 1.3.3 Create `config.py` -| Attribute | Value | -|-----------|-------| -| **What** | Pydantic BaseSettings for config | -| **How** | Settings class loading from `.env` | -| **Fields** | DATABASE_URL, POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB, DASH_DEBUG, SECRET_KEY, LOG_LEVEL | - -#### 1.3.5 Create `exceptions.py` -| Attribute | Value | -|-----------|-------| -| **What** | Exception hierarchy per project plan | -| **Classes** | `PortfolioError` (base), `ParseError`, `ValidationError`, `LoadError` | - -#### 1.3.6 Create `handlers.py` -| Attribute | Value | -|-----------|-------| -| **What** | Error handling decorators | -| **How** | Decorators for: logging/re-raise, retry logic, transaction boundaries, timing | -| **Pattern** | Infrastructure concerns only; domain logic uses explicit handling | - -#### 1.3.7 Create `app.py` -| Attribute | Value | -|-----------|-------| -| **What** | Dash app factory with Pages routing | -| **How** | `Dash(__name__, use_pages=True)`, MantineProvider wrapper | -| **Imports** | External: absolute; Internal: relative (dot notation) | - -#### 1.3.16 Create `conftest.py` -| Attribute | Value | -|-----------|-------| -| **What** | pytest fixtures | -| **How** | Test database fixture, sample data fixtures, app client fixture | - -### 1.4 Bio Page - -#### 1.4.7 Integrate bio content -| Attribute | Value | -|-----------|-------| -| **What** | Content from `bio_content_v2.md` | -| **Sections** | Headline, Professional Summary, Tech Stack, Side Project, Availability | -| **Layout** | Hero → Summary → Tech Stack → Project Cards → Social Links → Availability | - -#### 1.4.8 Replace social link placeholders -| Attribute | Value | -|-----------|-------| -| **What** | Replace `[USERNAME]` in LinkedIn/GitHub URLs | -| **Source** | `bio_content_v2.md` → Social Links | -| **Acceptance** | No placeholder text in production | - -#### 1.4.9 Implement project cards -| Attribute | Value | -|-----------|-------| -| **What** | Dynamic project card display | -| **Logic** | Show deployed projects with links; show "In Development" for in-progress; hide or grey out planned | -| **Source** | `bio_content_v2.md` → Portfolio Projects Section | - -### 2.1 Data Acquisition - -#### 2.1.1 Define TRREB year scope + download PDFs -| Attribute | Value | -|-----------|-------| -| **What** | Decide which years to parse for V1, download PDFs | -| **Decision** | 2020—present for V1 (manageable scope, consistent PDF format). Expand to 2007+ in future if needed. | -| **Output** | `data/toronto/raw/trreb/market_watch_YYYY_MM.pdf` | -| **Note** | PDF format may vary pre-2018; test before committing to older years | - -#### 2.1.2 Digitize TRREB district boundaries -| Attribute | Value | -|-----------|-------| -| **What** | GeoJSON with ~35 district polygons | -| **Tool** | QGIS | -| **Process** | Import PDF as raster → create vector layer → trace polygons → add attributes (district_code, district_name, area_type) → export GeoJSON (WGS84/EPSG:4326) | -| **Input** | TRREB Toronto.pdf map | -| **Output** | `data/toronto/raw/geo/trreb_districts.geojson` | -| **Effort** | High | -| **Complexity** | High | -| **Note** | HUMAN TASK — not automatable | - -#### 2.1.5 Extract CMHC zone boundaries -| Attribute | Value | -|-----------|-------| -| **What** | GeoJSON with ~20 zone polygons | -| **Tool** | R with cmhc and sf packages | -| **Process** | `get_cmhc_geography(geography_type="ZONE", cma="Toronto")` → `st_write()` to GeoJSON | -| **Output** | `data/toronto/raw/geo/cmhc_zones.geojson` | - -#### 2.1.9 Research Tier 1 policy events -| Attribute | Value | -|-----------|-------| -| **What** | Federal/provincial policy events with dates, descriptions, expected direction | -| **Sources** | Bank of Canada, OSFI, Ontario Legislature | -| **Schema** | event_date, effective_date, level, category, title, description, expected_direction, source_url, confidence | -| **Acceptance** | Minimum 10 events, maximum 20 | -| **Examples** | BoC rate decisions, OSFI B-20, Ontario Fair Housing Plan, foreign buyer tax | - -#### 2.1.13 Test TRREB parser across year boundaries -| Attribute | Value | -|-----------|-------| -| **What** | Verify parser handles PDFs from different years | -| **Test Cases** | 2020 Q1, 2022 Q1, 2024 Q1 (minimum) | -| **Check For** | Table structure changes, column naming variations, page number shifts | -| **Output** | Documented format variations, parser fallbacks if needed | - -### 2.2 Data Processing - -#### 2.2.3 Build TRREB PDF parser -| Attribute | Value | -|-----------|-------| -| **What** | Extract summary tables from TRREB PDFs | -| **Tool** | pdfplumber or camelot-py | -| **Location** | Pages 3-4 (Summary by Area) | -| **Fields** | report_date, area_code, area_name, area_type, sales, dollar_volume, avg_price, median_price, new_listings, active_listings, avg_sp_lp, avg_dom | -| **Output** | `portfolio_app/toronto/parsers/trreb.py` | - -#### 2.2.7 CMHC reliability code handling -| Attribute | Value | -|-----------|-------| -| **What** | Parse reliability codes, handle suppression | -| **Codes** | a (excellent), b (good), c (fair), d (poor/caution), ** (suppressed → NULL) | -| **Implementation** | Pydantic validators, enum type | - -### 2.3 Database Layer - -#### 2.3.8 Create dimension models -| Attribute | Value | -|-----------|-------| -| **What** | SQLAlchemy 2.0 models for dimensions | -| **Tables** | `dim_time`, `dim_trreb_district`, `dim_cmhc_zone`, `dim_neighbourhood`, `dim_policy_event` | -| **Geometry** | PostGIS geometry columns for districts, zones, neighbourhoods | -| **Note** | `dim_neighbourhood` has no FK to facts in V1 | - -#### 2.3.9 Create fact models -| Attribute | Value | -|-----------|-------| -| **What** | SQLAlchemy 2.0 models for facts | -| **Tables** | `fact_purchases`, `fact_rentals` | -| **FKs** | fact_purchases → dim_time, dim_trreb_district; fact_rentals → dim_time, dim_cmhc_zone | - -### 2.4 dbt Transformation - -#### 2.4.8 Create staging models -| Attribute | Value | -|-----------|-------| -| **What** | 1:1 source mapping, cleaned and typed | -| **Models** | `stg_trreb__monthly`, `stg_cmhc__rental` | -| **Naming** | `stg_{source}__{entity}` | - -#### 2.4.11 Create dbt schema tests -| Attribute | Value | -|-----------|-------| -| **What** | Data quality tests | -| **Tests** | `unique` (PKs), `not_null` (required), `accepted_values` (reliability codes, area_type), `relationships` (FK integrity) | - -#### 2.4.12 Create custom dbt tests -| Attribute | Value | -|-----------|-------| -| **What** | Anomaly detection rules | -| **Rules** | Price MoM change >30% → flag; missing districts → fail; duplicate records → fail | - -### 2.5 Visualization - -#### 2.5.2 Build choropleth factory -| Attribute | Value | -|-----------|-------| -| **What** | Reusable choropleth_mapbox figure generator | -| **Inputs** | GeoDataFrame, metric column, color config | -| **Output** | Plotly figure | -| **Location** | `portfolio_app/figures/choropleth.py` | - -#### 2.5.4—2.5.6 Statistical chart factories -| Attribute | Value | -|-----------|-------| -| **What** | Statistical analysis visualizations | -| **Charts** | YoY change with variance bands, seasonality decomposition, district correlation matrix | -| **Location** | `portfolio_app/figures/statistical.py` | -| **Why** | Required skill demonstration per project plan | - -#### 2.5.8 Create dashboard layout -| Attribute | Value | -|-----------|-------| -| **What** | Toronto dashboard page structure | -| **File** | `portfolio_app/pages/toronto/dashboard.py` | -| **Pattern** | Layout only — no callbacks in this file | -| **Components** | Navbar, choropleth map, time controls, layer toggles, time series panel, statistics panel, footer | - -#### 2.5.13—2.5.16 Create callbacks -| Attribute | Value | -|-----------|-------| -| **What** | Dashboard interaction logic | -| **Location** | `portfolio_app/pages/toronto/callbacks/` | -| **Files** | `__init__.py`, `map_callbacks.py`, `filter_callbacks.py`, `timeseries_callbacks.py` | -| **Pattern** | Separate from layout per project plan callback separation pattern | -| **Registration** | Import callback modules in `callbacks/__init__.py`; import that package in `dashboard.py`. Dash Pages auto-discovers callbacks when module is imported. | - -#### 2.5.22 Test dashboard renders with sample data -| Attribute | Value | -|-----------|-------| -| **What** | Verify dashboard works end-to-end | -| **Sample Data** | Use output from task 2.3.12 (fact loaders). Run loaders with subset of parsed data before this task. | -| **Verify** | Choropleth renders, time controls work, tooltips display, no console errors | - ---- - -## Sprint Plan - -### Sprint 1: Project Bootstrap + Start TRREB Digitization - -**Goal**: Dev environment working, repo initialized, TRREB digitization started - -| Task ID | Task | Effort | -|---------|------|--------| -| 1.1.1 | Git repo init | Low | -| 1.1.2 | .gitignore | Low | -| 1.1.3 | pyproject.toml | Low | -| 1.1.4 | .python-version | Low | -| 1.1.5 | .env.example | Low | -| 1.1.6 | README.md (initial) | Low | -| 1.1.7 | CLAUDE.md | Low | -| 1.1.8 | Makefile | Low | -| 1.2.1 | Python env setup | Low | -| 1.2.2 | .pre-commit-config.yaml | Low | -| 1.2.3 | Install pre-commit | Low | -| 1.2.4 | docker-compose.yml | Low | -| 1.2.5 | scripts/ directory structure | Low | -| 1.2.6—1.2.9 | Docker scripts | Low | -| 1.2.10 | scripts/db/init.sh | Low | -| 1.2.11 | scripts/dev/setup.sh | Low | -| 1.2.12 | Verify Docker + PostGIS | Low | -| 1.3.1 | portfolio_app/ directory structure | Low | -| 1.3.2—1.3.6 | App foundation files | Low | -| 1.3.14—1.3.17 | Test infrastructure | Low | -| 2.1.1 | Download TRREB PDFs | Low | -| 2.1.2 | **START** TRREB boundaries (HUMAN) | High | -| 2.1.9 | **START** Policy events research | Mid | - ---- - -### Sprint 2: Bio Page + Data Acquisition - -**Goal**: Bio live, all raw data downloaded - -| Task ID | Task | Effort | -|---------|------|--------| -| 1.3.7 | app.py with Pages | Low | -| 1.3.8 | Theme config | Low | -| 1.3.9—1.3.13 | Assets directory + files | Low | -| 1.4.1—1.4.4 | Components | Low | -| 1.4.5—1.4.10 | Bio page | Low | -| 1.5.1—1.5.3 | VPS setup | Low | -| 1.5.4—1.5.6 | Gunicorn/Nginx/SSL | Low | -| 1.5.7—1.5.8 | Deploy scripts | Low | -| 1.5.9—1.5.10 | Deploy + verify | Low | -| 2.1.2 | **CONTINUE** TRREB boundaries | High | -| 2.1.3—2.1.4 | CMHC registration + export | Low | -| 2.1.5 | CMHC zone boundaries (R) | Low | -| 2.1.6 | Neighbourhoods GeoJSON | Low | -| 2.1.7 | Neighbourhood Profiles download | Low | -| 2.1.9 | **CONTINUE** Policy events research | Mid | -| 2.1.10 | policy_events.csv | Low | -| 2.1.11—2.1.12 | data/ directory + organize | Low | - -**Milestone**: **Launch 1 — Bio Live** - ---- - -### Sprint 3: Parsers + Schemas + Models - -**Goal**: ETL pipeline working, database layer complete - -| Task ID | Task | Effort | -|---------|------|--------| -| 2.1.2 | **COMPLETE** TRREB boundaries | High | -| 2.1.8 | CRS validation | Low | -| 2.2.1—2.2.2 | Toronto module init | Low | -| 2.2.3—2.2.5 | TRREB parser + tests | Mid | -| 2.2.6—2.2.8 | CMHC processor + tests | Low | -| 2.2.9 | Neighbourhood Profiles parser | Low | -| 2.2.10 | Policy events loader | Low | -| 2.3.1—2.3.5 | Pydantic schemas | Low | -| 2.3.6—2.3.9 | SQLAlchemy models | Low | - ---- - -### Sprint 4: Loaders + dbt - -**Goal**: Data loaded, transformation layer ready - -| Task ID | Task | Effort | -|---------|------|--------| -| 2.3.10—2.3.13 | Loaders + tests | Mid | -| 2.3.14 | SQL views | Low | -| 2.4.1—2.4.7 | dbt setup + scripts | Low | -| 2.4.8—2.4.10 | dbt models | Low | -| 2.4.11—2.4.12 | dbt tests | Low | -| 2.4.13 | dbt documentation | Low | -| 2.7.1—2.7.3 | DB backup/restore scripts | Low | - ---- - -### Sprint 5: Visualization - -**Goal**: Dashboard functional - -| Task ID | Task | Effort | -|---------|------|--------| -| 2.5.1—2.5.6 | Figure factories | Mid | -| 2.5.7—2.5.12 | Dashboard layout + controls | Mid | -| 2.5.13—2.5.16 | Callbacks | Mid | -| 2.5.17—2.5.21 | Tooltips + overlays + markers | Low | -| 2.5.22 | Test dashboard | Low | - ---- - -### Sprint 6: Polish + Launch 2 - -**Goal**: Dashboard deployed - -| Task ID | Task | Effort | -|---------|------|--------| -| 2.6.1—2.6.6 | Documentation | Low | -| 2.7.4—2.7.5 | Rollback script + retention | Low | -| 2.7.6—2.7.7 | Health endpoint + monitoring | Low | -| 2.7.8—2.7.9 | Deploy + verify | Low | - -**Milestone**: **Launch 2 — Toronto Dashboard Live** - ---- - -### Sprint 7: Buffer - -**Goal**: Contingency for slippage, bug fixes - -| Task ID | Task | Effort | -|---------|------|--------| -| — | Overflow from previous sprints | Varies | -| — | Bug fixes | Varies | -| — | UX polish | Low | - ---- - -## Sprint Summary - -| Sprint | Focus | Key Risk | Milestone | -|--------|-------|----------|-----------| -| 1 | Bootstrap + start boundaries | — | — | -| 2 | Bio + data acquisition | TRREB digitization | Launch 1 | -| 3 | Parsers + DB layer | PDF parser, boundaries | — | -| 4 | Loaders + dbt | — | — | -| 5 | Visualization | Choropleth complexity | — | -| 6 | Polish + deploy | — | Launch 2 | -| 7 | Buffer | — | — | - ---- - -## Dependency Graph - -### Launch 1 Critical Path -``` -1.1.1 → 1.1.3 → 1.2.1 → 1.3.1 → 1.3.7 → 1.4.6 → 1.4.10 → 1.5.9 → 1.5.10 -``` - -### Launch 2 Critical Path -``` -2.1.2 (TRREB boundaries) ─┬→ 2.1.8 (CRS) → 2.5.2 (choropleth) → 2.5.8 (layout) → 2.5.22 (test) → 2.7.8 (deploy) - │ -2.1.1 → 2.2.3 (parser) → 2.2.4 → 2.3.12 (loaders) → 2.4.8 (dbt) ─┘ -``` - -### Parallel Tracks (can run simultaneously) - -| Track | Tasks | Can Start | -|-------|-------|-----------| -| **A: TRREB Boundaries** | 2.1.1 → 2.1.2 | Sprint 1 | -| **B: TRREB Parser** | 2.2.3—2.2.5 | Sprint 2 (after PDFs) | -| **C: CMHC** | 2.1.3—2.1.5 → 2.2.6—2.2.8 | Sprint 2 | -| **D: Enrichment** | 2.1.6—2.1.7 → 2.2.9 | Sprint 2 | -| **E: Policy Events** | 2.1.9—2.1.10 → 2.2.10 | Sprint 1—2 | -| **F: Schemas/Models** | 2.3.1—2.3.9 | Sprint 3 (after parsers) | -| **G: dbt** | 2.4.* | Sprint 4 (after loaders) | -| **H: Ops Scripts** | 2.7.1—2.7.5 | Sprint 4 | - ---- - -## Risk Register - -| Risk | Likelihood | Impact | Mitigation | -|------|------------|--------|------------| -| TRREB digitization slips | Medium | High | Start Sprint 1; timebox; accept lower precision initially | -| PDF parser breaks on older years | Medium | Medium | Test multiple years early; build fallbacks | -| PostGIS geometry issues | Low | Medium | Validate CRS before load (2.1.8) | -| Choropleth performance | Low | Medium | Pre-aggregate; simplify geometries | -| Policy events research takes too long | Medium | Low | Cap at 10 events minimum; expand post-launch | - ---- - -## Acceptance Criteria - -### Launch 1 -- [ ] Bio page accessible via HTTPS -- [ ] All content from `bio_content_v2.md` rendered -- [ ] No placeholder text ([USERNAME]) visible -- [ ] Mobile responsive -- [ ] Social links functional - -### Launch 2 -- [ ] Choropleth renders TRREB districts -- [ ] Choropleth renders CMHC zones -- [ ] Purchase/rental mode toggle works -- [ ] Time navigation works (monthly for TRREB, annual for CMHC) -- [ ] Policy event markers visible on time series -- [ ] Neighbourhood overlay toggleable -- [ ] Methodology documentation published -- [ ] Data sources cited -- [ ] Health endpoint responds - ---- - -## Effort Legend - -| Level | Meaning | -|-------|---------| -| **Low** | Straightforward; minimal iteration expected | -| **Mid** | Requires debugging or multi-step coordination | -| **High** | Complex logic, external tools, or human intervention required | - ---- - -*Document Version: 4.1* -*Created: January 2026* diff --git a/portfolio_app/figures/__init__.py b/portfolio_app/figures/__init__.py index 73000e8..bf9dabc 100644 --- a/portfolio_app/figures/__init__.py +++ b/portfolio_app/figures/__init__.py @@ -2,7 +2,6 @@ from .choropleth import ( create_choropleth_figure, - create_district_map, create_zone_map, ) from .summary_cards import create_metric_card_figure, create_summary_metrics @@ -17,7 +16,6 @@ from .time_series import ( __all__ = [ # Choropleth "create_choropleth_figure", - "create_district_map", "create_zone_map", # Time series "create_price_time_series", diff --git a/portfolio_app/figures/choropleth.py b/portfolio_app/figures/choropleth.py index 8c7be7d..a8c8d55 100644 --- a/portfolio_app/figures/choropleth.py +++ b/portfolio_app/figures/choropleth.py @@ -115,34 +115,6 @@ def create_choropleth_figure( return fig -def create_district_map( - districts_geojson: dict[str, Any] | None, - purchase_data: list[dict[str, Any]], - metric: str = "avg_price", -) -> go.Figure: - """Create choropleth map for TRREB districts. - - Args: - districts_geojson: GeoJSON for TRREB district boundaries. - purchase_data: Purchase statistics by district. - metric: Metric to display (avg_price, sales_count, etc.). - - Returns: - Plotly Figure object. - """ - hover_columns = ["district_name", "sales_count", "avg_price", "median_price"] - - return create_choropleth_figure( - geojson=districts_geojson, - data=purchase_data, - location_key="district_code", - color_column=metric, - hover_data=[c for c in hover_columns if c != metric], - color_scale="Blues" if "price" in metric else "Greens", - title="Toronto Purchase Market by District", - ) - - def create_zone_map( zones_geojson: dict[str, Any] | None, rental_data: list[dict[str, Any]], diff --git a/portfolio_app/pages/toronto/callbacks/__init__.py b/portfolio_app/pages/toronto/callbacks/__init__.py index fdfbc8f..20e7941 100644 --- a/portfolio_app/pages/toronto/callbacks/__init__.py +++ b/portfolio_app/pages/toronto/callbacks/__init__.py @@ -18,8 +18,7 @@ _CMHC_ZONES_PATH = Path("data/toronto/raw/geo/cmhc_zones.geojson") _cmhc_parser = CMHCZoneParser(_CMHC_ZONES_PATH) if _CMHC_ZONES_PATH.exists() else None CMHC_ZONES_GEOJSON = _cmhc_parser.get_geojson_for_choropleth() if _cmhc_parser else None -# Load Toronto neighbourhoods GeoJSON for purchase choropleth maps -# Note: This is a temporary proxy until TRREB district boundaries are digitized +# Load Toronto neighbourhoods GeoJSON for choropleth maps _NEIGHBOURHOODS_PATH = Path("data/toronto/raw/geo/toronto_neighbourhoods.geojson") _neighbourhood_parser = ( NeighbourhoodParser(_NEIGHBOURHOODS_PATH) if _NEIGHBOURHOODS_PATH.exists() else None @@ -30,9 +29,7 @@ NEIGHBOURHOODS_GEOJSON = ( else None ) -# Sample purchase data for all 158 City of Toronto neighbourhoods -# Note: This is SAMPLE DATA until TRREB district boundaries are digitized (Issue #25) -# Once TRREB boundaries are available, this will be replaced with real TRREB data by district +# Sample data for all 158 City of Toronto neighbourhoods SAMPLE_PURCHASE_DATA = [ { "neighbourhood_id": 1, @@ -1486,11 +1483,7 @@ SAMPLE_TIME_SERIES_DATA = [ Input("toronto-year-selector", "value"), ) def update_purchase_choropleth(metric: str, year: str) -> go.Figure: - """Update the purchase market choropleth map. - - Note: Currently using City of Toronto neighbourhoods as a proxy. - Will switch to TRREB districts when boundaries are digitized. - """ + """Update the neighbourhood choropleth map.""" return create_choropleth_figure( geojson=NEIGHBOURHOODS_GEOJSON, data=SAMPLE_PURCHASE_DATA, diff --git a/portfolio_app/pages/toronto/dashboard.py b/portfolio_app/pages/toronto/dashboard.py index 204db0f..85d1e45 100644 --- a/portfolio_app/pages/toronto/dashboard.py +++ b/portfolio_app/pages/toronto/dashboard.py @@ -257,9 +257,8 @@ def create_data_notice() -> dmc.Alert: return dmc.Alert( children=[ dmc.Text( - "This dashboard uses TRREB and CMHC data. " - "Geographic boundaries require QGIS digitization to enable choropleth maps. " - "Sample data is shown below.", + "This dashboard displays Toronto neighbourhood and CMHC rental data. " + "Sample data is shown for demonstration purposes.", size="sm", ), ], diff --git a/portfolio_app/pages/toronto/methodology.py b/portfolio_app/pages/toronto/methodology.py index e77e869..9264619 100644 --- a/portfolio_app/pages/toronto/methodology.py +++ b/portfolio_app/pages/toronto/methodology.py @@ -46,42 +46,8 @@ def layout() -> dmc.Container: mb="lg", children=[ dmc.Title("Data Sources", order=2, mb="md"), - # TRREB - dmc.Title("Purchase Data: TRREB", order=3, size="h4", mb="sm"), - dmc.Text( - [ - "The Toronto Regional Real Estate Board (TRREB) publishes monthly ", - html.Strong("Market Watch"), - " reports containing aggregate statistics for residential real estate " - "transactions across the Greater Toronto Area.", - ], - mb="sm", - ), - dmc.List( - [ - dmc.ListItem("Source: TRREB Market Watch Reports (PDF)"), - dmc.ListItem("Geographic granularity: ~35 TRREB Districts"), - dmc.ListItem("Temporal granularity: Monthly"), - dmc.ListItem("Coverage: 2021-present"), - dmc.ListItem( - [ - "Metrics: Sales count, average/median price, new listings, ", - "active listings, days on market, sale-to-list ratio", - ] - ), - ], - mb="md", - ), - dmc.Anchor( - "TRREB Market Watch Archive", - href="https://trreb.ca/market-data/market-watch/market-watch-archive/", - target="_blank", - mb="lg", - ), # CMHC - dmc.Title( - "Rental Data: CMHC", order=3, size="h4", mb="sm", mt="md" - ), + dmc.Title("Rental Data: CMHC", order=3, size="h4", mb="sm"), dmc.Text( [ "Canada Mortgage and Housing Corporation (CMHC) conducts the annual ", @@ -124,28 +90,17 @@ def layout() -> dmc.Container: mb="lg", children=[ dmc.Title("Geographic Considerations", order=2, mb="md"), - dmc.Alert( - title="Important: Non-Aligned Geographies", - color="yellow", - mb="md", - children=[ - "TRREB Districts and CMHC Zones do ", - html.Strong("not"), - " align geographically. They are displayed as separate layers and " - "should not be directly compared at the sub-regional level.", - ], - ), dmc.Text( - "The dashboard presents three geographic layers:", + "The dashboard presents two geographic layers:", mb="sm", ), dmc.List( [ dmc.ListItem( [ - html.Strong("TRREB Districts (~35): "), - "Used for purchase/sales data visualization. " - "Districts are defined by TRREB and labeled with codes like W01, C01, E01.", + html.Strong("City Neighbourhoods (158): "), + "Official City of Toronto neighbourhood boundaries, " + "used for neighbourhood-level analysis.", ] ), dmc.ListItem( @@ -155,13 +110,6 @@ def layout() -> dmc.Container: "Zones are aligned with Census Tract boundaries.", ] ), - dmc.ListItem( - [ - html.Strong("City Neighbourhoods (158): "), - "Reference overlay only. " - "These are official City of Toronto neighbourhood boundaries.", - ] - ), ], ), ], @@ -212,22 +160,15 @@ def layout() -> dmc.Container: dmc.ListItem( [ html.Strong("Reporting Lag: "), - "TRREB data reflects closed transactions, which may lag market " - "conditions by 1-3 months. CMHC data is annual.", - ] - ), - dmc.ListItem( - [ - html.Strong("Geographic Boundaries: "), - "TRREB district boundaries were manually digitized from reference maps " - "and may contain minor inaccuracies.", + "CMHC rental data is annual (October survey). " + "Other data sources may have different update frequencies.", ] ), dmc.ListItem( [ html.Strong("Data Suppression: "), - "Some cells may be suppressed for confidentiality when transaction " - "counts are below thresholds.", + "Some cells may be suppressed for confidentiality when counts " + "are below thresholds.", ] ), ], diff --git a/portfolio_app/toronto/demo_data.py b/portfolio_app/toronto/demo_data.py index 3f76524..27b56ba 100644 --- a/portfolio_app/toronto/demo_data.py +++ b/portfolio_app/toronto/demo_data.py @@ -8,98 +8,6 @@ from datetime import date from typing import Any -def get_demo_districts() -> list[dict[str, Any]]: - """Return sample TRREB district data.""" - return [ - {"district_code": "W01", "district_name": "Long Branch", "area_type": "West"}, - {"district_code": "W02", "district_name": "Mimico", "area_type": "West"}, - { - "district_code": "W03", - "district_name": "Kingsway South", - "area_type": "West", - }, - {"district_code": "W04", "district_name": "Edenbridge", "area_type": "West"}, - {"district_code": "W05", "district_name": "Islington", "area_type": "West"}, - {"district_code": "W06", "district_name": "Rexdale", "area_type": "West"}, - {"district_code": "W07", "district_name": "Willowdale", "area_type": "West"}, - {"district_code": "W08", "district_name": "York", "area_type": "West"}, - { - "district_code": "C01", - "district_name": "Downtown Core", - "area_type": "Central", - }, - {"district_code": "C02", "district_name": "Annex", "area_type": "Central"}, - { - "district_code": "C03", - "district_name": "Forest Hill", - "area_type": "Central", - }, - { - "district_code": "C04", - "district_name": "Lawrence Park", - "area_type": "Central", - }, - { - "district_code": "C06", - "district_name": "Willowdale East", - "area_type": "Central", - }, - {"district_code": "C07", "district_name": "Thornhill", "area_type": "Central"}, - {"district_code": "C08", "district_name": "Waterfront", "area_type": "Central"}, - {"district_code": "E01", "district_name": "Leslieville", "area_type": "East"}, - {"district_code": "E02", "district_name": "The Beaches", "area_type": "East"}, - {"district_code": "E03", "district_name": "Danforth", "area_type": "East"}, - {"district_code": "E04", "district_name": "Birch Cliff", "area_type": "East"}, - {"district_code": "E05", "district_name": "Scarborough", "area_type": "East"}, - ] - - -def get_demo_purchase_data() -> list[dict[str, Any]]: - """Return sample purchase data for time series visualization.""" - import random - - random.seed(42) - data = [] - - base_prices = { - "W01": 850000, - "C01": 1200000, - "E01": 950000, - } - - for year in [2024, 2025]: - for month in range(1, 13): - if year == 2025 and month > 12: - break - - for district, base_price in base_prices.items(): - # Add some randomness and trend - trend = (year - 2024) * 12 + month - price_variation = random.uniform(-0.05, 0.05) - trend_factor = 1 + (trend * 0.002) # Slight upward trend - - avg_price = int(base_price * trend_factor * (1 + price_variation)) - sales = random.randint(50, 200) - - data.append( - { - "district_code": district, - "full_date": date(year, month, 1), - "year": year, - "month": month, - "avg_price": avg_price, - "median_price": int(avg_price * 0.95), - "sales_count": sales, - "new_listings": int(sales * random.uniform(1.2, 1.8)), - "active_listings": int(sales * random.uniform(2.0, 3.5)), - "days_on_market": random.randint(15, 45), - "sale_to_list_ratio": round(random.uniform(0.95, 1.05), 2), - } - ) - - return data - - def get_demo_rental_data() -> list[dict[str, Any]]: """Return sample rental data for visualization.""" data = [] @@ -219,23 +127,6 @@ def get_demo_policy_events() -> list[dict[str, Any]]: def get_demo_summary_metrics() -> dict[str, dict[str, Any]]: """Return summary metrics for KPI cards.""" return { - "avg_price": { - "value": 1067968, - "title": "Avg. Price (2025)", - "delta": -4.7, - "delta_suffix": "%", - "prefix": "$", - "format_spec": ",.0f", - "positive_is_good": True, - }, - "total_sales": { - "value": 67610, - "title": "Total Sales (2024)", - "delta": 2.6, - "delta_suffix": "%", - "format_spec": ",.0f", - "positive_is_good": True, - }, "avg_rent": { "value": 2450, "title": "Avg. Rent (2025)", diff --git a/portfolio_app/toronto/loaders/__init__.py b/portfolio_app/toronto/loaders/__init__.py index 1b47b25..3574070 100644 --- a/portfolio_app/toronto/loaders/__init__.py +++ b/portfolio_app/toronto/loaders/__init__.py @@ -8,9 +8,7 @@ from .dimensions import ( load_neighbourhoods, load_policy_events, load_time_dimension, - load_trreb_districts, ) -from .trreb import load_trreb_purchases, load_trreb_record __all__ = [ # Base utilities @@ -20,13 +18,10 @@ __all__ = [ # Dimension loaders "generate_date_key", "load_time_dimension", - "load_trreb_districts", "load_cmhc_zones", "load_neighbourhoods", "load_policy_events", # Fact loaders - "load_trreb_purchases", - "load_trreb_record", "load_cmhc_rentals", "load_cmhc_record", ] diff --git a/portfolio_app/toronto/loaders/dimensions.py b/portfolio_app/toronto/loaders/dimensions.py index c69424f..092fd05 100644 --- a/portfolio_app/toronto/loaders/dimensions.py +++ b/portfolio_app/toronto/loaders/dimensions.py @@ -9,13 +9,11 @@ from portfolio_app.toronto.models import ( DimNeighbourhood, DimPolicyEvent, DimTime, - DimTRREBDistrict, ) from portfolio_app.toronto.schemas import ( CMHCZone, Neighbourhood, PolicyEvent, - TRREBDistrict, ) from .base import get_session, upsert_by_key @@ -97,42 +95,6 @@ def load_time_dimension( return _load(sess) -def load_trreb_districts( - districts: list[TRREBDistrict], - session: Session | None = None, -) -> int: - """Load TRREB district dimension. - - Args: - districts: List of validated district schemas. - session: Optional existing session. - - Returns: - Number of records loaded. - """ - - def _load(sess: Session) -> int: - records = [] - for d in districts: - dim = DimTRREBDistrict( - district_code=d.district_code, - district_name=d.district_name, - area_type=d.area_type.value, - geometry=d.geometry_wkt, - ) - records.append(dim) - - inserted, updated = upsert_by_key( - sess, DimTRREBDistrict, records, ["district_code"] - ) - return inserted + updated - - if session: - return _load(session) - with get_session() as sess: - return _load(sess) - - def load_cmhc_zones( zones: list[CMHCZone], session: Session | None = None, diff --git a/portfolio_app/toronto/loaders/trreb.py b/portfolio_app/toronto/loaders/trreb.py deleted file mode 100644 index 06e4c8b..0000000 --- a/portfolio_app/toronto/loaders/trreb.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Loader for TRREB purchase data into fact_purchases.""" - -from sqlalchemy.orm import Session - -from portfolio_app.toronto.models import DimTime, DimTRREBDistrict, FactPurchases -from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport - -from .base import get_session, upsert_by_key -from .dimensions import generate_date_key - - -def load_trreb_purchases( - report: TRREBMonthlyReport, - session: Session | None = None, -) -> int: - """Load TRREB monthly report data into fact_purchases. - - Args: - report: Validated TRREB monthly report containing records. - session: Optional existing session. - - Returns: - Number of records loaded. - """ - - def _load(sess: Session) -> int: - # Get district key mapping - districts = sess.query(DimTRREBDistrict).all() - district_map = {d.district_code: d.district_key for d in districts} - - # Build date key from report date - date_key = generate_date_key(report.report_date) - - # Verify time dimension exists - time_dim = sess.query(DimTime).filter_by(date_key=date_key).first() - if not time_dim: - raise ValueError( - f"Time dimension not found for date_key {date_key}. " - "Load time dimension first." - ) - - records = [] - for record in report.records: - district_key = district_map.get(record.area_code) - if not district_key: - # Skip records for unknown districts (e.g., aggregate rows) - continue - - fact = FactPurchases( - date_key=date_key, - district_key=district_key, - sales_count=record.sales, - dollar_volume=record.dollar_volume, - avg_price=record.avg_price, - median_price=record.median_price, - new_listings=record.new_listings, - active_listings=record.active_listings, - avg_dom=record.avg_dom, - avg_sp_lp=record.avg_sp_lp, - ) - records.append(fact) - - inserted, updated = upsert_by_key( - sess, FactPurchases, records, ["date_key", "district_key"] - ) - return inserted + updated - - if session: - return _load(session) - with get_session() as sess: - return _load(sess) - - -def load_trreb_record( - record: TRREBMonthlyRecord, - session: Session | None = None, -) -> int: - """Load a single TRREB record into fact_purchases. - - Args: - record: Single validated TRREB monthly record. - session: Optional existing session. - - Returns: - Number of records loaded (0 or 1). - """ - - def _load(sess: Session) -> int: - # Get district key - district = ( - sess.query(DimTRREBDistrict) - .filter_by(district_code=record.area_code) - .first() - ) - if not district: - return 0 - - date_key = generate_date_key(record.report_date) - - # Verify time dimension exists - time_dim = sess.query(DimTime).filter_by(date_key=date_key).first() - if not time_dim: - raise ValueError( - f"Time dimension not found for date_key {date_key}. " - "Load time dimension first." - ) - - fact = FactPurchases( - date_key=date_key, - district_key=district.district_key, - sales_count=record.sales, - dollar_volume=record.dollar_volume, - avg_price=record.avg_price, - median_price=record.median_price, - new_listings=record.new_listings, - active_listings=record.active_listings, - avg_dom=record.avg_dom, - avg_sp_lp=record.avg_sp_lp, - ) - - inserted, updated = upsert_by_key( - sess, FactPurchases, [fact], ["date_key", "district_key"] - ) - return inserted + updated - - if session: - return _load(session) - with get_session() as sess: - return _load(sess) diff --git a/portfolio_app/toronto/models/__init__.py b/portfolio_app/toronto/models/__init__.py index e09a8a7..44c5ceb 100644 --- a/portfolio_app/toronto/models/__init__.py +++ b/portfolio_app/toronto/models/__init__.py @@ -6,9 +6,8 @@ from .dimensions import ( DimNeighbourhood, DimPolicyEvent, DimTime, - DimTRREBDistrict, ) -from .facts import FactPurchases, FactRentals +from .facts import FactRentals __all__ = [ # Base @@ -18,11 +17,9 @@ __all__ = [ "create_tables", # Dimensions "DimTime", - "DimTRREBDistrict", "DimCMHCZone", "DimNeighbourhood", "DimPolicyEvent", # Facts - "FactPurchases", "FactRentals", ] diff --git a/portfolio_app/toronto/models/dimensions.py b/portfolio_app/toronto/models/dimensions.py index a8f8bef..7e86265 100644 --- a/portfolio_app/toronto/models/dimensions.py +++ b/portfolio_app/toronto/models/dimensions.py @@ -23,20 +23,6 @@ class DimTime(Base): is_month_start: Mapped[bool] = mapped_column(Boolean, default=True) -class DimTRREBDistrict(Base): - """TRREB district dimension table with PostGIS geometry.""" - - __tablename__ = "dim_trreb_district" - - district_key: Mapped[int] = mapped_column( - Integer, primary_key=True, autoincrement=True - ) - district_code: Mapped[str] = mapped_column(String(3), nullable=False, unique=True) - district_name: Mapped[str] = mapped_column(String(100), nullable=False) - area_type: Mapped[str] = mapped_column(String(10), nullable=False) - geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True) - - class DimCMHCZone(Base): """CMHC zone dimension table with PostGIS geometry.""" diff --git a/portfolio_app/toronto/models/facts.py b/portfolio_app/toronto/models/facts.py index 3a072a8..38e660e 100644 --- a/portfolio_app/toronto/models/facts.py +++ b/portfolio_app/toronto/models/facts.py @@ -6,37 +6,6 @@ from sqlalchemy.orm import Mapped, mapped_column, relationship from .base import Base -class FactPurchases(Base): - """Fact table for TRREB purchase/sales data. - - Grain: One row per district per month. - """ - - __tablename__ = "fact_purchases" - - id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) - date_key: Mapped[int] = mapped_column( - Integer, ForeignKey("dim_time.date_key"), nullable=False - ) - district_key: Mapped[int] = mapped_column( - Integer, ForeignKey("dim_trreb_district.district_key"), nullable=False - ) - sales_count: Mapped[int] = mapped_column(Integer, nullable=False) - dollar_volume: Mapped[float] = mapped_column(Numeric(15, 2), nullable=False) - avg_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False) - median_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False) - new_listings: Mapped[int] = mapped_column(Integer, nullable=False) - active_listings: Mapped[int] = mapped_column(Integer, nullable=False) - avg_dom: Mapped[int] = mapped_column(Integer, nullable=False) # Days on market - avg_sp_lp: Mapped[float] = mapped_column( - Numeric(5, 2), nullable=False - ) # Sale/List ratio - - # Relationships - time = relationship("DimTime", backref="purchases") - district = relationship("DimTRREBDistrict", backref="purchases") - - class FactRentals(Base): """Fact table for CMHC rental market data. diff --git a/portfolio_app/toronto/parsers/__init__.py b/portfolio_app/toronto/parsers/__init__.py index 2d33037..02ea39d 100644 --- a/portfolio_app/toronto/parsers/__init__.py +++ b/portfolio_app/toronto/parsers/__init__.py @@ -4,17 +4,13 @@ from .cmhc import CMHCParser from .geo import ( CMHCZoneParser, NeighbourhoodParser, - TRREBDistrictParser, load_geojson, ) -from .trreb import TRREBParser __all__ = [ - "TRREBParser", "CMHCParser", # GeoJSON parsers "CMHCZoneParser", - "TRREBDistrictParser", "NeighbourhoodParser", "load_geojson", ] diff --git a/portfolio_app/toronto/parsers/geo.py b/portfolio_app/toronto/parsers/geo.py index 6640914..0e0f979 100644 --- a/portfolio_app/toronto/parsers/geo.py +++ b/portfolio_app/toronto/parsers/geo.py @@ -13,8 +13,7 @@ from pyproj import Transformer from shapely.geometry import mapping, shape from shapely.ops import transform -from portfolio_app.toronto.schemas import CMHCZone, Neighbourhood, TRREBDistrict -from portfolio_app.toronto.schemas.dimensions import AreaType +from portfolio_app.toronto.schemas import CMHCZone, Neighbourhood # Transformer for reprojecting from Web Mercator to WGS84 _TRANSFORMER_3857_TO_4326 = Transformer.from_crs( @@ -221,135 +220,6 @@ class CMHCZoneParser: return {"type": "FeatureCollection", "features": features} -class TRREBDistrictParser: - """Parser for TRREB district boundary GeoJSON files. - - TRREB district boundaries are manually digitized from the TRREB PDF map - using QGIS. - - Expected GeoJSON properties: - - district_code: District code (W01, C01, E01, etc.) - - district_name: District name - - area_type: West, Central, East, or North - """ - - CODE_PROPERTIES = [ - "district_code", - "District_Code", - "DISTRICT_CODE", - "districtcode", - "code", - ] - NAME_PROPERTIES = [ - "district_name", - "District_Name", - "DISTRICT_NAME", - "districtname", - "name", - "NAME", - ] - AREA_PROPERTIES = [ - "area_type", - "Area_Type", - "AREA_TYPE", - "areatype", - "area", - "type", - ] - - def __init__(self, geojson_path: Path) -> None: - """Initialize parser with path to GeoJSON file.""" - self.geojson_path = geojson_path - self._geojson: dict[str, Any] | None = None - - @property - def geojson(self) -> dict[str, Any]: - """Lazy-load and return raw GeoJSON data.""" - if self._geojson is None: - self._geojson = load_geojson(self.geojson_path) - return self._geojson - - def _find_property( - self, properties: dict[str, Any], candidates: list[str] - ) -> str | None: - """Find a property value by checking multiple candidate names.""" - for name in candidates: - if name in properties and properties[name] is not None: - return str(properties[name]) - return None - - def _infer_area_type(self, district_code: str) -> AreaType: - """Infer area type from district code prefix.""" - prefix = district_code[0].upper() - mapping = {"W": AreaType.WEST, "C": AreaType.CENTRAL, "E": AreaType.EAST} - return mapping.get(prefix, AreaType.NORTH) - - def parse(self) -> list[TRREBDistrict]: - """Parse GeoJSON and return list of TRREBDistrict schemas.""" - districts = [] - for feature in self.geojson.get("features", []): - props = feature.get("properties", {}) - geom = feature.get("geometry") - - district_code = self._find_property(props, self.CODE_PROPERTIES) - district_name = self._find_property(props, self.NAME_PROPERTIES) - area_type_str = self._find_property(props, self.AREA_PROPERTIES) - - if not district_code: - raise ValueError( - f"District code not found in properties: {list(props.keys())}" - ) - if not district_name: - district_name = district_code - - # Infer or parse area type - if area_type_str: - try: - area_type = AreaType(area_type_str) - except ValueError: - area_type = self._infer_area_type(district_code) - else: - area_type = self._infer_area_type(district_code) - - geometry_wkt = geometry_to_wkt(geom) if geom else None - - districts.append( - TRREBDistrict( - district_code=district_code, - district_name=district_name, - area_type=area_type, - geometry_wkt=geometry_wkt, - ) - ) - - return districts - - def get_geojson_for_choropleth( - self, key_property: str = "district_code" - ) -> dict[str, Any]: - """Get GeoJSON formatted for Plotly choropleth maps.""" - features = [] - for feature in self.geojson.get("features", []): - props = feature.get("properties", {}) - new_props = dict(props) - - district_code = self._find_property(props, self.CODE_PROPERTIES) - district_name = self._find_property(props, self.NAME_PROPERTIES) - - new_props["district_code"] = district_code - new_props["district_name"] = district_name or district_code - - features.append( - { - "type": "Feature", - "properties": new_props, - "geometry": feature.get("geometry"), - } - ) - - return {"type": "FeatureCollection", "features": features} - - class NeighbourhoodParser: """Parser for City of Toronto neighbourhood boundary GeoJSON files. diff --git a/portfolio_app/toronto/parsers/trreb.py b/portfolio_app/toronto/parsers/trreb.py deleted file mode 100644 index fad5869..0000000 --- a/portfolio_app/toronto/parsers/trreb.py +++ /dev/null @@ -1,82 +0,0 @@ -"""TRREB PDF parser for monthly market watch reports. - -This module provides the structure for parsing TRREB (Toronto Regional Real Estate Board) -monthly Market Watch PDF reports into structured data. -""" - -from pathlib import Path -from typing import Any - -from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport - - -class TRREBParser: - """Parser for TRREB Market Watch PDF reports. - - TRREB publishes monthly Market Watch reports as PDFs containing: - - Summary statistics by area (416, 905, Total) - - District-level breakdowns - - Year-over-year comparisons - - The parser extracts tabular data from these PDFs and validates - against the TRREBMonthlyRecord schema. - """ - - def __init__(self, pdf_path: Path) -> None: - """Initialize parser with path to PDF file. - - Args: - pdf_path: Path to the TRREB Market Watch PDF file. - """ - self.pdf_path = pdf_path - self._validate_path() - - def _validate_path(self) -> None: - """Validate that the PDF path exists and is readable.""" - if not self.pdf_path.exists(): - raise FileNotFoundError(f"PDF not found: {self.pdf_path}") - if not self.pdf_path.suffix.lower() == ".pdf": - raise ValueError(f"Expected PDF file, got: {self.pdf_path.suffix}") - - def parse(self) -> TRREBMonthlyReport: - """Parse the PDF and return structured data. - - Returns: - TRREBMonthlyReport containing all extracted records. - - Raises: - NotImplementedError: PDF parsing not yet implemented. - """ - raise NotImplementedError( - "PDF parsing requires pdfplumber/tabula-py. " - "Implementation pending Sprint 4 data ingestion." - ) - - def _extract_tables(self) -> list[dict[str, Any]]: - """Extract raw tables from PDF pages. - - Returns: - List of dictionaries representing table data. - """ - raise NotImplementedError("Table extraction not yet implemented.") - - def _parse_district_table( - self, table_data: list[dict[str, Any]] - ) -> list[TRREBMonthlyRecord]: - """Parse district-level statistics table. - - Args: - table_data: Raw table data extracted from PDF. - - Returns: - List of validated TRREBMonthlyRecord objects. - """ - raise NotImplementedError("District table parsing not yet implemented.") - - def _infer_report_date(self) -> tuple[int, int]: - """Infer report year and month from PDF filename or content. - - Returns: - Tuple of (year, month). - """ - raise NotImplementedError("Date inference not yet implemented.") diff --git a/portfolio_app/toronto/schemas/__init__.py b/portfolio_app/toronto/schemas/__init__.py index 1d33f3e..0a470a8 100644 --- a/portfolio_app/toronto/schemas/__init__.py +++ b/portfolio_app/toronto/schemas/__init__.py @@ -2,7 +2,6 @@ from .cmhc import BedroomType, CMHCAnnualSurvey, CMHCRentalRecord, ReliabilityCode from .dimensions import ( - AreaType, CMHCZone, Confidence, ExpectedDirection, @@ -11,14 +10,9 @@ from .dimensions import ( PolicyEvent, PolicyLevel, TimeDimension, - TRREBDistrict, ) -from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport __all__ = [ - # TRREB - "TRREBMonthlyRecord", - "TRREBMonthlyReport", # CMHC "CMHCRentalRecord", "CMHCAnnualSurvey", @@ -26,12 +20,10 @@ __all__ = [ "ReliabilityCode", # Dimensions "TimeDimension", - "TRREBDistrict", "CMHCZone", "Neighbourhood", "PolicyEvent", # Enums - "AreaType", "PolicyLevel", "PolicyCategory", "ExpectedDirection", diff --git a/portfolio_app/toronto/schemas/dimensions.py b/portfolio_app/toronto/schemas/dimensions.py index 66fd509..1eb71b8 100644 --- a/portfolio_app/toronto/schemas/dimensions.py +++ b/portfolio_app/toronto/schemas/dimensions.py @@ -41,15 +41,6 @@ class Confidence(str, Enum): LOW = "low" -class AreaType(str, Enum): - """TRREB area type.""" - - WEST = "West" - CENTRAL = "Central" - EAST = "East" - NORTH = "North" - - class TimeDimension(BaseModel): """Schema for time dimension record.""" @@ -62,15 +53,6 @@ class TimeDimension(BaseModel): is_month_start: bool = True -class TRREBDistrict(BaseModel): - """Schema for TRREB district dimension.""" - - district_code: str = Field(max_length=3, description="W01, C01, E01, etc.") - district_name: str = Field(max_length=100) - area_type: AreaType - geometry_wkt: str | None = Field(default=None, description="WKT geometry string") - - class CMHCZone(BaseModel): """Schema for CMHC zone dimension.""" diff --git a/portfolio_app/toronto/schemas/trreb.py b/portfolio_app/toronto/schemas/trreb.py deleted file mode 100644 index e972ff6..0000000 --- a/portfolio_app/toronto/schemas/trreb.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Pydantic schemas for TRREB monthly market data.""" - -from datetime import date -from decimal import Decimal - -from pydantic import BaseModel, Field - - -class TRREBMonthlyRecord(BaseModel): - """Schema for a single TRREB monthly summary record. - - Represents aggregated sales data for one district in one month. - """ - - report_date: date = Field(description="First of month (YYYY-MM-01)") - area_code: str = Field( - max_length=3, description="District code (W01, C01, E01, etc.)" - ) - area_name: str = Field(max_length=100, description="District name") - area_type: str = Field(max_length=10, description="West / Central / East / North") - sales: int = Field(ge=0, description="Number of transactions") - dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)") - avg_price: Decimal = Field(ge=0, description="Average sale price ($)") - median_price: Decimal = Field(ge=0, description="Median sale price ($)") - new_listings: int = Field(ge=0, description="New listings count") - active_listings: int = Field(ge=0, description="Active listings at month end") - avg_sp_lp: Decimal = Field( - ge=0, le=200, description="Avg sale price / list price ratio (%)" - ) - avg_dom: int = Field(ge=0, description="Average days on market") - - model_config = {"str_strip_whitespace": True} - - -class TRREBMonthlyReport(BaseModel): - """Schema for a complete TRREB monthly report. - - Contains all district records for a single month. - """ - - report_date: date - records: list[TRREBMonthlyRecord] - - @property - def total_sales(self) -> int: - """Total sales across all districts.""" - return sum(r.sales for r in self.records) - - @property - def district_count(self) -> int: - """Number of districts in report.""" - return len(self.records)