feat: add Pydantic schemas, SQLAlchemy models, and parser structure

Sprint 3 implementation:
- Pydantic schemas for TRREB, CMHC, and dimension data validation
- SQLAlchemy models with PostGIS geometry for fact and dimension tables
- Parser structure (stubs) for TRREB PDF and CMHC CSV processing

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-11 14:58:31 -05:00
parent 549e1fcbaf
commit ead6d91a28
11 changed files with 760 additions and 1 deletions

View File

@@ -1 +1,39 @@
"""Pydantic schemas for Toronto housing data validation."""
from .cmhc import BedroomType, CMHCAnnualSurvey, CMHCRentalRecord, ReliabilityCode
from .dimensions import (
AreaType,
CMHCZone,
Confidence,
ExpectedDirection,
Neighbourhood,
PolicyCategory,
PolicyEvent,
PolicyLevel,
TimeDimension,
TRREBDistrict,
)
from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport
__all__ = [
# TRREB
"TRREBMonthlyRecord",
"TRREBMonthlyReport",
# CMHC
"CMHCRentalRecord",
"CMHCAnnualSurvey",
"BedroomType",
"ReliabilityCode",
# Dimensions
"TimeDimension",
"TRREBDistrict",
"CMHCZone",
"Neighbourhood",
"PolicyEvent",
# Enums
"AreaType",
"PolicyLevel",
"PolicyCategory",
"ExpectedDirection",
"Confidence",
]

View File

@@ -0,0 +1,81 @@
"""Pydantic schemas for CMHC rental market data."""
from decimal import Decimal
from enum import Enum
from pydantic import BaseModel, Field
class BedroomType(str, Enum):
"""CMHC bedroom type categories."""
BACHELOR = "Bachelor"
ONE_BED = "1 Bedroom"
TWO_BED = "2 Bedroom"
THREE_BED_PLUS = "3 Bedroom+"
TOTAL = "Total"
class ReliabilityCode(str, Enum):
"""CMHC data reliability codes.
Based on coefficient of variation (CV).
"""
EXCELLENT = "a" # CV <= 2.5%
GOOD = "b" # 2.5% < CV <= 5%
FAIR = "c" # 5% < CV <= 10%
POOR = "d" # CV > 10%
SUPPRESSED = "**" # Sample too small
class CMHCRentalRecord(BaseModel):
"""Schema for a single CMHC rental survey record.
Represents rental data for one zone and bedroom type in one survey year.
"""
survey_year: int = Field(ge=1990, description="Survey year (October snapshot)")
zone_code: str = Field(max_length=10, description="CMHC zone identifier")
zone_name: str = Field(max_length=100, description="Zone name")
bedroom_type: BedroomType = Field(description="Bedroom category")
universe: int | None = Field(
default=None, ge=0, description="Total rental units in zone"
)
vacancy_rate: Decimal | None = Field(
default=None, ge=0, le=100, description="Vacancy rate (%)"
)
vacancy_rate_reliability: ReliabilityCode | None = Field(default=None)
availability_rate: Decimal | None = Field(
default=None, ge=0, le=100, description="Availability rate (%)"
)
average_rent: Decimal | None = Field(
default=None, ge=0, description="Average monthly rent ($)"
)
average_rent_reliability: ReliabilityCode | None = Field(default=None)
median_rent: Decimal | None = Field(
default=None, ge=0, description="Median monthly rent ($)"
)
rent_change_pct: Decimal | None = Field(
default=None, description="YoY rent change (%)"
)
turnover_rate: Decimal | None = Field(
default=None, ge=0, le=100, description="Unit turnover rate (%)"
)
model_config = {"str_strip_whitespace": True}
class CMHCAnnualSurvey(BaseModel):
"""Schema for a complete CMHC annual survey for Toronto.
Contains all zone and bedroom type combinations for one survey year.
"""
survey_year: int
records: list[CMHCRentalRecord]
@property
def zone_count(self) -> int:
"""Number of unique zones in survey."""
return len({r.zone_code for r in self.records})

View File

@@ -0,0 +1,121 @@
"""Pydantic schemas for dimension tables."""
from datetime import date
from decimal import Decimal
from enum import Enum
from pydantic import BaseModel, Field, HttpUrl
class PolicyLevel(str, Enum):
"""Government level for policy events."""
FEDERAL = "federal"
PROVINCIAL = "provincial"
MUNICIPAL = "municipal"
class PolicyCategory(str, Enum):
"""Policy event category."""
MONETARY = "monetary"
TAX = "tax"
REGULATORY = "regulatory"
SUPPLY = "supply"
ECONOMIC = "economic"
class ExpectedDirection(str, Enum):
"""Expected price impact direction."""
BULLISH = "bullish" # Expected to increase prices
BEARISH = "bearish" # Expected to decrease prices
NEUTRAL = "neutral" # Uncertain or mixed impact
class Confidence(str, Enum):
"""Confidence level in policy event data."""
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
class AreaType(str, Enum):
"""TRREB area type."""
WEST = "West"
CENTRAL = "Central"
EAST = "East"
NORTH = "North"
class TimeDimension(BaseModel):
"""Schema for time dimension record."""
date_key: int = Field(description="Date key in YYYYMMDD format")
full_date: date
year: int = Field(ge=2000, le=2100)
month: int = Field(ge=1, le=12)
quarter: int = Field(ge=1, le=4)
month_name: str = Field(max_length=20)
is_month_start: bool = True
class TRREBDistrict(BaseModel):
"""Schema for TRREB district dimension."""
district_code: str = Field(max_length=3, description="W01, C01, E01, etc.")
district_name: str = Field(max_length=100)
area_type: AreaType
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
class CMHCZone(BaseModel):
"""Schema for CMHC zone dimension."""
zone_code: str = Field(max_length=10)
zone_name: str = Field(max_length=100)
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
class Neighbourhood(BaseModel):
"""Schema for City of Toronto neighbourhood dimension.
Note: No FK to fact tables in V1 - reference overlay only.
"""
neighbourhood_id: int = Field(ge=1, le=200)
name: str = Field(max_length=100)
geometry_wkt: str | None = Field(default=None)
population: int | None = Field(default=None, ge=0)
land_area_sqkm: Decimal | None = Field(default=None, ge=0)
pop_density_per_sqkm: Decimal | None = Field(default=None, ge=0)
pct_bachelors_or_higher: Decimal | None = Field(default=None, ge=0, le=100)
median_household_income: Decimal | None = Field(default=None, ge=0)
pct_owner_occupied: Decimal | None = Field(default=None, ge=0, le=100)
pct_renter_occupied: Decimal | None = Field(default=None, ge=0, le=100)
census_year: int = Field(default=2021, description="Census year for SCD tracking")
class PolicyEvent(BaseModel):
"""Schema for policy event dimension.
Used for time-series annotation. No causation claims.
"""
event_date: date = Field(description="Date event was announced/occurred")
effective_date: date | None = Field(
default=None, description="Date policy took effect"
)
level: PolicyLevel
category: PolicyCategory
title: str = Field(max_length=200, description="Short event title for display")
description: str | None = Field(
default=None, description="Longer description for tooltip"
)
expected_direction: ExpectedDirection
source_url: HttpUrl | None = Field(default=None)
confidence: Confidence = Field(default=Confidence.MEDIUM)
model_config = {"str_strip_whitespace": True}

View File

@@ -0,0 +1,52 @@
"""Pydantic schemas for TRREB monthly market data."""
from datetime import date
from decimal import Decimal
from pydantic import BaseModel, Field
class TRREBMonthlyRecord(BaseModel):
"""Schema for a single TRREB monthly summary record.
Represents aggregated sales data for one district in one month.
"""
report_date: date = Field(description="First of month (YYYY-MM-01)")
area_code: str = Field(
max_length=3, description="District code (W01, C01, E01, etc.)"
)
area_name: str = Field(max_length=100, description="District name")
area_type: str = Field(max_length=10, description="West / Central / East / North")
sales: int = Field(ge=0, description="Number of transactions")
dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)")
avg_price: Decimal = Field(ge=0, description="Average sale price ($)")
median_price: Decimal = Field(ge=0, description="Median sale price ($)")
new_listings: int = Field(ge=0, description="New listings count")
active_listings: int = Field(ge=0, description="Active listings at month end")
avg_sp_lp: Decimal = Field(
ge=0, le=200, description="Avg sale price / list price ratio (%)"
)
avg_dom: int = Field(ge=0, description="Average days on market")
model_config = {"str_strip_whitespace": True}
class TRREBMonthlyReport(BaseModel):
"""Schema for a complete TRREB monthly report.
Contains all district records for a single month.
"""
report_date: date
records: list[TRREBMonthlyRecord]
@property
def total_sales(self) -> int:
"""Total sales across all districts."""
return sum(r.sales for r in self.records)
@property
def district_count(self) -> int:
"""Number of districts in report."""
return len(self.records)