feat: add Pydantic schemas, SQLAlchemy models, and parser structure
Sprint 3 implementation: - Pydantic schemas for TRREB, CMHC, and dimension data validation - SQLAlchemy models with PostGIS geometry for fact and dimension tables - Parser structure (stubs) for TRREB PDF and CMHC CSV processing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1 +1,39 @@
|
||||
"""Pydantic schemas for Toronto housing data validation."""
|
||||
|
||||
from .cmhc import BedroomType, CMHCAnnualSurvey, CMHCRentalRecord, ReliabilityCode
|
||||
from .dimensions import (
|
||||
AreaType,
|
||||
CMHCZone,
|
||||
Confidence,
|
||||
ExpectedDirection,
|
||||
Neighbourhood,
|
||||
PolicyCategory,
|
||||
PolicyEvent,
|
||||
PolicyLevel,
|
||||
TimeDimension,
|
||||
TRREBDistrict,
|
||||
)
|
||||
from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport
|
||||
|
||||
__all__ = [
|
||||
# TRREB
|
||||
"TRREBMonthlyRecord",
|
||||
"TRREBMonthlyReport",
|
||||
# CMHC
|
||||
"CMHCRentalRecord",
|
||||
"CMHCAnnualSurvey",
|
||||
"BedroomType",
|
||||
"ReliabilityCode",
|
||||
# Dimensions
|
||||
"TimeDimension",
|
||||
"TRREBDistrict",
|
||||
"CMHCZone",
|
||||
"Neighbourhood",
|
||||
"PolicyEvent",
|
||||
# Enums
|
||||
"AreaType",
|
||||
"PolicyLevel",
|
||||
"PolicyCategory",
|
||||
"ExpectedDirection",
|
||||
"Confidence",
|
||||
]
|
||||
|
||||
81
portfolio_app/toronto/schemas/cmhc.py
Normal file
81
portfolio_app/toronto/schemas/cmhc.py
Normal file
@@ -0,0 +1,81 @@
|
||||
"""Pydantic schemas for CMHC rental market data."""
|
||||
|
||||
from decimal import Decimal
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class BedroomType(str, Enum):
|
||||
"""CMHC bedroom type categories."""
|
||||
|
||||
BACHELOR = "Bachelor"
|
||||
ONE_BED = "1 Bedroom"
|
||||
TWO_BED = "2 Bedroom"
|
||||
THREE_BED_PLUS = "3 Bedroom+"
|
||||
TOTAL = "Total"
|
||||
|
||||
|
||||
class ReliabilityCode(str, Enum):
|
||||
"""CMHC data reliability codes.
|
||||
|
||||
Based on coefficient of variation (CV).
|
||||
"""
|
||||
|
||||
EXCELLENT = "a" # CV <= 2.5%
|
||||
GOOD = "b" # 2.5% < CV <= 5%
|
||||
FAIR = "c" # 5% < CV <= 10%
|
||||
POOR = "d" # CV > 10%
|
||||
SUPPRESSED = "**" # Sample too small
|
||||
|
||||
|
||||
class CMHCRentalRecord(BaseModel):
|
||||
"""Schema for a single CMHC rental survey record.
|
||||
|
||||
Represents rental data for one zone and bedroom type in one survey year.
|
||||
"""
|
||||
|
||||
survey_year: int = Field(ge=1990, description="Survey year (October snapshot)")
|
||||
zone_code: str = Field(max_length=10, description="CMHC zone identifier")
|
||||
zone_name: str = Field(max_length=100, description="Zone name")
|
||||
bedroom_type: BedroomType = Field(description="Bedroom category")
|
||||
universe: int | None = Field(
|
||||
default=None, ge=0, description="Total rental units in zone"
|
||||
)
|
||||
vacancy_rate: Decimal | None = Field(
|
||||
default=None, ge=0, le=100, description="Vacancy rate (%)"
|
||||
)
|
||||
vacancy_rate_reliability: ReliabilityCode | None = Field(default=None)
|
||||
availability_rate: Decimal | None = Field(
|
||||
default=None, ge=0, le=100, description="Availability rate (%)"
|
||||
)
|
||||
average_rent: Decimal | None = Field(
|
||||
default=None, ge=0, description="Average monthly rent ($)"
|
||||
)
|
||||
average_rent_reliability: ReliabilityCode | None = Field(default=None)
|
||||
median_rent: Decimal | None = Field(
|
||||
default=None, ge=0, description="Median monthly rent ($)"
|
||||
)
|
||||
rent_change_pct: Decimal | None = Field(
|
||||
default=None, description="YoY rent change (%)"
|
||||
)
|
||||
turnover_rate: Decimal | None = Field(
|
||||
default=None, ge=0, le=100, description="Unit turnover rate (%)"
|
||||
)
|
||||
|
||||
model_config = {"str_strip_whitespace": True}
|
||||
|
||||
|
||||
class CMHCAnnualSurvey(BaseModel):
|
||||
"""Schema for a complete CMHC annual survey for Toronto.
|
||||
|
||||
Contains all zone and bedroom type combinations for one survey year.
|
||||
"""
|
||||
|
||||
survey_year: int
|
||||
records: list[CMHCRentalRecord]
|
||||
|
||||
@property
|
||||
def zone_count(self) -> int:
|
||||
"""Number of unique zones in survey."""
|
||||
return len({r.zone_code for r in self.records})
|
||||
121
portfolio_app/toronto/schemas/dimensions.py
Normal file
121
portfolio_app/toronto/schemas/dimensions.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""Pydantic schemas for dimension tables."""
|
||||
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
|
||||
|
||||
class PolicyLevel(str, Enum):
|
||||
"""Government level for policy events."""
|
||||
|
||||
FEDERAL = "federal"
|
||||
PROVINCIAL = "provincial"
|
||||
MUNICIPAL = "municipal"
|
||||
|
||||
|
||||
class PolicyCategory(str, Enum):
|
||||
"""Policy event category."""
|
||||
|
||||
MONETARY = "monetary"
|
||||
TAX = "tax"
|
||||
REGULATORY = "regulatory"
|
||||
SUPPLY = "supply"
|
||||
ECONOMIC = "economic"
|
||||
|
||||
|
||||
class ExpectedDirection(str, Enum):
|
||||
"""Expected price impact direction."""
|
||||
|
||||
BULLISH = "bullish" # Expected to increase prices
|
||||
BEARISH = "bearish" # Expected to decrease prices
|
||||
NEUTRAL = "neutral" # Uncertain or mixed impact
|
||||
|
||||
|
||||
class Confidence(str, Enum):
|
||||
"""Confidence level in policy event data."""
|
||||
|
||||
HIGH = "high"
|
||||
MEDIUM = "medium"
|
||||
LOW = "low"
|
||||
|
||||
|
||||
class AreaType(str, Enum):
|
||||
"""TRREB area type."""
|
||||
|
||||
WEST = "West"
|
||||
CENTRAL = "Central"
|
||||
EAST = "East"
|
||||
NORTH = "North"
|
||||
|
||||
|
||||
class TimeDimension(BaseModel):
|
||||
"""Schema for time dimension record."""
|
||||
|
||||
date_key: int = Field(description="Date key in YYYYMMDD format")
|
||||
full_date: date
|
||||
year: int = Field(ge=2000, le=2100)
|
||||
month: int = Field(ge=1, le=12)
|
||||
quarter: int = Field(ge=1, le=4)
|
||||
month_name: str = Field(max_length=20)
|
||||
is_month_start: bool = True
|
||||
|
||||
|
||||
class TRREBDistrict(BaseModel):
|
||||
"""Schema for TRREB district dimension."""
|
||||
|
||||
district_code: str = Field(max_length=3, description="W01, C01, E01, etc.")
|
||||
district_name: str = Field(max_length=100)
|
||||
area_type: AreaType
|
||||
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
|
||||
|
||||
|
||||
class CMHCZone(BaseModel):
|
||||
"""Schema for CMHC zone dimension."""
|
||||
|
||||
zone_code: str = Field(max_length=10)
|
||||
zone_name: str = Field(max_length=100)
|
||||
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
|
||||
|
||||
|
||||
class Neighbourhood(BaseModel):
|
||||
"""Schema for City of Toronto neighbourhood dimension.
|
||||
|
||||
Note: No FK to fact tables in V1 - reference overlay only.
|
||||
"""
|
||||
|
||||
neighbourhood_id: int = Field(ge=1, le=200)
|
||||
name: str = Field(max_length=100)
|
||||
geometry_wkt: str | None = Field(default=None)
|
||||
population: int | None = Field(default=None, ge=0)
|
||||
land_area_sqkm: Decimal | None = Field(default=None, ge=0)
|
||||
pop_density_per_sqkm: Decimal | None = Field(default=None, ge=0)
|
||||
pct_bachelors_or_higher: Decimal | None = Field(default=None, ge=0, le=100)
|
||||
median_household_income: Decimal | None = Field(default=None, ge=0)
|
||||
pct_owner_occupied: Decimal | None = Field(default=None, ge=0, le=100)
|
||||
pct_renter_occupied: Decimal | None = Field(default=None, ge=0, le=100)
|
||||
census_year: int = Field(default=2021, description="Census year for SCD tracking")
|
||||
|
||||
|
||||
class PolicyEvent(BaseModel):
|
||||
"""Schema for policy event dimension.
|
||||
|
||||
Used for time-series annotation. No causation claims.
|
||||
"""
|
||||
|
||||
event_date: date = Field(description="Date event was announced/occurred")
|
||||
effective_date: date | None = Field(
|
||||
default=None, description="Date policy took effect"
|
||||
)
|
||||
level: PolicyLevel
|
||||
category: PolicyCategory
|
||||
title: str = Field(max_length=200, description="Short event title for display")
|
||||
description: str | None = Field(
|
||||
default=None, description="Longer description for tooltip"
|
||||
)
|
||||
expected_direction: ExpectedDirection
|
||||
source_url: HttpUrl | None = Field(default=None)
|
||||
confidence: Confidence = Field(default=Confidence.MEDIUM)
|
||||
|
||||
model_config = {"str_strip_whitespace": True}
|
||||
52
portfolio_app/toronto/schemas/trreb.py
Normal file
52
portfolio_app/toronto/schemas/trreb.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Pydantic schemas for TRREB monthly market data."""
|
||||
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class TRREBMonthlyRecord(BaseModel):
|
||||
"""Schema for a single TRREB monthly summary record.
|
||||
|
||||
Represents aggregated sales data for one district in one month.
|
||||
"""
|
||||
|
||||
report_date: date = Field(description="First of month (YYYY-MM-01)")
|
||||
area_code: str = Field(
|
||||
max_length=3, description="District code (W01, C01, E01, etc.)"
|
||||
)
|
||||
area_name: str = Field(max_length=100, description="District name")
|
||||
area_type: str = Field(max_length=10, description="West / Central / East / North")
|
||||
sales: int = Field(ge=0, description="Number of transactions")
|
||||
dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)")
|
||||
avg_price: Decimal = Field(ge=0, description="Average sale price ($)")
|
||||
median_price: Decimal = Field(ge=0, description="Median sale price ($)")
|
||||
new_listings: int = Field(ge=0, description="New listings count")
|
||||
active_listings: int = Field(ge=0, description="Active listings at month end")
|
||||
avg_sp_lp: Decimal = Field(
|
||||
ge=0, le=200, description="Avg sale price / list price ratio (%)"
|
||||
)
|
||||
avg_dom: int = Field(ge=0, description="Average days on market")
|
||||
|
||||
model_config = {"str_strip_whitespace": True}
|
||||
|
||||
|
||||
class TRREBMonthlyReport(BaseModel):
|
||||
"""Schema for a complete TRREB monthly report.
|
||||
|
||||
Contains all district records for a single month.
|
||||
"""
|
||||
|
||||
report_date: date
|
||||
records: list[TRREBMonthlyRecord]
|
||||
|
||||
@property
|
||||
def total_sales(self) -> int:
|
||||
"""Total sales across all districts."""
|
||||
return sum(r.sales for r in self.records)
|
||||
|
||||
@property
|
||||
def district_count(self) -> int:
|
||||
"""Number of districts in report."""
|
||||
return len(self.records)
|
||||
Reference in New Issue
Block a user