feat: Sprint 3 - Pydantic schemas, SQLAlchemy models, and parser structure #14

Merged
lmiranda merged 1 commits from feature/sprint3-schemas-models into development 2026-01-11 20:00:20 +00:00
11 changed files with 760 additions and 1 deletions
Showing only changes of commit ead6d91a28 - Show all commits

View File

@@ -1 +1,28 @@
"""SQLAlchemy models for Toronto housing data."""
from .base import Base, create_tables, get_engine, get_session_factory
from .dimensions import (
DimCMHCZone,
DimNeighbourhood,
DimPolicyEvent,
DimTime,
DimTRREBDistrict,
)
from .facts import FactPurchases, FactRentals
__all__ = [
# Base
"Base",
"get_engine",
"get_session_factory",
"create_tables",
# Dimensions
"DimTime",
"DimTRREBDistrict",
"DimCMHCZone",
"DimNeighbourhood",
"DimPolicyEvent",
# Facts
"FactPurchases",
"FactRentals",
]

View File

@@ -0,0 +1,30 @@
"""SQLAlchemy base configuration and engine setup."""
from sqlalchemy import Engine, create_engine
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
from portfolio_app.config import get_settings
class Base(DeclarativeBase): # type: ignore[misc]
"""Base class for all SQLAlchemy models."""
pass
def get_engine() -> Engine:
"""Create database engine from settings."""
settings = get_settings()
return create_engine(settings.database_url, echo=False)
def get_session_factory() -> sessionmaker[Session]:
"""Create session factory."""
engine = get_engine()
return sessionmaker(bind=engine)
def create_tables() -> None:
"""Create all tables in database."""
engine = get_engine()
Base.metadata.create_all(engine)

View File

@@ -0,0 +1,104 @@
"""SQLAlchemy models for dimension tables."""
from datetime import date
from geoalchemy2 import Geometry
from sqlalchemy import Boolean, Date, Integer, Numeric, String, Text
from sqlalchemy.orm import Mapped, mapped_column
from .base import Base
class DimTime(Base):
"""Time dimension table."""
__tablename__ = "dim_time"
date_key: Mapped[int] = mapped_column(Integer, primary_key=True)
full_date: Mapped[date] = mapped_column(Date, nullable=False, unique=True)
year: Mapped[int] = mapped_column(Integer, nullable=False)
month: Mapped[int] = mapped_column(Integer, nullable=False)
quarter: Mapped[int] = mapped_column(Integer, nullable=False)
month_name: Mapped[str] = mapped_column(String(20), nullable=False)
is_month_start: Mapped[bool] = mapped_column(Boolean, default=True)
class DimTRREBDistrict(Base):
"""TRREB district dimension table with PostGIS geometry."""
__tablename__ = "dim_trreb_district"
district_key: Mapped[int] = mapped_column(
Integer, primary_key=True, autoincrement=True
)
district_code: Mapped[str] = mapped_column(String(3), nullable=False, unique=True)
district_name: Mapped[str] = mapped_column(String(100), nullable=False)
area_type: Mapped[str] = mapped_column(String(10), nullable=False)
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
class DimCMHCZone(Base):
"""CMHC zone dimension table with PostGIS geometry."""
__tablename__ = "dim_cmhc_zone"
zone_key: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
zone_code: Mapped[str] = mapped_column(String(10), nullable=False, unique=True)
zone_name: Mapped[str] = mapped_column(String(100), nullable=False)
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
class DimNeighbourhood(Base):
"""City of Toronto neighbourhood dimension.
Note: No FK to fact tables in V1 - reference overlay only.
"""
__tablename__ = "dim_neighbourhood"
neighbourhood_id: Mapped[int] = mapped_column(Integer, primary_key=True)
name: Mapped[str] = mapped_column(String(100), nullable=False)
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
population: Mapped[int | None] = mapped_column(Integer, nullable=True)
land_area_sqkm: Mapped[float | None] = mapped_column(Numeric(10, 4), nullable=True)
pop_density_per_sqkm: Mapped[float | None] = mapped_column(
Numeric(10, 2), nullable=True
)
pct_bachelors_or_higher: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
median_household_income: Mapped[float | None] = mapped_column(
Numeric(12, 2), nullable=True
)
pct_owner_occupied: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
pct_renter_occupied: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
census_year: Mapped[int] = mapped_column(Integer, default=2021)
class DimPolicyEvent(Base):
"""Policy event dimension for time-series annotation."""
__tablename__ = "dim_policy_event"
event_id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
event_date: Mapped[date] = mapped_column(Date, nullable=False)
effective_date: Mapped[date | None] = mapped_column(Date, nullable=True)
level: Mapped[str] = mapped_column(
String(20), nullable=False
) # federal/provincial/municipal
category: Mapped[str] = mapped_column(
String(20), nullable=False
) # monetary/tax/regulatory/supply/economic
title: Mapped[str] = mapped_column(String(200), nullable=False)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
expected_direction: Mapped[str] = mapped_column(
String(10), nullable=False
) # bearish/bullish/neutral
source_url: Mapped[str | None] = mapped_column(String(500), nullable=True)
confidence: Mapped[str] = mapped_column(
String(10), default="medium"
) # high/medium/low

View File

@@ -0,0 +1,69 @@
"""SQLAlchemy models for fact tables."""
from sqlalchemy import ForeignKey, Integer, Numeric, String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from .base import Base
class FactPurchases(Base):
"""Fact table for TRREB purchase/sales data.
Grain: One row per district per month.
"""
__tablename__ = "fact_purchases"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
date_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_time.date_key"), nullable=False
)
district_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_trreb_district.district_key"), nullable=False
)
sales_count: Mapped[int] = mapped_column(Integer, nullable=False)
dollar_volume: Mapped[float] = mapped_column(Numeric(15, 2), nullable=False)
avg_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False)
median_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False)
new_listings: Mapped[int] = mapped_column(Integer, nullable=False)
active_listings: Mapped[int] = mapped_column(Integer, nullable=False)
avg_dom: Mapped[int] = mapped_column(Integer, nullable=False) # Days on market
avg_sp_lp: Mapped[float] = mapped_column(
Numeric(5, 2), nullable=False
) # Sale/List ratio
# Relationships
time = relationship("DimTime", backref="purchases")
district = relationship("DimTRREBDistrict", backref="purchases")
class FactRentals(Base):
"""Fact table for CMHC rental market data.
Grain: One row per zone per bedroom type per survey year.
"""
__tablename__ = "fact_rentals"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
date_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_time.date_key"), nullable=False
)
zone_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_cmhc_zone.zone_key"), nullable=False
)
bedroom_type: Mapped[str] = mapped_column(String(20), nullable=False)
universe: Mapped[int | None] = mapped_column(Integer, nullable=True)
avg_rent: Mapped[float | None] = mapped_column(Numeric(10, 2), nullable=True)
median_rent: Mapped[float | None] = mapped_column(Numeric(10, 2), nullable=True)
vacancy_rate: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
availability_rate: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
turnover_rate: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
rent_change_pct: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
reliability_code: Mapped[str | None] = mapped_column(String(2), nullable=True)
# Relationships
time = relationship("DimTime", backref="rentals")
zone = relationship("DimCMHCZone", backref="rentals")

View File

@@ -1 +1,9 @@
"""Data parsers for Toronto housing data sources."""
"""Parsers for Toronto housing data sources."""
from .cmhc import CMHCParser
from .trreb import TRREBParser
__all__ = [
"TRREBParser",
"CMHCParser",
]

View File

@@ -0,0 +1,147 @@
"""CMHC CSV processor for rental market survey data.
This module provides the structure for processing CMHC (Canada Mortgage and Housing
Corporation) rental market survey data from CSV exports.
"""
from pathlib import Path
from typing import Any, cast
import pandas as pd
from portfolio_app.toronto.schemas import CMHCAnnualSurvey, CMHCRentalRecord
class CMHCParser:
"""Parser for CMHC Rental Market Survey CSV data.
CMHC conducts annual rental market surveys and publishes data including:
- Average and median rents by zone and bedroom type
- Vacancy rates
- Universe (total rental units)
- Year-over-year rent changes
Data is available via the Housing Market Information Portal as CSV exports.
"""
# Expected columns in CMHC CSV exports
REQUIRED_COLUMNS = {
"zone_code",
"zone_name",
"bedroom_type",
"survey_year",
}
# Column name mappings from CMHC export format
COLUMN_MAPPINGS = {
"Zone Code": "zone_code",
"Zone Name": "zone_name",
"Bedroom Type": "bedroom_type",
"Survey Year": "survey_year",
"Universe": "universe",
"Average Rent ($)": "avg_rent",
"Median Rent ($)": "median_rent",
"Vacancy Rate (%)": "vacancy_rate",
"Availability Rate (%)": "availability_rate",
"Turnover Rate (%)": "turnover_rate",
"% Change in Rent": "rent_change_pct",
"Reliability Code": "reliability_code",
}
def __init__(self, csv_path: Path) -> None:
"""Initialize parser with path to CSV file.
Args:
csv_path: Path to the CMHC CSV export file.
"""
self.csv_path = csv_path
self._validate_path()
def _validate_path(self) -> None:
"""Validate that the CSV path exists and is readable."""
if not self.csv_path.exists():
raise FileNotFoundError(f"CSV not found: {self.csv_path}")
if not self.csv_path.suffix.lower() == ".csv":
raise ValueError(f"Expected CSV file, got: {self.csv_path.suffix}")
def parse(self) -> CMHCAnnualSurvey:
"""Parse the CSV and return structured data.
Returns:
CMHCAnnualSurvey containing all extracted records.
Raises:
ValueError: If required columns are missing.
"""
df = self._load_csv()
df = self._normalize_columns(df)
self._validate_columns(df)
records = self._convert_to_records(df)
survey_year = self._infer_survey_year(df)
return CMHCAnnualSurvey(survey_year=survey_year, records=records)
def _load_csv(self) -> pd.DataFrame:
"""Load CSV file into DataFrame.
Returns:
Raw DataFrame from CSV.
"""
return pd.read_csv(self.csv_path)
def _normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Normalize column names to standard format.
Args:
df: DataFrame with original column names.
Returns:
DataFrame with normalized column names.
"""
rename_map = {k: v for k, v in self.COLUMN_MAPPINGS.items() if k in df.columns}
return df.rename(columns=rename_map)
def _validate_columns(self, df: pd.DataFrame) -> None:
"""Validate that all required columns are present.
Args:
df: DataFrame to validate.
Raises:
ValueError: If required columns are missing.
"""
missing = self.REQUIRED_COLUMNS - set(df.columns)
if missing:
raise ValueError(f"Missing required columns: {missing}")
def _convert_to_records(self, df: pd.DataFrame) -> list[CMHCRentalRecord]:
"""Convert DataFrame rows to validated schema records.
Args:
df: Normalized DataFrame.
Returns:
List of validated CMHCRentalRecord objects.
"""
records = []
for _, row in df.iterrows():
record_data = row.to_dict()
# Handle NaN values
record_data = {
k: (None if pd.isna(v) else v) for k, v in record_data.items()
}
records.append(CMHCRentalRecord(**cast(dict[str, Any], record_data)))
return records
def _infer_survey_year(self, df: pd.DataFrame) -> int:
"""Infer survey year from data.
Args:
df: DataFrame with survey_year column.
Returns:
Survey year as integer.
"""
if "survey_year" in df.columns:
return int(df["survey_year"].iloc[0])
raise ValueError("Cannot infer survey year from data.")

View File

@@ -0,0 +1,82 @@
"""TRREB PDF parser for monthly market watch reports.
This module provides the structure for parsing TRREB (Toronto Regional Real Estate Board)
monthly Market Watch PDF reports into structured data.
"""
from pathlib import Path
from typing import Any
from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport
class TRREBParser:
"""Parser for TRREB Market Watch PDF reports.
TRREB publishes monthly Market Watch reports as PDFs containing:
- Summary statistics by area (416, 905, Total)
- District-level breakdowns
- Year-over-year comparisons
The parser extracts tabular data from these PDFs and validates
against the TRREBMonthlyRecord schema.
"""
def __init__(self, pdf_path: Path) -> None:
"""Initialize parser with path to PDF file.
Args:
pdf_path: Path to the TRREB Market Watch PDF file.
"""
self.pdf_path = pdf_path
self._validate_path()
def _validate_path(self) -> None:
"""Validate that the PDF path exists and is readable."""
if not self.pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
if not self.pdf_path.suffix.lower() == ".pdf":
raise ValueError(f"Expected PDF file, got: {self.pdf_path.suffix}")
def parse(self) -> TRREBMonthlyReport:
"""Parse the PDF and return structured data.
Returns:
TRREBMonthlyReport containing all extracted records.
Raises:
NotImplementedError: PDF parsing not yet implemented.
"""
raise NotImplementedError(
"PDF parsing requires pdfplumber/tabula-py. "
"Implementation pending Sprint 4 data ingestion."
)
def _extract_tables(self) -> list[dict[str, Any]]:
"""Extract raw tables from PDF pages.
Returns:
List of dictionaries representing table data.
"""
raise NotImplementedError("Table extraction not yet implemented.")
def _parse_district_table(
self, table_data: list[dict[str, Any]]
) -> list[TRREBMonthlyRecord]:
"""Parse district-level statistics table.
Args:
table_data: Raw table data extracted from PDF.
Returns:
List of validated TRREBMonthlyRecord objects.
"""
raise NotImplementedError("District table parsing not yet implemented.")
def _infer_report_date(self) -> tuple[int, int]:
"""Infer report year and month from PDF filename or content.
Returns:
Tuple of (year, month).
"""
raise NotImplementedError("Date inference not yet implemented.")

View File

@@ -1 +1,39 @@
"""Pydantic schemas for Toronto housing data validation."""
from .cmhc import BedroomType, CMHCAnnualSurvey, CMHCRentalRecord, ReliabilityCode
from .dimensions import (
AreaType,
CMHCZone,
Confidence,
ExpectedDirection,
Neighbourhood,
PolicyCategory,
PolicyEvent,
PolicyLevel,
TimeDimension,
TRREBDistrict,
)
from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport
__all__ = [
# TRREB
"TRREBMonthlyRecord",
"TRREBMonthlyReport",
# CMHC
"CMHCRentalRecord",
"CMHCAnnualSurvey",
"BedroomType",
"ReliabilityCode",
# Dimensions
"TimeDimension",
"TRREBDistrict",
"CMHCZone",
"Neighbourhood",
"PolicyEvent",
# Enums
"AreaType",
"PolicyLevel",
"PolicyCategory",
"ExpectedDirection",
"Confidence",
]

View File

@@ -0,0 +1,81 @@
"""Pydantic schemas for CMHC rental market data."""
from decimal import Decimal
from enum import Enum
from pydantic import BaseModel, Field
class BedroomType(str, Enum):
"""CMHC bedroom type categories."""
BACHELOR = "Bachelor"
ONE_BED = "1 Bedroom"
TWO_BED = "2 Bedroom"
THREE_BED_PLUS = "3 Bedroom+"
TOTAL = "Total"
class ReliabilityCode(str, Enum):
"""CMHC data reliability codes.
Based on coefficient of variation (CV).
"""
EXCELLENT = "a" # CV <= 2.5%
GOOD = "b" # 2.5% < CV <= 5%
FAIR = "c" # 5% < CV <= 10%
POOR = "d" # CV > 10%
SUPPRESSED = "**" # Sample too small
class CMHCRentalRecord(BaseModel):
"""Schema for a single CMHC rental survey record.
Represents rental data for one zone and bedroom type in one survey year.
"""
survey_year: int = Field(ge=1990, description="Survey year (October snapshot)")
zone_code: str = Field(max_length=10, description="CMHC zone identifier")
zone_name: str = Field(max_length=100, description="Zone name")
bedroom_type: BedroomType = Field(description="Bedroom category")
universe: int | None = Field(
default=None, ge=0, description="Total rental units in zone"
)
vacancy_rate: Decimal | None = Field(
default=None, ge=0, le=100, description="Vacancy rate (%)"
)
vacancy_rate_reliability: ReliabilityCode | None = Field(default=None)
availability_rate: Decimal | None = Field(
default=None, ge=0, le=100, description="Availability rate (%)"
)
average_rent: Decimal | None = Field(
default=None, ge=0, description="Average monthly rent ($)"
)
average_rent_reliability: ReliabilityCode | None = Field(default=None)
median_rent: Decimal | None = Field(
default=None, ge=0, description="Median monthly rent ($)"
)
rent_change_pct: Decimal | None = Field(
default=None, description="YoY rent change (%)"
)
turnover_rate: Decimal | None = Field(
default=None, ge=0, le=100, description="Unit turnover rate (%)"
)
model_config = {"str_strip_whitespace": True}
class CMHCAnnualSurvey(BaseModel):
"""Schema for a complete CMHC annual survey for Toronto.
Contains all zone and bedroom type combinations for one survey year.
"""
survey_year: int
records: list[CMHCRentalRecord]
@property
def zone_count(self) -> int:
"""Number of unique zones in survey."""
return len({r.zone_code for r in self.records})

View File

@@ -0,0 +1,121 @@
"""Pydantic schemas for dimension tables."""
from datetime import date
from decimal import Decimal
from enum import Enum
from pydantic import BaseModel, Field, HttpUrl
class PolicyLevel(str, Enum):
"""Government level for policy events."""
FEDERAL = "federal"
PROVINCIAL = "provincial"
MUNICIPAL = "municipal"
class PolicyCategory(str, Enum):
"""Policy event category."""
MONETARY = "monetary"
TAX = "tax"
REGULATORY = "regulatory"
SUPPLY = "supply"
ECONOMIC = "economic"
class ExpectedDirection(str, Enum):
"""Expected price impact direction."""
BULLISH = "bullish" # Expected to increase prices
BEARISH = "bearish" # Expected to decrease prices
NEUTRAL = "neutral" # Uncertain or mixed impact
class Confidence(str, Enum):
"""Confidence level in policy event data."""
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
class AreaType(str, Enum):
"""TRREB area type."""
WEST = "West"
CENTRAL = "Central"
EAST = "East"
NORTH = "North"
class TimeDimension(BaseModel):
"""Schema for time dimension record."""
date_key: int = Field(description="Date key in YYYYMMDD format")
full_date: date
year: int = Field(ge=2000, le=2100)
month: int = Field(ge=1, le=12)
quarter: int = Field(ge=1, le=4)
month_name: str = Field(max_length=20)
is_month_start: bool = True
class TRREBDistrict(BaseModel):
"""Schema for TRREB district dimension."""
district_code: str = Field(max_length=3, description="W01, C01, E01, etc.")
district_name: str = Field(max_length=100)
area_type: AreaType
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
class CMHCZone(BaseModel):
"""Schema for CMHC zone dimension."""
zone_code: str = Field(max_length=10)
zone_name: str = Field(max_length=100)
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
class Neighbourhood(BaseModel):
"""Schema for City of Toronto neighbourhood dimension.
Note: No FK to fact tables in V1 - reference overlay only.
"""
neighbourhood_id: int = Field(ge=1, le=200)
name: str = Field(max_length=100)
geometry_wkt: str | None = Field(default=None)
population: int | None = Field(default=None, ge=0)
land_area_sqkm: Decimal | None = Field(default=None, ge=0)
pop_density_per_sqkm: Decimal | None = Field(default=None, ge=0)
pct_bachelors_or_higher: Decimal | None = Field(default=None, ge=0, le=100)
median_household_income: Decimal | None = Field(default=None, ge=0)
pct_owner_occupied: Decimal | None = Field(default=None, ge=0, le=100)
pct_renter_occupied: Decimal | None = Field(default=None, ge=0, le=100)
census_year: int = Field(default=2021, description="Census year for SCD tracking")
class PolicyEvent(BaseModel):
"""Schema for policy event dimension.
Used for time-series annotation. No causation claims.
"""
event_date: date = Field(description="Date event was announced/occurred")
effective_date: date | None = Field(
default=None, description="Date policy took effect"
)
level: PolicyLevel
category: PolicyCategory
title: str = Field(max_length=200, description="Short event title for display")
description: str | None = Field(
default=None, description="Longer description for tooltip"
)
expected_direction: ExpectedDirection
source_url: HttpUrl | None = Field(default=None)
confidence: Confidence = Field(default=Confidence.MEDIUM)
model_config = {"str_strip_whitespace": True}

View File

@@ -0,0 +1,52 @@
"""Pydantic schemas for TRREB monthly market data."""
from datetime import date
from decimal import Decimal
from pydantic import BaseModel, Field
class TRREBMonthlyRecord(BaseModel):
"""Schema for a single TRREB monthly summary record.
Represents aggregated sales data for one district in one month.
"""
report_date: date = Field(description="First of month (YYYY-MM-01)")
area_code: str = Field(
max_length=3, description="District code (W01, C01, E01, etc.)"
)
area_name: str = Field(max_length=100, description="District name")
area_type: str = Field(max_length=10, description="West / Central / East / North")
sales: int = Field(ge=0, description="Number of transactions")
dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)")
avg_price: Decimal = Field(ge=0, description="Average sale price ($)")
median_price: Decimal = Field(ge=0, description="Median sale price ($)")
new_listings: int = Field(ge=0, description="New listings count")
active_listings: int = Field(ge=0, description="Active listings at month end")
avg_sp_lp: Decimal = Field(
ge=0, le=200, description="Avg sale price / list price ratio (%)"
)
avg_dom: int = Field(ge=0, description="Average days on market")
model_config = {"str_strip_whitespace": True}
class TRREBMonthlyReport(BaseModel):
"""Schema for a complete TRREB monthly report.
Contains all district records for a single month.
"""
report_date: date
records: list[TRREBMonthlyRecord]
@property
def total_sales(self) -> int:
"""Total sales across all districts."""
return sum(r.sales for r in self.records)
@property
def district_count(self) -> int:
"""Number of districts in report."""
return len(self.records)