Merge pull request 'feat: Sprint 3 - Pydantic schemas, SQLAlchemy models, and parser structure' (#14) from feature/sprint3-schemas-models into development
This commit was merged in pull request #14.
This commit is contained in:
@@ -1 +1,28 @@
|
|||||||
"""SQLAlchemy models for Toronto housing data."""
|
"""SQLAlchemy models for Toronto housing data."""
|
||||||
|
|
||||||
|
from .base import Base, create_tables, get_engine, get_session_factory
|
||||||
|
from .dimensions import (
|
||||||
|
DimCMHCZone,
|
||||||
|
DimNeighbourhood,
|
||||||
|
DimPolicyEvent,
|
||||||
|
DimTime,
|
||||||
|
DimTRREBDistrict,
|
||||||
|
)
|
||||||
|
from .facts import FactPurchases, FactRentals
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Base
|
||||||
|
"Base",
|
||||||
|
"get_engine",
|
||||||
|
"get_session_factory",
|
||||||
|
"create_tables",
|
||||||
|
# Dimensions
|
||||||
|
"DimTime",
|
||||||
|
"DimTRREBDistrict",
|
||||||
|
"DimCMHCZone",
|
||||||
|
"DimNeighbourhood",
|
||||||
|
"DimPolicyEvent",
|
||||||
|
# Facts
|
||||||
|
"FactPurchases",
|
||||||
|
"FactRentals",
|
||||||
|
]
|
||||||
|
|||||||
30
portfolio_app/toronto/models/base.py
Normal file
30
portfolio_app/toronto/models/base.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
"""SQLAlchemy base configuration and engine setup."""
|
||||||
|
|
||||||
|
from sqlalchemy import Engine, create_engine
|
||||||
|
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
|
||||||
|
|
||||||
|
from portfolio_app.config import get_settings
|
||||||
|
|
||||||
|
|
||||||
|
class Base(DeclarativeBase): # type: ignore[misc]
|
||||||
|
"""Base class for all SQLAlchemy models."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_engine() -> Engine:
|
||||||
|
"""Create database engine from settings."""
|
||||||
|
settings = get_settings()
|
||||||
|
return create_engine(settings.database_url, echo=False)
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_factory() -> sessionmaker[Session]:
|
||||||
|
"""Create session factory."""
|
||||||
|
engine = get_engine()
|
||||||
|
return sessionmaker(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
|
def create_tables() -> None:
|
||||||
|
"""Create all tables in database."""
|
||||||
|
engine = get_engine()
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
104
portfolio_app/toronto/models/dimensions.py
Normal file
104
portfolio_app/toronto/models/dimensions.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
"""SQLAlchemy models for dimension tables."""
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
from geoalchemy2 import Geometry
|
||||||
|
from sqlalchemy import Boolean, Date, Integer, Numeric, String, Text
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column
|
||||||
|
|
||||||
|
from .base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class DimTime(Base):
|
||||||
|
"""Time dimension table."""
|
||||||
|
|
||||||
|
__tablename__ = "dim_time"
|
||||||
|
|
||||||
|
date_key: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
full_date: Mapped[date] = mapped_column(Date, nullable=False, unique=True)
|
||||||
|
year: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
month: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
quarter: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
month_name: Mapped[str] = mapped_column(String(20), nullable=False)
|
||||||
|
is_month_start: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||||
|
|
||||||
|
|
||||||
|
class DimTRREBDistrict(Base):
|
||||||
|
"""TRREB district dimension table with PostGIS geometry."""
|
||||||
|
|
||||||
|
__tablename__ = "dim_trreb_district"
|
||||||
|
|
||||||
|
district_key: Mapped[int] = mapped_column(
|
||||||
|
Integer, primary_key=True, autoincrement=True
|
||||||
|
)
|
||||||
|
district_code: Mapped[str] = mapped_column(String(3), nullable=False, unique=True)
|
||||||
|
district_name: Mapped[str] = mapped_column(String(100), nullable=False)
|
||||||
|
area_type: Mapped[str] = mapped_column(String(10), nullable=False)
|
||||||
|
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
|
||||||
|
|
||||||
|
|
||||||
|
class DimCMHCZone(Base):
|
||||||
|
"""CMHC zone dimension table with PostGIS geometry."""
|
||||||
|
|
||||||
|
__tablename__ = "dim_cmhc_zone"
|
||||||
|
|
||||||
|
zone_key: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
zone_code: Mapped[str] = mapped_column(String(10), nullable=False, unique=True)
|
||||||
|
zone_name: Mapped[str] = mapped_column(String(100), nullable=False)
|
||||||
|
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
|
||||||
|
|
||||||
|
|
||||||
|
class DimNeighbourhood(Base):
|
||||||
|
"""City of Toronto neighbourhood dimension.
|
||||||
|
|
||||||
|
Note: No FK to fact tables in V1 - reference overlay only.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "dim_neighbourhood"
|
||||||
|
|
||||||
|
neighbourhood_id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
||||||
|
name: Mapped[str] = mapped_column(String(100), nullable=False)
|
||||||
|
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
|
||||||
|
population: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||||
|
land_area_sqkm: Mapped[float | None] = mapped_column(Numeric(10, 4), nullable=True)
|
||||||
|
pop_density_per_sqkm: Mapped[float | None] = mapped_column(
|
||||||
|
Numeric(10, 2), nullable=True
|
||||||
|
)
|
||||||
|
pct_bachelors_or_higher: Mapped[float | None] = mapped_column(
|
||||||
|
Numeric(5, 2), nullable=True
|
||||||
|
)
|
||||||
|
median_household_income: Mapped[float | None] = mapped_column(
|
||||||
|
Numeric(12, 2), nullable=True
|
||||||
|
)
|
||||||
|
pct_owner_occupied: Mapped[float | None] = mapped_column(
|
||||||
|
Numeric(5, 2), nullable=True
|
||||||
|
)
|
||||||
|
pct_renter_occupied: Mapped[float | None] = mapped_column(
|
||||||
|
Numeric(5, 2), nullable=True
|
||||||
|
)
|
||||||
|
census_year: Mapped[int] = mapped_column(Integer, default=2021)
|
||||||
|
|
||||||
|
|
||||||
|
class DimPolicyEvent(Base):
|
||||||
|
"""Policy event dimension for time-series annotation."""
|
||||||
|
|
||||||
|
__tablename__ = "dim_policy_event"
|
||||||
|
|
||||||
|
event_id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
event_date: Mapped[date] = mapped_column(Date, nullable=False)
|
||||||
|
effective_date: Mapped[date | None] = mapped_column(Date, nullable=True)
|
||||||
|
level: Mapped[str] = mapped_column(
|
||||||
|
String(20), nullable=False
|
||||||
|
) # federal/provincial/municipal
|
||||||
|
category: Mapped[str] = mapped_column(
|
||||||
|
String(20), nullable=False
|
||||||
|
) # monetary/tax/regulatory/supply/economic
|
||||||
|
title: Mapped[str] = mapped_column(String(200), nullable=False)
|
||||||
|
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
expected_direction: Mapped[str] = mapped_column(
|
||||||
|
String(10), nullable=False
|
||||||
|
) # bearish/bullish/neutral
|
||||||
|
source_url: Mapped[str | None] = mapped_column(String(500), nullable=True)
|
||||||
|
confidence: Mapped[str] = mapped_column(
|
||||||
|
String(10), default="medium"
|
||||||
|
) # high/medium/low
|
||||||
69
portfolio_app/toronto/models/facts.py
Normal file
69
portfolio_app/toronto/models/facts.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
"""SQLAlchemy models for fact tables."""
|
||||||
|
|
||||||
|
from sqlalchemy import ForeignKey, Integer, Numeric, String
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
|
from .base import Base
|
||||||
|
|
||||||
|
|
||||||
|
class FactPurchases(Base):
|
||||||
|
"""Fact table for TRREB purchase/sales data.
|
||||||
|
|
||||||
|
Grain: One row per district per month.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "fact_purchases"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
date_key: Mapped[int] = mapped_column(
|
||||||
|
Integer, ForeignKey("dim_time.date_key"), nullable=False
|
||||||
|
)
|
||||||
|
district_key: Mapped[int] = mapped_column(
|
||||||
|
Integer, ForeignKey("dim_trreb_district.district_key"), nullable=False
|
||||||
|
)
|
||||||
|
sales_count: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
dollar_volume: Mapped[float] = mapped_column(Numeric(15, 2), nullable=False)
|
||||||
|
avg_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False)
|
||||||
|
median_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False)
|
||||||
|
new_listings: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
active_listings: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
|
avg_dom: Mapped[int] = mapped_column(Integer, nullable=False) # Days on market
|
||||||
|
avg_sp_lp: Mapped[float] = mapped_column(
|
||||||
|
Numeric(5, 2), nullable=False
|
||||||
|
) # Sale/List ratio
|
||||||
|
|
||||||
|
# Relationships
|
||||||
|
time = relationship("DimTime", backref="purchases")
|
||||||
|
district = relationship("DimTRREBDistrict", backref="purchases")
|
||||||
|
|
||||||
|
|
||||||
|
class FactRentals(Base):
|
||||||
|
"""Fact table for CMHC rental market data.
|
||||||
|
|
||||||
|
Grain: One row per zone per bedroom type per survey year.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "fact_rentals"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
date_key: Mapped[int] = mapped_column(
|
||||||
|
Integer, ForeignKey("dim_time.date_key"), nullable=False
|
||||||
|
)
|
||||||
|
zone_key: Mapped[int] = mapped_column(
|
||||||
|
Integer, ForeignKey("dim_cmhc_zone.zone_key"), nullable=False
|
||||||
|
)
|
||||||
|
bedroom_type: Mapped[str] = mapped_column(String(20), nullable=False)
|
||||||
|
universe: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||||
|
avg_rent: Mapped[float | None] = mapped_column(Numeric(10, 2), nullable=True)
|
||||||
|
median_rent: Mapped[float | None] = mapped_column(Numeric(10, 2), nullable=True)
|
||||||
|
vacancy_rate: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
|
||||||
|
availability_rate: Mapped[float | None] = mapped_column(
|
||||||
|
Numeric(5, 2), nullable=True
|
||||||
|
)
|
||||||
|
turnover_rate: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
|
||||||
|
rent_change_pct: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
|
||||||
|
reliability_code: Mapped[str | None] = mapped_column(String(2), nullable=True)
|
||||||
|
|
||||||
|
# Relationships
|
||||||
|
time = relationship("DimTime", backref="rentals")
|
||||||
|
zone = relationship("DimCMHCZone", backref="rentals")
|
||||||
@@ -1 +1,9 @@
|
|||||||
"""Data parsers for Toronto housing data sources."""
|
"""Parsers for Toronto housing data sources."""
|
||||||
|
|
||||||
|
from .cmhc import CMHCParser
|
||||||
|
from .trreb import TRREBParser
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"TRREBParser",
|
||||||
|
"CMHCParser",
|
||||||
|
]
|
||||||
|
|||||||
147
portfolio_app/toronto/parsers/cmhc.py
Normal file
147
portfolio_app/toronto/parsers/cmhc.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""CMHC CSV processor for rental market survey data.
|
||||||
|
|
||||||
|
This module provides the structure for processing CMHC (Canada Mortgage and Housing
|
||||||
|
Corporation) rental market survey data from CSV exports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, cast
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from portfolio_app.toronto.schemas import CMHCAnnualSurvey, CMHCRentalRecord
|
||||||
|
|
||||||
|
|
||||||
|
class CMHCParser:
|
||||||
|
"""Parser for CMHC Rental Market Survey CSV data.
|
||||||
|
|
||||||
|
CMHC conducts annual rental market surveys and publishes data including:
|
||||||
|
- Average and median rents by zone and bedroom type
|
||||||
|
- Vacancy rates
|
||||||
|
- Universe (total rental units)
|
||||||
|
- Year-over-year rent changes
|
||||||
|
|
||||||
|
Data is available via the Housing Market Information Portal as CSV exports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Expected columns in CMHC CSV exports
|
||||||
|
REQUIRED_COLUMNS = {
|
||||||
|
"zone_code",
|
||||||
|
"zone_name",
|
||||||
|
"bedroom_type",
|
||||||
|
"survey_year",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Column name mappings from CMHC export format
|
||||||
|
COLUMN_MAPPINGS = {
|
||||||
|
"Zone Code": "zone_code",
|
||||||
|
"Zone Name": "zone_name",
|
||||||
|
"Bedroom Type": "bedroom_type",
|
||||||
|
"Survey Year": "survey_year",
|
||||||
|
"Universe": "universe",
|
||||||
|
"Average Rent ($)": "avg_rent",
|
||||||
|
"Median Rent ($)": "median_rent",
|
||||||
|
"Vacancy Rate (%)": "vacancy_rate",
|
||||||
|
"Availability Rate (%)": "availability_rate",
|
||||||
|
"Turnover Rate (%)": "turnover_rate",
|
||||||
|
"% Change in Rent": "rent_change_pct",
|
||||||
|
"Reliability Code": "reliability_code",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, csv_path: Path) -> None:
|
||||||
|
"""Initialize parser with path to CSV file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
csv_path: Path to the CMHC CSV export file.
|
||||||
|
"""
|
||||||
|
self.csv_path = csv_path
|
||||||
|
self._validate_path()
|
||||||
|
|
||||||
|
def _validate_path(self) -> None:
|
||||||
|
"""Validate that the CSV path exists and is readable."""
|
||||||
|
if not self.csv_path.exists():
|
||||||
|
raise FileNotFoundError(f"CSV not found: {self.csv_path}")
|
||||||
|
if not self.csv_path.suffix.lower() == ".csv":
|
||||||
|
raise ValueError(f"Expected CSV file, got: {self.csv_path.suffix}")
|
||||||
|
|
||||||
|
def parse(self) -> CMHCAnnualSurvey:
|
||||||
|
"""Parse the CSV and return structured data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CMHCAnnualSurvey containing all extracted records.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If required columns are missing.
|
||||||
|
"""
|
||||||
|
df = self._load_csv()
|
||||||
|
df = self._normalize_columns(df)
|
||||||
|
self._validate_columns(df)
|
||||||
|
records = self._convert_to_records(df)
|
||||||
|
survey_year = self._infer_survey_year(df)
|
||||||
|
|
||||||
|
return CMHCAnnualSurvey(survey_year=survey_year, records=records)
|
||||||
|
|
||||||
|
def _load_csv(self) -> pd.DataFrame:
|
||||||
|
"""Load CSV file into DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Raw DataFrame from CSV.
|
||||||
|
"""
|
||||||
|
return pd.read_csv(self.csv_path)
|
||||||
|
|
||||||
|
def _normalize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Normalize column names to standard format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with original column names.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame with normalized column names.
|
||||||
|
"""
|
||||||
|
rename_map = {k: v for k, v in self.COLUMN_MAPPINGS.items() if k in df.columns}
|
||||||
|
return df.rename(columns=rename_map)
|
||||||
|
|
||||||
|
def _validate_columns(self, df: pd.DataFrame) -> None:
|
||||||
|
"""Validate that all required columns are present.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame to validate.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If required columns are missing.
|
||||||
|
"""
|
||||||
|
missing = self.REQUIRED_COLUMNS - set(df.columns)
|
||||||
|
if missing:
|
||||||
|
raise ValueError(f"Missing required columns: {missing}")
|
||||||
|
|
||||||
|
def _convert_to_records(self, df: pd.DataFrame) -> list[CMHCRentalRecord]:
|
||||||
|
"""Convert DataFrame rows to validated schema records.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Normalized DataFrame.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of validated CMHCRentalRecord objects.
|
||||||
|
"""
|
||||||
|
records = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
record_data = row.to_dict()
|
||||||
|
# Handle NaN values
|
||||||
|
record_data = {
|
||||||
|
k: (None if pd.isna(v) else v) for k, v in record_data.items()
|
||||||
|
}
|
||||||
|
records.append(CMHCRentalRecord(**cast(dict[str, Any], record_data)))
|
||||||
|
return records
|
||||||
|
|
||||||
|
def _infer_survey_year(self, df: pd.DataFrame) -> int:
|
||||||
|
"""Infer survey year from data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: DataFrame with survey_year column.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Survey year as integer.
|
||||||
|
"""
|
||||||
|
if "survey_year" in df.columns:
|
||||||
|
return int(df["survey_year"].iloc[0])
|
||||||
|
raise ValueError("Cannot infer survey year from data.")
|
||||||
82
portfolio_app/toronto/parsers/trreb.py
Normal file
82
portfolio_app/toronto/parsers/trreb.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
"""TRREB PDF parser for monthly market watch reports.
|
||||||
|
|
||||||
|
This module provides the structure for parsing TRREB (Toronto Regional Real Estate Board)
|
||||||
|
monthly Market Watch PDF reports into structured data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport
|
||||||
|
|
||||||
|
|
||||||
|
class TRREBParser:
|
||||||
|
"""Parser for TRREB Market Watch PDF reports.
|
||||||
|
|
||||||
|
TRREB publishes monthly Market Watch reports as PDFs containing:
|
||||||
|
- Summary statistics by area (416, 905, Total)
|
||||||
|
- District-level breakdowns
|
||||||
|
- Year-over-year comparisons
|
||||||
|
|
||||||
|
The parser extracts tabular data from these PDFs and validates
|
||||||
|
against the TRREBMonthlyRecord schema.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pdf_path: Path) -> None:
|
||||||
|
"""Initialize parser with path to PDF file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Path to the TRREB Market Watch PDF file.
|
||||||
|
"""
|
||||||
|
self.pdf_path = pdf_path
|
||||||
|
self._validate_path()
|
||||||
|
|
||||||
|
def _validate_path(self) -> None:
|
||||||
|
"""Validate that the PDF path exists and is readable."""
|
||||||
|
if not self.pdf_path.exists():
|
||||||
|
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
|
||||||
|
if not self.pdf_path.suffix.lower() == ".pdf":
|
||||||
|
raise ValueError(f"Expected PDF file, got: {self.pdf_path.suffix}")
|
||||||
|
|
||||||
|
def parse(self) -> TRREBMonthlyReport:
|
||||||
|
"""Parse the PDF and return structured data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TRREBMonthlyReport containing all extracted records.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
NotImplementedError: PDF parsing not yet implemented.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
"PDF parsing requires pdfplumber/tabula-py. "
|
||||||
|
"Implementation pending Sprint 4 data ingestion."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _extract_tables(self) -> list[dict[str, Any]]:
|
||||||
|
"""Extract raw tables from PDF pages.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dictionaries representing table data.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Table extraction not yet implemented.")
|
||||||
|
|
||||||
|
def _parse_district_table(
|
||||||
|
self, table_data: list[dict[str, Any]]
|
||||||
|
) -> list[TRREBMonthlyRecord]:
|
||||||
|
"""Parse district-level statistics table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
table_data: Raw table data extracted from PDF.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of validated TRREBMonthlyRecord objects.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("District table parsing not yet implemented.")
|
||||||
|
|
||||||
|
def _infer_report_date(self) -> tuple[int, int]:
|
||||||
|
"""Infer report year and month from PDF filename or content.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (year, month).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Date inference not yet implemented.")
|
||||||
@@ -1 +1,39 @@
|
|||||||
"""Pydantic schemas for Toronto housing data validation."""
|
"""Pydantic schemas for Toronto housing data validation."""
|
||||||
|
|
||||||
|
from .cmhc import BedroomType, CMHCAnnualSurvey, CMHCRentalRecord, ReliabilityCode
|
||||||
|
from .dimensions import (
|
||||||
|
AreaType,
|
||||||
|
CMHCZone,
|
||||||
|
Confidence,
|
||||||
|
ExpectedDirection,
|
||||||
|
Neighbourhood,
|
||||||
|
PolicyCategory,
|
||||||
|
PolicyEvent,
|
||||||
|
PolicyLevel,
|
||||||
|
TimeDimension,
|
||||||
|
TRREBDistrict,
|
||||||
|
)
|
||||||
|
from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# TRREB
|
||||||
|
"TRREBMonthlyRecord",
|
||||||
|
"TRREBMonthlyReport",
|
||||||
|
# CMHC
|
||||||
|
"CMHCRentalRecord",
|
||||||
|
"CMHCAnnualSurvey",
|
||||||
|
"BedroomType",
|
||||||
|
"ReliabilityCode",
|
||||||
|
# Dimensions
|
||||||
|
"TimeDimension",
|
||||||
|
"TRREBDistrict",
|
||||||
|
"CMHCZone",
|
||||||
|
"Neighbourhood",
|
||||||
|
"PolicyEvent",
|
||||||
|
# Enums
|
||||||
|
"AreaType",
|
||||||
|
"PolicyLevel",
|
||||||
|
"PolicyCategory",
|
||||||
|
"ExpectedDirection",
|
||||||
|
"Confidence",
|
||||||
|
]
|
||||||
|
|||||||
81
portfolio_app/toronto/schemas/cmhc.py
Normal file
81
portfolio_app/toronto/schemas/cmhc.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
"""Pydantic schemas for CMHC rental market data."""
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class BedroomType(str, Enum):
|
||||||
|
"""CMHC bedroom type categories."""
|
||||||
|
|
||||||
|
BACHELOR = "Bachelor"
|
||||||
|
ONE_BED = "1 Bedroom"
|
||||||
|
TWO_BED = "2 Bedroom"
|
||||||
|
THREE_BED_PLUS = "3 Bedroom+"
|
||||||
|
TOTAL = "Total"
|
||||||
|
|
||||||
|
|
||||||
|
class ReliabilityCode(str, Enum):
|
||||||
|
"""CMHC data reliability codes.
|
||||||
|
|
||||||
|
Based on coefficient of variation (CV).
|
||||||
|
"""
|
||||||
|
|
||||||
|
EXCELLENT = "a" # CV <= 2.5%
|
||||||
|
GOOD = "b" # 2.5% < CV <= 5%
|
||||||
|
FAIR = "c" # 5% < CV <= 10%
|
||||||
|
POOR = "d" # CV > 10%
|
||||||
|
SUPPRESSED = "**" # Sample too small
|
||||||
|
|
||||||
|
|
||||||
|
class CMHCRentalRecord(BaseModel):
|
||||||
|
"""Schema for a single CMHC rental survey record.
|
||||||
|
|
||||||
|
Represents rental data for one zone and bedroom type in one survey year.
|
||||||
|
"""
|
||||||
|
|
||||||
|
survey_year: int = Field(ge=1990, description="Survey year (October snapshot)")
|
||||||
|
zone_code: str = Field(max_length=10, description="CMHC zone identifier")
|
||||||
|
zone_name: str = Field(max_length=100, description="Zone name")
|
||||||
|
bedroom_type: BedroomType = Field(description="Bedroom category")
|
||||||
|
universe: int | None = Field(
|
||||||
|
default=None, ge=0, description="Total rental units in zone"
|
||||||
|
)
|
||||||
|
vacancy_rate: Decimal | None = Field(
|
||||||
|
default=None, ge=0, le=100, description="Vacancy rate (%)"
|
||||||
|
)
|
||||||
|
vacancy_rate_reliability: ReliabilityCode | None = Field(default=None)
|
||||||
|
availability_rate: Decimal | None = Field(
|
||||||
|
default=None, ge=0, le=100, description="Availability rate (%)"
|
||||||
|
)
|
||||||
|
average_rent: Decimal | None = Field(
|
||||||
|
default=None, ge=0, description="Average monthly rent ($)"
|
||||||
|
)
|
||||||
|
average_rent_reliability: ReliabilityCode | None = Field(default=None)
|
||||||
|
median_rent: Decimal | None = Field(
|
||||||
|
default=None, ge=0, description="Median monthly rent ($)"
|
||||||
|
)
|
||||||
|
rent_change_pct: Decimal | None = Field(
|
||||||
|
default=None, description="YoY rent change (%)"
|
||||||
|
)
|
||||||
|
turnover_rate: Decimal | None = Field(
|
||||||
|
default=None, ge=0, le=100, description="Unit turnover rate (%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
model_config = {"str_strip_whitespace": True}
|
||||||
|
|
||||||
|
|
||||||
|
class CMHCAnnualSurvey(BaseModel):
|
||||||
|
"""Schema for a complete CMHC annual survey for Toronto.
|
||||||
|
|
||||||
|
Contains all zone and bedroom type combinations for one survey year.
|
||||||
|
"""
|
||||||
|
|
||||||
|
survey_year: int
|
||||||
|
records: list[CMHCRentalRecord]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def zone_count(self) -> int:
|
||||||
|
"""Number of unique zones in survey."""
|
||||||
|
return len({r.zone_code for r in self.records})
|
||||||
121
portfolio_app/toronto/schemas/dimensions.py
Normal file
121
portfolio_app/toronto/schemas/dimensions.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
"""Pydantic schemas for dimension tables."""
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
from decimal import Decimal
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field, HttpUrl
|
||||||
|
|
||||||
|
|
||||||
|
class PolicyLevel(str, Enum):
|
||||||
|
"""Government level for policy events."""
|
||||||
|
|
||||||
|
FEDERAL = "federal"
|
||||||
|
PROVINCIAL = "provincial"
|
||||||
|
MUNICIPAL = "municipal"
|
||||||
|
|
||||||
|
|
||||||
|
class PolicyCategory(str, Enum):
|
||||||
|
"""Policy event category."""
|
||||||
|
|
||||||
|
MONETARY = "monetary"
|
||||||
|
TAX = "tax"
|
||||||
|
REGULATORY = "regulatory"
|
||||||
|
SUPPLY = "supply"
|
||||||
|
ECONOMIC = "economic"
|
||||||
|
|
||||||
|
|
||||||
|
class ExpectedDirection(str, Enum):
|
||||||
|
"""Expected price impact direction."""
|
||||||
|
|
||||||
|
BULLISH = "bullish" # Expected to increase prices
|
||||||
|
BEARISH = "bearish" # Expected to decrease prices
|
||||||
|
NEUTRAL = "neutral" # Uncertain or mixed impact
|
||||||
|
|
||||||
|
|
||||||
|
class Confidence(str, Enum):
|
||||||
|
"""Confidence level in policy event data."""
|
||||||
|
|
||||||
|
HIGH = "high"
|
||||||
|
MEDIUM = "medium"
|
||||||
|
LOW = "low"
|
||||||
|
|
||||||
|
|
||||||
|
class AreaType(str, Enum):
|
||||||
|
"""TRREB area type."""
|
||||||
|
|
||||||
|
WEST = "West"
|
||||||
|
CENTRAL = "Central"
|
||||||
|
EAST = "East"
|
||||||
|
NORTH = "North"
|
||||||
|
|
||||||
|
|
||||||
|
class TimeDimension(BaseModel):
|
||||||
|
"""Schema for time dimension record."""
|
||||||
|
|
||||||
|
date_key: int = Field(description="Date key in YYYYMMDD format")
|
||||||
|
full_date: date
|
||||||
|
year: int = Field(ge=2000, le=2100)
|
||||||
|
month: int = Field(ge=1, le=12)
|
||||||
|
quarter: int = Field(ge=1, le=4)
|
||||||
|
month_name: str = Field(max_length=20)
|
||||||
|
is_month_start: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class TRREBDistrict(BaseModel):
|
||||||
|
"""Schema for TRREB district dimension."""
|
||||||
|
|
||||||
|
district_code: str = Field(max_length=3, description="W01, C01, E01, etc.")
|
||||||
|
district_name: str = Field(max_length=100)
|
||||||
|
area_type: AreaType
|
||||||
|
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
|
||||||
|
|
||||||
|
|
||||||
|
class CMHCZone(BaseModel):
|
||||||
|
"""Schema for CMHC zone dimension."""
|
||||||
|
|
||||||
|
zone_code: str = Field(max_length=10)
|
||||||
|
zone_name: str = Field(max_length=100)
|
||||||
|
geometry_wkt: str | None = Field(default=None, description="WKT geometry string")
|
||||||
|
|
||||||
|
|
||||||
|
class Neighbourhood(BaseModel):
|
||||||
|
"""Schema for City of Toronto neighbourhood dimension.
|
||||||
|
|
||||||
|
Note: No FK to fact tables in V1 - reference overlay only.
|
||||||
|
"""
|
||||||
|
|
||||||
|
neighbourhood_id: int = Field(ge=1, le=200)
|
||||||
|
name: str = Field(max_length=100)
|
||||||
|
geometry_wkt: str | None = Field(default=None)
|
||||||
|
population: int | None = Field(default=None, ge=0)
|
||||||
|
land_area_sqkm: Decimal | None = Field(default=None, ge=0)
|
||||||
|
pop_density_per_sqkm: Decimal | None = Field(default=None, ge=0)
|
||||||
|
pct_bachelors_or_higher: Decimal | None = Field(default=None, ge=0, le=100)
|
||||||
|
median_household_income: Decimal | None = Field(default=None, ge=0)
|
||||||
|
pct_owner_occupied: Decimal | None = Field(default=None, ge=0, le=100)
|
||||||
|
pct_renter_occupied: Decimal | None = Field(default=None, ge=0, le=100)
|
||||||
|
census_year: int = Field(default=2021, description="Census year for SCD tracking")
|
||||||
|
|
||||||
|
|
||||||
|
class PolicyEvent(BaseModel):
|
||||||
|
"""Schema for policy event dimension.
|
||||||
|
|
||||||
|
Used for time-series annotation. No causation claims.
|
||||||
|
"""
|
||||||
|
|
||||||
|
event_date: date = Field(description="Date event was announced/occurred")
|
||||||
|
effective_date: date | None = Field(
|
||||||
|
default=None, description="Date policy took effect"
|
||||||
|
)
|
||||||
|
level: PolicyLevel
|
||||||
|
category: PolicyCategory
|
||||||
|
title: str = Field(max_length=200, description="Short event title for display")
|
||||||
|
description: str | None = Field(
|
||||||
|
default=None, description="Longer description for tooltip"
|
||||||
|
)
|
||||||
|
expected_direction: ExpectedDirection
|
||||||
|
source_url: HttpUrl | None = Field(default=None)
|
||||||
|
confidence: Confidence = Field(default=Confidence.MEDIUM)
|
||||||
|
|
||||||
|
model_config = {"str_strip_whitespace": True}
|
||||||
52
portfolio_app/toronto/schemas/trreb.py
Normal file
52
portfolio_app/toronto/schemas/trreb.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""Pydantic schemas for TRREB monthly market data."""
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class TRREBMonthlyRecord(BaseModel):
|
||||||
|
"""Schema for a single TRREB monthly summary record.
|
||||||
|
|
||||||
|
Represents aggregated sales data for one district in one month.
|
||||||
|
"""
|
||||||
|
|
||||||
|
report_date: date = Field(description="First of month (YYYY-MM-01)")
|
||||||
|
area_code: str = Field(
|
||||||
|
max_length=3, description="District code (W01, C01, E01, etc.)"
|
||||||
|
)
|
||||||
|
area_name: str = Field(max_length=100, description="District name")
|
||||||
|
area_type: str = Field(max_length=10, description="West / Central / East / North")
|
||||||
|
sales: int = Field(ge=0, description="Number of transactions")
|
||||||
|
dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)")
|
||||||
|
avg_price: Decimal = Field(ge=0, description="Average sale price ($)")
|
||||||
|
median_price: Decimal = Field(ge=0, description="Median sale price ($)")
|
||||||
|
new_listings: int = Field(ge=0, description="New listings count")
|
||||||
|
active_listings: int = Field(ge=0, description="Active listings at month end")
|
||||||
|
avg_sp_lp: Decimal = Field(
|
||||||
|
ge=0, le=200, description="Avg sale price / list price ratio (%)"
|
||||||
|
)
|
||||||
|
avg_dom: int = Field(ge=0, description="Average days on market")
|
||||||
|
|
||||||
|
model_config = {"str_strip_whitespace": True}
|
||||||
|
|
||||||
|
|
||||||
|
class TRREBMonthlyReport(BaseModel):
|
||||||
|
"""Schema for a complete TRREB monthly report.
|
||||||
|
|
||||||
|
Contains all district records for a single month.
|
||||||
|
"""
|
||||||
|
|
||||||
|
report_date: date
|
||||||
|
records: list[TRREBMonthlyRecord]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def total_sales(self) -> int:
|
||||||
|
"""Total sales across all districts."""
|
||||||
|
return sum(r.sales for r in self.records)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def district_count(self) -> int:
|
||||||
|
"""Number of districts in report."""
|
||||||
|
return len(self.records)
|
||||||
Reference in New Issue
Block a user