feat: add Pydantic schemas, SQLAlchemy models, and parser structure

Sprint 3 implementation:
- Pydantic schemas for TRREB, CMHC, and dimension data validation
- SQLAlchemy models with PostGIS geometry for fact and dimension tables
- Parser structure (stubs) for TRREB PDF and CMHC CSV processing

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-11 14:58:31 -05:00
parent 549e1fcbaf
commit ead6d91a28
11 changed files with 760 additions and 1 deletions

View File

@@ -1 +1,28 @@
"""SQLAlchemy models for Toronto housing data."""
from .base import Base, create_tables, get_engine, get_session_factory
from .dimensions import (
DimCMHCZone,
DimNeighbourhood,
DimPolicyEvent,
DimTime,
DimTRREBDistrict,
)
from .facts import FactPurchases, FactRentals
__all__ = [
# Base
"Base",
"get_engine",
"get_session_factory",
"create_tables",
# Dimensions
"DimTime",
"DimTRREBDistrict",
"DimCMHCZone",
"DimNeighbourhood",
"DimPolicyEvent",
# Facts
"FactPurchases",
"FactRentals",
]

View File

@@ -0,0 +1,30 @@
"""SQLAlchemy base configuration and engine setup."""
from sqlalchemy import Engine, create_engine
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
from portfolio_app.config import get_settings
class Base(DeclarativeBase): # type: ignore[misc]
"""Base class for all SQLAlchemy models."""
pass
def get_engine() -> Engine:
"""Create database engine from settings."""
settings = get_settings()
return create_engine(settings.database_url, echo=False)
def get_session_factory() -> sessionmaker[Session]:
"""Create session factory."""
engine = get_engine()
return sessionmaker(bind=engine)
def create_tables() -> None:
"""Create all tables in database."""
engine = get_engine()
Base.metadata.create_all(engine)

View File

@@ -0,0 +1,104 @@
"""SQLAlchemy models for dimension tables."""
from datetime import date
from geoalchemy2 import Geometry
from sqlalchemy import Boolean, Date, Integer, Numeric, String, Text
from sqlalchemy.orm import Mapped, mapped_column
from .base import Base
class DimTime(Base):
"""Time dimension table."""
__tablename__ = "dim_time"
date_key: Mapped[int] = mapped_column(Integer, primary_key=True)
full_date: Mapped[date] = mapped_column(Date, nullable=False, unique=True)
year: Mapped[int] = mapped_column(Integer, nullable=False)
month: Mapped[int] = mapped_column(Integer, nullable=False)
quarter: Mapped[int] = mapped_column(Integer, nullable=False)
month_name: Mapped[str] = mapped_column(String(20), nullable=False)
is_month_start: Mapped[bool] = mapped_column(Boolean, default=True)
class DimTRREBDistrict(Base):
"""TRREB district dimension table with PostGIS geometry."""
__tablename__ = "dim_trreb_district"
district_key: Mapped[int] = mapped_column(
Integer, primary_key=True, autoincrement=True
)
district_code: Mapped[str] = mapped_column(String(3), nullable=False, unique=True)
district_name: Mapped[str] = mapped_column(String(100), nullable=False)
area_type: Mapped[str] = mapped_column(String(10), nullable=False)
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
class DimCMHCZone(Base):
"""CMHC zone dimension table with PostGIS geometry."""
__tablename__ = "dim_cmhc_zone"
zone_key: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
zone_code: Mapped[str] = mapped_column(String(10), nullable=False, unique=True)
zone_name: Mapped[str] = mapped_column(String(100), nullable=False)
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
class DimNeighbourhood(Base):
"""City of Toronto neighbourhood dimension.
Note: No FK to fact tables in V1 - reference overlay only.
"""
__tablename__ = "dim_neighbourhood"
neighbourhood_id: Mapped[int] = mapped_column(Integer, primary_key=True)
name: Mapped[str] = mapped_column(String(100), nullable=False)
geometry = mapped_column(Geometry("POLYGON", srid=4326), nullable=True)
population: Mapped[int | None] = mapped_column(Integer, nullable=True)
land_area_sqkm: Mapped[float | None] = mapped_column(Numeric(10, 4), nullable=True)
pop_density_per_sqkm: Mapped[float | None] = mapped_column(
Numeric(10, 2), nullable=True
)
pct_bachelors_or_higher: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
median_household_income: Mapped[float | None] = mapped_column(
Numeric(12, 2), nullable=True
)
pct_owner_occupied: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
pct_renter_occupied: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
census_year: Mapped[int] = mapped_column(Integer, default=2021)
class DimPolicyEvent(Base):
"""Policy event dimension for time-series annotation."""
__tablename__ = "dim_policy_event"
event_id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
event_date: Mapped[date] = mapped_column(Date, nullable=False)
effective_date: Mapped[date | None] = mapped_column(Date, nullable=True)
level: Mapped[str] = mapped_column(
String(20), nullable=False
) # federal/provincial/municipal
category: Mapped[str] = mapped_column(
String(20), nullable=False
) # monetary/tax/regulatory/supply/economic
title: Mapped[str] = mapped_column(String(200), nullable=False)
description: Mapped[str | None] = mapped_column(Text, nullable=True)
expected_direction: Mapped[str] = mapped_column(
String(10), nullable=False
) # bearish/bullish/neutral
source_url: Mapped[str | None] = mapped_column(String(500), nullable=True)
confidence: Mapped[str] = mapped_column(
String(10), default="medium"
) # high/medium/low

View File

@@ -0,0 +1,69 @@
"""SQLAlchemy models for fact tables."""
from sqlalchemy import ForeignKey, Integer, Numeric, String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from .base import Base
class FactPurchases(Base):
"""Fact table for TRREB purchase/sales data.
Grain: One row per district per month.
"""
__tablename__ = "fact_purchases"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
date_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_time.date_key"), nullable=False
)
district_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_trreb_district.district_key"), nullable=False
)
sales_count: Mapped[int] = mapped_column(Integer, nullable=False)
dollar_volume: Mapped[float] = mapped_column(Numeric(15, 2), nullable=False)
avg_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False)
median_price: Mapped[float] = mapped_column(Numeric(12, 2), nullable=False)
new_listings: Mapped[int] = mapped_column(Integer, nullable=False)
active_listings: Mapped[int] = mapped_column(Integer, nullable=False)
avg_dom: Mapped[int] = mapped_column(Integer, nullable=False) # Days on market
avg_sp_lp: Mapped[float] = mapped_column(
Numeric(5, 2), nullable=False
) # Sale/List ratio
# Relationships
time = relationship("DimTime", backref="purchases")
district = relationship("DimTRREBDistrict", backref="purchases")
class FactRentals(Base):
"""Fact table for CMHC rental market data.
Grain: One row per zone per bedroom type per survey year.
"""
__tablename__ = "fact_rentals"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
date_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_time.date_key"), nullable=False
)
zone_key: Mapped[int] = mapped_column(
Integer, ForeignKey("dim_cmhc_zone.zone_key"), nullable=False
)
bedroom_type: Mapped[str] = mapped_column(String(20), nullable=False)
universe: Mapped[int | None] = mapped_column(Integer, nullable=True)
avg_rent: Mapped[float | None] = mapped_column(Numeric(10, 2), nullable=True)
median_rent: Mapped[float | None] = mapped_column(Numeric(10, 2), nullable=True)
vacancy_rate: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
availability_rate: Mapped[float | None] = mapped_column(
Numeric(5, 2), nullable=True
)
turnover_rate: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
rent_change_pct: Mapped[float | None] = mapped_column(Numeric(5, 2), nullable=True)
reliability_code: Mapped[str | None] = mapped_column(String(2), nullable=True)
# Relationships
time = relationship("DimTime", backref="rentals")
zone = relationship("DimCMHCZone", backref="rentals")