From cb877df9e134e1bb7d3300739bdbd4b238abf2da Mon Sep 17 00:00:00 2001 From: lmiranda Date: Fri, 16 Jan 2026 10:00:47 -0500 Subject: [PATCH] refactor: Delete legacy TRREB Python modules (#47) - Delete portfolio_app/toronto/schemas/trreb.py - Delete portfolio_app/toronto/parsers/trreb.py - Delete portfolio_app/toronto/loaders/trreb.py - Remove TRREB imports from __init__.py files Part of Sprint 9: Toronto Neighbourhood Dashboard transition See docs/changes/Change-Toronto-Analysis-Reviewed.md Co-Authored-By: Claude Opus 4.5 --- portfolio_app/toronto/loaders/__init__.py | 3 - portfolio_app/toronto/loaders/trreb.py | 129 ---------------------- portfolio_app/toronto/parsers/__init__.py | 2 - portfolio_app/toronto/parsers/trreb.py | 82 -------------- portfolio_app/toronto/schemas/__init__.py | 4 - portfolio_app/toronto/schemas/trreb.py | 52 --------- 6 files changed, 272 deletions(-) delete mode 100644 portfolio_app/toronto/loaders/trreb.py delete mode 100644 portfolio_app/toronto/parsers/trreb.py delete mode 100644 portfolio_app/toronto/schemas/trreb.py diff --git a/portfolio_app/toronto/loaders/__init__.py b/portfolio_app/toronto/loaders/__init__.py index 1b47b25..7b14f5f 100644 --- a/portfolio_app/toronto/loaders/__init__.py +++ b/portfolio_app/toronto/loaders/__init__.py @@ -10,7 +10,6 @@ from .dimensions import ( load_time_dimension, load_trreb_districts, ) -from .trreb import load_trreb_purchases, load_trreb_record __all__ = [ # Base utilities @@ -25,8 +24,6 @@ __all__ = [ "load_neighbourhoods", "load_policy_events", # Fact loaders - "load_trreb_purchases", - "load_trreb_record", "load_cmhc_rentals", "load_cmhc_record", ] diff --git a/portfolio_app/toronto/loaders/trreb.py b/portfolio_app/toronto/loaders/trreb.py deleted file mode 100644 index 06e4c8b..0000000 --- a/portfolio_app/toronto/loaders/trreb.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Loader for TRREB purchase data into fact_purchases.""" - -from sqlalchemy.orm import Session - -from portfolio_app.toronto.models import DimTime, DimTRREBDistrict, FactPurchases -from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport - -from .base import get_session, upsert_by_key -from .dimensions import generate_date_key - - -def load_trreb_purchases( - report: TRREBMonthlyReport, - session: Session | None = None, -) -> int: - """Load TRREB monthly report data into fact_purchases. - - Args: - report: Validated TRREB monthly report containing records. - session: Optional existing session. - - Returns: - Number of records loaded. - """ - - def _load(sess: Session) -> int: - # Get district key mapping - districts = sess.query(DimTRREBDistrict).all() - district_map = {d.district_code: d.district_key for d in districts} - - # Build date key from report date - date_key = generate_date_key(report.report_date) - - # Verify time dimension exists - time_dim = sess.query(DimTime).filter_by(date_key=date_key).first() - if not time_dim: - raise ValueError( - f"Time dimension not found for date_key {date_key}. " - "Load time dimension first." - ) - - records = [] - for record in report.records: - district_key = district_map.get(record.area_code) - if not district_key: - # Skip records for unknown districts (e.g., aggregate rows) - continue - - fact = FactPurchases( - date_key=date_key, - district_key=district_key, - sales_count=record.sales, - dollar_volume=record.dollar_volume, - avg_price=record.avg_price, - median_price=record.median_price, - new_listings=record.new_listings, - active_listings=record.active_listings, - avg_dom=record.avg_dom, - avg_sp_lp=record.avg_sp_lp, - ) - records.append(fact) - - inserted, updated = upsert_by_key( - sess, FactPurchases, records, ["date_key", "district_key"] - ) - return inserted + updated - - if session: - return _load(session) - with get_session() as sess: - return _load(sess) - - -def load_trreb_record( - record: TRREBMonthlyRecord, - session: Session | None = None, -) -> int: - """Load a single TRREB record into fact_purchases. - - Args: - record: Single validated TRREB monthly record. - session: Optional existing session. - - Returns: - Number of records loaded (0 or 1). - """ - - def _load(sess: Session) -> int: - # Get district key - district = ( - sess.query(DimTRREBDistrict) - .filter_by(district_code=record.area_code) - .first() - ) - if not district: - return 0 - - date_key = generate_date_key(record.report_date) - - # Verify time dimension exists - time_dim = sess.query(DimTime).filter_by(date_key=date_key).first() - if not time_dim: - raise ValueError( - f"Time dimension not found for date_key {date_key}. " - "Load time dimension first." - ) - - fact = FactPurchases( - date_key=date_key, - district_key=district.district_key, - sales_count=record.sales, - dollar_volume=record.dollar_volume, - avg_price=record.avg_price, - median_price=record.median_price, - new_listings=record.new_listings, - active_listings=record.active_listings, - avg_dom=record.avg_dom, - avg_sp_lp=record.avg_sp_lp, - ) - - inserted, updated = upsert_by_key( - sess, FactPurchases, [fact], ["date_key", "district_key"] - ) - return inserted + updated - - if session: - return _load(session) - with get_session() as sess: - return _load(sess) diff --git a/portfolio_app/toronto/parsers/__init__.py b/portfolio_app/toronto/parsers/__init__.py index 2d33037..2c3284a 100644 --- a/portfolio_app/toronto/parsers/__init__.py +++ b/portfolio_app/toronto/parsers/__init__.py @@ -7,10 +7,8 @@ from .geo import ( TRREBDistrictParser, load_geojson, ) -from .trreb import TRREBParser __all__ = [ - "TRREBParser", "CMHCParser", # GeoJSON parsers "CMHCZoneParser", diff --git a/portfolio_app/toronto/parsers/trreb.py b/portfolio_app/toronto/parsers/trreb.py deleted file mode 100644 index fad5869..0000000 --- a/portfolio_app/toronto/parsers/trreb.py +++ /dev/null @@ -1,82 +0,0 @@ -"""TRREB PDF parser for monthly market watch reports. - -This module provides the structure for parsing TRREB (Toronto Regional Real Estate Board) -monthly Market Watch PDF reports into structured data. -""" - -from pathlib import Path -from typing import Any - -from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport - - -class TRREBParser: - """Parser for TRREB Market Watch PDF reports. - - TRREB publishes monthly Market Watch reports as PDFs containing: - - Summary statistics by area (416, 905, Total) - - District-level breakdowns - - Year-over-year comparisons - - The parser extracts tabular data from these PDFs and validates - against the TRREBMonthlyRecord schema. - """ - - def __init__(self, pdf_path: Path) -> None: - """Initialize parser with path to PDF file. - - Args: - pdf_path: Path to the TRREB Market Watch PDF file. - """ - self.pdf_path = pdf_path - self._validate_path() - - def _validate_path(self) -> None: - """Validate that the PDF path exists and is readable.""" - if not self.pdf_path.exists(): - raise FileNotFoundError(f"PDF not found: {self.pdf_path}") - if not self.pdf_path.suffix.lower() == ".pdf": - raise ValueError(f"Expected PDF file, got: {self.pdf_path.suffix}") - - def parse(self) -> TRREBMonthlyReport: - """Parse the PDF and return structured data. - - Returns: - TRREBMonthlyReport containing all extracted records. - - Raises: - NotImplementedError: PDF parsing not yet implemented. - """ - raise NotImplementedError( - "PDF parsing requires pdfplumber/tabula-py. " - "Implementation pending Sprint 4 data ingestion." - ) - - def _extract_tables(self) -> list[dict[str, Any]]: - """Extract raw tables from PDF pages. - - Returns: - List of dictionaries representing table data. - """ - raise NotImplementedError("Table extraction not yet implemented.") - - def _parse_district_table( - self, table_data: list[dict[str, Any]] - ) -> list[TRREBMonthlyRecord]: - """Parse district-level statistics table. - - Args: - table_data: Raw table data extracted from PDF. - - Returns: - List of validated TRREBMonthlyRecord objects. - """ - raise NotImplementedError("District table parsing not yet implemented.") - - def _infer_report_date(self) -> tuple[int, int]: - """Infer report year and month from PDF filename or content. - - Returns: - Tuple of (year, month). - """ - raise NotImplementedError("Date inference not yet implemented.") diff --git a/portfolio_app/toronto/schemas/__init__.py b/portfolio_app/toronto/schemas/__init__.py index 1d33f3e..ae5dda0 100644 --- a/portfolio_app/toronto/schemas/__init__.py +++ b/portfolio_app/toronto/schemas/__init__.py @@ -13,12 +13,8 @@ from .dimensions import ( TimeDimension, TRREBDistrict, ) -from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport __all__ = [ - # TRREB - "TRREBMonthlyRecord", - "TRREBMonthlyReport", # CMHC "CMHCRentalRecord", "CMHCAnnualSurvey", diff --git a/portfolio_app/toronto/schemas/trreb.py b/portfolio_app/toronto/schemas/trreb.py deleted file mode 100644 index e972ff6..0000000 --- a/portfolio_app/toronto/schemas/trreb.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Pydantic schemas for TRREB monthly market data.""" - -from datetime import date -from decimal import Decimal - -from pydantic import BaseModel, Field - - -class TRREBMonthlyRecord(BaseModel): - """Schema for a single TRREB monthly summary record. - - Represents aggregated sales data for one district in one month. - """ - - report_date: date = Field(description="First of month (YYYY-MM-01)") - area_code: str = Field( - max_length=3, description="District code (W01, C01, E01, etc.)" - ) - area_name: str = Field(max_length=100, description="District name") - area_type: str = Field(max_length=10, description="West / Central / East / North") - sales: int = Field(ge=0, description="Number of transactions") - dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)") - avg_price: Decimal = Field(ge=0, description="Average sale price ($)") - median_price: Decimal = Field(ge=0, description="Median sale price ($)") - new_listings: int = Field(ge=0, description="New listings count") - active_listings: int = Field(ge=0, description="Active listings at month end") - avg_sp_lp: Decimal = Field( - ge=0, le=200, description="Avg sale price / list price ratio (%)" - ) - avg_dom: int = Field(ge=0, description="Average days on market") - - model_config = {"str_strip_whitespace": True} - - -class TRREBMonthlyReport(BaseModel): - """Schema for a complete TRREB monthly report. - - Contains all district records for a single month. - """ - - report_date: date - records: list[TRREBMonthlyRecord] - - @property - def total_sales(self) -> int: - """Total sales across all districts.""" - return sum(r.sales for r in self.records) - - @property - def district_count(self) -> int: - """Number of districts in report.""" - return len(self.records)