refactor: Delete legacy TRREB Python modules (#47)
- Delete portfolio_app/toronto/schemas/trreb.py - Delete portfolio_app/toronto/parsers/trreb.py - Delete portfolio_app/toronto/loaders/trreb.py - Remove TRREB imports from __init__.py files Part of Sprint 9: Toronto Neighbourhood Dashboard transition See docs/changes/Change-Toronto-Analysis-Reviewed.md Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -10,7 +10,6 @@ from .dimensions import (
|
|||||||
load_time_dimension,
|
load_time_dimension,
|
||||||
load_trreb_districts,
|
load_trreb_districts,
|
||||||
)
|
)
|
||||||
from .trreb import load_trreb_purchases, load_trreb_record
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
# Base utilities
|
# Base utilities
|
||||||
@@ -25,8 +24,6 @@ __all__ = [
|
|||||||
"load_neighbourhoods",
|
"load_neighbourhoods",
|
||||||
"load_policy_events",
|
"load_policy_events",
|
||||||
# Fact loaders
|
# Fact loaders
|
||||||
"load_trreb_purchases",
|
|
||||||
"load_trreb_record",
|
|
||||||
"load_cmhc_rentals",
|
"load_cmhc_rentals",
|
||||||
"load_cmhc_record",
|
"load_cmhc_record",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,129 +0,0 @@
|
|||||||
"""Loader for TRREB purchase data into fact_purchases."""
|
|
||||||
|
|
||||||
from sqlalchemy.orm import Session
|
|
||||||
|
|
||||||
from portfolio_app.toronto.models import DimTime, DimTRREBDistrict, FactPurchases
|
|
||||||
from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport
|
|
||||||
|
|
||||||
from .base import get_session, upsert_by_key
|
|
||||||
from .dimensions import generate_date_key
|
|
||||||
|
|
||||||
|
|
||||||
def load_trreb_purchases(
|
|
||||||
report: TRREBMonthlyReport,
|
|
||||||
session: Session | None = None,
|
|
||||||
) -> int:
|
|
||||||
"""Load TRREB monthly report data into fact_purchases.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
report: Validated TRREB monthly report containing records.
|
|
||||||
session: Optional existing session.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Number of records loaded.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _load(sess: Session) -> int:
|
|
||||||
# Get district key mapping
|
|
||||||
districts = sess.query(DimTRREBDistrict).all()
|
|
||||||
district_map = {d.district_code: d.district_key for d in districts}
|
|
||||||
|
|
||||||
# Build date key from report date
|
|
||||||
date_key = generate_date_key(report.report_date)
|
|
||||||
|
|
||||||
# Verify time dimension exists
|
|
||||||
time_dim = sess.query(DimTime).filter_by(date_key=date_key).first()
|
|
||||||
if not time_dim:
|
|
||||||
raise ValueError(
|
|
||||||
f"Time dimension not found for date_key {date_key}. "
|
|
||||||
"Load time dimension first."
|
|
||||||
)
|
|
||||||
|
|
||||||
records = []
|
|
||||||
for record in report.records:
|
|
||||||
district_key = district_map.get(record.area_code)
|
|
||||||
if not district_key:
|
|
||||||
# Skip records for unknown districts (e.g., aggregate rows)
|
|
||||||
continue
|
|
||||||
|
|
||||||
fact = FactPurchases(
|
|
||||||
date_key=date_key,
|
|
||||||
district_key=district_key,
|
|
||||||
sales_count=record.sales,
|
|
||||||
dollar_volume=record.dollar_volume,
|
|
||||||
avg_price=record.avg_price,
|
|
||||||
median_price=record.median_price,
|
|
||||||
new_listings=record.new_listings,
|
|
||||||
active_listings=record.active_listings,
|
|
||||||
avg_dom=record.avg_dom,
|
|
||||||
avg_sp_lp=record.avg_sp_lp,
|
|
||||||
)
|
|
||||||
records.append(fact)
|
|
||||||
|
|
||||||
inserted, updated = upsert_by_key(
|
|
||||||
sess, FactPurchases, records, ["date_key", "district_key"]
|
|
||||||
)
|
|
||||||
return inserted + updated
|
|
||||||
|
|
||||||
if session:
|
|
||||||
return _load(session)
|
|
||||||
with get_session() as sess:
|
|
||||||
return _load(sess)
|
|
||||||
|
|
||||||
|
|
||||||
def load_trreb_record(
|
|
||||||
record: TRREBMonthlyRecord,
|
|
||||||
session: Session | None = None,
|
|
||||||
) -> int:
|
|
||||||
"""Load a single TRREB record into fact_purchases.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
record: Single validated TRREB monthly record.
|
|
||||||
session: Optional existing session.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Number of records loaded (0 or 1).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _load(sess: Session) -> int:
|
|
||||||
# Get district key
|
|
||||||
district = (
|
|
||||||
sess.query(DimTRREBDistrict)
|
|
||||||
.filter_by(district_code=record.area_code)
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
if not district:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
date_key = generate_date_key(record.report_date)
|
|
||||||
|
|
||||||
# Verify time dimension exists
|
|
||||||
time_dim = sess.query(DimTime).filter_by(date_key=date_key).first()
|
|
||||||
if not time_dim:
|
|
||||||
raise ValueError(
|
|
||||||
f"Time dimension not found for date_key {date_key}. "
|
|
||||||
"Load time dimension first."
|
|
||||||
)
|
|
||||||
|
|
||||||
fact = FactPurchases(
|
|
||||||
date_key=date_key,
|
|
||||||
district_key=district.district_key,
|
|
||||||
sales_count=record.sales,
|
|
||||||
dollar_volume=record.dollar_volume,
|
|
||||||
avg_price=record.avg_price,
|
|
||||||
median_price=record.median_price,
|
|
||||||
new_listings=record.new_listings,
|
|
||||||
active_listings=record.active_listings,
|
|
||||||
avg_dom=record.avg_dom,
|
|
||||||
avg_sp_lp=record.avg_sp_lp,
|
|
||||||
)
|
|
||||||
|
|
||||||
inserted, updated = upsert_by_key(
|
|
||||||
sess, FactPurchases, [fact], ["date_key", "district_key"]
|
|
||||||
)
|
|
||||||
return inserted + updated
|
|
||||||
|
|
||||||
if session:
|
|
||||||
return _load(session)
|
|
||||||
with get_session() as sess:
|
|
||||||
return _load(sess)
|
|
||||||
@@ -7,10 +7,8 @@ from .geo import (
|
|||||||
TRREBDistrictParser,
|
TRREBDistrictParser,
|
||||||
load_geojson,
|
load_geojson,
|
||||||
)
|
)
|
||||||
from .trreb import TRREBParser
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"TRREBParser",
|
|
||||||
"CMHCParser",
|
"CMHCParser",
|
||||||
# GeoJSON parsers
|
# GeoJSON parsers
|
||||||
"CMHCZoneParser",
|
"CMHCZoneParser",
|
||||||
|
|||||||
@@ -1,82 +0,0 @@
|
|||||||
"""TRREB PDF parser for monthly market watch reports.
|
|
||||||
|
|
||||||
This module provides the structure for parsing TRREB (Toronto Regional Real Estate Board)
|
|
||||||
monthly Market Watch PDF reports into structured data.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport
|
|
||||||
|
|
||||||
|
|
||||||
class TRREBParser:
|
|
||||||
"""Parser for TRREB Market Watch PDF reports.
|
|
||||||
|
|
||||||
TRREB publishes monthly Market Watch reports as PDFs containing:
|
|
||||||
- Summary statistics by area (416, 905, Total)
|
|
||||||
- District-level breakdowns
|
|
||||||
- Year-over-year comparisons
|
|
||||||
|
|
||||||
The parser extracts tabular data from these PDFs and validates
|
|
||||||
against the TRREBMonthlyRecord schema.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, pdf_path: Path) -> None:
|
|
||||||
"""Initialize parser with path to PDF file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_path: Path to the TRREB Market Watch PDF file.
|
|
||||||
"""
|
|
||||||
self.pdf_path = pdf_path
|
|
||||||
self._validate_path()
|
|
||||||
|
|
||||||
def _validate_path(self) -> None:
|
|
||||||
"""Validate that the PDF path exists and is readable."""
|
|
||||||
if not self.pdf_path.exists():
|
|
||||||
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
|
|
||||||
if not self.pdf_path.suffix.lower() == ".pdf":
|
|
||||||
raise ValueError(f"Expected PDF file, got: {self.pdf_path.suffix}")
|
|
||||||
|
|
||||||
def parse(self) -> TRREBMonthlyReport:
|
|
||||||
"""Parse the PDF and return structured data.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
TRREBMonthlyReport containing all extracted records.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
NotImplementedError: PDF parsing not yet implemented.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError(
|
|
||||||
"PDF parsing requires pdfplumber/tabula-py. "
|
|
||||||
"Implementation pending Sprint 4 data ingestion."
|
|
||||||
)
|
|
||||||
|
|
||||||
def _extract_tables(self) -> list[dict[str, Any]]:
|
|
||||||
"""Extract raw tables from PDF pages.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of dictionaries representing table data.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("Table extraction not yet implemented.")
|
|
||||||
|
|
||||||
def _parse_district_table(
|
|
||||||
self, table_data: list[dict[str, Any]]
|
|
||||||
) -> list[TRREBMonthlyRecord]:
|
|
||||||
"""Parse district-level statistics table.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table_data: Raw table data extracted from PDF.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of validated TRREBMonthlyRecord objects.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("District table parsing not yet implemented.")
|
|
||||||
|
|
||||||
def _infer_report_date(self) -> tuple[int, int]:
|
|
||||||
"""Infer report year and month from PDF filename or content.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (year, month).
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("Date inference not yet implemented.")
|
|
||||||
@@ -13,12 +13,8 @@ from .dimensions import (
|
|||||||
TimeDimension,
|
TimeDimension,
|
||||||
TRREBDistrict,
|
TRREBDistrict,
|
||||||
)
|
)
|
||||||
from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
# TRREB
|
|
||||||
"TRREBMonthlyRecord",
|
|
||||||
"TRREBMonthlyReport",
|
|
||||||
# CMHC
|
# CMHC
|
||||||
"CMHCRentalRecord",
|
"CMHCRentalRecord",
|
||||||
"CMHCAnnualSurvey",
|
"CMHCAnnualSurvey",
|
||||||
|
|||||||
@@ -1,52 +0,0 @@
|
|||||||
"""Pydantic schemas for TRREB monthly market data."""
|
|
||||||
|
|
||||||
from datetime import date
|
|
||||||
from decimal import Decimal
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
|
|
||||||
class TRREBMonthlyRecord(BaseModel):
|
|
||||||
"""Schema for a single TRREB monthly summary record.
|
|
||||||
|
|
||||||
Represents aggregated sales data for one district in one month.
|
|
||||||
"""
|
|
||||||
|
|
||||||
report_date: date = Field(description="First of month (YYYY-MM-01)")
|
|
||||||
area_code: str = Field(
|
|
||||||
max_length=3, description="District code (W01, C01, E01, etc.)"
|
|
||||||
)
|
|
||||||
area_name: str = Field(max_length=100, description="District name")
|
|
||||||
area_type: str = Field(max_length=10, description="West / Central / East / North")
|
|
||||||
sales: int = Field(ge=0, description="Number of transactions")
|
|
||||||
dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)")
|
|
||||||
avg_price: Decimal = Field(ge=0, description="Average sale price ($)")
|
|
||||||
median_price: Decimal = Field(ge=0, description="Median sale price ($)")
|
|
||||||
new_listings: int = Field(ge=0, description="New listings count")
|
|
||||||
active_listings: int = Field(ge=0, description="Active listings at month end")
|
|
||||||
avg_sp_lp: Decimal = Field(
|
|
||||||
ge=0, le=200, description="Avg sale price / list price ratio (%)"
|
|
||||||
)
|
|
||||||
avg_dom: int = Field(ge=0, description="Average days on market")
|
|
||||||
|
|
||||||
model_config = {"str_strip_whitespace": True}
|
|
||||||
|
|
||||||
|
|
||||||
class TRREBMonthlyReport(BaseModel):
|
|
||||||
"""Schema for a complete TRREB monthly report.
|
|
||||||
|
|
||||||
Contains all district records for a single month.
|
|
||||||
"""
|
|
||||||
|
|
||||||
report_date: date
|
|
||||||
records: list[TRREBMonthlyRecord]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def total_sales(self) -> int:
|
|
||||||
"""Total sales across all districts."""
|
|
||||||
return sum(r.sales for r in self.records)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def district_count(self) -> int:
|
|
||||||
"""Number of districts in report."""
|
|
||||||
return len(self.records)
|
|
||||||
Reference in New Issue
Block a user