staging #96
@@ -10,7 +10,6 @@ from .dimensions import (
|
||||
load_time_dimension,
|
||||
load_trreb_districts,
|
||||
)
|
||||
from .trreb import load_trreb_purchases, load_trreb_record
|
||||
|
||||
__all__ = [
|
||||
# Base utilities
|
||||
@@ -25,8 +24,6 @@ __all__ = [
|
||||
"load_neighbourhoods",
|
||||
"load_policy_events",
|
||||
# Fact loaders
|
||||
"load_trreb_purchases",
|
||||
"load_trreb_record",
|
||||
"load_cmhc_rentals",
|
||||
"load_cmhc_record",
|
||||
]
|
||||
|
||||
@@ -1,129 +0,0 @@
|
||||
"""Loader for TRREB purchase data into fact_purchases."""
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from portfolio_app.toronto.models import DimTime, DimTRREBDistrict, FactPurchases
|
||||
from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport
|
||||
|
||||
from .base import get_session, upsert_by_key
|
||||
from .dimensions import generate_date_key
|
||||
|
||||
|
||||
def load_trreb_purchases(
|
||||
report: TRREBMonthlyReport,
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Load TRREB monthly report data into fact_purchases.
|
||||
|
||||
Args:
|
||||
report: Validated TRREB monthly report containing records.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of records loaded.
|
||||
"""
|
||||
|
||||
def _load(sess: Session) -> int:
|
||||
# Get district key mapping
|
||||
districts = sess.query(DimTRREBDistrict).all()
|
||||
district_map = {d.district_code: d.district_key for d in districts}
|
||||
|
||||
# Build date key from report date
|
||||
date_key = generate_date_key(report.report_date)
|
||||
|
||||
# Verify time dimension exists
|
||||
time_dim = sess.query(DimTime).filter_by(date_key=date_key).first()
|
||||
if not time_dim:
|
||||
raise ValueError(
|
||||
f"Time dimension not found for date_key {date_key}. "
|
||||
"Load time dimension first."
|
||||
)
|
||||
|
||||
records = []
|
||||
for record in report.records:
|
||||
district_key = district_map.get(record.area_code)
|
||||
if not district_key:
|
||||
# Skip records for unknown districts (e.g., aggregate rows)
|
||||
continue
|
||||
|
||||
fact = FactPurchases(
|
||||
date_key=date_key,
|
||||
district_key=district_key,
|
||||
sales_count=record.sales,
|
||||
dollar_volume=record.dollar_volume,
|
||||
avg_price=record.avg_price,
|
||||
median_price=record.median_price,
|
||||
new_listings=record.new_listings,
|
||||
active_listings=record.active_listings,
|
||||
avg_dom=record.avg_dom,
|
||||
avg_sp_lp=record.avg_sp_lp,
|
||||
)
|
||||
records.append(fact)
|
||||
|
||||
inserted, updated = upsert_by_key(
|
||||
sess, FactPurchases, records, ["date_key", "district_key"]
|
||||
)
|
||||
return inserted + updated
|
||||
|
||||
if session:
|
||||
return _load(session)
|
||||
with get_session() as sess:
|
||||
return _load(sess)
|
||||
|
||||
|
||||
def load_trreb_record(
|
||||
record: TRREBMonthlyRecord,
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Load a single TRREB record into fact_purchases.
|
||||
|
||||
Args:
|
||||
record: Single validated TRREB monthly record.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of records loaded (0 or 1).
|
||||
"""
|
||||
|
||||
def _load(sess: Session) -> int:
|
||||
# Get district key
|
||||
district = (
|
||||
sess.query(DimTRREBDistrict)
|
||||
.filter_by(district_code=record.area_code)
|
||||
.first()
|
||||
)
|
||||
if not district:
|
||||
return 0
|
||||
|
||||
date_key = generate_date_key(record.report_date)
|
||||
|
||||
# Verify time dimension exists
|
||||
time_dim = sess.query(DimTime).filter_by(date_key=date_key).first()
|
||||
if not time_dim:
|
||||
raise ValueError(
|
||||
f"Time dimension not found for date_key {date_key}. "
|
||||
"Load time dimension first."
|
||||
)
|
||||
|
||||
fact = FactPurchases(
|
||||
date_key=date_key,
|
||||
district_key=district.district_key,
|
||||
sales_count=record.sales,
|
||||
dollar_volume=record.dollar_volume,
|
||||
avg_price=record.avg_price,
|
||||
median_price=record.median_price,
|
||||
new_listings=record.new_listings,
|
||||
active_listings=record.active_listings,
|
||||
avg_dom=record.avg_dom,
|
||||
avg_sp_lp=record.avg_sp_lp,
|
||||
)
|
||||
|
||||
inserted, updated = upsert_by_key(
|
||||
sess, FactPurchases, [fact], ["date_key", "district_key"]
|
||||
)
|
||||
return inserted + updated
|
||||
|
||||
if session:
|
||||
return _load(session)
|
||||
with get_session() as sess:
|
||||
return _load(sess)
|
||||
@@ -7,10 +7,8 @@ from .geo import (
|
||||
TRREBDistrictParser,
|
||||
load_geojson,
|
||||
)
|
||||
from .trreb import TRREBParser
|
||||
|
||||
__all__ = [
|
||||
"TRREBParser",
|
||||
"CMHCParser",
|
||||
# GeoJSON parsers
|
||||
"CMHCZoneParser",
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
"""TRREB PDF parser for monthly market watch reports.
|
||||
|
||||
This module provides the structure for parsing TRREB (Toronto Regional Real Estate Board)
|
||||
monthly Market Watch PDF reports into structured data.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from portfolio_app.toronto.schemas import TRREBMonthlyRecord, TRREBMonthlyReport
|
||||
|
||||
|
||||
class TRREBParser:
|
||||
"""Parser for TRREB Market Watch PDF reports.
|
||||
|
||||
TRREB publishes monthly Market Watch reports as PDFs containing:
|
||||
- Summary statistics by area (416, 905, Total)
|
||||
- District-level breakdowns
|
||||
- Year-over-year comparisons
|
||||
|
||||
The parser extracts tabular data from these PDFs and validates
|
||||
against the TRREBMonthlyRecord schema.
|
||||
"""
|
||||
|
||||
def __init__(self, pdf_path: Path) -> None:
|
||||
"""Initialize parser with path to PDF file.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the TRREB Market Watch PDF file.
|
||||
"""
|
||||
self.pdf_path = pdf_path
|
||||
self._validate_path()
|
||||
|
||||
def _validate_path(self) -> None:
|
||||
"""Validate that the PDF path exists and is readable."""
|
||||
if not self.pdf_path.exists():
|
||||
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
|
||||
if not self.pdf_path.suffix.lower() == ".pdf":
|
||||
raise ValueError(f"Expected PDF file, got: {self.pdf_path.suffix}")
|
||||
|
||||
def parse(self) -> TRREBMonthlyReport:
|
||||
"""Parse the PDF and return structured data.
|
||||
|
||||
Returns:
|
||||
TRREBMonthlyReport containing all extracted records.
|
||||
|
||||
Raises:
|
||||
NotImplementedError: PDF parsing not yet implemented.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
"PDF parsing requires pdfplumber/tabula-py. "
|
||||
"Implementation pending Sprint 4 data ingestion."
|
||||
)
|
||||
|
||||
def _extract_tables(self) -> list[dict[str, Any]]:
|
||||
"""Extract raw tables from PDF pages.
|
||||
|
||||
Returns:
|
||||
List of dictionaries representing table data.
|
||||
"""
|
||||
raise NotImplementedError("Table extraction not yet implemented.")
|
||||
|
||||
def _parse_district_table(
|
||||
self, table_data: list[dict[str, Any]]
|
||||
) -> list[TRREBMonthlyRecord]:
|
||||
"""Parse district-level statistics table.
|
||||
|
||||
Args:
|
||||
table_data: Raw table data extracted from PDF.
|
||||
|
||||
Returns:
|
||||
List of validated TRREBMonthlyRecord objects.
|
||||
"""
|
||||
raise NotImplementedError("District table parsing not yet implemented.")
|
||||
|
||||
def _infer_report_date(self) -> tuple[int, int]:
|
||||
"""Infer report year and month from PDF filename or content.
|
||||
|
||||
Returns:
|
||||
Tuple of (year, month).
|
||||
"""
|
||||
raise NotImplementedError("Date inference not yet implemented.")
|
||||
@@ -13,12 +13,8 @@ from .dimensions import (
|
||||
TimeDimension,
|
||||
TRREBDistrict,
|
||||
)
|
||||
from .trreb import TRREBMonthlyRecord, TRREBMonthlyReport
|
||||
|
||||
__all__ = [
|
||||
# TRREB
|
||||
"TRREBMonthlyRecord",
|
||||
"TRREBMonthlyReport",
|
||||
# CMHC
|
||||
"CMHCRentalRecord",
|
||||
"CMHCAnnualSurvey",
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
"""Pydantic schemas for TRREB monthly market data."""
|
||||
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class TRREBMonthlyRecord(BaseModel):
|
||||
"""Schema for a single TRREB monthly summary record.
|
||||
|
||||
Represents aggregated sales data for one district in one month.
|
||||
"""
|
||||
|
||||
report_date: date = Field(description="First of month (YYYY-MM-01)")
|
||||
area_code: str = Field(
|
||||
max_length=3, description="District code (W01, C01, E01, etc.)"
|
||||
)
|
||||
area_name: str = Field(max_length=100, description="District name")
|
||||
area_type: str = Field(max_length=10, description="West / Central / East / North")
|
||||
sales: int = Field(ge=0, description="Number of transactions")
|
||||
dollar_volume: Decimal = Field(ge=0, description="Total sales volume ($)")
|
||||
avg_price: Decimal = Field(ge=0, description="Average sale price ($)")
|
||||
median_price: Decimal = Field(ge=0, description="Median sale price ($)")
|
||||
new_listings: int = Field(ge=0, description="New listings count")
|
||||
active_listings: int = Field(ge=0, description="Active listings at month end")
|
||||
avg_sp_lp: Decimal = Field(
|
||||
ge=0, le=200, description="Avg sale price / list price ratio (%)"
|
||||
)
|
||||
avg_dom: int = Field(ge=0, description="Average days on market")
|
||||
|
||||
model_config = {"str_strip_whitespace": True}
|
||||
|
||||
|
||||
class TRREBMonthlyReport(BaseModel):
|
||||
"""Schema for a complete TRREB monthly report.
|
||||
|
||||
Contains all district records for a single month.
|
||||
"""
|
||||
|
||||
report_date: date
|
||||
records: list[TRREBMonthlyRecord]
|
||||
|
||||
@property
|
||||
def total_sales(self) -> int:
|
||||
"""Total sales across all districts."""
|
||||
return sum(r.sales for r in self.records)
|
||||
|
||||
@property
|
||||
def district_count(self) -> int:
|
||||
"""Number of districts in report."""
|
||||
return len(self.records)
|
||||
Reference in New Issue
Block a user