feat: Implement Phase 3 neighbourhood data model

Add schemas, parsers, loaders, and models for Toronto neighbourhood-centric data including census profiles, crime statistics, and amenities. Schemas: - NeighbourhoodRecord, CensusRecord, CrimeRecord, CrimeType - AmenityType, AmenityRecord, AmenityCount Models: - BridgeCMHCNeighbourhood (zone-to-neighbourhood mapping with weights) - FactCensus, FactCrime, FactAmenities Parsers: - TorontoOpenDataParser (CKAN API for neighbourhoods, census, amenities) - TorontoPoliceParser (crime rates, MCI data) Loaders: - load_census_data, load_crime_data, load_amenities - build_cmhc_neighbourhood_crosswalk (PostGIS area weights) Also updates CLAUDE.md with projman plugin workflow documentation. Closes #53, #54, #55, #56, #57, #58, #59 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 11:07:13 -05:00
parent f69d0c15a7
commit 053acf6436
14 changed files with 1466 additions and 2 deletions
--- a/portfolio_app/toronto/loaders/init.py
+++ b/portfolio_app/toronto/loaders/init.py
@@ -1,7 +1,15 @@
 """Database loaders for Toronto housing data."""

+from .amenities import load_amenities, load_amenity_counts
 from .base import bulk_insert, get_session, upsert_by_key
+from .census import load_census_data
 from .cmhc import load_cmhc_record, load_cmhc_rentals
+from .cmhc_crosswalk import (
+    build_cmhc_neighbourhood_crosswalk,
+    disaggregate_zone_value,
+    get_neighbourhood_weights_for_zone,
+)
+from .crime import load_crime_data
 from .dimensions import (
    generate_date_key,
    load_cmhc_zones,
@@ -24,4 +32,13 @@ __all__ = [
    # Fact loaders
    "load_cmhc_rentals",
    "load_cmhc_record",
+    # Phase 3 loaders
+    "load_census_data",
+    "load_crime_data",
+    "load_amenities",
+    "load_amenity_counts",
+    # CMHC crosswalk
+    "build_cmhc_neighbourhood_crosswalk",
+    "get_neighbourhood_weights_for_zone",
+    "disaggregate_zone_value",
 ]
--- a/portfolio_app/toronto/loaders/amenities.py
+++ b/portfolio_app/toronto/loaders/amenities.py
@@ -0,0 +1,93 @@
+"""Loader for amenities data to fact_amenities table."""
+
+from collections import Counter
+
+from sqlalchemy.orm import Session
+
+from portfolio_app.toronto.models import FactAmenities
+from portfolio_app.toronto.schemas import AmenityCount, AmenityRecord
+
+from .base import get_session, upsert_by_key
+
+
+def load_amenities(
+    records: list[AmenityRecord],
+    year: int,
+    session: Session | None = None,
+) -> int:
+    """Load amenity records to fact_amenities table.
+
+    Aggregates individual amenity records into counts by neighbourhood
+    and amenity type before loading.
+
+    Args:
+        records: List of validated AmenityRecord schemas.
+        year: Year to associate with the amenity counts.
+        session: Optional existing session.
+
+    Returns:
+        Number of records loaded (inserted + updated).
+    """
+    # Aggregate records by neighbourhood and amenity type
+    counts: Counter[tuple[int, str]] = Counter()
+    for r in records:
+        key = (r.neighbourhood_id, r.amenity_type.value)
+        counts[key] += 1
+
+    # Convert to AmenityCount schemas then to models
+    def _load(sess: Session) -> int:
+        models = []
+        for (neighbourhood_id, amenity_type), count in counts.items():
+            model = FactAmenities(
+                neighbourhood_id=neighbourhood_id,
+                amenity_type=amenity_type,
+                count=count,
+                year=year,
+            )
+            models.append(model)
+
+        inserted, updated = upsert_by_key(
+            sess, FactAmenities, models, ["neighbourhood_id", "amenity_type", "year"]
+        )
+        return inserted + updated
+
+    if session:
+        return _load(session)
+    with get_session() as sess:
+        return _load(sess)
+
+
+def load_amenity_counts(
+    records: list[AmenityCount],
+    session: Session | None = None,
+) -> int:
+    """Load pre-aggregated amenity counts to fact_amenities table.
+
+    Args:
+        records: List of validated AmenityCount schemas.
+        session: Optional existing session.
+
+    Returns:
+        Number of records loaded (inserted + updated).
+    """
+
+    def _load(sess: Session) -> int:
+        models = []
+        for r in records:
+            model = FactAmenities(
+                neighbourhood_id=r.neighbourhood_id,
+                amenity_type=r.amenity_type.value,
+                count=r.count,
+                year=r.year,
+            )
+            models.append(model)
+
+        inserted, updated = upsert_by_key(
+            sess, FactAmenities, models, ["neighbourhood_id", "amenity_type", "year"]
+        )
+        return inserted + updated
+
+    if session:
+        return _load(session)
+    with get_session() as sess:
+        return _load(sess)
--- a/portfolio_app/toronto/loaders/census.py
+++ b/portfolio_app/toronto/loaders/census.py
@@ -0,0 +1,68 @@
+"""Loader for census data to fact_census table."""
+
+from sqlalchemy.orm import Session
+
+from portfolio_app.toronto.models import FactCensus
+from portfolio_app.toronto.schemas import CensusRecord
+
+from .base import get_session, upsert_by_key
+
+
+def load_census_data(
+    records: list[CensusRecord],
+    session: Session | None = None,
+) -> int:
+    """Load census records to fact_census table.
+
+    Args:
+        records: List of validated CensusRecord schemas.
+        session: Optional existing session.
+
+    Returns:
+        Number of records loaded (inserted + updated).
+    """
+
+    def _load(sess: Session) -> int:
+        models = []
+        for r in records:
+            model = FactCensus(
+                neighbourhood_id=r.neighbourhood_id,
+                census_year=r.census_year,
+                population=r.population,
+                population_density=float(r.population_density)
+                if r.population_density
+                else None,
+                median_household_income=float(r.median_household_income)
+                if r.median_household_income
+                else None,
+                average_household_income=float(r.average_household_income)
+                if r.average_household_income
+                else None,
+                unemployment_rate=float(r.unemployment_rate)
+                if r.unemployment_rate
+                else None,
+                pct_bachelors_or_higher=float(r.pct_bachelors_or_higher)
+                if r.pct_bachelors_or_higher
+                else None,
+                pct_owner_occupied=float(r.pct_owner_occupied)
+                if r.pct_owner_occupied
+                else None,
+                pct_renter_occupied=float(r.pct_renter_occupied)
+                if r.pct_renter_occupied
+                else None,
+                median_age=float(r.median_age) if r.median_age else None,
+                average_dwelling_value=float(r.average_dwelling_value)
+                if r.average_dwelling_value
+                else None,
+            )
+            models.append(model)
+
+        inserted, updated = upsert_by_key(
+            sess, FactCensus, models, ["neighbourhood_id", "census_year"]
+        )
+        return inserted + updated
+
+    if session:
+        return _load(session)
+    with get_session() as sess:
+        return _load(sess)
--- a/portfolio_app/toronto/loaders/cmhc_crosswalk.py
+++ b/portfolio_app/toronto/loaders/cmhc_crosswalk.py
@@ -0,0 +1,131 @@
+"""Loader for CMHC zone to neighbourhood crosswalk with area weights."""
+
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+from .base import get_session
+
+
+def build_cmhc_neighbourhood_crosswalk(
+    session: Session | None = None,
+) -> int:
+    """Calculate area overlap weights between CMHC zones and neighbourhoods.
+
+    Uses PostGIS ST_Intersection and ST_Area functions to compute the
+    proportion of each CMHC zone that overlaps with each neighbourhood.
+    This enables disaggregation of CMHC zone-level data to neighbourhood level.
+
+    The function is idempotent - it clears existing crosswalk data before
+    rebuilding.
+
+    Args:
+        session: Optional existing session.
+
+    Returns:
+        Number of bridge records created.
+
+    Note:
+        Requires both dim_cmhc_zone and dim_neighbourhood tables to have
+        geometry columns populated with valid PostGIS geometries.
+    """
+
+    def _build(sess: Session) -> int:
+        # Clear existing crosswalk data
+        sess.execute(text("DELETE FROM bridge_cmhc_neighbourhood"))
+
+        # Calculate overlap weights using PostGIS
+        # Weight = area of intersection / total area of CMHC zone
+        crosswalk_query = text(
+            """
+            INSERT INTO bridge_cmhc_neighbourhood (cmhc_zone_code, neighbourhood_id, weight)
+            SELECT
+                z.zone_code,
+                n.neighbourhood_id,
+                CASE
+                    WHEN ST_Area(z.geometry::geography) > 0 THEN
+                        ST_Area(ST_Intersection(z.geometry, n.geometry)::geography) /
+                        ST_Area(z.geometry::geography)
+                    ELSE 0
+                END as weight
+            FROM dim_cmhc_zone z
+            JOIN dim_neighbourhood n
+                ON ST_Intersects(z.geometry, n.geometry)
+            WHERE
+                z.geometry IS NOT NULL
+                AND n.geometry IS NOT NULL
+                AND ST_Area(ST_Intersection(z.geometry, n.geometry)::geography) > 0
+        """
+        )
+
+        sess.execute(crosswalk_query)
+
+        # Count records created
+        count_result = sess.execute(
+            text("SELECT COUNT(*) FROM bridge_cmhc_neighbourhood")
+        )
+        count = count_result.scalar() or 0
+
+        return int(count)
+
+    if session:
+        return _build(session)
+    with get_session() as sess:
+        return _build(sess)
+
+
+def get_neighbourhood_weights_for_zone(
+    zone_code: str,
+    session: Session | None = None,
+) -> list[tuple[int, float]]:
+    """Get neighbourhood weights for a specific CMHC zone.
+
+    Args:
+        zone_code: CMHC zone code.
+        session: Optional existing session.
+
+    Returns:
+        List of (neighbourhood_id, weight) tuples.
+    """
+
+    def _get(sess: Session) -> list[tuple[int, float]]:
+        result = sess.execute(
+            text(
+                """
+                SELECT neighbourhood_id, weight
+                FROM bridge_cmhc_neighbourhood
+                WHERE cmhc_zone_code = :zone_code
+                ORDER BY weight DESC
+            """
+            ),
+            {"zone_code": zone_code},
+        )
+        return [(int(row[0]), float(row[1])) for row in result]
+
+    if session:
+        return _get(session)
+    with get_session() as sess:
+        return _get(sess)
+
+
+def disaggregate_zone_value(
+    zone_code: str,
+    value: float,
+    session: Session | None = None,
+) -> dict[int, float]:
+    """Disaggregate a CMHC zone value to neighbourhoods using weights.
+
+    Args:
+        zone_code: CMHC zone code.
+        value: Value to disaggregate (e.g., average rent).
+        session: Optional existing session.
+
+    Returns:
+        Dictionary mapping neighbourhood_id to weighted value.
+
+    Note:
+        For averages (like rent), the weighted value represents the
+        contribution from this zone. To get a neighbourhood's total,
+        sum contributions from all overlapping zones.
+    """
+    weights = get_neighbourhood_weights_for_zone(zone_code, session)
+    return {neighbourhood_id: value * weight for neighbourhood_id, weight in weights}
--- a/portfolio_app/toronto/loaders/crime.py
+++ b/portfolio_app/toronto/loaders/crime.py
@@ -0,0 +1,45 @@
+"""Loader for crime data to fact_crime table."""
+
+from sqlalchemy.orm import Session
+
+from portfolio_app.toronto.models import FactCrime
+from portfolio_app.toronto.schemas import CrimeRecord
+
+from .base import get_session, upsert_by_key
+
+
+def load_crime_data(
+    records: list[CrimeRecord],
+    session: Session | None = None,
+) -> int:
+    """Load crime records to fact_crime table.
+
+    Args:
+        records: List of validated CrimeRecord schemas.
+        session: Optional existing session.
+
+    Returns:
+        Number of records loaded (inserted + updated).
+    """
+
+    def _load(sess: Session) -> int:
+        models = []
+        for r in records:
+            model = FactCrime(
+                neighbourhood_id=r.neighbourhood_id,
+                year=r.year,
+                crime_type=r.crime_type.value,
+                count=r.count,
+                rate_per_100k=float(r.rate_per_100k) if r.rate_per_100k else None,
+            )
+            models.append(model)
+
+        inserted, updated = upsert_by_key(
+            sess, FactCrime, models, ["neighbourhood_id", "year", "crime_type"]
+        )
+        return inserted + updated
+
+    if session:
+        return _load(session)
+    with get_session() as sess:
+        return _load(sess)