feat: Implement Phase 3 neighbourhood data model

Add schemas, parsers, loaders, and models for Toronto neighbourhood-centric
data including census profiles, crime statistics, and amenities.

Schemas:
- NeighbourhoodRecord, CensusRecord, CrimeRecord, CrimeType
- AmenityType, AmenityRecord, AmenityCount

Models:
- BridgeCMHCNeighbourhood (zone-to-neighbourhood mapping with weights)
- FactCensus, FactCrime, FactAmenities

Parsers:
- TorontoOpenDataParser (CKAN API for neighbourhoods, census, amenities)
- TorontoPoliceParser (crime rates, MCI data)

Loaders:
- load_census_data, load_crime_data, load_amenities
- build_cmhc_neighbourhood_crosswalk (PostGIS area weights)

Also updates CLAUDE.md with projman plugin workflow documentation.

Closes #53, #54, #55, #56, #57, #58, #59

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-16 11:07:13 -05:00
parent f69d0c15a7
commit 053acf6436
14 changed files with 1466 additions and 2 deletions

View File

@@ -1,7 +1,15 @@
"""Database loaders for Toronto housing data."""
from .amenities import load_amenities, load_amenity_counts
from .base import bulk_insert, get_session, upsert_by_key
from .census import load_census_data
from .cmhc import load_cmhc_record, load_cmhc_rentals
from .cmhc_crosswalk import (
build_cmhc_neighbourhood_crosswalk,
disaggregate_zone_value,
get_neighbourhood_weights_for_zone,
)
from .crime import load_crime_data
from .dimensions import (
generate_date_key,
load_cmhc_zones,
@@ -24,4 +32,13 @@ __all__ = [
# Fact loaders
"load_cmhc_rentals",
"load_cmhc_record",
# Phase 3 loaders
"load_census_data",
"load_crime_data",
"load_amenities",
"load_amenity_counts",
# CMHC crosswalk
"build_cmhc_neighbourhood_crosswalk",
"get_neighbourhood_weights_for_zone",
"disaggregate_zone_value",
]

View File

@@ -0,0 +1,93 @@
"""Loader for amenities data to fact_amenities table."""
from collections import Counter
from sqlalchemy.orm import Session
from portfolio_app.toronto.models import FactAmenities
from portfolio_app.toronto.schemas import AmenityCount, AmenityRecord
from .base import get_session, upsert_by_key
def load_amenities(
records: list[AmenityRecord],
year: int,
session: Session | None = None,
) -> int:
"""Load amenity records to fact_amenities table.
Aggregates individual amenity records into counts by neighbourhood
and amenity type before loading.
Args:
records: List of validated AmenityRecord schemas.
year: Year to associate with the amenity counts.
session: Optional existing session.
Returns:
Number of records loaded (inserted + updated).
"""
# Aggregate records by neighbourhood and amenity type
counts: Counter[tuple[int, str]] = Counter()
for r in records:
key = (r.neighbourhood_id, r.amenity_type.value)
counts[key] += 1
# Convert to AmenityCount schemas then to models
def _load(sess: Session) -> int:
models = []
for (neighbourhood_id, amenity_type), count in counts.items():
model = FactAmenities(
neighbourhood_id=neighbourhood_id,
amenity_type=amenity_type,
count=count,
year=year,
)
models.append(model)
inserted, updated = upsert_by_key(
sess, FactAmenities, models, ["neighbourhood_id", "amenity_type", "year"]
)
return inserted + updated
if session:
return _load(session)
with get_session() as sess:
return _load(sess)
def load_amenity_counts(
records: list[AmenityCount],
session: Session | None = None,
) -> int:
"""Load pre-aggregated amenity counts to fact_amenities table.
Args:
records: List of validated AmenityCount schemas.
session: Optional existing session.
Returns:
Number of records loaded (inserted + updated).
"""
def _load(sess: Session) -> int:
models = []
for r in records:
model = FactAmenities(
neighbourhood_id=r.neighbourhood_id,
amenity_type=r.amenity_type.value,
count=r.count,
year=r.year,
)
models.append(model)
inserted, updated = upsert_by_key(
sess, FactAmenities, models, ["neighbourhood_id", "amenity_type", "year"]
)
return inserted + updated
if session:
return _load(session)
with get_session() as sess:
return _load(sess)

View File

@@ -0,0 +1,68 @@
"""Loader for census data to fact_census table."""
from sqlalchemy.orm import Session
from portfolio_app.toronto.models import FactCensus
from portfolio_app.toronto.schemas import CensusRecord
from .base import get_session, upsert_by_key
def load_census_data(
records: list[CensusRecord],
session: Session | None = None,
) -> int:
"""Load census records to fact_census table.
Args:
records: List of validated CensusRecord schemas.
session: Optional existing session.
Returns:
Number of records loaded (inserted + updated).
"""
def _load(sess: Session) -> int:
models = []
for r in records:
model = FactCensus(
neighbourhood_id=r.neighbourhood_id,
census_year=r.census_year,
population=r.population,
population_density=float(r.population_density)
if r.population_density
else None,
median_household_income=float(r.median_household_income)
if r.median_household_income
else None,
average_household_income=float(r.average_household_income)
if r.average_household_income
else None,
unemployment_rate=float(r.unemployment_rate)
if r.unemployment_rate
else None,
pct_bachelors_or_higher=float(r.pct_bachelors_or_higher)
if r.pct_bachelors_or_higher
else None,
pct_owner_occupied=float(r.pct_owner_occupied)
if r.pct_owner_occupied
else None,
pct_renter_occupied=float(r.pct_renter_occupied)
if r.pct_renter_occupied
else None,
median_age=float(r.median_age) if r.median_age else None,
average_dwelling_value=float(r.average_dwelling_value)
if r.average_dwelling_value
else None,
)
models.append(model)
inserted, updated = upsert_by_key(
sess, FactCensus, models, ["neighbourhood_id", "census_year"]
)
return inserted + updated
if session:
return _load(session)
with get_session() as sess:
return _load(sess)

View File

@@ -0,0 +1,131 @@
"""Loader for CMHC zone to neighbourhood crosswalk with area weights."""
from sqlalchemy import text
from sqlalchemy.orm import Session
from .base import get_session
def build_cmhc_neighbourhood_crosswalk(
session: Session | None = None,
) -> int:
"""Calculate area overlap weights between CMHC zones and neighbourhoods.
Uses PostGIS ST_Intersection and ST_Area functions to compute the
proportion of each CMHC zone that overlaps with each neighbourhood.
This enables disaggregation of CMHC zone-level data to neighbourhood level.
The function is idempotent - it clears existing crosswalk data before
rebuilding.
Args:
session: Optional existing session.
Returns:
Number of bridge records created.
Note:
Requires both dim_cmhc_zone and dim_neighbourhood tables to have
geometry columns populated with valid PostGIS geometries.
"""
def _build(sess: Session) -> int:
# Clear existing crosswalk data
sess.execute(text("DELETE FROM bridge_cmhc_neighbourhood"))
# Calculate overlap weights using PostGIS
# Weight = area of intersection / total area of CMHC zone
crosswalk_query = text(
"""
INSERT INTO bridge_cmhc_neighbourhood (cmhc_zone_code, neighbourhood_id, weight)
SELECT
z.zone_code,
n.neighbourhood_id,
CASE
WHEN ST_Area(z.geometry::geography) > 0 THEN
ST_Area(ST_Intersection(z.geometry, n.geometry)::geography) /
ST_Area(z.geometry::geography)
ELSE 0
END as weight
FROM dim_cmhc_zone z
JOIN dim_neighbourhood n
ON ST_Intersects(z.geometry, n.geometry)
WHERE
z.geometry IS NOT NULL
AND n.geometry IS NOT NULL
AND ST_Area(ST_Intersection(z.geometry, n.geometry)::geography) > 0
"""
)
sess.execute(crosswalk_query)
# Count records created
count_result = sess.execute(
text("SELECT COUNT(*) FROM bridge_cmhc_neighbourhood")
)
count = count_result.scalar() or 0
return int(count)
if session:
return _build(session)
with get_session() as sess:
return _build(sess)
def get_neighbourhood_weights_for_zone(
zone_code: str,
session: Session | None = None,
) -> list[tuple[int, float]]:
"""Get neighbourhood weights for a specific CMHC zone.
Args:
zone_code: CMHC zone code.
session: Optional existing session.
Returns:
List of (neighbourhood_id, weight) tuples.
"""
def _get(sess: Session) -> list[tuple[int, float]]:
result = sess.execute(
text(
"""
SELECT neighbourhood_id, weight
FROM bridge_cmhc_neighbourhood
WHERE cmhc_zone_code = :zone_code
ORDER BY weight DESC
"""
),
{"zone_code": zone_code},
)
return [(int(row[0]), float(row[1])) for row in result]
if session:
return _get(session)
with get_session() as sess:
return _get(sess)
def disaggregate_zone_value(
zone_code: str,
value: float,
session: Session | None = None,
) -> dict[int, float]:
"""Disaggregate a CMHC zone value to neighbourhoods using weights.
Args:
zone_code: CMHC zone code.
value: Value to disaggregate (e.g., average rent).
session: Optional existing session.
Returns:
Dictionary mapping neighbourhood_id to weighted value.
Note:
For averages (like rent), the weighted value represents the
contribution from this zone. To get a neighbourhood's total,
sum contributions from all overlapping zones.
"""
weights = get_neighbourhood_weights_for_zone(zone_code, session)
return {neighbourhood_id: value * weight for neighbourhood_id, weight in weights}

View File

@@ -0,0 +1,45 @@
"""Loader for crime data to fact_crime table."""
from sqlalchemy.orm import Session
from portfolio_app.toronto.models import FactCrime
from portfolio_app.toronto.schemas import CrimeRecord
from .base import get_session, upsert_by_key
def load_crime_data(
records: list[CrimeRecord],
session: Session | None = None,
) -> int:
"""Load crime records to fact_crime table.
Args:
records: List of validated CrimeRecord schemas.
session: Optional existing session.
Returns:
Number of records loaded (inserted + updated).
"""
def _load(sess: Session) -> int:
models = []
for r in records:
model = FactCrime(
neighbourhood_id=r.neighbourhood_id,
year=r.year,
crime_type=r.crime_type.value,
count=r.count,
rate_per_100k=float(r.rate_per_100k) if r.rate_per_100k else None,
)
models.append(model)
inserted, updated = upsert_by_key(
sess, FactCrime, models, ["neighbourhood_id", "year", "crime_type"]
)
return inserted + updated
if session:
return _load(session)
with get_session() as sess:
return _load(sess)