feat: Implement Phase 3 neighbourhood data model
Add schemas, parsers, loaders, and models for Toronto neighbourhood-centric data including census profiles, crime statistics, and amenities. Schemas: - NeighbourhoodRecord, CensusRecord, CrimeRecord, CrimeType - AmenityType, AmenityRecord, AmenityCount Models: - BridgeCMHCNeighbourhood (zone-to-neighbourhood mapping with weights) - FactCensus, FactCrime, FactAmenities Parsers: - TorontoOpenDataParser (CKAN API for neighbourhoods, census, amenities) - TorontoPoliceParser (crime rates, MCI data) Loaders: - load_census_data, load_crime_data, load_amenities - build_cmhc_neighbourhood_crosswalk (PostGIS area weights) Also updates CLAUDE.md with projman plugin workflow documentation. Closes #53, #54, #55, #56, #57, #58, #59 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,15 @@
|
||||
"""Database loaders for Toronto housing data."""
|
||||
|
||||
from .amenities import load_amenities, load_amenity_counts
|
||||
from .base import bulk_insert, get_session, upsert_by_key
|
||||
from .census import load_census_data
|
||||
from .cmhc import load_cmhc_record, load_cmhc_rentals
|
||||
from .cmhc_crosswalk import (
|
||||
build_cmhc_neighbourhood_crosswalk,
|
||||
disaggregate_zone_value,
|
||||
get_neighbourhood_weights_for_zone,
|
||||
)
|
||||
from .crime import load_crime_data
|
||||
from .dimensions import (
|
||||
generate_date_key,
|
||||
load_cmhc_zones,
|
||||
@@ -24,4 +32,13 @@ __all__ = [
|
||||
# Fact loaders
|
||||
"load_cmhc_rentals",
|
||||
"load_cmhc_record",
|
||||
# Phase 3 loaders
|
||||
"load_census_data",
|
||||
"load_crime_data",
|
||||
"load_amenities",
|
||||
"load_amenity_counts",
|
||||
# CMHC crosswalk
|
||||
"build_cmhc_neighbourhood_crosswalk",
|
||||
"get_neighbourhood_weights_for_zone",
|
||||
"disaggregate_zone_value",
|
||||
]
|
||||
|
||||
93
portfolio_app/toronto/loaders/amenities.py
Normal file
93
portfolio_app/toronto/loaders/amenities.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Loader for amenities data to fact_amenities table."""
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from portfolio_app.toronto.models import FactAmenities
|
||||
from portfolio_app.toronto.schemas import AmenityCount, AmenityRecord
|
||||
|
||||
from .base import get_session, upsert_by_key
|
||||
|
||||
|
||||
def load_amenities(
|
||||
records: list[AmenityRecord],
|
||||
year: int,
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Load amenity records to fact_amenities table.
|
||||
|
||||
Aggregates individual amenity records into counts by neighbourhood
|
||||
and amenity type before loading.
|
||||
|
||||
Args:
|
||||
records: List of validated AmenityRecord schemas.
|
||||
year: Year to associate with the amenity counts.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of records loaded (inserted + updated).
|
||||
"""
|
||||
# Aggregate records by neighbourhood and amenity type
|
||||
counts: Counter[tuple[int, str]] = Counter()
|
||||
for r in records:
|
||||
key = (r.neighbourhood_id, r.amenity_type.value)
|
||||
counts[key] += 1
|
||||
|
||||
# Convert to AmenityCount schemas then to models
|
||||
def _load(sess: Session) -> int:
|
||||
models = []
|
||||
for (neighbourhood_id, amenity_type), count in counts.items():
|
||||
model = FactAmenities(
|
||||
neighbourhood_id=neighbourhood_id,
|
||||
amenity_type=amenity_type,
|
||||
count=count,
|
||||
year=year,
|
||||
)
|
||||
models.append(model)
|
||||
|
||||
inserted, updated = upsert_by_key(
|
||||
sess, FactAmenities, models, ["neighbourhood_id", "amenity_type", "year"]
|
||||
)
|
||||
return inserted + updated
|
||||
|
||||
if session:
|
||||
return _load(session)
|
||||
with get_session() as sess:
|
||||
return _load(sess)
|
||||
|
||||
|
||||
def load_amenity_counts(
|
||||
records: list[AmenityCount],
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Load pre-aggregated amenity counts to fact_amenities table.
|
||||
|
||||
Args:
|
||||
records: List of validated AmenityCount schemas.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of records loaded (inserted + updated).
|
||||
"""
|
||||
|
||||
def _load(sess: Session) -> int:
|
||||
models = []
|
||||
for r in records:
|
||||
model = FactAmenities(
|
||||
neighbourhood_id=r.neighbourhood_id,
|
||||
amenity_type=r.amenity_type.value,
|
||||
count=r.count,
|
||||
year=r.year,
|
||||
)
|
||||
models.append(model)
|
||||
|
||||
inserted, updated = upsert_by_key(
|
||||
sess, FactAmenities, models, ["neighbourhood_id", "amenity_type", "year"]
|
||||
)
|
||||
return inserted + updated
|
||||
|
||||
if session:
|
||||
return _load(session)
|
||||
with get_session() as sess:
|
||||
return _load(sess)
|
||||
68
portfolio_app/toronto/loaders/census.py
Normal file
68
portfolio_app/toronto/loaders/census.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""Loader for census data to fact_census table."""
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from portfolio_app.toronto.models import FactCensus
|
||||
from portfolio_app.toronto.schemas import CensusRecord
|
||||
|
||||
from .base import get_session, upsert_by_key
|
||||
|
||||
|
||||
def load_census_data(
|
||||
records: list[CensusRecord],
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Load census records to fact_census table.
|
||||
|
||||
Args:
|
||||
records: List of validated CensusRecord schemas.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of records loaded (inserted + updated).
|
||||
"""
|
||||
|
||||
def _load(sess: Session) -> int:
|
||||
models = []
|
||||
for r in records:
|
||||
model = FactCensus(
|
||||
neighbourhood_id=r.neighbourhood_id,
|
||||
census_year=r.census_year,
|
||||
population=r.population,
|
||||
population_density=float(r.population_density)
|
||||
if r.population_density
|
||||
else None,
|
||||
median_household_income=float(r.median_household_income)
|
||||
if r.median_household_income
|
||||
else None,
|
||||
average_household_income=float(r.average_household_income)
|
||||
if r.average_household_income
|
||||
else None,
|
||||
unemployment_rate=float(r.unemployment_rate)
|
||||
if r.unemployment_rate
|
||||
else None,
|
||||
pct_bachelors_or_higher=float(r.pct_bachelors_or_higher)
|
||||
if r.pct_bachelors_or_higher
|
||||
else None,
|
||||
pct_owner_occupied=float(r.pct_owner_occupied)
|
||||
if r.pct_owner_occupied
|
||||
else None,
|
||||
pct_renter_occupied=float(r.pct_renter_occupied)
|
||||
if r.pct_renter_occupied
|
||||
else None,
|
||||
median_age=float(r.median_age) if r.median_age else None,
|
||||
average_dwelling_value=float(r.average_dwelling_value)
|
||||
if r.average_dwelling_value
|
||||
else None,
|
||||
)
|
||||
models.append(model)
|
||||
|
||||
inserted, updated = upsert_by_key(
|
||||
sess, FactCensus, models, ["neighbourhood_id", "census_year"]
|
||||
)
|
||||
return inserted + updated
|
||||
|
||||
if session:
|
||||
return _load(session)
|
||||
with get_session() as sess:
|
||||
return _load(sess)
|
||||
131
portfolio_app/toronto/loaders/cmhc_crosswalk.py
Normal file
131
portfolio_app/toronto/loaders/cmhc_crosswalk.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""Loader for CMHC zone to neighbourhood crosswalk with area weights."""
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .base import get_session
|
||||
|
||||
|
||||
def build_cmhc_neighbourhood_crosswalk(
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Calculate area overlap weights between CMHC zones and neighbourhoods.
|
||||
|
||||
Uses PostGIS ST_Intersection and ST_Area functions to compute the
|
||||
proportion of each CMHC zone that overlaps with each neighbourhood.
|
||||
This enables disaggregation of CMHC zone-level data to neighbourhood level.
|
||||
|
||||
The function is idempotent - it clears existing crosswalk data before
|
||||
rebuilding.
|
||||
|
||||
Args:
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of bridge records created.
|
||||
|
||||
Note:
|
||||
Requires both dim_cmhc_zone and dim_neighbourhood tables to have
|
||||
geometry columns populated with valid PostGIS geometries.
|
||||
"""
|
||||
|
||||
def _build(sess: Session) -> int:
|
||||
# Clear existing crosswalk data
|
||||
sess.execute(text("DELETE FROM bridge_cmhc_neighbourhood"))
|
||||
|
||||
# Calculate overlap weights using PostGIS
|
||||
# Weight = area of intersection / total area of CMHC zone
|
||||
crosswalk_query = text(
|
||||
"""
|
||||
INSERT INTO bridge_cmhc_neighbourhood (cmhc_zone_code, neighbourhood_id, weight)
|
||||
SELECT
|
||||
z.zone_code,
|
||||
n.neighbourhood_id,
|
||||
CASE
|
||||
WHEN ST_Area(z.geometry::geography) > 0 THEN
|
||||
ST_Area(ST_Intersection(z.geometry, n.geometry)::geography) /
|
||||
ST_Area(z.geometry::geography)
|
||||
ELSE 0
|
||||
END as weight
|
||||
FROM dim_cmhc_zone z
|
||||
JOIN dim_neighbourhood n
|
||||
ON ST_Intersects(z.geometry, n.geometry)
|
||||
WHERE
|
||||
z.geometry IS NOT NULL
|
||||
AND n.geometry IS NOT NULL
|
||||
AND ST_Area(ST_Intersection(z.geometry, n.geometry)::geography) > 0
|
||||
"""
|
||||
)
|
||||
|
||||
sess.execute(crosswalk_query)
|
||||
|
||||
# Count records created
|
||||
count_result = sess.execute(
|
||||
text("SELECT COUNT(*) FROM bridge_cmhc_neighbourhood")
|
||||
)
|
||||
count = count_result.scalar() or 0
|
||||
|
||||
return int(count)
|
||||
|
||||
if session:
|
||||
return _build(session)
|
||||
with get_session() as sess:
|
||||
return _build(sess)
|
||||
|
||||
|
||||
def get_neighbourhood_weights_for_zone(
|
||||
zone_code: str,
|
||||
session: Session | None = None,
|
||||
) -> list[tuple[int, float]]:
|
||||
"""Get neighbourhood weights for a specific CMHC zone.
|
||||
|
||||
Args:
|
||||
zone_code: CMHC zone code.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
List of (neighbourhood_id, weight) tuples.
|
||||
"""
|
||||
|
||||
def _get(sess: Session) -> list[tuple[int, float]]:
|
||||
result = sess.execute(
|
||||
text(
|
||||
"""
|
||||
SELECT neighbourhood_id, weight
|
||||
FROM bridge_cmhc_neighbourhood
|
||||
WHERE cmhc_zone_code = :zone_code
|
||||
ORDER BY weight DESC
|
||||
"""
|
||||
),
|
||||
{"zone_code": zone_code},
|
||||
)
|
||||
return [(int(row[0]), float(row[1])) for row in result]
|
||||
|
||||
if session:
|
||||
return _get(session)
|
||||
with get_session() as sess:
|
||||
return _get(sess)
|
||||
|
||||
|
||||
def disaggregate_zone_value(
|
||||
zone_code: str,
|
||||
value: float,
|
||||
session: Session | None = None,
|
||||
) -> dict[int, float]:
|
||||
"""Disaggregate a CMHC zone value to neighbourhoods using weights.
|
||||
|
||||
Args:
|
||||
zone_code: CMHC zone code.
|
||||
value: Value to disaggregate (e.g., average rent).
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping neighbourhood_id to weighted value.
|
||||
|
||||
Note:
|
||||
For averages (like rent), the weighted value represents the
|
||||
contribution from this zone. To get a neighbourhood's total,
|
||||
sum contributions from all overlapping zones.
|
||||
"""
|
||||
weights = get_neighbourhood_weights_for_zone(zone_code, session)
|
||||
return {neighbourhood_id: value * weight for neighbourhood_id, weight in weights}
|
||||
45
portfolio_app/toronto/loaders/crime.py
Normal file
45
portfolio_app/toronto/loaders/crime.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Loader for crime data to fact_crime table."""
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from portfolio_app.toronto.models import FactCrime
|
||||
from portfolio_app.toronto.schemas import CrimeRecord
|
||||
|
||||
from .base import get_session, upsert_by_key
|
||||
|
||||
|
||||
def load_crime_data(
|
||||
records: list[CrimeRecord],
|
||||
session: Session | None = None,
|
||||
) -> int:
|
||||
"""Load crime records to fact_crime table.
|
||||
|
||||
Args:
|
||||
records: List of validated CrimeRecord schemas.
|
||||
session: Optional existing session.
|
||||
|
||||
Returns:
|
||||
Number of records loaded (inserted + updated).
|
||||
"""
|
||||
|
||||
def _load(sess: Session) -> int:
|
||||
models = []
|
||||
for r in records:
|
||||
model = FactCrime(
|
||||
neighbourhood_id=r.neighbourhood_id,
|
||||
year=r.year,
|
||||
crime_type=r.crime_type.value,
|
||||
count=r.count,
|
||||
rate_per_100k=float(r.rate_per_100k) if r.rate_per_100k else None,
|
||||
)
|
||||
models.append(model)
|
||||
|
||||
inserted, updated = upsert_by_key(
|
||||
sess, FactCrime, models, ["neighbourhood_id", "year", "crime_type"]
|
||||
)
|
||||
return inserted + updated
|
||||
|
||||
if session:
|
||||
return _load(session)
|
||||
with get_session() as sess:
|
||||
return _load(sess)
|
||||
Reference in New Issue
Block a user