Merge pull request 'feat: Add GeoJSON parsers and choropleth map visualization' (#26) from feature/geo-parsers-choropleth into development

This commit was merged in pull request #26.
This commit is contained in:
2026-01-14 23:02:21 +00:00
8 changed files with 1923 additions and 49 deletions

View File

@@ -7,7 +7,7 @@ repos:
- id: check-yaml
- id: check-added-large-files
args: ['--maxkb=1000']
exclude: ^data/raw/
exclude: ^data/(raw/|toronto/raw/geo/)
- id: check-merge-conflict
- repo: https://github.com/astral-sh/ruff-pre-commit

View File

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -1,9 +1,20 @@
"""Parsers for Toronto housing data sources."""
from .cmhc import CMHCParser
from .geo import (
CMHCZoneParser,
NeighbourhoodParser,
TRREBDistrictParser,
load_geojson,
)
from .trreb import TRREBParser
__all__ = [
"TRREBParser",
"CMHCParser",
# GeoJSON parsers
"CMHCZoneParser",
"TRREBDistrictParser",
"NeighbourhoodParser",
"load_geojson",
]

View File

@@ -0,0 +1,463 @@
"""GeoJSON parser for geographic boundary files.
This module provides parsers for loading geographic boundary files
(GeoJSON format) and converting them to Pydantic schemas for database
loading or direct use in Plotly choropleth maps.
"""
import json
from pathlib import Path
from typing import Any
from pyproj import Transformer
from shapely.geometry import mapping, shape
from shapely.ops import transform
from portfolio_app.toronto.schemas import CMHCZone, Neighbourhood, TRREBDistrict
from portfolio_app.toronto.schemas.dimensions import AreaType
# Transformer for reprojecting from Web Mercator to WGS84
_TRANSFORMER_3857_TO_4326 = Transformer.from_crs(
"EPSG:3857", "EPSG:4326", always_xy=True
)
def load_geojson(path: Path) -> dict[str, Any]:
"""Load a GeoJSON file and return as dictionary.
Args:
path: Path to the GeoJSON file.
Returns:
GeoJSON as dictionary (FeatureCollection).
Raises:
FileNotFoundError: If file does not exist.
ValueError: If file is not valid GeoJSON.
"""
if not path.exists():
raise FileNotFoundError(f"GeoJSON file not found: {path}")
if path.suffix.lower() not in (".geojson", ".json"):
raise ValueError(f"Expected GeoJSON file, got: {path.suffix}")
with open(path, encoding="utf-8") as f:
data = json.load(f)
if data.get("type") != "FeatureCollection":
raise ValueError("GeoJSON must be a FeatureCollection")
return dict(data)
def geometry_to_wkt(geometry: dict[str, Any]) -> str:
"""Convert GeoJSON geometry to WKT string.
Args:
geometry: GeoJSON geometry dictionary.
Returns:
WKT representation of the geometry.
"""
return str(shape(geometry).wkt)
def reproject_geometry(
geometry: dict[str, Any], source_crs: str = "EPSG:3857"
) -> dict[str, Any]:
"""Reproject a GeoJSON geometry to WGS84 (EPSG:4326).
Args:
geometry: GeoJSON geometry dictionary.
source_crs: Source CRS (default EPSG:3857 Web Mercator).
Returns:
GeoJSON geometry in WGS84 coordinates.
"""
if source_crs == "EPSG:3857":
transformer = _TRANSFORMER_3857_TO_4326
else:
transformer = Transformer.from_crs(source_crs, "EPSG:4326", always_xy=True)
geom = shape(geometry)
reprojected = transform(transformer.transform, geom)
return dict(mapping(reprojected))
class CMHCZoneParser:
"""Parser for CMHC zone boundary GeoJSON files.
CMHC zone boundaries are extracted from the R `cmhc` package using
`get_cmhc_geography(geography_type="ZONE", cma="Toronto")`.
Expected GeoJSON properties:
- zone_code or Zone_Code: Zone identifier
- zone_name or Zone_Name: Zone name
"""
# Property name mappings for different GeoJSON formats
CODE_PROPERTIES = ["zone_code", "Zone_Code", "ZONE_CODE", "zonecode", "code"]
NAME_PROPERTIES = [
"zone_name",
"Zone_Name",
"ZONE_NAME",
"ZONE_NAME_EN",
"NAME_EN",
"zonename",
"name",
"NAME",
]
def __init__(self, geojson_path: Path) -> None:
"""Initialize parser with path to GeoJSON file.
Args:
geojson_path: Path to the CMHC zones GeoJSON file.
"""
self.geojson_path = geojson_path
self._geojson: dict[str, Any] | None = None
@property
def geojson(self) -> dict[str, Any]:
"""Lazy-load and return raw GeoJSON data."""
if self._geojson is None:
self._geojson = load_geojson(self.geojson_path)
return self._geojson
def _find_property(
self, properties: dict[str, Any], candidates: list[str]
) -> str | None:
"""Find a property value by checking multiple candidate names."""
for name in candidates:
if name in properties and properties[name] is not None:
return str(properties[name])
return None
def parse(self) -> list[CMHCZone]:
"""Parse GeoJSON and return list of CMHCZone schemas.
Returns:
List of validated CMHCZone objects.
Raises:
ValueError: If required properties are missing.
"""
zones = []
for feature in self.geojson.get("features", []):
props = feature.get("properties", {})
geom = feature.get("geometry")
zone_code = self._find_property(props, self.CODE_PROPERTIES)
zone_name = self._find_property(props, self.NAME_PROPERTIES)
if not zone_code:
raise ValueError(
f"Zone code not found in properties: {list(props.keys())}"
)
if not zone_name:
zone_name = zone_code # Fallback to code if name missing
geometry_wkt = geometry_to_wkt(geom) if geom else None
zones.append(
CMHCZone(
zone_code=zone_code,
zone_name=zone_name,
geometry_wkt=geometry_wkt,
)
)
return zones
def _needs_reprojection(self) -> bool:
"""Check if GeoJSON needs reprojection to WGS84."""
crs = self.geojson.get("crs", {})
crs_name = crs.get("properties", {}).get("name", "")
# EPSG:3857 or Web Mercator needs reprojection
return "3857" in crs_name or "900913" in crs_name
def get_geojson_for_choropleth(
self, key_property: str = "zone_code"
) -> dict[str, Any]:
"""Get GeoJSON formatted for Plotly choropleth maps.
Ensures the feature properties include a standardized key for
joining with data. Automatically reprojects from EPSG:3857 to
WGS84 if needed.
Args:
key_property: Property name to use as feature identifier.
Returns:
GeoJSON FeatureCollection with standardized properties in WGS84.
"""
needs_reproject = self._needs_reprojection()
features = []
for feature in self.geojson.get("features", []):
props = feature.get("properties", {})
new_props = dict(props)
# Ensure standardized property names exist
zone_code = self._find_property(props, self.CODE_PROPERTIES)
zone_name = self._find_property(props, self.NAME_PROPERTIES)
new_props["zone_code"] = zone_code
new_props["zone_name"] = zone_name or zone_code
# Reproject geometry if needed
geometry = feature.get("geometry")
if needs_reproject and geometry:
geometry = reproject_geometry(geometry)
features.append(
{
"type": "Feature",
"properties": new_props,
"geometry": geometry,
}
)
return {"type": "FeatureCollection", "features": features}
class TRREBDistrictParser:
"""Parser for TRREB district boundary GeoJSON files.
TRREB district boundaries are manually digitized from the TRREB PDF map
using QGIS.
Expected GeoJSON properties:
- district_code: District code (W01, C01, E01, etc.)
- district_name: District name
- area_type: West, Central, East, or North
"""
CODE_PROPERTIES = [
"district_code",
"District_Code",
"DISTRICT_CODE",
"districtcode",
"code",
]
NAME_PROPERTIES = [
"district_name",
"District_Name",
"DISTRICT_NAME",
"districtname",
"name",
"NAME",
]
AREA_PROPERTIES = [
"area_type",
"Area_Type",
"AREA_TYPE",
"areatype",
"area",
"type",
]
def __init__(self, geojson_path: Path) -> None:
"""Initialize parser with path to GeoJSON file."""
self.geojson_path = geojson_path
self._geojson: dict[str, Any] | None = None
@property
def geojson(self) -> dict[str, Any]:
"""Lazy-load and return raw GeoJSON data."""
if self._geojson is None:
self._geojson = load_geojson(self.geojson_path)
return self._geojson
def _find_property(
self, properties: dict[str, Any], candidates: list[str]
) -> str | None:
"""Find a property value by checking multiple candidate names."""
for name in candidates:
if name in properties and properties[name] is not None:
return str(properties[name])
return None
def _infer_area_type(self, district_code: str) -> AreaType:
"""Infer area type from district code prefix."""
prefix = district_code[0].upper()
mapping = {"W": AreaType.WEST, "C": AreaType.CENTRAL, "E": AreaType.EAST}
return mapping.get(prefix, AreaType.NORTH)
def parse(self) -> list[TRREBDistrict]:
"""Parse GeoJSON and return list of TRREBDistrict schemas."""
districts = []
for feature in self.geojson.get("features", []):
props = feature.get("properties", {})
geom = feature.get("geometry")
district_code = self._find_property(props, self.CODE_PROPERTIES)
district_name = self._find_property(props, self.NAME_PROPERTIES)
area_type_str = self._find_property(props, self.AREA_PROPERTIES)
if not district_code:
raise ValueError(
f"District code not found in properties: {list(props.keys())}"
)
if not district_name:
district_name = district_code
# Infer or parse area type
if area_type_str:
try:
area_type = AreaType(area_type_str)
except ValueError:
area_type = self._infer_area_type(district_code)
else:
area_type = self._infer_area_type(district_code)
geometry_wkt = geometry_to_wkt(geom) if geom else None
districts.append(
TRREBDistrict(
district_code=district_code,
district_name=district_name,
area_type=area_type,
geometry_wkt=geometry_wkt,
)
)
return districts
def get_geojson_for_choropleth(
self, key_property: str = "district_code"
) -> dict[str, Any]:
"""Get GeoJSON formatted for Plotly choropleth maps."""
features = []
for feature in self.geojson.get("features", []):
props = feature.get("properties", {})
new_props = dict(props)
district_code = self._find_property(props, self.CODE_PROPERTIES)
district_name = self._find_property(props, self.NAME_PROPERTIES)
new_props["district_code"] = district_code
new_props["district_name"] = district_name or district_code
features.append(
{
"type": "Feature",
"properties": new_props,
"geometry": feature.get("geometry"),
}
)
return {"type": "FeatureCollection", "features": features}
class NeighbourhoodParser:
"""Parser for City of Toronto neighbourhood boundary GeoJSON files.
Neighbourhood boundaries are from the City of Toronto Open Data portal.
Expected GeoJSON properties:
- neighbourhood_id or AREA_ID: Neighbourhood ID (1-158)
- name or AREA_NAME: Neighbourhood name
"""
ID_PROPERTIES = [
"neighbourhood_id",
"AREA_SHORT_CODE", # City of Toronto 158 neighbourhoods
"AREA_LONG_CODE",
"AREA_ID",
"area_id",
"id",
"ID",
"HOOD_ID",
]
NAME_PROPERTIES = [
"AREA_NAME", # City of Toronto 158 neighbourhoods
"name",
"NAME",
"area_name",
"neighbourhood_name",
]
def __init__(self, geojson_path: Path) -> None:
"""Initialize parser with path to GeoJSON file."""
self.geojson_path = geojson_path
self._geojson: dict[str, Any] | None = None
@property
def geojson(self) -> dict[str, Any]:
"""Lazy-load and return raw GeoJSON data."""
if self._geojson is None:
self._geojson = load_geojson(self.geojson_path)
return self._geojson
def _find_property(
self, properties: dict[str, Any], candidates: list[str]
) -> str | None:
"""Find a property value by checking multiple candidate names."""
for name in candidates:
if name in properties and properties[name] is not None:
return str(properties[name])
return None
def parse(self) -> list[Neighbourhood]:
"""Parse GeoJSON and return list of Neighbourhood schemas.
Note: This parser only extracts ID, name, and geometry.
Census enrichment data (population, income, etc.) should be
loaded separately and merged.
"""
neighbourhoods = []
for feature in self.geojson.get("features", []):
props = feature.get("properties", {})
geom = feature.get("geometry")
neighbourhood_id_str = self._find_property(props, self.ID_PROPERTIES)
name = self._find_property(props, self.NAME_PROPERTIES)
if not neighbourhood_id_str:
raise ValueError(
f"Neighbourhood ID not found in properties: {list(props.keys())}"
)
neighbourhood_id = int(neighbourhood_id_str)
if not name:
name = f"Neighbourhood {neighbourhood_id}"
geometry_wkt = geometry_to_wkt(geom) if geom else None
neighbourhoods.append(
Neighbourhood(
neighbourhood_id=neighbourhood_id,
name=name,
geometry_wkt=geometry_wkt,
)
)
return neighbourhoods
def get_geojson_for_choropleth(
self, key_property: str = "neighbourhood_id"
) -> dict[str, Any]:
"""Get GeoJSON formatted for Plotly choropleth maps."""
features = []
for feature in self.geojson.get("features", []):
props = feature.get("properties", {})
new_props = dict(props)
neighbourhood_id = self._find_property(props, self.ID_PROPERTIES)
name = self._find_property(props, self.NAME_PROPERTIES)
new_props["neighbourhood_id"] = (
int(neighbourhood_id) if neighbourhood_id else None
)
new_props["name"] = name
features.append(
{
"type": "Feature",
"properties": new_props,
"geometry": feature.get("geometry"),
}
)
return {"type": "FeatureCollection", "features": features}

View File

@@ -0,0 +1,6 @@
"""Placeholder test to ensure pytest collection succeeds."""
def test_placeholder():
"""Remove this once real tests are added."""
assert True