feat: add GeoJSON parsers and choropleth map visualization
- Add geo.py parser module with CMHCZoneParser, TRREBDistrictParser, and NeighbourhoodParser for loading geographic boundaries - Add coordinate reprojection support (EPSG:3857 to WGS84) - Organize geo data in data/toronto/raw/geo/ directory - Add CMHC zones GeoJSON (31 zones) for rental market choropleth - Add Toronto neighbourhoods GeoJSON (158) as purchase market proxy - Update callbacks with real CMHC 2024 rental data - Add sample purchase data for all 158 neighbourhoods - Update pre-commit config to exclude geo data files Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,7 @@ repos:
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
args: ['--maxkb=1000']
|
||||
exclude: ^data/raw/
|
||||
exclude: ^data/(raw/|toronto/raw/geo/)
|
||||
- id: check-merge-conflict
|
||||
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
|
||||
0
data/toronto/raw/geo/.gitkeep
Normal file
0
data/toronto/raw/geo/.gitkeep
Normal file
1
data/toronto/raw/geo/toronto_neighbourhoods.geojson
Normal file
1
data/toronto/raw/geo/toronto_neighbourhoods.geojson
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,20 @@
|
||||
"""Parsers for Toronto housing data sources."""
|
||||
|
||||
from .cmhc import CMHCParser
|
||||
from .geo import (
|
||||
CMHCZoneParser,
|
||||
NeighbourhoodParser,
|
||||
TRREBDistrictParser,
|
||||
load_geojson,
|
||||
)
|
||||
from .trreb import TRREBParser
|
||||
|
||||
__all__ = [
|
||||
"TRREBParser",
|
||||
"CMHCParser",
|
||||
# GeoJSON parsers
|
||||
"CMHCZoneParser",
|
||||
"TRREBDistrictParser",
|
||||
"NeighbourhoodParser",
|
||||
"load_geojson",
|
||||
]
|
||||
|
||||
463
portfolio_app/toronto/parsers/geo.py
Normal file
463
portfolio_app/toronto/parsers/geo.py
Normal file
@@ -0,0 +1,463 @@
|
||||
"""GeoJSON parser for geographic boundary files.
|
||||
|
||||
This module provides parsers for loading geographic boundary files
|
||||
(GeoJSON format) and converting them to Pydantic schemas for database
|
||||
loading or direct use in Plotly choropleth maps.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from pyproj import Transformer
|
||||
from shapely.geometry import mapping, shape
|
||||
from shapely.ops import transform
|
||||
|
||||
from portfolio_app.toronto.schemas import CMHCZone, Neighbourhood, TRREBDistrict
|
||||
from portfolio_app.toronto.schemas.dimensions import AreaType
|
||||
|
||||
# Transformer for reprojecting from Web Mercator to WGS84
|
||||
_TRANSFORMER_3857_TO_4326 = Transformer.from_crs(
|
||||
"EPSG:3857", "EPSG:4326", always_xy=True
|
||||
)
|
||||
|
||||
|
||||
def load_geojson(path: Path) -> dict[str, Any]:
|
||||
"""Load a GeoJSON file and return as dictionary.
|
||||
|
||||
Args:
|
||||
path: Path to the GeoJSON file.
|
||||
|
||||
Returns:
|
||||
GeoJSON as dictionary (FeatureCollection).
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If file does not exist.
|
||||
ValueError: If file is not valid GeoJSON.
|
||||
"""
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"GeoJSON file not found: {path}")
|
||||
|
||||
if path.suffix.lower() not in (".geojson", ".json"):
|
||||
raise ValueError(f"Expected GeoJSON file, got: {path.suffix}")
|
||||
|
||||
with open(path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if data.get("type") != "FeatureCollection":
|
||||
raise ValueError("GeoJSON must be a FeatureCollection")
|
||||
|
||||
return dict(data)
|
||||
|
||||
|
||||
def geometry_to_wkt(geometry: dict[str, Any]) -> str:
|
||||
"""Convert GeoJSON geometry to WKT string.
|
||||
|
||||
Args:
|
||||
geometry: GeoJSON geometry dictionary.
|
||||
|
||||
Returns:
|
||||
WKT representation of the geometry.
|
||||
"""
|
||||
return str(shape(geometry).wkt)
|
||||
|
||||
|
||||
def reproject_geometry(
|
||||
geometry: dict[str, Any], source_crs: str = "EPSG:3857"
|
||||
) -> dict[str, Any]:
|
||||
"""Reproject a GeoJSON geometry to WGS84 (EPSG:4326).
|
||||
|
||||
Args:
|
||||
geometry: GeoJSON geometry dictionary.
|
||||
source_crs: Source CRS (default EPSG:3857 Web Mercator).
|
||||
|
||||
Returns:
|
||||
GeoJSON geometry in WGS84 coordinates.
|
||||
"""
|
||||
if source_crs == "EPSG:3857":
|
||||
transformer = _TRANSFORMER_3857_TO_4326
|
||||
else:
|
||||
transformer = Transformer.from_crs(source_crs, "EPSG:4326", always_xy=True)
|
||||
|
||||
geom = shape(geometry)
|
||||
reprojected = transform(transformer.transform, geom)
|
||||
return dict(mapping(reprojected))
|
||||
|
||||
|
||||
class CMHCZoneParser:
|
||||
"""Parser for CMHC zone boundary GeoJSON files.
|
||||
|
||||
CMHC zone boundaries are extracted from the R `cmhc` package using
|
||||
`get_cmhc_geography(geography_type="ZONE", cma="Toronto")`.
|
||||
|
||||
Expected GeoJSON properties:
|
||||
- zone_code or Zone_Code: Zone identifier
|
||||
- zone_name or Zone_Name: Zone name
|
||||
"""
|
||||
|
||||
# Property name mappings for different GeoJSON formats
|
||||
CODE_PROPERTIES = ["zone_code", "Zone_Code", "ZONE_CODE", "zonecode", "code"]
|
||||
NAME_PROPERTIES = [
|
||||
"zone_name",
|
||||
"Zone_Name",
|
||||
"ZONE_NAME",
|
||||
"ZONE_NAME_EN",
|
||||
"NAME_EN",
|
||||
"zonename",
|
||||
"name",
|
||||
"NAME",
|
||||
]
|
||||
|
||||
def __init__(self, geojson_path: Path) -> None:
|
||||
"""Initialize parser with path to GeoJSON file.
|
||||
|
||||
Args:
|
||||
geojson_path: Path to the CMHC zones GeoJSON file.
|
||||
"""
|
||||
self.geojson_path = geojson_path
|
||||
self._geojson: dict[str, Any] | None = None
|
||||
|
||||
@property
|
||||
def geojson(self) -> dict[str, Any]:
|
||||
"""Lazy-load and return raw GeoJSON data."""
|
||||
if self._geojson is None:
|
||||
self._geojson = load_geojson(self.geojson_path)
|
||||
return self._geojson
|
||||
|
||||
def _find_property(
|
||||
self, properties: dict[str, Any], candidates: list[str]
|
||||
) -> str | None:
|
||||
"""Find a property value by checking multiple candidate names."""
|
||||
for name in candidates:
|
||||
if name in properties and properties[name] is not None:
|
||||
return str(properties[name])
|
||||
return None
|
||||
|
||||
def parse(self) -> list[CMHCZone]:
|
||||
"""Parse GeoJSON and return list of CMHCZone schemas.
|
||||
|
||||
Returns:
|
||||
List of validated CMHCZone objects.
|
||||
|
||||
Raises:
|
||||
ValueError: If required properties are missing.
|
||||
"""
|
||||
zones = []
|
||||
for feature in self.geojson.get("features", []):
|
||||
props = feature.get("properties", {})
|
||||
geom = feature.get("geometry")
|
||||
|
||||
zone_code = self._find_property(props, self.CODE_PROPERTIES)
|
||||
zone_name = self._find_property(props, self.NAME_PROPERTIES)
|
||||
|
||||
if not zone_code:
|
||||
raise ValueError(
|
||||
f"Zone code not found in properties: {list(props.keys())}"
|
||||
)
|
||||
if not zone_name:
|
||||
zone_name = zone_code # Fallback to code if name missing
|
||||
|
||||
geometry_wkt = geometry_to_wkt(geom) if geom else None
|
||||
|
||||
zones.append(
|
||||
CMHCZone(
|
||||
zone_code=zone_code,
|
||||
zone_name=zone_name,
|
||||
geometry_wkt=geometry_wkt,
|
||||
)
|
||||
)
|
||||
|
||||
return zones
|
||||
|
||||
def _needs_reprojection(self) -> bool:
|
||||
"""Check if GeoJSON needs reprojection to WGS84."""
|
||||
crs = self.geojson.get("crs", {})
|
||||
crs_name = crs.get("properties", {}).get("name", "")
|
||||
# EPSG:3857 or Web Mercator needs reprojection
|
||||
return "3857" in crs_name or "900913" in crs_name
|
||||
|
||||
def get_geojson_for_choropleth(
|
||||
self, key_property: str = "zone_code"
|
||||
) -> dict[str, Any]:
|
||||
"""Get GeoJSON formatted for Plotly choropleth maps.
|
||||
|
||||
Ensures the feature properties include a standardized key for
|
||||
joining with data. Automatically reprojects from EPSG:3857 to
|
||||
WGS84 if needed.
|
||||
|
||||
Args:
|
||||
key_property: Property name to use as feature identifier.
|
||||
|
||||
Returns:
|
||||
GeoJSON FeatureCollection with standardized properties in WGS84.
|
||||
"""
|
||||
needs_reproject = self._needs_reprojection()
|
||||
features = []
|
||||
|
||||
for feature in self.geojson.get("features", []):
|
||||
props = feature.get("properties", {})
|
||||
new_props = dict(props)
|
||||
|
||||
# Ensure standardized property names exist
|
||||
zone_code = self._find_property(props, self.CODE_PROPERTIES)
|
||||
zone_name = self._find_property(props, self.NAME_PROPERTIES)
|
||||
|
||||
new_props["zone_code"] = zone_code
|
||||
new_props["zone_name"] = zone_name or zone_code
|
||||
|
||||
# Reproject geometry if needed
|
||||
geometry = feature.get("geometry")
|
||||
if needs_reproject and geometry:
|
||||
geometry = reproject_geometry(geometry)
|
||||
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"properties": new_props,
|
||||
"geometry": geometry,
|
||||
}
|
||||
)
|
||||
|
||||
return {"type": "FeatureCollection", "features": features}
|
||||
|
||||
|
||||
class TRREBDistrictParser:
|
||||
"""Parser for TRREB district boundary GeoJSON files.
|
||||
|
||||
TRREB district boundaries are manually digitized from the TRREB PDF map
|
||||
using QGIS.
|
||||
|
||||
Expected GeoJSON properties:
|
||||
- district_code: District code (W01, C01, E01, etc.)
|
||||
- district_name: District name
|
||||
- area_type: West, Central, East, or North
|
||||
"""
|
||||
|
||||
CODE_PROPERTIES = [
|
||||
"district_code",
|
||||
"District_Code",
|
||||
"DISTRICT_CODE",
|
||||
"districtcode",
|
||||
"code",
|
||||
]
|
||||
NAME_PROPERTIES = [
|
||||
"district_name",
|
||||
"District_Name",
|
||||
"DISTRICT_NAME",
|
||||
"districtname",
|
||||
"name",
|
||||
"NAME",
|
||||
]
|
||||
AREA_PROPERTIES = [
|
||||
"area_type",
|
||||
"Area_Type",
|
||||
"AREA_TYPE",
|
||||
"areatype",
|
||||
"area",
|
||||
"type",
|
||||
]
|
||||
|
||||
def __init__(self, geojson_path: Path) -> None:
|
||||
"""Initialize parser with path to GeoJSON file."""
|
||||
self.geojson_path = geojson_path
|
||||
self._geojson: dict[str, Any] | None = None
|
||||
|
||||
@property
|
||||
def geojson(self) -> dict[str, Any]:
|
||||
"""Lazy-load and return raw GeoJSON data."""
|
||||
if self._geojson is None:
|
||||
self._geojson = load_geojson(self.geojson_path)
|
||||
return self._geojson
|
||||
|
||||
def _find_property(
|
||||
self, properties: dict[str, Any], candidates: list[str]
|
||||
) -> str | None:
|
||||
"""Find a property value by checking multiple candidate names."""
|
||||
for name in candidates:
|
||||
if name in properties and properties[name] is not None:
|
||||
return str(properties[name])
|
||||
return None
|
||||
|
||||
def _infer_area_type(self, district_code: str) -> AreaType:
|
||||
"""Infer area type from district code prefix."""
|
||||
prefix = district_code[0].upper()
|
||||
mapping = {"W": AreaType.WEST, "C": AreaType.CENTRAL, "E": AreaType.EAST}
|
||||
return mapping.get(prefix, AreaType.NORTH)
|
||||
|
||||
def parse(self) -> list[TRREBDistrict]:
|
||||
"""Parse GeoJSON and return list of TRREBDistrict schemas."""
|
||||
districts = []
|
||||
for feature in self.geojson.get("features", []):
|
||||
props = feature.get("properties", {})
|
||||
geom = feature.get("geometry")
|
||||
|
||||
district_code = self._find_property(props, self.CODE_PROPERTIES)
|
||||
district_name = self._find_property(props, self.NAME_PROPERTIES)
|
||||
area_type_str = self._find_property(props, self.AREA_PROPERTIES)
|
||||
|
||||
if not district_code:
|
||||
raise ValueError(
|
||||
f"District code not found in properties: {list(props.keys())}"
|
||||
)
|
||||
if not district_name:
|
||||
district_name = district_code
|
||||
|
||||
# Infer or parse area type
|
||||
if area_type_str:
|
||||
try:
|
||||
area_type = AreaType(area_type_str)
|
||||
except ValueError:
|
||||
area_type = self._infer_area_type(district_code)
|
||||
else:
|
||||
area_type = self._infer_area_type(district_code)
|
||||
|
||||
geometry_wkt = geometry_to_wkt(geom) if geom else None
|
||||
|
||||
districts.append(
|
||||
TRREBDistrict(
|
||||
district_code=district_code,
|
||||
district_name=district_name,
|
||||
area_type=area_type,
|
||||
geometry_wkt=geometry_wkt,
|
||||
)
|
||||
)
|
||||
|
||||
return districts
|
||||
|
||||
def get_geojson_for_choropleth(
|
||||
self, key_property: str = "district_code"
|
||||
) -> dict[str, Any]:
|
||||
"""Get GeoJSON formatted for Plotly choropleth maps."""
|
||||
features = []
|
||||
for feature in self.geojson.get("features", []):
|
||||
props = feature.get("properties", {})
|
||||
new_props = dict(props)
|
||||
|
||||
district_code = self._find_property(props, self.CODE_PROPERTIES)
|
||||
district_name = self._find_property(props, self.NAME_PROPERTIES)
|
||||
|
||||
new_props["district_code"] = district_code
|
||||
new_props["district_name"] = district_name or district_code
|
||||
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"properties": new_props,
|
||||
"geometry": feature.get("geometry"),
|
||||
}
|
||||
)
|
||||
|
||||
return {"type": "FeatureCollection", "features": features}
|
||||
|
||||
|
||||
class NeighbourhoodParser:
|
||||
"""Parser for City of Toronto neighbourhood boundary GeoJSON files.
|
||||
|
||||
Neighbourhood boundaries are from the City of Toronto Open Data portal.
|
||||
|
||||
Expected GeoJSON properties:
|
||||
- neighbourhood_id or AREA_ID: Neighbourhood ID (1-158)
|
||||
- name or AREA_NAME: Neighbourhood name
|
||||
"""
|
||||
|
||||
ID_PROPERTIES = [
|
||||
"neighbourhood_id",
|
||||
"AREA_SHORT_CODE", # City of Toronto 158 neighbourhoods
|
||||
"AREA_LONG_CODE",
|
||||
"AREA_ID",
|
||||
"area_id",
|
||||
"id",
|
||||
"ID",
|
||||
"HOOD_ID",
|
||||
]
|
||||
NAME_PROPERTIES = [
|
||||
"AREA_NAME", # City of Toronto 158 neighbourhoods
|
||||
"name",
|
||||
"NAME",
|
||||
"area_name",
|
||||
"neighbourhood_name",
|
||||
]
|
||||
|
||||
def __init__(self, geojson_path: Path) -> None:
|
||||
"""Initialize parser with path to GeoJSON file."""
|
||||
self.geojson_path = geojson_path
|
||||
self._geojson: dict[str, Any] | None = None
|
||||
|
||||
@property
|
||||
def geojson(self) -> dict[str, Any]:
|
||||
"""Lazy-load and return raw GeoJSON data."""
|
||||
if self._geojson is None:
|
||||
self._geojson = load_geojson(self.geojson_path)
|
||||
return self._geojson
|
||||
|
||||
def _find_property(
|
||||
self, properties: dict[str, Any], candidates: list[str]
|
||||
) -> str | None:
|
||||
"""Find a property value by checking multiple candidate names."""
|
||||
for name in candidates:
|
||||
if name in properties and properties[name] is not None:
|
||||
return str(properties[name])
|
||||
return None
|
||||
|
||||
def parse(self) -> list[Neighbourhood]:
|
||||
"""Parse GeoJSON and return list of Neighbourhood schemas.
|
||||
|
||||
Note: This parser only extracts ID, name, and geometry.
|
||||
Census enrichment data (population, income, etc.) should be
|
||||
loaded separately and merged.
|
||||
"""
|
||||
neighbourhoods = []
|
||||
for feature in self.geojson.get("features", []):
|
||||
props = feature.get("properties", {})
|
||||
geom = feature.get("geometry")
|
||||
|
||||
neighbourhood_id_str = self._find_property(props, self.ID_PROPERTIES)
|
||||
name = self._find_property(props, self.NAME_PROPERTIES)
|
||||
|
||||
if not neighbourhood_id_str:
|
||||
raise ValueError(
|
||||
f"Neighbourhood ID not found in properties: {list(props.keys())}"
|
||||
)
|
||||
|
||||
neighbourhood_id = int(neighbourhood_id_str)
|
||||
if not name:
|
||||
name = f"Neighbourhood {neighbourhood_id}"
|
||||
|
||||
geometry_wkt = geometry_to_wkt(geom) if geom else None
|
||||
|
||||
neighbourhoods.append(
|
||||
Neighbourhood(
|
||||
neighbourhood_id=neighbourhood_id,
|
||||
name=name,
|
||||
geometry_wkt=geometry_wkt,
|
||||
)
|
||||
)
|
||||
|
||||
return neighbourhoods
|
||||
|
||||
def get_geojson_for_choropleth(
|
||||
self, key_property: str = "neighbourhood_id"
|
||||
) -> dict[str, Any]:
|
||||
"""Get GeoJSON formatted for Plotly choropleth maps."""
|
||||
features = []
|
||||
for feature in self.geojson.get("features", []):
|
||||
props = feature.get("properties", {})
|
||||
new_props = dict(props)
|
||||
|
||||
neighbourhood_id = self._find_property(props, self.ID_PROPERTIES)
|
||||
name = self._find_property(props, self.NAME_PROPERTIES)
|
||||
|
||||
new_props["neighbourhood_id"] = (
|
||||
int(neighbourhood_id) if neighbourhood_id else None
|
||||
)
|
||||
new_props["name"] = name
|
||||
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"properties": new_props,
|
||||
"geometry": feature.get("geometry"),
|
||||
}
|
||||
)
|
||||
|
||||
return {"type": "FeatureCollection", "features": features}
|
||||
6
tests/test_placeholder.py
Normal file
6
tests/test_placeholder.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Placeholder test to ensure pytest collection succeeds."""
|
||||
|
||||
|
||||
def test_placeholder():
|
||||
"""Remove this once real tests are added."""
|
||||
assert True
|
||||
Reference in New Issue
Block a user