Files
personal-portfolio/scripts/data/load_toronto_data.py
lmiranda 6872aa510b fix: Use explicit path for .env file loading
load_dotenv() was searching from cwd, which may not be the project root.
Now explicitly passes PROJECT_ROOT / ".env" path.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 12:14:48 -05:00

406 lines
12 KiB
Python

#!/usr/bin/env python3
"""Load Toronto neighbourhood data into the database.
Usage:
python scripts/data/load_toronto_data.py [OPTIONS]
Options:
--skip-fetch Skip API fetching, only run dbt
--skip-dbt Skip dbt run, only load data
--dry-run Show what would be done without executing
-v, --verbose Enable verbose logging
This script orchestrates:
1. Fetching data from Toronto Open Data and CMHC APIs
2. Loading data into PostgreSQL fact tables
3. Running dbt to transform staging -> intermediate -> marts
Exit codes:
0 = Success
1 = Error
"""
import argparse
import logging
import subprocess
import sys
from datetime import date
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent.parent
# Load .env file so dbt can access POSTGRES_* environment variables
load_dotenv(PROJECT_ROOT / ".env")
sys.path.insert(0, str(PROJECT_ROOT))
from portfolio_app.toronto.loaders import ( # noqa: E402
get_session,
load_amenities,
load_census_data,
load_crime_data,
load_neighbourhoods,
load_statcan_cmhc_data,
load_time_dimension,
)
from portfolio_app.toronto.parsers import ( # noqa: E402
TorontoOpenDataParser,
TorontoPoliceParser,
)
from portfolio_app.toronto.parsers.statcan_cmhc import ( # noqa: E402
fetch_toronto_rental_data,
)
from portfolio_app.toronto.schemas import Neighbourhood # noqa: E402
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
class DataPipeline:
"""Orchestrates data loading from APIs to database to dbt."""
def __init__(self, dry_run: bool = False, verbose: bool = False):
self.dry_run = dry_run
self.verbose = verbose
self.stats: dict[str, int] = {}
if verbose:
logging.getLogger().setLevel(logging.DEBUG)
def fetch_and_load(self) -> bool:
"""Fetch data from APIs and load into database.
Returns:
True if successful, False otherwise.
"""
logger.info("Starting data fetch and load pipeline...")
try:
with get_session() as session:
# 1. Load time dimension first (for date keys)
self._load_time_dimension(session)
# 2. Load neighbourhoods (required for foreign keys)
self._load_neighbourhoods(session)
# 3. Load census data
self._load_census(session)
# 4. Load crime data
self._load_crime(session)
# 5. Load amenities
self._load_amenities(session)
# 6. Load CMHC rental data from StatCan
self._load_rentals(session)
session.commit()
logger.info("All data committed to database")
self._print_stats()
return True
except Exception as e:
logger.error(f"Pipeline failed: {e}")
if self.verbose:
import traceback
traceback.print_exc()
return False
def _load_time_dimension(self, session: Any) -> None:
"""Load time dimension with date range for dashboard."""
logger.info("Loading time dimension...")
if self.dry_run:
logger.info(
" [DRY RUN] Would load time dimension 2019-01-01 to 2025-12-01"
)
return
count = load_time_dimension(
start_date=date(2019, 1, 1),
end_date=date(2025, 12, 1),
session=session,
)
self.stats["time_dimension"] = count
logger.info(f" Loaded {count} time dimension records")
def _load_neighbourhoods(self, session: Any) -> None:
"""Fetch and load neighbourhood boundaries."""
logger.info("Fetching neighbourhoods from Toronto Open Data...")
if self.dry_run:
logger.info(" [DRY RUN] Would fetch and load neighbourhoods")
return
import json
parser = TorontoOpenDataParser()
raw_neighbourhoods = parser.get_neighbourhoods()
# Convert NeighbourhoodRecord to Neighbourhood schema
neighbourhoods = []
for n in raw_neighbourhoods:
# Convert GeoJSON geometry dict to WKT if present
geometry_wkt = None
if n.geometry:
# Store as GeoJSON string for PostGIS ST_GeomFromGeoJSON
geometry_wkt = json.dumps(n.geometry)
neighbourhood = Neighbourhood(
neighbourhood_id=n.area_id,
name=n.area_name,
geometry_wkt=geometry_wkt,
population=None, # Will be filled from census data
land_area_sqkm=None,
pop_density_per_sqkm=None,
census_year=2021,
)
neighbourhoods.append(neighbourhood)
count = load_neighbourhoods(neighbourhoods, session)
self.stats["neighbourhoods"] = count
logger.info(f" Loaded {count} neighbourhoods")
def _load_census(self, session: Any) -> None:
"""Fetch and load census profile data."""
logger.info("Fetching census profiles from Toronto Open Data...")
if self.dry_run:
logger.info(" [DRY RUN] Would fetch and load census data")
return
parser = TorontoOpenDataParser()
census_records = parser.get_census_profiles(year=2021)
if not census_records:
logger.warning(" No census records fetched")
return
count = load_census_data(census_records, session)
self.stats["census"] = count
logger.info(f" Loaded {count} census records")
def _load_crime(self, session: Any) -> None:
"""Fetch and load crime statistics."""
logger.info("Fetching crime data from Toronto Police Service...")
if self.dry_run:
logger.info(" [DRY RUN] Would fetch and load crime data")
return
parser = TorontoPoliceParser()
crime_records = parser.get_crime_rates()
if not crime_records:
logger.warning(" No crime records fetched")
return
count = load_crime_data(crime_records, session)
self.stats["crime"] = count
logger.info(f" Loaded {count} crime records")
def _load_amenities(self, session: Any) -> None:
"""Fetch and load amenity data (parks, schools, childcare)."""
logger.info("Fetching amenities from Toronto Open Data...")
if self.dry_run:
logger.info(" [DRY RUN] Would fetch and load amenity data")
return
parser = TorontoOpenDataParser()
total_count = 0
# Fetch parks
try:
parks = parser.get_parks()
if parks:
count = load_amenities(parks, year=2024, session=session)
total_count += count
logger.info(f" Loaded {count} park amenities")
except Exception as e:
logger.warning(f" Failed to load parks: {e}")
# Fetch schools
try:
schools = parser.get_schools()
if schools:
count = load_amenities(schools, year=2024, session=session)
total_count += count
logger.info(f" Loaded {count} school amenities")
except Exception as e:
logger.warning(f" Failed to load schools: {e}")
# Fetch childcare centres
try:
childcare = parser.get_childcare_centres()
if childcare:
count = load_amenities(childcare, year=2024, session=session)
total_count += count
logger.info(f" Loaded {count} childcare amenities")
except Exception as e:
logger.warning(f" Failed to load childcare: {e}")
self.stats["amenities"] = total_count
def _load_rentals(self, session: Any) -> None:
"""Fetch and load CMHC rental data from StatCan."""
logger.info("Fetching CMHC rental data from Statistics Canada...")
if self.dry_run:
logger.info(" [DRY RUN] Would fetch and load CMHC rental data")
return
try:
# Fetch rental data (2014-present)
rental_records = fetch_toronto_rental_data(start_year=2014)
if not rental_records:
logger.warning(" No rental records fetched")
return
count = load_statcan_cmhc_data(rental_records, session)
self.stats["rentals"] = count
logger.info(f" Loaded {count} CMHC rental records")
except Exception as e:
logger.warning(f" Failed to load CMHC rental data: {e}")
if self.verbose:
import traceback
traceback.print_exc()
def run_dbt(self) -> bool:
"""Run dbt to transform data.
Returns:
True if successful, False otherwise.
"""
logger.info("Running dbt transformations...")
dbt_project_dir = PROJECT_ROOT / "dbt"
if not dbt_project_dir.exists():
logger.error(f"dbt project directory not found: {dbt_project_dir}")
return False
if self.dry_run:
logger.info(" [DRY RUN] Would run: dbt run")
logger.info(" [DRY RUN] Would run: dbt test")
return True
try:
# Run dbt models
logger.info(" Running dbt run...")
result = subprocess.run(
["dbt", "run", "--profiles-dir", str(dbt_project_dir)],
cwd=dbt_project_dir,
capture_output=True,
text=True,
)
if result.returncode != 0:
logger.error(f"dbt run failed:\n{result.stderr}")
if self.verbose:
logger.debug(f"dbt output:\n{result.stdout}")
return False
logger.info(" dbt run completed successfully")
# Run dbt tests
logger.info(" Running dbt test...")
result = subprocess.run(
["dbt", "test", "--profiles-dir", str(dbt_project_dir)],
cwd=dbt_project_dir,
capture_output=True,
text=True,
)
if result.returncode != 0:
logger.warning(f"dbt test had failures:\n{result.stderr}")
# Don't fail on test failures, just warn
else:
logger.info(" dbt test completed successfully")
return True
except FileNotFoundError:
logger.error(
"dbt not found in PATH. Install with: pip install dbt-postgres"
)
return False
except Exception as e:
logger.error(f"dbt execution failed: {e}")
return False
def _print_stats(self) -> None:
"""Print loading statistics."""
if not self.stats:
return
logger.info("Loading statistics:")
for key, count in self.stats.items():
logger.info(f" {key}: {count} records")
def main() -> int:
"""Main entry point for the data loading script."""
parser = argparse.ArgumentParser(
description="Load Toronto neighbourhood data into the database",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--skip-fetch",
action="store_true",
help="Skip API fetching, only run dbt",
)
parser.add_argument(
"--skip-dbt",
action="store_true",
help="Skip dbt run, only load data",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be done without executing",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
if args.skip_fetch and args.skip_dbt:
logger.error("Cannot skip both fetch and dbt - nothing to do")
return 1
pipeline = DataPipeline(dry_run=args.dry_run, verbose=args.verbose)
# Execute pipeline stages
if not args.skip_fetch and not pipeline.fetch_and_load():
return 1
if not args.skip_dbt and not pipeline.run_dbt():
return 1
logger.info("Pipeline completed successfully!")
return 0
if __name__ == "__main__":
sys.exit(main())