From 1eba95d4d15437b4431b342015ab8b08ad8a3851 Mon Sep 17 00:00:00 2001 From: lmiranda Date: Sat, 17 Jan 2026 12:10:46 -0500 Subject: [PATCH] docs: Complete Phase 6 notebooks and Phase 7 documentation review Phase 6 - Jupyter Notebooks (15 total): - Overview tab: livability_choropleth, top_bottom_10_bar, income_safety_scatter - Housing tab: affordability_choropleth, rent_trend_line, tenure_breakdown_bar - Safety tab: crime_rate_choropleth, crime_breakdown_bar, crime_trend_line - Demographics tab: income_choropleth, age_distribution, population_density_bar - Amenities tab: amenity_index_choropleth, amenity_radar, transit_accessibility_bar Phase 7 - Documentation: - Updated CLAUDE.md with Sprint 9 completion status - Added notebooks directory to application structure - Expanded figures directory listing Closes #71, #72, #73, #74, #75, #76, #77 Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 20 +- notebooks/README.md | 69 +++++++ notebooks/amenities/.gitkeep | 0 .../amenities/amenity_index_choropleth.ipynb | 170 ++++++++++++++++ notebooks/amenities/amenity_radar.ipynb | 173 ++++++++++++++++ .../amenities/transit_accessibility_bar.ipynb | 161 +++++++++++++++ notebooks/demographics/.gitkeep | 0 notebooks/demographics/age_distribution.ipynb | 173 ++++++++++++++++ .../demographics/income_choropleth.ipynb | 173 ++++++++++++++++ .../demographics/population_density_bar.ipynb | 161 +++++++++++++++ .../housing/affordability_choropleth.ipynb | 174 ++++++++++++++++ notebooks/housing/rent_trend_line.ipynb | 183 +++++++++++++++++ notebooks/housing/tenure_breakdown_bar.ipynb | 188 ++++++++++++++++++ .../overview/income_safety_scatter.ipynb | 183 +++++++++++++++++ .../overview/livability_choropleth.ipynb | 184 +++++++++++++++++ notebooks/overview/top_bottom_10_bar.ipynb | 167 ++++++++++++++++ notebooks/safety/.gitkeep | 0 notebooks/safety/crime_breakdown_bar.ipynb | 178 +++++++++++++++++ notebooks/safety/crime_rate_choropleth.ipynb | 172 ++++++++++++++++ notebooks/safety/crime_trend_line.ipynb | 186 +++++++++++++++++ 20 files changed, 2711 insertions(+), 4 deletions(-) create mode 100644 notebooks/README.md create mode 100644 notebooks/amenities/.gitkeep create mode 100644 notebooks/amenities/amenity_index_choropleth.ipynb create mode 100644 notebooks/amenities/amenity_radar.ipynb create mode 100644 notebooks/amenities/transit_accessibility_bar.ipynb create mode 100644 notebooks/demographics/.gitkeep create mode 100644 notebooks/demographics/age_distribution.ipynb create mode 100644 notebooks/demographics/income_choropleth.ipynb create mode 100644 notebooks/demographics/population_density_bar.ipynb create mode 100644 notebooks/housing/affordability_choropleth.ipynb create mode 100644 notebooks/housing/rent_trend_line.ipynb create mode 100644 notebooks/housing/tenure_breakdown_bar.ipynb create mode 100644 notebooks/overview/income_safety_scatter.ipynb create mode 100644 notebooks/overview/livability_choropleth.ipynb create mode 100644 notebooks/overview/top_bottom_10_bar.ipynb create mode 100644 notebooks/safety/.gitkeep create mode 100644 notebooks/safety/crime_breakdown_bar.ipynb create mode 100644 notebooks/safety/crime_rate_choropleth.ipynb create mode 100644 notebooks/safety/crime_trend_line.ipynb diff --git a/CLAUDE.md b/CLAUDE.md index 1d4604a..8a51f15 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,8 +6,8 @@ Working context for Claude Code on the Analytics Portfolio project. ## Project Status -**Current Sprint**: 9 (Neighbourhood Dashboard Transition) -**Phase**: Toronto Neighbourhood Dashboard +**Current Sprint**: 9 (Neighbourhood Dashboard Transition) - **COMPLETE** +**Phase**: Toronto Neighbourhood Dashboard - Phase 6 & 7 Done **Branch**: `development` (feature branches merge here) --- @@ -129,8 +129,12 @@ portfolio_app/ │ └── time_slider.py # Time range selector ├── figures/ # Shared chart factories │ ├── choropleth.py # Map visualizations -│ ├── summary_cards.py # KPI figures -│ └── time_series.py # Trend charts +│ ├── bar_charts.py # Ranking, stacked, horizontal bars +│ ├── scatter.py # Scatter and bubble plots +│ ├── radar.py # Radar/spider charts +│ ├── demographics.py # Age pyramids, donut charts +│ ├── time_series.py # Trend lines +│ └── summary_cards.py # KPI figures ├── content/ # Markdown content │ └── blog/ # Blog articles ├── toronto/ # Toronto data logic @@ -142,6 +146,14 @@ portfolio_app/ ├── utils/ # Utilities │ └── markdown_loader.py # Markdown processing └── errors/ + +notebooks/ # Data documentation (Phase 6) +├── README.md # Template and usage guide +├── overview/ # Overview tab notebooks (3) +├── housing/ # Housing tab notebooks (3) +├── safety/ # Safety tab notebooks (3) +├── demographics/ # Demographics tab notebooks (3) +└── amenities/ # Amenities tab notebooks (3) ``` ### URL Routing diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..6354a9c --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,69 @@ +# Toronto Neighbourhood Dashboard - Notebooks + +Documentation notebooks for the Toronto Neighbourhood Dashboard visualizations. Each notebook documents how data is queried, transformed, and visualized using the figure factory pattern. + +## Directory Structure + +``` +notebooks/ +├── README.md # This file +├── overview/ # Overview tab visualizations +├── housing/ # Housing tab visualizations +├── safety/ # Safety tab visualizations +├── demographics/ # Demographics tab visualizations +└── amenities/ # Amenities tab visualizations +``` + +## Notebook Template + +Each notebook follows a standard two-section structure: + +### Section 1: Data Reference + +Documents the data pipeline: +- **Source Tables**: List of dbt marts/tables used +- **SQL Query**: The exact query to fetch data +- **Transformation Steps**: Any pandas/python transformations +- **Sample Output**: First 10 rows of the result + +### Section 2: Data Visualization + +Documents the figure creation: +- **Figure Factory**: Import from `portfolio_app.figures` +- **Parameters**: Key configuration options +- **Rendered Output**: The actual visualization + +## Available Figure Factories + +| Factory | Module | Use Case | +|---------|--------|----------| +| `create_choropleth` | `figures.choropleth` | Map visualizations | +| `create_ranking_bar` | `figures.bar_charts` | Top/bottom N rankings | +| `create_stacked_bar` | `figures.bar_charts` | Category breakdowns | +| `create_scatter` | `figures.scatter` | Correlation plots | +| `create_radar` | `figures.radar` | Multi-metric comparisons | +| `create_age_pyramid` | `figures.demographics` | Age distributions | +| `create_time_series` | `figures.time_series` | Trend lines | + +## Usage + +1. Start Jupyter from project root: + ```bash + jupyter notebook notebooks/ + ``` + +2. Ensure database is running: + ```bash + make docker-up + ``` + +3. Each notebook is self-contained - run all cells top to bottom. + +## Notebook Naming Convention + +`{metric}_{chart_type}.ipynb` + +Examples: +- `livability_choropleth.ipynb` +- `crime_trend_line.ipynb` +- `age_pyramid.ipynb` diff --git a/notebooks/amenities/.gitkeep b/notebooks/amenities/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/amenities/amenity_index_choropleth.ipynb b/notebooks/amenities/amenity_index_choropleth.ipynb new file mode 100644 index 0000000..befda7c --- /dev/null +++ b/notebooks/amenities/amenity_index_choropleth.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Amenity Index Choropleth Map\n", + "\n", + "Displays total amenities per 1,000 residents across Toronto's 158 neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_amenities` | neighbourhood × year | amenity_index, total_amenities_per_1000, amenity_tier, geometry |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_id,\n", + " neighbourhood_name,\n", + " geometry,\n", + " year,\n", + " total_amenities_per_1000,\n", + " amenity_index,\n", + " amenity_tier,\n", + " parks_per_1000,\n", + " schools_per_1000,\n", + " transit_per_1000,\n", + " total_amenities,\n", + " population\n", + "FROM mart_neighbourhood_amenities\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_amenities)\n", + "ORDER BY total_amenities_per_1000 DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent year\n", + "2. Convert geometry to GeoJSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import json\n", + "\n", + "gdf = gpd.GeoDataFrame(\n", + " df,\n", + " geometry=gpd.GeoSeries.from_wkb(df['geometry']),\n", + " crs='EPSG:4326'\n", + ")\n", + "\n", + "geojson = json.loads(gdf.to_json())\n", + "data = df.drop(columns=['geometry']).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'total_amenities_per_1000', 'amenity_index', 'amenity_tier']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_choropleth_figure` from `portfolio_app.figures.choropleth`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.choropleth import create_choropleth_figure\n", + "\n", + "fig = create_choropleth_figure(\n", + " geojson=geojson,\n", + " data=data,\n", + " location_key='neighbourhood_id',\n", + " color_column='total_amenities_per_1000',\n", + " hover_data=['neighbourhood_name', 'amenity_index', 'parks_per_1000', 'schools_per_1000'],\n", + " color_scale='Greens',\n", + " title='Toronto Amenities per 1,000 Population',\n", + " zoom=10,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Amenity Tier Interpretation\n", + "\n", + "| Tier | Meaning |\n", + "|------|--------|\n", + "| 1 | Best served (top 20%) |\n", + "| 2-4 | Middle tiers |\n", + "| 5 | Underserved (bottom 20%) |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/amenities/amenity_radar.ipynb b/notebooks/amenities/amenity_radar.ipynb new file mode 100644 index 0000000..35861aa --- /dev/null +++ b/notebooks/amenities/amenity_radar.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Amenity Radar Chart\n", + "\n", + "Spider/radar chart comparing amenity categories for selected neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_amenities` | neighbourhood × year | parks_index, schools_index, transit_index |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " parks_index,\n", + " schools_index,\n", + " transit_index,\n", + " amenity_index,\n", + " amenity_tier\n", + "FROM mart_neighbourhood_amenities\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_amenities)\n", + "ORDER BY amenity_index DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Select top 5 and bottom 5 neighbourhoods by amenity index\n", + "2. Reshape for radar chart format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select representative neighbourhoods\n", + "top_5 = df.head(5)\n", + "bottom_5 = df.tail(5)\n", + "\n", + "# Prepare radar data\n", + "categories = ['Parks', 'Schools', 'Transit']\n", + "index_columns = ['parks_index', 'schools_index', 'transit_index']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Top 5 Amenity-Rich Neighbourhoods:\")\n", + "display(top_5[['neighbourhood_name', 'parks_index', 'schools_index', 'transit_index', 'amenity_index']])\n", + "print(\"\\nBottom 5 Underserved Neighbourhoods:\")\n", + "display(bottom_5[['neighbourhood_name', 'parks_index', 'schools_index', 'transit_index', 'amenity_index']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_radar` from `portfolio_app.figures.radar`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.radar import create_radar_figure\n", + "\n", + "# Compare top neighbourhood vs city average (100)\n", + "top_hood = top_5.iloc[0]\n", + "\n", + "data = [\n", + " {\n", + " 'name': top_hood['neighbourhood_name'],\n", + " 'values': [top_hood['parks_index'], top_hood['schools_index'], top_hood['transit_index']],\n", + " 'categories': categories\n", + " },\n", + " {\n", + " 'name': 'City Average',\n", + " 'values': [100, 100, 100],\n", + " 'categories': categories\n", + " }\n", + "]\n", + "\n", + "fig = create_radar_figure(\n", + " data=data,\n", + " title=f\"Amenity Profile: {top_hood['neighbourhood_name']} vs City Average\",\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Index Interpretation\n", + "\n", + "| Value | Meaning |\n", + "|-------|--------|\n", + "| < 100 | Below city average |\n", + "| = 100 | City average |\n", + "| > 100 | Above city average |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/amenities/transit_accessibility_bar.ipynb b/notebooks/amenities/transit_accessibility_bar.ipynb new file mode 100644 index 0000000..9613589 --- /dev/null +++ b/notebooks/amenities/transit_accessibility_bar.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Transit Accessibility Bar Chart\n", + "\n", + "Shows transit stops per 1,000 residents across Toronto neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_amenities` | neighbourhood × year | transit_per_1000, transit_index, transit_count |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " transit_per_1000,\n", + " transit_index,\n", + " transit_count,\n", + " population,\n", + " amenity_tier\n", + "FROM mart_neighbourhood_amenities\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_amenities)\n", + " AND transit_per_1000 IS NOT NULL\n", + "ORDER BY transit_per_1000 DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Sort by transit accessibility\n", + "2. Select top 20 for visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = df.head(20).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'transit_per_1000', 'transit_index', 'transit_count']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_horizontal_bar` from `portfolio_app.figures.bar_charts`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.bar_charts import create_horizontal_bar\n", + "\n", + "fig = create_horizontal_bar(\n", + " data=data,\n", + " name_column='neighbourhood_name',\n", + " value_column='transit_per_1000',\n", + " title='Top 20 Neighbourhoods by Transit Accessibility',\n", + " color='#00BCD4',\n", + " value_format='.2f',\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transit Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"City-wide Transit Statistics:\")\n", + "print(f\" Total Transit Stops: {df['transit_count'].sum():,.0f}\")\n", + "print(f\" Average per 1,000 pop: {df['transit_per_1000'].mean():.2f}\")\n", + "print(f\" Median per 1,000 pop: {df['transit_per_1000'].median():.2f}\")\n", + "print(f\" Best Access: {df['transit_per_1000'].max():.2f} per 1,000\")\n", + "print(f\" Worst Access: {df['transit_per_1000'].min():.2f} per 1,000\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/demographics/.gitkeep b/notebooks/demographics/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/demographics/age_distribution.ipynb b/notebooks/demographics/age_distribution.ipynb new file mode 100644 index 0000000..f05c1e5 --- /dev/null +++ b/notebooks/demographics/age_distribution.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Age Distribution Analysis\n", + "\n", + "Compares median age and age index across Toronto neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_demographics` | neighbourhood × year | median_age, age_index, city_avg_age |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " median_age,\n", + " age_index,\n", + " city_avg_age,\n", + " population,\n", + " income_quintile,\n", + " pct_renter_occupied\n", + "FROM mart_neighbourhood_demographics\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_demographics)\n", + " AND median_age IS NOT NULL\n", + "ORDER BY median_age DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods with age data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent census year\n", + "2. Calculate deviation from city average\n", + "3. Classify as younger/older than average" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "city_avg = df['city_avg_age'].iloc[0]\n", + "df['age_category'] = df['median_age'].apply(\n", + " lambda x: 'Younger' if x < city_avg else 'Older'\n", + ")\n", + "df['age_deviation'] = df['median_age'] - city_avg\n", + "\n", + "data = df.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"City Average Age: {city_avg:.1f}\")\n", + "print(\"\\nYoungest Neighbourhoods:\")\n", + "display(df.tail(5)[['neighbourhood_name', 'median_age', 'age_index', 'pct_renter_occupied']])\n", + "print(\"\\nOldest Neighbourhoods:\")\n", + "display(df.head(5)[['neighbourhood_name', 'median_age', 'age_index', 'pct_renter_occupied']])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_ranking_bar` from `portfolio_app.figures.bar_charts`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.bar_charts import create_ranking_bar\n", + "\n", + "fig = create_ranking_bar(\n", + " data=data,\n", + " name_column='neighbourhood_name',\n", + " value_column='median_age',\n", + " title='Youngest & Oldest Neighbourhoods (Median Age)',\n", + " top_n=10,\n", + " bottom_n=10,\n", + " color_top='#FF9800', # Orange for older\n", + " color_bottom='#2196F3', # Blue for younger\n", + " value_format='.1f',\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Age vs Income Correlation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Age by income quintile\n", + "print(\"Median Age by Income Quintile:\")\n", + "df.groupby('income_quintile')['median_age'].mean().round(1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/demographics/income_choropleth.ipynb b/notebooks/demographics/income_choropleth.ipynb new file mode 100644 index 0000000..8af755a --- /dev/null +++ b/notebooks/demographics/income_choropleth.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Median Income Choropleth Map\n", + "\n", + "Displays median household income across Toronto's 158 neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_demographics` | neighbourhood × year | median_household_income, income_index, income_quintile, geometry |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_id,\n", + " neighbourhood_name,\n", + " geometry,\n", + " year,\n", + " median_household_income,\n", + " income_index,\n", + " income_quintile,\n", + " population,\n", + " unemployment_rate\n", + "FROM mart_neighbourhood_demographics\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_demographics)\n", + "ORDER BY median_household_income DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent census year\n", + "2. Convert geometry to GeoJSON\n", + "3. Scale income to thousands for readability" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import json\n", + "\n", + "df['income_thousands'] = df['median_household_income'] / 1000\n", + "\n", + "gdf = gpd.GeoDataFrame(\n", + " df,\n", + " geometry=gpd.GeoSeries.from_wkb(df['geometry']),\n", + " crs='EPSG:4326'\n", + ")\n", + "\n", + "geojson = json.loads(gdf.to_json())\n", + "data = df.drop(columns=['geometry']).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'median_household_income', 'income_index', 'income_quintile']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_choropleth_figure` from `portfolio_app.figures.choropleth`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.choropleth import create_choropleth_figure\n", + "\n", + "fig = create_choropleth_figure(\n", + " geojson=geojson,\n", + " data=data,\n", + " location_key='neighbourhood_id',\n", + " color_column='median_household_income',\n", + " hover_data=['neighbourhood_name', 'income_index', 'income_quintile'],\n", + " color_scale='Viridis',\n", + " title='Toronto Median Household Income by Neighbourhood',\n", + " zoom=10,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Income Quintile Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby('income_quintile')['median_household_income'].agg(['count', 'mean', 'min', 'max']).round(0)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/demographics/population_density_bar.ipynb b/notebooks/demographics/population_density_bar.ipynb new file mode 100644 index 0000000..24e2a2e --- /dev/null +++ b/notebooks/demographics/population_density_bar.ipynb @@ -0,0 +1,161 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Population Density Bar Chart\n", + "\n", + "Shows population density (people per sq km) across Toronto neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_demographics` | neighbourhood × year | population_density, population, land_area_sqkm |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " population_density,\n", + " population,\n", + " land_area_sqkm,\n", + " median_household_income,\n", + " pct_renter_occupied\n", + "FROM mart_neighbourhood_demographics\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_demographics)\n", + " AND population_density IS NOT NULL\n", + "ORDER BY population_density DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Sort by population density\n", + "2. Select top 20 most dense neighbourhoods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = df.head(20).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'population_density', 'population', 'land_area_sqkm']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_horizontal_bar` from `portfolio_app.figures.bar_charts`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.bar_charts import create_horizontal_bar\n", + "\n", + "fig = create_horizontal_bar(\n", + " data=data,\n", + " name_column='neighbourhood_name',\n", + " value_column='population_density',\n", + " title='Top 20 Most Dense Neighbourhoods',\n", + " color='#9C27B0',\n", + " value_format=',.0f',\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Density Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"City-wide Statistics:\")\n", + "print(f\" Total Population: {df['population'].sum():,.0f}\")\n", + "print(f\" Total Area: {df['land_area_sqkm'].sum():,.1f} sq km\")\n", + "print(f\" Average Density: {df['population_density'].mean():,.0f} per sq km\")\n", + "print(f\" Max Density: {df['population_density'].max():,.0f} per sq km\")\n", + "print(f\" Min Density: {df['population_density'].min():,.0f} per sq km\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/housing/affordability_choropleth.ipynb b/notebooks/housing/affordability_choropleth.ipynb new file mode 100644 index 0000000..70af4cb --- /dev/null +++ b/notebooks/housing/affordability_choropleth.ipynb @@ -0,0 +1,174 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Affordability Index Choropleth Map\n", + "\n", + "Displays housing affordability across Toronto's 158 neighbourhoods. Index of 100 = city average." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_housing` | neighbourhood × year | affordability_index, rent_to_income_pct, avg_rent_2bed, geometry |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_id,\n", + " neighbourhood_name,\n", + " geometry,\n", + " year,\n", + " affordability_index,\n", + " rent_to_income_pct,\n", + " avg_rent_2bed,\n", + " median_household_income,\n", + " is_affordable\n", + "FROM mart_neighbourhood_housing\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_housing)\n", + "ORDER BY affordability_index ASC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent year\n", + "2. Convert geometry to GeoJSON\n", + "3. Lower index = more affordable (inverted for visualization clarity)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import json\n", + "\n", + "gdf = gpd.GeoDataFrame(\n", + " df,\n", + " geometry=gpd.GeoSeries.from_wkb(df['geometry']),\n", + " crs='EPSG:4326'\n", + ")\n", + "\n", + "geojson = json.loads(gdf.to_json())\n", + "data = df.drop(columns=['geometry']).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'affordability_index', 'rent_to_income_pct', 'avg_rent_2bed', 'is_affordable']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_choropleth_figure` from `portfolio_app.figures.choropleth`.\n", + "\n", + "**Key Parameters:**\n", + "- `color_column`: 'affordability_index'\n", + "- `color_scale`: 'RdYlGn_r' (reversed: green=affordable, red=expensive)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.choropleth import create_choropleth_figure\n", + "\n", + "fig = create_choropleth_figure(\n", + " geojson=geojson,\n", + " data=data,\n", + " location_key='neighbourhood_id',\n", + " color_column='affordability_index',\n", + " hover_data=['neighbourhood_name', 'rent_to_income_pct', 'avg_rent_2bed'],\n", + " color_scale='RdYlGn_r', # Reversed: lower index (affordable) = green\n", + " title='Toronto Housing Affordability Index',\n", + " zoom=10,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Index Interpretation\n", + "\n", + "| Index | Meaning |\n", + "|-------|--------|\n", + "| < 100 | More affordable than city average |\n", + "| = 100 | City average affordability |\n", + "| > 100 | Less affordable than city average |\n", + "\n", + "Affordability calculated as: `rent_to_income_pct / city_avg_rent_to_income * 100`" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/housing/rent_trend_line.ipynb b/notebooks/housing/rent_trend_line.ipynb new file mode 100644 index 0000000..780b4e1 --- /dev/null +++ b/notebooks/housing/rent_trend_line.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rent Trend Line Chart\n", + "\n", + "Shows 5-year rental price trends across Toronto neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_housing` | neighbourhood × year | year, avg_rent_2bed, rent_yoy_change_pct |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "# City-wide average rent by year\n", + "query = \"\"\"\n", + "SELECT\n", + " year,\n", + " AVG(avg_rent_bachelor) as avg_rent_bachelor,\n", + " AVG(avg_rent_1bed) as avg_rent_1bed,\n", + " AVG(avg_rent_2bed) as avg_rent_2bed,\n", + " AVG(avg_rent_3bed) as avg_rent_3bed,\n", + " AVG(rent_yoy_change_pct) as avg_yoy_change\n", + "FROM mart_neighbourhood_housing\n", + "WHERE year >= (SELECT MAX(year) - 5 FROM mart_neighbourhood_housing)\n", + "GROUP BY year\n", + "ORDER BY year\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} years of rent data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Aggregate rent by year (city-wide average)\n", + "2. Convert year to datetime for proper x-axis\n", + "3. Reshape for multi-line chart by bedroom type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create date column from year\n", + "df['date'] = pd.to_datetime(df['year'].astype(str) + '-01-01')\n", + "\n", + "# Melt for multi-line chart\n", + "df_melted = df.melt(\n", + " id_vars=['year', 'date'],\n", + " value_vars=['avg_rent_bachelor', 'avg_rent_1bed', 'avg_rent_2bed', 'avg_rent_3bed'],\n", + " var_name='bedroom_type',\n", + " value_name='avg_rent'\n", + ")\n", + "\n", + "# Clean labels\n", + "df_melted['bedroom_type'] = df_melted['bedroom_type'].map({\n", + " 'avg_rent_bachelor': 'Bachelor',\n", + " 'avg_rent_1bed': '1 Bedroom',\n", + " 'avg_rent_2bed': '2 Bedroom',\n", + " 'avg_rent_3bed': '3 Bedroom'\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['year', 'avg_rent_bachelor', 'avg_rent_1bed', 'avg_rent_2bed', 'avg_rent_3bed', 'avg_yoy_change']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_price_time_series` from `portfolio_app.figures.time_series`.\n", + "\n", + "**Key Parameters:**\n", + "- `date_column`: 'date'\n", + "- `price_column`: 'avg_rent'\n", + "- `group_column`: 'bedroom_type' (for multi-line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.time_series import create_price_time_series\n", + "\n", + "data = df_melted.to_dict('records')\n", + "\n", + "fig = create_price_time_series(\n", + " data=data,\n", + " date_column='date',\n", + " price_column='avg_rent',\n", + " group_column='bedroom_type',\n", + " title='Toronto Average Rent Trend (5 Years)',\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### YoY Change Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show year-over-year changes\n", + "print(\"Year-over-Year Rent Change (%)\")\n", + "df[['year', 'avg_yoy_change']].dropna()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/housing/tenure_breakdown_bar.ipynb b/notebooks/housing/tenure_breakdown_bar.ipynb new file mode 100644 index 0000000..547bc1a --- /dev/null +++ b/notebooks/housing/tenure_breakdown_bar.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Housing Tenure Breakdown Bar Chart\n", + "\n", + "Shows the distribution of owner-occupied vs renter-occupied dwellings across neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_housing` | neighbourhood × year | pct_owner_occupied, pct_renter_occupied, income_quintile |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " pct_owner_occupied,\n", + " pct_renter_occupied,\n", + " income_quintile,\n", + " total_rental_units,\n", + " average_dwelling_value\n", + "FROM mart_neighbourhood_housing\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_housing)\n", + " AND pct_owner_occupied IS NOT NULL\n", + "ORDER BY pct_renter_occupied DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods with tenure data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent year with tenure data\n", + "2. Melt owner/renter columns for stacked bar\n", + "3. Sort by renter percentage (highest first)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare for stacked bar\n", + "df_stacked = df.melt(\n", + " id_vars=['neighbourhood_name', 'income_quintile'],\n", + " value_vars=['pct_owner_occupied', 'pct_renter_occupied'],\n", + " var_name='tenure_type',\n", + " value_name='percentage'\n", + ")\n", + "\n", + "df_stacked['tenure_type'] = df_stacked['tenure_type'].map({\n", + " 'pct_owner_occupied': 'Owner',\n", + " 'pct_renter_occupied': 'Renter'\n", + "})\n", + "\n", + "data = df_stacked.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Highest Renter Neighbourhoods:\")\n", + "df[['neighbourhood_name', 'pct_renter_occupied', 'pct_owner_occupied', 'income_quintile']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_stacked_bar` from `portfolio_app.figures.bar_charts`.\n", + "\n", + "**Key Parameters:**\n", + "- `x_column`: 'neighbourhood_name'\n", + "- `value_column`: 'percentage'\n", + "- `category_column`: 'tenure_type'\n", + "- `show_percentages`: True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.bar_charts import create_stacked_bar\n", + "\n", + "# Show top 20 by renter percentage\n", + "top_20_names = df.head(20)['neighbourhood_name'].tolist()\n", + "data_filtered = [d for d in data if d['neighbourhood_name'] in top_20_names]\n", + "\n", + "fig = create_stacked_bar(\n", + " data=data_filtered,\n", + " x_column='neighbourhood_name',\n", + " value_column='percentage',\n", + " category_column='tenure_type',\n", + " title='Housing Tenure Mix - Top 20 Renter Neighbourhoods',\n", + " color_map={'Owner': '#4CAF50', 'Renter': '#2196F3'},\n", + " show_percentages=True,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### City-Wide Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# City-wide averages\n", + "print(f\"City Average Owner-Occupied: {df['pct_owner_occupied'].mean():.1f}%\")\n", + "print(f\"City Average Renter-Occupied: {df['pct_renter_occupied'].mean():.1f}%\")\n", + "\n", + "# By income quintile\n", + "print(\"\\nTenure by Income Quintile:\")\n", + "df.groupby('income_quintile')[['pct_owner_occupied', 'pct_renter_occupied']].mean().round(1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/overview/income_safety_scatter.ipynb b/notebooks/overview/income_safety_scatter.ipynb new file mode 100644 index 0000000..d2b6c3c --- /dev/null +++ b/notebooks/overview/income_safety_scatter.ipynb @@ -0,0 +1,183 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Income vs Safety Scatter Plot\n", + "\n", + "Explores the correlation between median household income and safety score across Toronto neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_overview` | neighbourhood × year | neighbourhood_name, median_household_income, safety_score, population |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " median_household_income,\n", + " safety_score,\n", + " population,\n", + " livability_score,\n", + " crime_rate_per_100k\n", + "FROM mart_neighbourhood_overview\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_overview)\n", + " AND median_household_income IS NOT NULL\n", + " AND safety_score IS NOT NULL\n", + "ORDER BY median_household_income DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods with income and safety data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter out null values for income and safety\n", + "2. Optionally scale income to thousands for readability\n", + "3. Pass to scatter figure factory with optional trendline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Scale income to thousands for better axis readability\n", + "df['income_thousands'] = df['median_household_income'] / 1000\n", + "\n", + "# Prepare data for figure factory\n", + "data = df.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'median_household_income', 'safety_score', 'crime_rate_per_100k']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_scatter_figure` from `portfolio_app.figures.scatter`.\n", + "\n", + "**Key Parameters:**\n", + "- `x_column`: 'income_thousands' (median household income in $K)\n", + "- `y_column`: 'safety_score' (0-100 percentile rank)\n", + "- `name_column`: 'neighbourhood_name' (hover label)\n", + "- `size_column`: 'population' (optional, bubble size)\n", + "- `trendline`: True (adds OLS regression line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.scatter import create_scatter_figure\n", + "\n", + "fig = create_scatter_figure(\n", + " data=data,\n", + " x_column='income_thousands',\n", + " y_column='safety_score',\n", + " name_column='neighbourhood_name',\n", + " size_column='population',\n", + " title='Income vs Safety by Neighbourhood',\n", + " x_title='Median Household Income ($K)',\n", + " y_title='Safety Score (0-100)',\n", + " trendline=True,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interpretation\n", + "\n", + "This scatter plot reveals the relationship between income and safety:\n", + "\n", + "- **Positive correlation**: Higher income neighbourhoods tend to have higher safety scores\n", + "- **Bubble size**: Represents population (larger = more people)\n", + "- **Trendline**: Orange dashed line shows the overall trend\n", + "- **Outliers**: Neighbourhoods far from the trendline are interesting cases\n", + " - Above line: Safer than income would predict\n", + " - Below line: Less safe than income would predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate correlation coefficient\n", + "correlation = df['median_household_income'].corr(df['safety_score'])\n", + "print(f\"Correlation coefficient (Income vs Safety): {correlation:.3f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/overview/livability_choropleth.ipynb b/notebooks/overview/livability_choropleth.ipynb new file mode 100644 index 0000000..c02024f --- /dev/null +++ b/notebooks/overview/livability_choropleth.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Livability Score Choropleth Map\n", + "\n", + "Displays neighbourhood livability scores on an interactive map of Toronto's 158 neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_overview` | neighbourhood × year | livability_score, safety_score, affordability_score, amenity_score, geometry |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "# Connect to database\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_id,\n", + " neighbourhood_name,\n", + " geometry,\n", + " year,\n", + " livability_score,\n", + " safety_score,\n", + " affordability_score,\n", + " amenity_score,\n", + " population,\n", + " median_household_income\n", + "FROM mart_neighbourhood_overview\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_overview)\n", + "ORDER BY livability_score DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent year of data\n", + "2. Extract GeoJSON from PostGIS geometry column\n", + "3. Pass to choropleth figure factory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Transform geometry to GeoJSON\n", + "import geopandas as gpd\n", + "import json\n", + "\n", + "# Convert WKB geometry to GeoDataFrame\n", + "gdf = gpd.GeoDataFrame(\n", + " df,\n", + " geometry=gpd.GeoSeries.from_wkb(df['geometry']),\n", + " crs='EPSG:4326'\n", + ")\n", + "\n", + "# Create GeoJSON FeatureCollection\n", + "geojson = json.loads(gdf.to_json())\n", + "\n", + "# Prepare data for figure factory\n", + "data = df.drop(columns=['geometry']).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'livability_score', 'safety_score', 'affordability_score', 'amenity_score']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_choropleth_figure` from `portfolio_app.figures.choropleth`.\n", + "\n", + "**Key Parameters:**\n", + "- `geojson`: GeoJSON FeatureCollection with neighbourhood boundaries\n", + "- `data`: List of dicts with neighbourhood_id and scores\n", + "- `location_key`: 'neighbourhood_id'\n", + "- `color_column`: 'livability_score' (or safety_score, etc.)\n", + "- `color_scale`: 'RdYlGn' (red=low, yellow=mid, green=high)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.choropleth import create_choropleth_figure\n", + "\n", + "fig = create_choropleth_figure(\n", + " geojson=geojson,\n", + " data=data,\n", + " location_key='neighbourhood_id',\n", + " color_column='livability_score',\n", + " hover_data=['neighbourhood_name', 'safety_score', 'affordability_score', 'amenity_score'],\n", + " color_scale='RdYlGn',\n", + " title='Toronto Neighbourhood Livability Score',\n", + " zoom=10,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Score Components\n", + "\n", + "The livability score is a weighted composite:\n", + "\n", + "| Component | Weight | Source |\n", + "|-----------|--------|--------|\n", + "| Safety | 30% | Inverse of crime rate per 100K |\n", + "| Affordability | 40% | Inverse of rent-to-income ratio |\n", + "| Amenities | 30% | Amenities per 1,000 residents |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/overview/top_bottom_10_bar.ipynb b/notebooks/overview/top_bottom_10_bar.ipynb new file mode 100644 index 0000000..01da858 --- /dev/null +++ b/notebooks/overview/top_bottom_10_bar.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Top & Bottom 10 Neighbourhoods Bar Chart\n", + "\n", + "Horizontal bar chart showing the highest and lowest scoring neighbourhoods by livability." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_overview` | neighbourhood × year | neighbourhood_name, livability_score |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " livability_score,\n", + " safety_score,\n", + " affordability_score,\n", + " amenity_score\n", + "FROM mart_neighbourhood_overview\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_overview)\n", + " AND livability_score IS NOT NULL\n", + "ORDER BY livability_score DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods with scores\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Sort by livability_score descending\n", + "2. Take top 10 and bottom 10\n", + "3. Pass to ranking bar figure factory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The figure factory handles top/bottom selection internally\n", + "# Just prepare as list of dicts\n", + "data = df.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Top 5:\")\n", + "display(df.head(5))\n", + "print(\"\\nBottom 5:\")\n", + "display(df.tail(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_ranking_bar` from `portfolio_app.figures.bar_charts`.\n", + "\n", + "**Key Parameters:**\n", + "- `data`: List of dicts with all neighbourhoods\n", + "- `name_column`: 'neighbourhood_name'\n", + "- `value_column`: 'livability_score'\n", + "- `top_n`: 10 (green bars)\n", + "- `bottom_n`: 10 (red bars)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.bar_charts import create_ranking_bar\n", + "\n", + "fig = create_ranking_bar(\n", + " data=data,\n", + " name_column='neighbourhood_name',\n", + " value_column='livability_score',\n", + " title='Top & Bottom 10 Neighbourhoods by Livability',\n", + " top_n=10,\n", + " bottom_n=10,\n", + " color_top='#4CAF50', # Green for top performers\n", + " color_bottom='#F44336', # Red for bottom performers\n", + " value_format='.1f',\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interpretation\n", + "\n", + "- **Green bars**: Highest livability scores (best combination of safety, affordability, and amenities)\n", + "- **Red bars**: Lowest livability scores (areas that may need targeted investment)\n", + "\n", + "The ranking bar chart provides quick context for which neighbourhoods stand out at either extreme." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/safety/.gitkeep b/notebooks/safety/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/safety/crime_breakdown_bar.ipynb b/notebooks/safety/crime_breakdown_bar.ipynb new file mode 100644 index 0000000..5376226 --- /dev/null +++ b/notebooks/safety/crime_breakdown_bar.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Crime Type Breakdown Bar Chart\n", + "\n", + "Stacked bar chart showing crime composition by Major Crime Indicator (MCI) categories." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_safety` | neighbourhood × year | assault_count, auto_theft_count, break_enter_count, robbery_count, etc. |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_name,\n", + " assault_count,\n", + " auto_theft_count,\n", + " break_enter_count,\n", + " robbery_count,\n", + " theft_over_count,\n", + " homicide_count,\n", + " total_incidents,\n", + " crime_rate_per_100k\n", + "FROM mart_neighbourhood_safety\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_safety)\n", + "ORDER BY total_incidents DESC\n", + "LIMIT 15\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded top {len(df)} neighbourhoods by crime volume\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Select top 15 neighbourhoods by total incidents\n", + "2. Melt crime type columns into rows\n", + "3. Pass to stacked bar figure factory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_melted = df.melt(\n", + " id_vars=['neighbourhood_name', 'total_incidents'],\n", + " value_vars=['assault_count', 'auto_theft_count', 'break_enter_count', \n", + " 'robbery_count', 'theft_over_count', 'homicide_count'],\n", + " var_name='crime_type',\n", + " value_name='count'\n", + ")\n", + "\n", + "# Clean labels\n", + "df_melted['crime_type'] = df_melted['crime_type'].str.replace('_count', '').str.replace('_', ' ').str.title()\n", + "\n", + "data = df_melted.to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'assault_count', 'auto_theft_count', 'break_enter_count', 'total_incidents']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_stacked_bar` from `portfolio_app.figures.bar_charts`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.bar_charts import create_stacked_bar\n", + "\n", + "fig = create_stacked_bar(\n", + " data=data,\n", + " x_column='neighbourhood_name',\n", + " value_column='count',\n", + " category_column='crime_type',\n", + " title='Crime Type Breakdown - Top 15 Neighbourhoods',\n", + " color_map={\n", + " 'Assault': '#d62728',\n", + " 'Auto Theft': '#ff7f0e',\n", + " 'Break Enter': '#9467bd',\n", + " 'Robbery': '#8c564b',\n", + " 'Theft Over': '#e377c2',\n", + " 'Homicide': '#1f77b4'\n", + " },\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MCI Categories\n", + "\n", + "| Category | Description |\n", + "|----------|------------|\n", + "| Assault | Physical attacks |\n", + "| Auto Theft | Vehicle theft |\n", + "| Break & Enter | Burglary |\n", + "| Robbery | Theft with force/threat |\n", + "| Theft Over | Theft > $5,000 |\n", + "| Homicide | Murder/manslaughter |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/safety/crime_rate_choropleth.ipynb b/notebooks/safety/crime_rate_choropleth.ipynb new file mode 100644 index 0000000..9f35180 --- /dev/null +++ b/notebooks/safety/crime_rate_choropleth.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Crime Rate Choropleth Map\n", + "\n", + "Displays crime rates per 100,000 population across Toronto's 158 neighbourhoods." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_safety` | neighbourhood × year | crime_rate_per_100k, crime_index, safety_tier, geometry |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " neighbourhood_id,\n", + " neighbourhood_name,\n", + " geometry,\n", + " year,\n", + " crime_rate_per_100k,\n", + " crime_index,\n", + " safety_tier,\n", + " total_incidents,\n", + " population\n", + "FROM mart_neighbourhood_safety\n", + "WHERE year = (SELECT MAX(year) FROM mart_neighbourhood_safety)\n", + "ORDER BY crime_rate_per_100k DESC\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} neighbourhoods\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Filter to most recent year\n", + "2. Convert geometry to GeoJSON\n", + "3. Use reversed color scale (green=low crime, red=high crime)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import json\n", + "\n", + "gdf = gpd.GeoDataFrame(\n", + " df,\n", + " geometry=gpd.GeoSeries.from_wkb(df['geometry']),\n", + " crs='EPSG:4326'\n", + ")\n", + "\n", + "geojson = json.loads(gdf.to_json())\n", + "data = df.drop(columns=['geometry']).to_dict('records')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['neighbourhood_name', 'crime_rate_per_100k', 'crime_index', 'safety_tier', 'total_incidents']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_choropleth_figure` from `portfolio_app.figures.choropleth`.\n", + "\n", + "**Key Parameters:**\n", + "- `color_column`: 'crime_rate_per_100k'\n", + "- `color_scale`: 'RdYlGn_r' (red=high crime, green=low crime)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.choropleth import create_choropleth_figure\n", + "\n", + "fig = create_choropleth_figure(\n", + " geojson=geojson,\n", + " data=data,\n", + " location_key='neighbourhood_id',\n", + " color_column='crime_rate_per_100k',\n", + " hover_data=['neighbourhood_name', 'crime_index', 'total_incidents'],\n", + " color_scale='RdYlGn_r',\n", + " title='Toronto Crime Rate per 100,000 Population',\n", + " zoom=10,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Safety Tier Interpretation\n", + "\n", + "| Tier | Meaning |\n", + "|------|--------|\n", + "| 1 | Highest crime (top 20%) |\n", + "| 2-4 | Middle tiers |\n", + "| 5 | Lowest crime (bottom 20%) |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/safety/crime_trend_line.ipynb b/notebooks/safety/crime_trend_line.ipynb new file mode 100644 index 0000000..2f84a10 --- /dev/null +++ b/notebooks/safety/crime_trend_line.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Crime Trend Line Chart\n", + "\n", + "Shows 5-year crime rate trends across Toronto." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data Reference\n", + "\n", + "### Source Tables\n", + "\n", + "| Table | Grain | Key Columns |\n", + "|-------|-------|-------------|\n", + "| `mart_neighbourhood_safety` | neighbourhood × year | year, crime_rate_per_100k, crime_yoy_change_pct |\n", + "\n", + "### SQL Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "engine = create_engine(os.environ.get('DATABASE_URL', 'postgresql://portfolio:portfolio@localhost:5432/portfolio'))\n", + "\n", + "query = \"\"\"\n", + "SELECT\n", + " year,\n", + " AVG(crime_rate_per_100k) as avg_crime_rate,\n", + " AVG(assault_rate_per_100k) as avg_assault_rate,\n", + " AVG(auto_theft_rate_per_100k) as avg_auto_theft_rate,\n", + " AVG(break_enter_rate_per_100k) as avg_break_enter_rate,\n", + " SUM(total_incidents) as total_city_incidents,\n", + " AVG(crime_yoy_change_pct) as avg_yoy_change\n", + "FROM mart_neighbourhood_safety\n", + "WHERE year >= (SELECT MAX(year) - 5 FROM mart_neighbourhood_safety)\n", + "GROUP BY year\n", + "ORDER BY year\n", + "\"\"\"\n", + "\n", + "df = pd.read_sql(query, engine)\n", + "print(f\"Loaded {len(df)} years of crime data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transformation Steps\n", + "\n", + "1. Aggregate by year (city-wide)\n", + "2. Convert year to datetime\n", + "3. Melt for multi-line by crime type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['date'] = pd.to_datetime(df['year'].astype(str) + '-01-01')\n", + "\n", + "# Melt for multi-line\n", + "df_melted = df.melt(\n", + " id_vars=['year', 'date'],\n", + " value_vars=['avg_assault_rate', 'avg_auto_theft_rate', 'avg_break_enter_rate'],\n", + " var_name='crime_type',\n", + " value_name='rate_per_100k'\n", + ")\n", + "\n", + "df_melted['crime_type'] = df_melted['crime_type'].map({\n", + " 'avg_assault_rate': 'Assault',\n", + " 'avg_auto_theft_rate': 'Auto Theft',\n", + " 'avg_break_enter_rate': 'Break & Enter'\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[['year', 'avg_crime_rate', 'total_city_incidents', 'avg_yoy_change']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Visualization\n", + "\n", + "### Figure Factory\n", + "\n", + "Uses `create_price_time_series` (reused for any numeric trend)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '../..')\n", + "\n", + "from portfolio_app.figures.time_series import create_price_time_series\n", + "\n", + "data = df_melted.to_dict('records')\n", + "\n", + "fig = create_price_time_series(\n", + " data=data,\n", + " date_column='date',\n", + " price_column='rate_per_100k',\n", + " group_column='crime_type',\n", + " title='Toronto Crime Trends by Type (5 Years)',\n", + ")\n", + "\n", + "# Remove dollar sign formatting since this is rate data\n", + "fig.update_layout(yaxis_tickprefix='', yaxis_title='Rate per 100K')\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Overall Trend" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Total crime rate trend\n", + "total_data = df[['date', 'avg_crime_rate']].rename(columns={'avg_crime_rate': 'total_rate'}).to_dict('records')\n", + "\n", + "fig2 = create_price_time_series(\n", + " data=total_data,\n", + " date_column='date',\n", + " price_column='total_rate',\n", + " title='Toronto Overall Crime Rate Trend',\n", + ")\n", + "fig2.update_layout(yaxis_tickprefix='', yaxis_title='Rate per 100K')\n", + "fig2.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}