development #95

Merged
lmiranda merged 89 commits from development into staging 2026-02-01 21:32:42 +00:00
12 changed files with 1067 additions and 1 deletions
Showing only changes of commit bf6e392002 - Show all commits

35
.gitea/workflows/ci.yml Normal file
View File

@@ -0,0 +1,35 @@
name: CI
on:
push:
branches:
- development
- staging
- main
pull_request:
branches:
- development
jobs:
lint-and-test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install ruff pytest
- name: Run linter
run: ruff check .
- name: Run tests
run: pytest tests/ -v --tb=short

View File

@@ -0,0 +1,44 @@
name: Deploy to Production
on:
push:
branches:
- main
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Deploy to Production Server
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.PROD_HOST }}
username: ${{ secrets.PROD_USER }}
key: ${{ secrets.PROD_SSH_KEY }}
script: |
set -euo pipefail
cd ~/apps/personal-portfolio
echo "Pulling latest changes..."
git fetch origin main
git reset --hard origin/main
echo "Activating virtual environment..."
source .venv/bin/activate
echo "Installing dependencies..."
pip install -r requirements.txt --quiet
echo "Running dbt models..."
cd dbt && dbt run --profiles-dir . && cd ..
echo "Restarting application..."
docker compose down
docker compose up -d
echo "Waiting for health check..."
sleep 10
curl -f http://localhost:8050/health || exit 1
echo "Production deployment complete!"

View File

@@ -0,0 +1,44 @@
name: Deploy to Staging
on:
push:
branches:
- staging
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Deploy to Staging Server
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.STAGING_HOST }}
username: ${{ secrets.STAGING_USER }}
key: ${{ secrets.STAGING_SSH_KEY }}
script: |
set -euo pipefail
cd ~/apps/personal-portfolio
echo "Pulling latest changes..."
git fetch origin staging
git reset --hard origin/staging
echo "Activating virtual environment..."
source .venv/bin/activate
echo "Installing dependencies..."
pip install -r requirements.txt --quiet
echo "Running dbt models..."
cd dbt && dbt run --profiles-dir . && cd ..
echo "Restarting application..."
docker compose down
docker compose up -d
echo "Waiting for health check..."
sleep 10
curl -f http://localhost:8050/health || exit 1
echo "Staging deployment complete!"

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024-2025 Leo Miranda
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,4 +1,4 @@
.PHONY: setup docker-up docker-down db-init load-data run test dbt-run dbt-test lint format ci deploy clean help
.PHONY: setup docker-up docker-down db-init load-data run test dbt-run dbt-test lint format ci deploy clean help logs run-detached etl-toronto
# Default target
.DEFAULT_GOAL := help
@@ -151,6 +151,19 @@ ci: ## Run all checks (lint, typecheck, test)
$(MAKE) test
@echo "$(GREEN)All checks passed!$(NC)"
# =============================================================================
# Operations
# =============================================================================
logs: ## Follow docker compose logs (usage: make logs or make logs SERVICE=postgres)
@./scripts/logs.sh $(SERVICE)
run-detached: ## Start containers and wait for health check
@./scripts/run-detached.sh
etl-toronto: ## Run Toronto ETL pipeline (usage: make etl-toronto MODE=--full)
@./scripts/etl/toronto.sh $(MODE)
# =============================================================================
# Deployment
# =============================================================================

View File

@@ -1,5 +1,9 @@
# Analytics Portfolio
[![CI](https://gitea.hotserv.cloud/lmiranda/personal-portfolio/actions/workflows/ci.yml/badge.svg)](https://gitea.hotserv.cloud/lmiranda/personal-portfolio/actions)
**Live Demo:** [leodata.science](https://leodata.science)
A personal portfolio website showcasing data engineering and visualization capabilities, featuring an interactive Toronto Neighbourhood Dashboard.
## Live Pages
@@ -32,6 +36,42 @@ An interactive choropleth dashboard analyzing Toronto's 158 official neighbourho
- Toronto Police Service (crime statistics)
- CMHC Rental Market Survey (rental data by zone)
## Architecture
```mermaid
flowchart LR
subgraph Sources
A1[City of Toronto API]
A2[Toronto Police API]
A3[CMHC Data]
end
subgraph ETL
B1[Parsers]
B2[Loaders]
end
subgraph Database
C1[(PostgreSQL/PostGIS)]
C2[dbt Models]
end
subgraph Application
D1[Dash App]
D2[Plotly Figures]
end
A1 & A2 & A3 --> B1 --> B2 --> C1 --> C2 --> D1 --> D2
```
**Pipeline Stages:**
- **Sources**: External APIs and data files (City of Toronto, Toronto Police, CMHC)
- **ETL**: Python parsers extract and validate data; loaders persist to database
- **Database**: PostgreSQL with PostGIS for geospatial; dbt transforms raw → staging → marts
- **Application**: Dash serves interactive dashboards with Plotly visualizations
For detailed database schema, see [docs/DATABASE_SCHEMA.md](docs/DATABASE_SCHEMA.md).
## Quick Start
```bash

307
docs/DATABASE_SCHEMA.md Normal file
View File

@@ -0,0 +1,307 @@
# Database Schema
This document describes the PostgreSQL/PostGIS database schema for the Toronto Neighbourhood Dashboard.
## Entity Relationship Diagram
```mermaid
erDiagram
dim_time {
int date_key PK
date full_date UK
int year
int month
int quarter
string month_name
bool is_month_start
}
dim_cmhc_zone {
int zone_key PK
string zone_code UK
string zone_name
geometry geometry
}
dim_neighbourhood {
int neighbourhood_id PK
string name
geometry geometry
int population
numeric land_area_sqkm
numeric pop_density_per_sqkm
numeric pct_bachelors_or_higher
numeric median_household_income
numeric pct_owner_occupied
numeric pct_renter_occupied
int census_year
}
dim_policy_event {
int event_id PK
date event_date
date effective_date
string level
string category
string title
text description
string expected_direction
string source_url
string confidence
}
fact_rentals {
int id PK
int date_key FK
int zone_key FK
string bedroom_type
int universe
numeric avg_rent
numeric median_rent
numeric vacancy_rate
numeric availability_rate
numeric turnover_rate
numeric rent_change_pct
string reliability_code
}
fact_census {
int id PK
int neighbourhood_id FK
int census_year
int population
numeric population_density
numeric median_household_income
numeric average_household_income
numeric unemployment_rate
numeric pct_bachelors_or_higher
numeric pct_owner_occupied
numeric pct_renter_occupied
numeric median_age
numeric average_dwelling_value
}
fact_crime {
int id PK
int neighbourhood_id FK
int year
string crime_type
int count
numeric rate_per_100k
}
fact_amenities {
int id PK
int neighbourhood_id FK
string amenity_type
int count
int year
}
bridge_cmhc_neighbourhood {
int id PK
string cmhc_zone_code FK
int neighbourhood_id FK
numeric weight
}
dim_time ||--o{ fact_rentals : "date_key"
dim_cmhc_zone ||--o{ fact_rentals : "zone_key"
dim_neighbourhood ||--o{ fact_census : "neighbourhood_id"
dim_neighbourhood ||--o{ fact_crime : "neighbourhood_id"
dim_neighbourhood ||--o{ fact_amenities : "neighbourhood_id"
dim_cmhc_zone ||--o{ bridge_cmhc_neighbourhood : "zone_code"
dim_neighbourhood ||--o{ bridge_cmhc_neighbourhood : "neighbourhood_id"
```
## Schema Layers
### Raw Schema
Raw data is loaded directly from external sources without transformation:
| Table | Source | Description |
|-------|--------|-------------|
| `raw.neighbourhoods` | City of Toronto API | GeoJSON neighbourhood boundaries |
| `raw.census_profiles` | City of Toronto API | Census profile data |
| `raw.crime_data` | Toronto Police API | Crime statistics by neighbourhood |
| `raw.cmhc_rentals` | CMHC Data Files | Rental market survey data |
### Staging Schema (dbt)
Staging models provide 1:1 cleaned representations of source data:
| Model | Source Table | Purpose |
|-------|-------------|---------|
| `stg_toronto__neighbourhoods` | raw.neighbourhoods | Cleaned boundaries with standardized names |
| `stg_toronto__census` | raw.census_profiles | Typed census metrics |
| `stg_cmhc__rentals` | raw.cmhc_rentals | Validated rental data |
| `stg_police__crimes` | raw.crime_data | Standardized crime categories |
### Marts Schema (dbt)
Analytical tables ready for dashboard consumption:
| Model | Grain | Purpose |
|-------|-------|---------|
| `mart_neighbourhood_summary` | neighbourhood | Composite livability scores |
| `mart_rental_trends` | zone × month | Time-series rental analysis |
| `mart_crime_rates` | neighbourhood × year | Crime rate calculations |
| `mart_amenity_density` | neighbourhood | Amenity accessibility scores |
## Table Details
### Dimension Tables
#### dim_time
Time dimension for date-based analysis. Grain: one row per month.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| date_key | INTEGER | PK | Surrogate key (YYYYMM format) |
| full_date | DATE | UNIQUE, NOT NULL | First day of month |
| year | INTEGER | NOT NULL | Calendar year |
| month | INTEGER | NOT NULL | Month number (1-12) |
| quarter | INTEGER | NOT NULL | Quarter (1-4) |
| month_name | VARCHAR(20) | NOT NULL | Month name |
| is_month_start | BOOLEAN | DEFAULT TRUE | Always true (monthly grain) |
#### dim_cmhc_zone
CMHC rental market zones (~20 zones covering Toronto).
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| zone_key | INTEGER | PK, AUTO | Surrogate key |
| zone_code | VARCHAR(10) | UNIQUE, NOT NULL | CMHC zone identifier |
| zone_name | VARCHAR(100) | NOT NULL | Zone display name |
| geometry | GEOMETRY(POLYGON) | SRID 4326 | PostGIS zone boundary |
#### dim_neighbourhood
Toronto's 158 official neighbourhoods.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| neighbourhood_id | INTEGER | PK | City-assigned ID |
| name | VARCHAR(100) | NOT NULL | Neighbourhood name |
| geometry | GEOMETRY(POLYGON) | SRID 4326 | PostGIS boundary |
| population | INTEGER | | Total population |
| land_area_sqkm | NUMERIC(10,4) | | Area in km² |
| pop_density_per_sqkm | NUMERIC(10,2) | | Population density |
| pct_bachelors_or_higher | NUMERIC(5,2) | | Education rate |
| median_household_income | NUMERIC(12,2) | | Median income |
| pct_owner_occupied | NUMERIC(5,2) | | Owner occupancy rate |
| pct_renter_occupied | NUMERIC(5,2) | | Renter occupancy rate |
| census_year | INTEGER | DEFAULT 2021 | Census reference year |
#### dim_policy_event
Policy events for time-series annotation (rent control, interest rates, etc.).
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| event_id | INTEGER | PK, AUTO | Surrogate key |
| event_date | DATE | NOT NULL | Announcement date |
| effective_date | DATE | | Implementation date |
| level | VARCHAR(20) | NOT NULL | federal/provincial/municipal |
| category | VARCHAR(20) | NOT NULL | monetary/tax/regulatory/supply/economic |
| title | VARCHAR(200) | NOT NULL | Event title |
| description | TEXT | | Detailed description |
| expected_direction | VARCHAR(10) | NOT NULL | bearish/bullish/neutral |
| source_url | VARCHAR(500) | | Reference link |
| confidence | VARCHAR(10) | DEFAULT 'medium' | high/medium/low |
### Fact Tables
#### fact_rentals
CMHC rental market survey data. Grain: zone × bedroom type × survey date.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| id | INTEGER | PK, AUTO | Surrogate key |
| date_key | INTEGER | FK → dim_time | Survey date reference |
| zone_key | INTEGER | FK → dim_cmhc_zone | CMHC zone reference |
| bedroom_type | VARCHAR(20) | NOT NULL | bachelor/1-bed/2-bed/3+bed/total |
| universe | INTEGER | | Total rental units |
| avg_rent | NUMERIC(10,2) | | Average rent |
| median_rent | NUMERIC(10,2) | | Median rent |
| vacancy_rate | NUMERIC(5,2) | | Vacancy percentage |
| availability_rate | NUMERIC(5,2) | | Availability percentage |
| turnover_rate | NUMERIC(5,2) | | Turnover percentage |
| rent_change_pct | NUMERIC(5,2) | | Year-over-year change |
| reliability_code | VARCHAR(2) | | CMHC data quality code |
#### fact_census
Census statistics. Grain: neighbourhood × census year.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| id | INTEGER | PK, AUTO | Surrogate key |
| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference |
| census_year | INTEGER | NOT NULL | 2016, 2021, etc. |
| population | INTEGER | | Total population |
| population_density | NUMERIC(10,2) | | People per km² |
| median_household_income | NUMERIC(12,2) | | Median income |
| average_household_income | NUMERIC(12,2) | | Average income |
| unemployment_rate | NUMERIC(5,2) | | Unemployment % |
| pct_bachelors_or_higher | NUMERIC(5,2) | | Education rate |
| pct_owner_occupied | NUMERIC(5,2) | | Owner rate |
| pct_renter_occupied | NUMERIC(5,2) | | Renter rate |
| median_age | NUMERIC(5,2) | | Median resident age |
| average_dwelling_value | NUMERIC(12,2) | | Average home value |
#### fact_crime
Crime statistics. Grain: neighbourhood × year × crime type.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| id | INTEGER | PK, AUTO | Surrogate key |
| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference |
| year | INTEGER | NOT NULL | Calendar year |
| crime_type | VARCHAR(50) | NOT NULL | Crime category |
| count | INTEGER | NOT NULL | Number of incidents |
| rate_per_100k | NUMERIC(10,2) | | Rate per 100k population |
#### fact_amenities
Amenity counts. Grain: neighbourhood × amenity type × year.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| id | INTEGER | PK, AUTO | Surrogate key |
| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference |
| amenity_type | VARCHAR(50) | NOT NULL | parks/schools/transit/etc. |
| count | INTEGER | NOT NULL | Number of amenities |
| year | INTEGER | NOT NULL | Reference year |
### Bridge Tables
#### bridge_cmhc_neighbourhood
Maps CMHC zones to neighbourhoods with area-based weights for data disaggregation.
| Column | Type | Constraints | Description |
|--------|------|-------------|-------------|
| id | INTEGER | PK, AUTO | Surrogate key |
| cmhc_zone_code | VARCHAR(10) | FK → dim_cmhc_zone | Zone reference |
| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference |
| weight | NUMERIC(5,4) | NOT NULL | Proportional weight (0-1) |
## Indexes
| Table | Index | Columns | Purpose |
|-------|-------|---------|---------|
| fact_rentals | ix_fact_rentals_date_zone | date_key, zone_key | Time-series queries |
| fact_census | ix_fact_census_neighbourhood_year | neighbourhood_id, census_year | Census lookups |
| fact_crime | ix_fact_crime_neighbourhood_year | neighbourhood_id, year | Crime trends |
| fact_crime | ix_fact_crime_type | crime_type | Crime filtering |
| fact_amenities | ix_fact_amenities_neighbourhood_year | neighbourhood_id, year | Amenity queries |
| fact_amenities | ix_fact_amenities_type | amenity_type | Amenity filtering |
| bridge_cmhc_neighbourhood | ix_bridge_cmhc_zone | cmhc_zone_code | Zone lookups |
| bridge_cmhc_neighbourhood | ix_bridge_neighbourhood | neighbourhood_id | Neighbourhood lookups |
## PostGIS Extensions
The database requires PostGIS for geospatial operations:
```sql
CREATE EXTENSION IF NOT EXISTS postgis;
```
All geometry columns use SRID 4326 (WGS84) for compatibility with web mapping libraries.

View File

@@ -0,0 +1,200 @@
# Runbook: Adding a New Dashboard
This runbook describes how to add a new data dashboard to the portfolio application.
## Prerequisites
- [ ] Data sources identified and accessible
- [ ] Database schema designed
- [ ] Basic Dash/Plotly familiarity
## Directory Structure
Create the following structure under `portfolio_app/`:
```
portfolio_app/
├── pages/
│ └── {dashboard_name}/
│ ├── dashboard.py # Main layout with tabs
│ ├── methodology.py # Data sources and methods page
│ ├── tabs/
│ │ ├── __init__.py
│ │ ├── overview.py # Overview tab layout
│ │ └── ... # Additional tab layouts
│ └── callbacks/
│ ├── __init__.py
│ └── ... # Callback modules
├── {dashboard_name}/ # Data logic (outside pages/)
│ ├── __init__.py
│ ├── parsers/ # API/CSV extraction
│ │ └── __init__.py
│ ├── loaders/ # Database operations
│ │ └── __init__.py
│ ├── schemas/ # Pydantic models
│ │ └── __init__.py
│ └── models/ # SQLAlchemy ORM
│ └── __init__.py
```
## Step-by-Step Checklist
### 1. Data Layer
- [ ] Create Pydantic schemas in `{dashboard_name}/schemas/`
- [ ] Create SQLAlchemy models in `{dashboard_name}/models/`
- [ ] Create parsers in `{dashboard_name}/parsers/`
- [ ] Create loaders in `{dashboard_name}/loaders/`
- [ ] Add database migrations if needed
### 2. dbt Models
Create dbt models in `dbt/models/`:
- [ ] `staging/stg_{source}__{entity}.sql` - Raw data cleaning
- [ ] `intermediate/int_{domain}__{transform}.sql` - Business logic
- [ ] `marts/mart_{domain}.sql` - Final analytical tables
Follow naming conventions:
- Staging: `stg_{source}__{entity}`
- Intermediate: `int_{domain}__{transform}`
- Marts: `mart_{domain}`
### 3. Visualization Layer
- [ ] Create figure factories in `figures/` (or reuse existing)
- [ ] Follow the factory pattern: `create_{chart_type}_figure(data, **kwargs)`
### 4. Dashboard Pages
#### Main Dashboard (`pages/{dashboard_name}/dashboard.py`)
```python
import dash
from dash import html, dcc
import dash_mantine_components as dmc
dash.register_page(
__name__,
path="/{dashboard_name}",
title="{Dashboard Title}",
description="{Description}"
)
def layout():
return dmc.Container([
# Header
dmc.Title("{Dashboard Title}", order=1),
# Tabs
dmc.Tabs([
dmc.TabsList([
dmc.TabsTab("Overview", value="overview"),
# Add more tabs
]),
dmc.TabsPanel(overview_tab(), value="overview"),
# Add more panels
], value="overview"),
])
```
#### Tab Layouts (`pages/{dashboard_name}/tabs/`)
- [ ] Create one file per tab
- [ ] Export layout function from each
#### Callbacks (`pages/{dashboard_name}/callbacks/`)
- [ ] Create callback modules for interactivity
- [ ] Import and register in dashboard.py
### 5. Navigation
Add to sidebar in `components/sidebar.py`:
```python
dmc.NavLink(
label="{Dashboard Name}",
href="/{dashboard_name}",
icon=DashIconify(icon="..."),
)
```
### 6. Documentation
- [ ] Create methodology page (`pages/{dashboard_name}/methodology.py`)
- [ ] Document data sources
- [ ] Document transformation logic
- [ ] Add notebooks to `notebooks/{dashboard_name}/` if needed
### 7. Testing
- [ ] Add unit tests for parsers
- [ ] Add unit tests for loaders
- [ ] Add integration tests for callbacks
- [ ] Run `make test`
### 8. Final Verification
- [ ] All pages render without errors
- [ ] All callbacks respond correctly
- [ ] Data loads successfully
- [ ] dbt models run cleanly (`make dbt-run`)
- [ ] Linting passes (`make lint`)
- [ ] Tests pass (`make test`)
## Example: Toronto Dashboard
Reference implementation: `portfolio_app/pages/toronto/`
Key files:
- `dashboard.py` - Main layout with 5 tabs
- `tabs/overview.py` - Livability scores, scatter plots
- `callbacks/map_callbacks.py` - Choropleth interactions
- `toronto/models/dimensions.py` - Dimension tables
- `toronto/models/facts.py` - Fact tables
## Common Patterns
### Figure Factories
```python
# figures/choropleth.py
def create_choropleth_figure(
gdf: gpd.GeoDataFrame,
value_column: str,
title: str,
**kwargs
) -> go.Figure:
...
```
### Callbacks
```python
# callbacks/map_callbacks.py
@callback(
Output("neighbourhood-details", "children"),
Input("choropleth-map", "clickData"),
)
def update_details(click_data):
...
```
### Data Loading
```python
# {dashboard_name}/loaders/load.py
def load_data(session: Session) -> None:
# Parse from source
records = parse_source_data()
# Validate with Pydantic
validated = [Schema(**r) for r in records]
# Load to database
for record in validated:
session.add(Model(**record.model_dump()))
session.commit()
```

232
docs/runbooks/deployment.md Normal file
View File

@@ -0,0 +1,232 @@
# Runbook: Deployment
This runbook covers deployment procedures for the Analytics Portfolio application.
## Environments
| Environment | Branch | Server | URL |
|-------------|--------|--------|-----|
| Development | `development` | Local | http://localhost:8050 |
| Staging | `staging` | Homelab (hotserv) | Internal |
| Production | `main` | Bandit Labs VPS | https://leodata.science |
## CI/CD Pipeline
### Automatic Deployment
Deployments are triggered automatically via Gitea Actions:
1. **Push to `staging`** → Deploys to staging server
2. **Push to `main`** → Deploys to production server
### Workflow Files
- `.gitea/workflows/ci.yml` - Runs linting and tests on all branches
- `.gitea/workflows/deploy-staging.yml` - Staging deployment
- `.gitea/workflows/deploy-production.yml` - Production deployment
### Required Secrets
Configure these in Gitea repository settings:
| Secret | Description |
|--------|-------------|
| `STAGING_HOST` | Staging server hostname/IP |
| `STAGING_USER` | SSH username for staging |
| `STAGING_SSH_KEY` | Private key for staging SSH |
| `PROD_HOST` | Production server hostname/IP |
| `PROD_USER` | SSH username for production |
| `PROD_SSH_KEY` | Private key for production SSH |
## Manual Deployment
### Prerequisites
- SSH access to target server
- Repository cloned at `~/apps/personal-portfolio`
- Virtual environment created at `.venv`
- Docker and Docker Compose installed
- PostgreSQL container running
### Steps
```bash
# 1. SSH to server
ssh user@server
# 2. Navigate to app directory
cd ~/apps/personal-portfolio
# 3. Pull latest changes
git fetch origin {branch}
git reset --hard origin/{branch}
# 4. Activate virtual environment
source .venv/bin/activate
# 5. Install dependencies
pip install -r requirements.txt
# 6. Run database migrations (if any)
# python -m alembic upgrade head
# 7. Run dbt models
cd dbt && dbt run --profiles-dir . && cd ..
# 8. Restart application
docker compose down
docker compose up -d
# 9. Verify health
curl http://localhost:8050/health
```
## Rollback Procedure
### Quick Rollback
If deployment fails, rollback to previous commit:
```bash
# 1. Find previous working commit
git log --oneline -10
# 2. Reset to that commit
git reset --hard {commit_hash}
# 3. Restart services
docker compose down
docker compose up -d
# 4. Verify
curl http://localhost:8050/health
```
### Full Rollback (Database)
If database changes need to be reverted:
```bash
# 1. Stop application
docker compose down
# 2. Restore database backup
pg_restore -h localhost -U portfolio -d portfolio backup.dump
# 3. Revert code
git reset --hard {commit_hash}
# 4. Run dbt at that version
cd dbt && dbt run --profiles-dir . && cd ..
# 5. Restart
docker compose up -d
```
## Health Checks
### Application Health
```bash
curl http://localhost:8050/health
```
Expected response:
```json
{"status": "healthy"}
```
### Database Health
```bash
docker compose exec postgres pg_isready -U portfolio
```
### Container Status
```bash
docker compose ps
```
## Monitoring
### View Logs
```bash
# All services
make logs
# Specific service
make logs SERVICE=postgres
# Or directly
docker compose logs -f
```
### Check Resource Usage
```bash
docker stats
```
## Troubleshooting
### Application Won't Start
1. Check container logs: `docker compose logs app`
2. Verify environment variables: `cat .env`
3. Check database connectivity: `docker compose exec postgres pg_isready`
4. Verify port availability: `lsof -i :8050`
### Database Connection Errors
1. Check postgres container: `docker compose ps postgres`
2. Verify DATABASE_URL in `.env`
3. Check postgres logs: `docker compose logs postgres`
4. Test connection: `docker compose exec postgres psql -U portfolio -c '\l'`
### dbt Failures
1. Check dbt logs: `cd dbt && dbt debug`
2. Verify profiles.yml: `cat dbt/profiles.yml`
3. Run with verbose output: `dbt run --debug`
### Out of Memory
1. Check memory usage: `free -h`
2. Review container limits in docker-compose.yml
3. Consider increasing swap or server resources
## Backup Procedures
### Database Backup
```bash
# Create backup
docker compose exec postgres pg_dump -U portfolio portfolio > backup_$(date +%Y%m%d).sql
# Compressed backup
docker compose exec postgres pg_dump -U portfolio -Fc portfolio > backup_$(date +%Y%m%d).dump
```
### Restore from Backup
```bash
# From SQL file
docker compose exec -T postgres psql -U portfolio portfolio < backup.sql
# From dump file
docker compose exec -T postgres pg_restore -U portfolio -d portfolio < backup.dump
```
## Deployment Checklist
Before deploying to production:
- [ ] All tests pass (`make test`)
- [ ] Linting passes (`make lint`)
- [ ] Staging deployment successful
- [ ] Manual testing on staging complete
- [ ] Database backup taken
- [ ] Rollback plan confirmed
- [ ] Team notified of deployment window

72
scripts/etl/toronto.sh Executable file
View File

@@ -0,0 +1,72 @@
#!/usr/bin/env bash
# scripts/etl/toronto.sh - Run Toronto data pipeline
#
# Usage:
# ./scripts/etl/toronto.sh --full # Complete reload of all data
# ./scripts/etl/toronto.sh --incremental # Only new data since last run
# ./scripts/etl/toronto.sh # Default: incremental
#
# Logs are written to .dev/logs/etl/
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
LOG_DIR="$PROJECT_ROOT/.dev/logs/etl"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="$LOG_DIR/toronto_${TIMESTAMP}.log"
MODE="${1:---incremental}"
mkdir -p "$LOG_DIR"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
log "Starting Toronto ETL pipeline (mode: $MODE)"
log "Log file: $LOG_FILE"
cd "$PROJECT_ROOT"
# Activate virtual environment if it exists
if [ -d ".venv" ]; then
source .venv/bin/activate
log "Activated virtual environment"
fi
case "$MODE" in
--full)
log "Running FULL data reload..."
log "Step 1/4: Parsing neighbourhood data..."
python -m portfolio_app.toronto.parsers.neighbourhoods 2>&1 | tee -a "$LOG_FILE"
log "Step 2/4: Parsing census data..."
python -m portfolio_app.toronto.parsers.census 2>&1 | tee -a "$LOG_FILE"
log "Step 3/4: Parsing crime data..."
python -m portfolio_app.toronto.parsers.crime 2>&1 | tee -a "$LOG_FILE"
log "Step 4/4: Running dbt transformations..."
cd dbt && dbt run --full-refresh --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd ..
;;
--incremental)
log "Running INCREMENTAL update..."
log "Step 1/2: Checking for new data..."
# Add incremental logic here when implemented
log "Step 2/2: Running dbt transformations..."
cd dbt && dbt run --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd ..
;;
*)
log "ERROR: Unknown mode '$MODE'. Use --full or --incremental"
exit 1
;;
esac
log "Toronto ETL pipeline completed successfully"
log "Full log available at: $LOG_FILE"

20
scripts/logs.sh Executable file
View File

@@ -0,0 +1,20 @@
#!/usr/bin/env bash
# scripts/logs.sh - Follow docker compose logs
#
# Usage:
# ./scripts/logs.sh # All services
# ./scripts/logs.sh postgres # Specific service
# ./scripts/logs.sh -n 100 # Last 100 lines
set -euo pipefail
SERVICE="${1:-}"
EXTRA_ARGS="${@:2}"
if [[ -n "$SERVICE" && "$SERVICE" != -* ]]; then
echo "Following logs for service: $SERVICE"
docker compose logs -f "$SERVICE" $EXTRA_ARGS
else
echo "Following logs for all services"
docker compose logs -f $@
fi

38
scripts/run-detached.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
# scripts/run-detached.sh - Start containers and wait for health
#
# Usage:
# ./scripts/run-detached.sh
set -euo pipefail
TIMEOUT=60
INTERVAL=5
echo "Starting containers in detached mode..."
docker compose up -d
echo "Waiting for services to become healthy..."
elapsed=0
while [ $elapsed -lt $TIMEOUT ]; do
# Check if postgres is ready
if docker compose exec -T postgres pg_isready -U portfolio > /dev/null 2>&1; then
echo "PostgreSQL is ready!"
# Check if app health endpoint responds (if running)
if curl -sf http://localhost:8050/health > /dev/null 2>&1; then
echo "Application health check passed!"
echo "All services are healthy."
exit 0
fi
fi
echo "Waiting... ($elapsed/$TIMEOUT seconds)"
sleep $INTERVAL
elapsed=$((elapsed + INTERVAL))
done
echo "ERROR: Health check timed out after $TIMEOUT seconds"
docker compose ps
exit 1