diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..2e69fed --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + push: + branches: + - development + - staging + - main + pull_request: + branches: + - development + +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install ruff pytest + + - name: Run linter + run: ruff check . + + - name: Run tests + run: pytest tests/ -v --tb=short diff --git a/.gitea/workflows/deploy-production.yml b/.gitea/workflows/deploy-production.yml new file mode 100644 index 0000000..6cb8837 --- /dev/null +++ b/.gitea/workflows/deploy-production.yml @@ -0,0 +1,44 @@ +name: Deploy to Production + +on: + push: + branches: + - main + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Deploy to Production Server + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.PROD_HOST }} + username: ${{ secrets.PROD_USER }} + key: ${{ secrets.PROD_SSH_KEY }} + script: | + set -euo pipefail + + cd ~/apps/personal-portfolio + + echo "Pulling latest changes..." + git fetch origin main + git reset --hard origin/main + + echo "Activating virtual environment..." + source .venv/bin/activate + + echo "Installing dependencies..." + pip install -r requirements.txt --quiet + + echo "Running dbt models..." + cd dbt && dbt run --profiles-dir . && cd .. + + echo "Restarting application..." + docker compose down + docker compose up -d + + echo "Waiting for health check..." + sleep 10 + curl -f http://localhost:8050/health || exit 1 + + echo "Production deployment complete!" diff --git a/.gitea/workflows/deploy-staging.yml b/.gitea/workflows/deploy-staging.yml new file mode 100644 index 0000000..5a26df2 --- /dev/null +++ b/.gitea/workflows/deploy-staging.yml @@ -0,0 +1,44 @@ +name: Deploy to Staging + +on: + push: + branches: + - staging + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Deploy to Staging Server + uses: appleboy/ssh-action@v1.0.3 + with: + host: ${{ secrets.STAGING_HOST }} + username: ${{ secrets.STAGING_USER }} + key: ${{ secrets.STAGING_SSH_KEY }} + script: | + set -euo pipefail + + cd ~/apps/personal-portfolio + + echo "Pulling latest changes..." + git fetch origin staging + git reset --hard origin/staging + + echo "Activating virtual environment..." + source .venv/bin/activate + + echo "Installing dependencies..." + pip install -r requirements.txt --quiet + + echo "Running dbt models..." + cd dbt && dbt run --profiles-dir . && cd .. + + echo "Restarting application..." + docker compose down + docker compose up -d + + echo "Waiting for health check..." + sleep 10 + curl -f http://localhost:8050/health || exit 1 + + echo "Staging deployment complete!" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0bd39ba --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024-2025 Leo Miranda + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile index 4d51b4c..0d78a06 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: setup docker-up docker-down db-init load-data run test dbt-run dbt-test lint format ci deploy clean help +.PHONY: setup docker-up docker-down db-init load-data run test dbt-run dbt-test lint format ci deploy clean help logs run-detached etl-toronto # Default target .DEFAULT_GOAL := help @@ -151,6 +151,19 @@ ci: ## Run all checks (lint, typecheck, test) $(MAKE) test @echo "$(GREEN)All checks passed!$(NC)" +# ============================================================================= +# Operations +# ============================================================================= + +logs: ## Follow docker compose logs (usage: make logs or make logs SERVICE=postgres) + @./scripts/logs.sh $(SERVICE) + +run-detached: ## Start containers and wait for health check + @./scripts/run-detached.sh + +etl-toronto: ## Run Toronto ETL pipeline (usage: make etl-toronto MODE=--full) + @./scripts/etl/toronto.sh $(MODE) + # ============================================================================= # Deployment # ============================================================================= diff --git a/README.md b/README.md index 2f5c1b7..1f156b7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # Analytics Portfolio +[![CI](https://gitea.hotserv.cloud/lmiranda/personal-portfolio/actions/workflows/ci.yml/badge.svg)](https://gitea.hotserv.cloud/lmiranda/personal-portfolio/actions) + +**Live Demo:** [leodata.science](https://leodata.science) + A personal portfolio website showcasing data engineering and visualization capabilities, featuring an interactive Toronto Neighbourhood Dashboard. ## Live Pages @@ -32,6 +36,42 @@ An interactive choropleth dashboard analyzing Toronto's 158 official neighbourho - Toronto Police Service (crime statistics) - CMHC Rental Market Survey (rental data by zone) +## Architecture + +```mermaid +flowchart LR + subgraph Sources + A1[City of Toronto API] + A2[Toronto Police API] + A3[CMHC Data] + end + + subgraph ETL + B1[Parsers] + B2[Loaders] + end + + subgraph Database + C1[(PostgreSQL/PostGIS)] + C2[dbt Models] + end + + subgraph Application + D1[Dash App] + D2[Plotly Figures] + end + + A1 & A2 & A3 --> B1 --> B2 --> C1 --> C2 --> D1 --> D2 +``` + +**Pipeline Stages:** +- **Sources**: External APIs and data files (City of Toronto, Toronto Police, CMHC) +- **ETL**: Python parsers extract and validate data; loaders persist to database +- **Database**: PostgreSQL with PostGIS for geospatial; dbt transforms raw → staging → marts +- **Application**: Dash serves interactive dashboards with Plotly visualizations + +For detailed database schema, see [docs/DATABASE_SCHEMA.md](docs/DATABASE_SCHEMA.md). + ## Quick Start ```bash diff --git a/docs/DATABASE_SCHEMA.md b/docs/DATABASE_SCHEMA.md new file mode 100644 index 0000000..7336820 --- /dev/null +++ b/docs/DATABASE_SCHEMA.md @@ -0,0 +1,307 @@ +# Database Schema + +This document describes the PostgreSQL/PostGIS database schema for the Toronto Neighbourhood Dashboard. + +## Entity Relationship Diagram + +```mermaid +erDiagram + dim_time { + int date_key PK + date full_date UK + int year + int month + int quarter + string month_name + bool is_month_start + } + + dim_cmhc_zone { + int zone_key PK + string zone_code UK + string zone_name + geometry geometry + } + + dim_neighbourhood { + int neighbourhood_id PK + string name + geometry geometry + int population + numeric land_area_sqkm + numeric pop_density_per_sqkm + numeric pct_bachelors_or_higher + numeric median_household_income + numeric pct_owner_occupied + numeric pct_renter_occupied + int census_year + } + + dim_policy_event { + int event_id PK + date event_date + date effective_date + string level + string category + string title + text description + string expected_direction + string source_url + string confidence + } + + fact_rentals { + int id PK + int date_key FK + int zone_key FK + string bedroom_type + int universe + numeric avg_rent + numeric median_rent + numeric vacancy_rate + numeric availability_rate + numeric turnover_rate + numeric rent_change_pct + string reliability_code + } + + fact_census { + int id PK + int neighbourhood_id FK + int census_year + int population + numeric population_density + numeric median_household_income + numeric average_household_income + numeric unemployment_rate + numeric pct_bachelors_or_higher + numeric pct_owner_occupied + numeric pct_renter_occupied + numeric median_age + numeric average_dwelling_value + } + + fact_crime { + int id PK + int neighbourhood_id FK + int year + string crime_type + int count + numeric rate_per_100k + } + + fact_amenities { + int id PK + int neighbourhood_id FK + string amenity_type + int count + int year + } + + bridge_cmhc_neighbourhood { + int id PK + string cmhc_zone_code FK + int neighbourhood_id FK + numeric weight + } + + dim_time ||--o{ fact_rentals : "date_key" + dim_cmhc_zone ||--o{ fact_rentals : "zone_key" + dim_neighbourhood ||--o{ fact_census : "neighbourhood_id" + dim_neighbourhood ||--o{ fact_crime : "neighbourhood_id" + dim_neighbourhood ||--o{ fact_amenities : "neighbourhood_id" + dim_cmhc_zone ||--o{ bridge_cmhc_neighbourhood : "zone_code" + dim_neighbourhood ||--o{ bridge_cmhc_neighbourhood : "neighbourhood_id" +``` + +## Schema Layers + +### Raw Schema + +Raw data is loaded directly from external sources without transformation: + +| Table | Source | Description | +|-------|--------|-------------| +| `raw.neighbourhoods` | City of Toronto API | GeoJSON neighbourhood boundaries | +| `raw.census_profiles` | City of Toronto API | Census profile data | +| `raw.crime_data` | Toronto Police API | Crime statistics by neighbourhood | +| `raw.cmhc_rentals` | CMHC Data Files | Rental market survey data | + +### Staging Schema (dbt) + +Staging models provide 1:1 cleaned representations of source data: + +| Model | Source Table | Purpose | +|-------|-------------|---------| +| `stg_toronto__neighbourhoods` | raw.neighbourhoods | Cleaned boundaries with standardized names | +| `stg_toronto__census` | raw.census_profiles | Typed census metrics | +| `stg_cmhc__rentals` | raw.cmhc_rentals | Validated rental data | +| `stg_police__crimes` | raw.crime_data | Standardized crime categories | + +### Marts Schema (dbt) + +Analytical tables ready for dashboard consumption: + +| Model | Grain | Purpose | +|-------|-------|---------| +| `mart_neighbourhood_summary` | neighbourhood | Composite livability scores | +| `mart_rental_trends` | zone × month | Time-series rental analysis | +| `mart_crime_rates` | neighbourhood × year | Crime rate calculations | +| `mart_amenity_density` | neighbourhood | Amenity accessibility scores | + +## Table Details + +### Dimension Tables + +#### dim_time +Time dimension for date-based analysis. Grain: one row per month. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| date_key | INTEGER | PK | Surrogate key (YYYYMM format) | +| full_date | DATE | UNIQUE, NOT NULL | First day of month | +| year | INTEGER | NOT NULL | Calendar year | +| month | INTEGER | NOT NULL | Month number (1-12) | +| quarter | INTEGER | NOT NULL | Quarter (1-4) | +| month_name | VARCHAR(20) | NOT NULL | Month name | +| is_month_start | BOOLEAN | DEFAULT TRUE | Always true (monthly grain) | + +#### dim_cmhc_zone +CMHC rental market zones (~20 zones covering Toronto). + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| zone_key | INTEGER | PK, AUTO | Surrogate key | +| zone_code | VARCHAR(10) | UNIQUE, NOT NULL | CMHC zone identifier | +| zone_name | VARCHAR(100) | NOT NULL | Zone display name | +| geometry | GEOMETRY(POLYGON) | SRID 4326 | PostGIS zone boundary | + +#### dim_neighbourhood +Toronto's 158 official neighbourhoods. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| neighbourhood_id | INTEGER | PK | City-assigned ID | +| name | VARCHAR(100) | NOT NULL | Neighbourhood name | +| geometry | GEOMETRY(POLYGON) | SRID 4326 | PostGIS boundary | +| population | INTEGER | | Total population | +| land_area_sqkm | NUMERIC(10,4) | | Area in km² | +| pop_density_per_sqkm | NUMERIC(10,2) | | Population density | +| pct_bachelors_or_higher | NUMERIC(5,2) | | Education rate | +| median_household_income | NUMERIC(12,2) | | Median income | +| pct_owner_occupied | NUMERIC(5,2) | | Owner occupancy rate | +| pct_renter_occupied | NUMERIC(5,2) | | Renter occupancy rate | +| census_year | INTEGER | DEFAULT 2021 | Census reference year | + +#### dim_policy_event +Policy events for time-series annotation (rent control, interest rates, etc.). + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| event_id | INTEGER | PK, AUTO | Surrogate key | +| event_date | DATE | NOT NULL | Announcement date | +| effective_date | DATE | | Implementation date | +| level | VARCHAR(20) | NOT NULL | federal/provincial/municipal | +| category | VARCHAR(20) | NOT NULL | monetary/tax/regulatory/supply/economic | +| title | VARCHAR(200) | NOT NULL | Event title | +| description | TEXT | | Detailed description | +| expected_direction | VARCHAR(10) | NOT NULL | bearish/bullish/neutral | +| source_url | VARCHAR(500) | | Reference link | +| confidence | VARCHAR(10) | DEFAULT 'medium' | high/medium/low | + +### Fact Tables + +#### fact_rentals +CMHC rental market survey data. Grain: zone × bedroom type × survey date. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| id | INTEGER | PK, AUTO | Surrogate key | +| date_key | INTEGER | FK → dim_time | Survey date reference | +| zone_key | INTEGER | FK → dim_cmhc_zone | CMHC zone reference | +| bedroom_type | VARCHAR(20) | NOT NULL | bachelor/1-bed/2-bed/3+bed/total | +| universe | INTEGER | | Total rental units | +| avg_rent | NUMERIC(10,2) | | Average rent | +| median_rent | NUMERIC(10,2) | | Median rent | +| vacancy_rate | NUMERIC(5,2) | | Vacancy percentage | +| availability_rate | NUMERIC(5,2) | | Availability percentage | +| turnover_rate | NUMERIC(5,2) | | Turnover percentage | +| rent_change_pct | NUMERIC(5,2) | | Year-over-year change | +| reliability_code | VARCHAR(2) | | CMHC data quality code | + +#### fact_census +Census statistics. Grain: neighbourhood × census year. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| id | INTEGER | PK, AUTO | Surrogate key | +| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference | +| census_year | INTEGER | NOT NULL | 2016, 2021, etc. | +| population | INTEGER | | Total population | +| population_density | NUMERIC(10,2) | | People per km² | +| median_household_income | NUMERIC(12,2) | | Median income | +| average_household_income | NUMERIC(12,2) | | Average income | +| unemployment_rate | NUMERIC(5,2) | | Unemployment % | +| pct_bachelors_or_higher | NUMERIC(5,2) | | Education rate | +| pct_owner_occupied | NUMERIC(5,2) | | Owner rate | +| pct_renter_occupied | NUMERIC(5,2) | | Renter rate | +| median_age | NUMERIC(5,2) | | Median resident age | +| average_dwelling_value | NUMERIC(12,2) | | Average home value | + +#### fact_crime +Crime statistics. Grain: neighbourhood × year × crime type. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| id | INTEGER | PK, AUTO | Surrogate key | +| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference | +| year | INTEGER | NOT NULL | Calendar year | +| crime_type | VARCHAR(50) | NOT NULL | Crime category | +| count | INTEGER | NOT NULL | Number of incidents | +| rate_per_100k | NUMERIC(10,2) | | Rate per 100k population | + +#### fact_amenities +Amenity counts. Grain: neighbourhood × amenity type × year. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| id | INTEGER | PK, AUTO | Surrogate key | +| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference | +| amenity_type | VARCHAR(50) | NOT NULL | parks/schools/transit/etc. | +| count | INTEGER | NOT NULL | Number of amenities | +| year | INTEGER | NOT NULL | Reference year | + +### Bridge Tables + +#### bridge_cmhc_neighbourhood +Maps CMHC zones to neighbourhoods with area-based weights for data disaggregation. + +| Column | Type | Constraints | Description | +|--------|------|-------------|-------------| +| id | INTEGER | PK, AUTO | Surrogate key | +| cmhc_zone_code | VARCHAR(10) | FK → dim_cmhc_zone | Zone reference | +| neighbourhood_id | INTEGER | FK → dim_neighbourhood | Neighbourhood reference | +| weight | NUMERIC(5,4) | NOT NULL | Proportional weight (0-1) | + +## Indexes + +| Table | Index | Columns | Purpose | +|-------|-------|---------|---------| +| fact_rentals | ix_fact_rentals_date_zone | date_key, zone_key | Time-series queries | +| fact_census | ix_fact_census_neighbourhood_year | neighbourhood_id, census_year | Census lookups | +| fact_crime | ix_fact_crime_neighbourhood_year | neighbourhood_id, year | Crime trends | +| fact_crime | ix_fact_crime_type | crime_type | Crime filtering | +| fact_amenities | ix_fact_amenities_neighbourhood_year | neighbourhood_id, year | Amenity queries | +| fact_amenities | ix_fact_amenities_type | amenity_type | Amenity filtering | +| bridge_cmhc_neighbourhood | ix_bridge_cmhc_zone | cmhc_zone_code | Zone lookups | +| bridge_cmhc_neighbourhood | ix_bridge_neighbourhood | neighbourhood_id | Neighbourhood lookups | + +## PostGIS Extensions + +The database requires PostGIS for geospatial operations: + +```sql +CREATE EXTENSION IF NOT EXISTS postgis; +``` + +All geometry columns use SRID 4326 (WGS84) for compatibility with web mapping libraries. diff --git a/docs/runbooks/adding-dashboard.md b/docs/runbooks/adding-dashboard.md new file mode 100644 index 0000000..d02e421 --- /dev/null +++ b/docs/runbooks/adding-dashboard.md @@ -0,0 +1,200 @@ +# Runbook: Adding a New Dashboard + +This runbook describes how to add a new data dashboard to the portfolio application. + +## Prerequisites + +- [ ] Data sources identified and accessible +- [ ] Database schema designed +- [ ] Basic Dash/Plotly familiarity + +## Directory Structure + +Create the following structure under `portfolio_app/`: + +``` +portfolio_app/ +├── pages/ +│ └── {dashboard_name}/ +│ ├── dashboard.py # Main layout with tabs +│ ├── methodology.py # Data sources and methods page +│ ├── tabs/ +│ │ ├── __init__.py +│ │ ├── overview.py # Overview tab layout +│ │ └── ... # Additional tab layouts +│ └── callbacks/ +│ ├── __init__.py +│ └── ... # Callback modules +├── {dashboard_name}/ # Data logic (outside pages/) +│ ├── __init__.py +│ ├── parsers/ # API/CSV extraction +│ │ └── __init__.py +│ ├── loaders/ # Database operations +│ │ └── __init__.py +│ ├── schemas/ # Pydantic models +│ │ └── __init__.py +│ └── models/ # SQLAlchemy ORM +│ └── __init__.py +``` + +## Step-by-Step Checklist + +### 1. Data Layer + +- [ ] Create Pydantic schemas in `{dashboard_name}/schemas/` +- [ ] Create SQLAlchemy models in `{dashboard_name}/models/` +- [ ] Create parsers in `{dashboard_name}/parsers/` +- [ ] Create loaders in `{dashboard_name}/loaders/` +- [ ] Add database migrations if needed + +### 2. dbt Models + +Create dbt models in `dbt/models/`: + +- [ ] `staging/stg_{source}__{entity}.sql` - Raw data cleaning +- [ ] `intermediate/int_{domain}__{transform}.sql` - Business logic +- [ ] `marts/mart_{domain}.sql` - Final analytical tables + +Follow naming conventions: +- Staging: `stg_{source}__{entity}` +- Intermediate: `int_{domain}__{transform}` +- Marts: `mart_{domain}` + +### 3. Visualization Layer + +- [ ] Create figure factories in `figures/` (or reuse existing) +- [ ] Follow the factory pattern: `create_{chart_type}_figure(data, **kwargs)` + +### 4. Dashboard Pages + +#### Main Dashboard (`pages/{dashboard_name}/dashboard.py`) + +```python +import dash +from dash import html, dcc +import dash_mantine_components as dmc + +dash.register_page( + __name__, + path="/{dashboard_name}", + title="{Dashboard Title}", + description="{Description}" +) + +def layout(): + return dmc.Container([ + # Header + dmc.Title("{Dashboard Title}", order=1), + + # Tabs + dmc.Tabs([ + dmc.TabsList([ + dmc.TabsTab("Overview", value="overview"), + # Add more tabs + ]), + dmc.TabsPanel(overview_tab(), value="overview"), + # Add more panels + ], value="overview"), + ]) +``` + +#### Tab Layouts (`pages/{dashboard_name}/tabs/`) + +- [ ] Create one file per tab +- [ ] Export layout function from each + +#### Callbacks (`pages/{dashboard_name}/callbacks/`) + +- [ ] Create callback modules for interactivity +- [ ] Import and register in dashboard.py + +### 5. Navigation + +Add to sidebar in `components/sidebar.py`: + +```python +dmc.NavLink( + label="{Dashboard Name}", + href="/{dashboard_name}", + icon=DashIconify(icon="..."), +) +``` + +### 6. Documentation + +- [ ] Create methodology page (`pages/{dashboard_name}/methodology.py`) +- [ ] Document data sources +- [ ] Document transformation logic +- [ ] Add notebooks to `notebooks/{dashboard_name}/` if needed + +### 7. Testing + +- [ ] Add unit tests for parsers +- [ ] Add unit tests for loaders +- [ ] Add integration tests for callbacks +- [ ] Run `make test` + +### 8. Final Verification + +- [ ] All pages render without errors +- [ ] All callbacks respond correctly +- [ ] Data loads successfully +- [ ] dbt models run cleanly (`make dbt-run`) +- [ ] Linting passes (`make lint`) +- [ ] Tests pass (`make test`) + +## Example: Toronto Dashboard + +Reference implementation: `portfolio_app/pages/toronto/` + +Key files: +- `dashboard.py` - Main layout with 5 tabs +- `tabs/overview.py` - Livability scores, scatter plots +- `callbacks/map_callbacks.py` - Choropleth interactions +- `toronto/models/dimensions.py` - Dimension tables +- `toronto/models/facts.py` - Fact tables + +## Common Patterns + +### Figure Factories + +```python +# figures/choropleth.py +def create_choropleth_figure( + gdf: gpd.GeoDataFrame, + value_column: str, + title: str, + **kwargs +) -> go.Figure: + ... +``` + +### Callbacks + +```python +# callbacks/map_callbacks.py +@callback( + Output("neighbourhood-details", "children"), + Input("choropleth-map", "clickData"), +) +def update_details(click_data): + ... +``` + +### Data Loading + +```python +# {dashboard_name}/loaders/load.py +def load_data(session: Session) -> None: + # Parse from source + records = parse_source_data() + + # Validate with Pydantic + validated = [Schema(**r) for r in records] + + # Load to database + for record in validated: + session.add(Model(**record.model_dump())) + + session.commit() +``` diff --git a/docs/runbooks/deployment.md b/docs/runbooks/deployment.md new file mode 100644 index 0000000..3babddc --- /dev/null +++ b/docs/runbooks/deployment.md @@ -0,0 +1,232 @@ +# Runbook: Deployment + +This runbook covers deployment procedures for the Analytics Portfolio application. + +## Environments + +| Environment | Branch | Server | URL | +|-------------|--------|--------|-----| +| Development | `development` | Local | http://localhost:8050 | +| Staging | `staging` | Homelab (hotserv) | Internal | +| Production | `main` | Bandit Labs VPS | https://leodata.science | + +## CI/CD Pipeline + +### Automatic Deployment + +Deployments are triggered automatically via Gitea Actions: + +1. **Push to `staging`** → Deploys to staging server +2. **Push to `main`** → Deploys to production server + +### Workflow Files + +- `.gitea/workflows/ci.yml` - Runs linting and tests on all branches +- `.gitea/workflows/deploy-staging.yml` - Staging deployment +- `.gitea/workflows/deploy-production.yml` - Production deployment + +### Required Secrets + +Configure these in Gitea repository settings: + +| Secret | Description | +|--------|-------------| +| `STAGING_HOST` | Staging server hostname/IP | +| `STAGING_USER` | SSH username for staging | +| `STAGING_SSH_KEY` | Private key for staging SSH | +| `PROD_HOST` | Production server hostname/IP | +| `PROD_USER` | SSH username for production | +| `PROD_SSH_KEY` | Private key for production SSH | + +## Manual Deployment + +### Prerequisites + +- SSH access to target server +- Repository cloned at `~/apps/personal-portfolio` +- Virtual environment created at `.venv` +- Docker and Docker Compose installed +- PostgreSQL container running + +### Steps + +```bash +# 1. SSH to server +ssh user@server + +# 2. Navigate to app directory +cd ~/apps/personal-portfolio + +# 3. Pull latest changes +git fetch origin {branch} +git reset --hard origin/{branch} + +# 4. Activate virtual environment +source .venv/bin/activate + +# 5. Install dependencies +pip install -r requirements.txt + +# 6. Run database migrations (if any) +# python -m alembic upgrade head + +# 7. Run dbt models +cd dbt && dbt run --profiles-dir . && cd .. + +# 8. Restart application +docker compose down +docker compose up -d + +# 9. Verify health +curl http://localhost:8050/health +``` + +## Rollback Procedure + +### Quick Rollback + +If deployment fails, rollback to previous commit: + +```bash +# 1. Find previous working commit +git log --oneline -10 + +# 2. Reset to that commit +git reset --hard {commit_hash} + +# 3. Restart services +docker compose down +docker compose up -d + +# 4. Verify +curl http://localhost:8050/health +``` + +### Full Rollback (Database) + +If database changes need to be reverted: + +```bash +# 1. Stop application +docker compose down + +# 2. Restore database backup +pg_restore -h localhost -U portfolio -d portfolio backup.dump + +# 3. Revert code +git reset --hard {commit_hash} + +# 4. Run dbt at that version +cd dbt && dbt run --profiles-dir . && cd .. + +# 5. Restart +docker compose up -d +``` + +## Health Checks + +### Application Health + +```bash +curl http://localhost:8050/health +``` + +Expected response: +```json +{"status": "healthy"} +``` + +### Database Health + +```bash +docker compose exec postgres pg_isready -U portfolio +``` + +### Container Status + +```bash +docker compose ps +``` + +## Monitoring + +### View Logs + +```bash +# All services +make logs + +# Specific service +make logs SERVICE=postgres + +# Or directly +docker compose logs -f +``` + +### Check Resource Usage + +```bash +docker stats +``` + +## Troubleshooting + +### Application Won't Start + +1. Check container logs: `docker compose logs app` +2. Verify environment variables: `cat .env` +3. Check database connectivity: `docker compose exec postgres pg_isready` +4. Verify port availability: `lsof -i :8050` + +### Database Connection Errors + +1. Check postgres container: `docker compose ps postgres` +2. Verify DATABASE_URL in `.env` +3. Check postgres logs: `docker compose logs postgres` +4. Test connection: `docker compose exec postgres psql -U portfolio -c '\l'` + +### dbt Failures + +1. Check dbt logs: `cd dbt && dbt debug` +2. Verify profiles.yml: `cat dbt/profiles.yml` +3. Run with verbose output: `dbt run --debug` + +### Out of Memory + +1. Check memory usage: `free -h` +2. Review container limits in docker-compose.yml +3. Consider increasing swap or server resources + +## Backup Procedures + +### Database Backup + +```bash +# Create backup +docker compose exec postgres pg_dump -U portfolio portfolio > backup_$(date +%Y%m%d).sql + +# Compressed backup +docker compose exec postgres pg_dump -U portfolio -Fc portfolio > backup_$(date +%Y%m%d).dump +``` + +### Restore from Backup + +```bash +# From SQL file +docker compose exec -T postgres psql -U portfolio portfolio < backup.sql + +# From dump file +docker compose exec -T postgres pg_restore -U portfolio -d portfolio < backup.dump +``` + +## Deployment Checklist + +Before deploying to production: + +- [ ] All tests pass (`make test`) +- [ ] Linting passes (`make lint`) +- [ ] Staging deployment successful +- [ ] Manual testing on staging complete +- [ ] Database backup taken +- [ ] Rollback plan confirmed +- [ ] Team notified of deployment window diff --git a/scripts/etl/toronto.sh b/scripts/etl/toronto.sh new file mode 100755 index 0000000..6019693 --- /dev/null +++ b/scripts/etl/toronto.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# scripts/etl/toronto.sh - Run Toronto data pipeline +# +# Usage: +# ./scripts/etl/toronto.sh --full # Complete reload of all data +# ./scripts/etl/toronto.sh --incremental # Only new data since last run +# ./scripts/etl/toronto.sh # Default: incremental +# +# Logs are written to .dev/logs/etl/ + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +LOG_DIR="$PROJECT_ROOT/.dev/logs/etl" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +LOG_FILE="$LOG_DIR/toronto_${TIMESTAMP}.log" + +MODE="${1:---incremental}" + +mkdir -p "$LOG_DIR" + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +log "Starting Toronto ETL pipeline (mode: $MODE)" +log "Log file: $LOG_FILE" + +cd "$PROJECT_ROOT" + +# Activate virtual environment if it exists +if [ -d ".venv" ]; then + source .venv/bin/activate + log "Activated virtual environment" +fi + +case "$MODE" in + --full) + log "Running FULL data reload..." + + log "Step 1/4: Parsing neighbourhood data..." + python -m portfolio_app.toronto.parsers.neighbourhoods 2>&1 | tee -a "$LOG_FILE" + + log "Step 2/4: Parsing census data..." + python -m portfolio_app.toronto.parsers.census 2>&1 | tee -a "$LOG_FILE" + + log "Step 3/4: Parsing crime data..." + python -m portfolio_app.toronto.parsers.crime 2>&1 | tee -a "$LOG_FILE" + + log "Step 4/4: Running dbt transformations..." + cd dbt && dbt run --full-refresh --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd .. + ;; + + --incremental) + log "Running INCREMENTAL update..." + + log "Step 1/2: Checking for new data..." + # Add incremental logic here when implemented + + log "Step 2/2: Running dbt transformations..." + cd dbt && dbt run --profiles-dir . 2>&1 | tee -a "$LOG_FILE" && cd .. + ;; + + *) + log "ERROR: Unknown mode '$MODE'. Use --full or --incremental" + exit 1 + ;; +esac + +log "Toronto ETL pipeline completed successfully" +log "Full log available at: $LOG_FILE" diff --git a/scripts/logs.sh b/scripts/logs.sh new file mode 100755 index 0000000..8d53153 --- /dev/null +++ b/scripts/logs.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# scripts/logs.sh - Follow docker compose logs +# +# Usage: +# ./scripts/logs.sh # All services +# ./scripts/logs.sh postgres # Specific service +# ./scripts/logs.sh -n 100 # Last 100 lines + +set -euo pipefail + +SERVICE="${1:-}" +EXTRA_ARGS="${@:2}" + +if [[ -n "$SERVICE" && "$SERVICE" != -* ]]; then + echo "Following logs for service: $SERVICE" + docker compose logs -f "$SERVICE" $EXTRA_ARGS +else + echo "Following logs for all services" + docker compose logs -f $@ +fi diff --git a/scripts/run-detached.sh b/scripts/run-detached.sh new file mode 100755 index 0000000..e4de1d5 --- /dev/null +++ b/scripts/run-detached.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# scripts/run-detached.sh - Start containers and wait for health +# +# Usage: +# ./scripts/run-detached.sh + +set -euo pipefail + +TIMEOUT=60 +INTERVAL=5 + +echo "Starting containers in detached mode..." +docker compose up -d + +echo "Waiting for services to become healthy..." +elapsed=0 + +while [ $elapsed -lt $TIMEOUT ]; do + # Check if postgres is ready + if docker compose exec -T postgres pg_isready -U portfolio > /dev/null 2>&1; then + echo "PostgreSQL is ready!" + + # Check if app health endpoint responds (if running) + if curl -sf http://localhost:8050/health > /dev/null 2>&1; then + echo "Application health check passed!" + echo "All services are healthy." + exit 0 + fi + fi + + echo "Waiting... ($elapsed/$TIMEOUT seconds)" + sleep $INTERVAL + elapsed=$((elapsed + INTERVAL)) +done + +echo "ERROR: Health check timed out after $TIMEOUT seconds" +docker compose ps +exit 1