feat: project bootstrap and structure #4

Merged
lmiranda merged 1 commits from feature/sprint1-bootstrap into development 2026-01-11 18:58:42 +00:00
38 changed files with 709 additions and 1 deletions
Showing only changes of commit c7e9b88adb - Show all commits

15
.env.example Normal file
View File

@@ -0,0 +1,15 @@
# Database Configuration
DATABASE_URL=postgresql://portfolio:portfolio_dev@localhost:5432/portfolio
POSTGRES_USER=portfolio
POSTGRES_PASSWORD=portfolio_dev
POSTGRES_DB=portfolio
# Application Settings
DASH_DEBUG=true
SECRET_KEY=change-me-in-production
# Logging
LOG_LEVEL=INFO
# Optional: dbt profile (defaults to profiles.yml)
# DBT_PROFILES_DIR=.

26
.gitignore vendored
View File

@@ -1,4 +1,28 @@
# ---> Python
# ====================
# Project-Specific
# ====================
# Processed data (generated, not source)
data/*/processed/
# Reports (generated)
reports/
# Backups
backups/
# Notebook exports
notebooks/*.html
# dbt
dbt/target/
dbt/dbt_packages/
dbt/logs/
# ====================
# Python
# ====================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

32
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,32 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
args: ['--maxkb=1000']
- id: check-merge-conflict
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.9
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
hooks:
- id: mypy
additional_dependencies:
- pydantic>=2.0
- pandas-stubs
- types-requests
args: [--ignore-missing-imports]
exclude: ^(tests/|dbt/)
ci:
autofix_commit_msg: "style: auto-fix by pre-commit hooks"
autoupdate_commit_msg: "chore: update pre-commit hooks"

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.11

258
CLAUDE.md Normal file
View File

@@ -0,0 +1,258 @@
# CLAUDE.md
Working context for Claude Code on the Analytics Portfolio project.
---
## Project Status
**Current Sprint**: 1 (Project Bootstrap)
**Phase**: 1 - Toronto Housing Dashboard
**Branch**: `development` (feature branches merge here)
---
## Quick Reference
### Run Commands
```bash
make setup # Install deps, create .env, init pre-commit
make docker-up # Start PostgreSQL + PostGIS
make docker-down # Stop containers
make db-init # Initialize database schema
make run # Start Dash dev server
make test # Run pytest
make lint # Run ruff linter
make format # Run ruff formatter
make ci # Run all checks
```
### Branch Workflow
1. Create feature branch FROM `development`: `git checkout -b feature/{sprint}-{description}`
2. Work and commit on feature branch
3. Merge INTO `development` when complete
4. `development` -> `staging` -> `main` for releases
---
## Code Conventions
### Import Style
| Context | Style | Example |
|---------|-------|---------|
| Same directory | Single dot | `from .trreb import TRREBParser` |
| Sibling directory | Double dot | `from ..schemas.trreb import TRREBRecord` |
| External packages | Absolute | `import pandas as pd` |
### Module Responsibilities
| Directory | Contains | Purpose |
|-----------|----------|---------|
| `schemas/` | Pydantic models | Data validation |
| `models/` | SQLAlchemy ORM | Database persistence |
| `parsers/` | PDF/CSV extraction | Raw data ingestion |
| `loaders/` | Database operations | Data loading |
| `figures/` | Chart factories | Plotly figure generation |
| `callbacks/` | Dash callbacks | In `pages/{dashboard}/callbacks/` |
| `errors/` | Exceptions + handlers | Error handling |
### Type Hints
Use Python 3.10+ style:
```python
def process(items: list[str], config: dict[str, int] | None = None) -> bool:
...
```
### Error Handling
```python
# errors/exceptions.py
class PortfolioError(Exception):
"""Base exception."""
class ParseError(PortfolioError):
"""PDF/CSV parsing failed."""
class ValidationError(PortfolioError):
"""Pydantic or business rule validation failed."""
class LoadError(PortfolioError):
"""Database load operation failed."""
```
### Code Standards
- Single responsibility functions with verb naming
- Early returns over deep nesting
- Google-style docstrings only for non-obvious behavior
- Module-level constants for magic values
- Pydantic BaseSettings for runtime config
---
## Application Structure
```
portfolio_app/
├── app.py # Dash app factory with Pages routing
├── config.py # Pydantic BaseSettings
├── assets/ # CSS, images (auto-served)
├── pages/
│ ├── home.py # Bio landing page -> /
│ └── toronto/
│ ├── dashboard.py # Layout only -> /toronto
│ └── callbacks/ # Interaction logic
├── components/ # Shared UI (navbar, footer, cards)
├── figures/ # Shared chart factories
├── toronto/ # Toronto data logic
│ ├── parsers/
│ ├── loaders/
│ ├── schemas/ # Pydantic
│ └── models/ # SQLAlchemy
└── errors/
```
### URL Routing
| URL | Page | Sprint |
|-----|------|--------|
| `/` | Bio landing page | 2 |
| `/toronto` | Toronto Housing Dashboard | 6 |
---
## Tech Stack (Locked)
| Layer | Technology | Version |
|-------|------------|---------|
| Database | PostgreSQL + PostGIS | 16.x |
| Validation | Pydantic | >=2.0 |
| ORM | SQLAlchemy | >=2.0 (2.0-style API only) |
| Transformation | dbt-postgres | >=1.7 |
| Data Processing | Pandas | >=2.1 |
| Geospatial | GeoPandas + Shapely | >=0.14 |
| Visualization | Dash + Plotly | >=2.14 |
| UI Components | dash-mantine-components | Latest stable |
| Testing | pytest | >=7.0 |
| Python | 3.11+ | Via pyenv |
**Notes**:
- SQLAlchemy 2.0 + Pydantic 2.0 only (never mix 1.x APIs)
- PostGIS extension required in database
- Docker Compose V2 format (no `version` field)
---
## Data Model Overview
### Geographic Reality (Toronto Housing)
```
TRREB Districts (~35) - Purchase data (W01, C01, E01...)
CMHC Zones (~20) - Rental data (Census Tract aligned)
City Neighbourhoods (158) - Enrichment/overlay only
```
**Critical**: These geographies do NOT align. Display as separate layers—do not force crosswalks.
### Star Schema
| Table | Type | Keys |
|-------|------|------|
| `fact_purchases` | Fact | -> dim_time, dim_trreb_district |
| `fact_rentals` | Fact | -> dim_time, dim_cmhc_zone |
| `dim_time` | Dimension | date_key (PK) |
| `dim_trreb_district` | Dimension | district_key (PK), geometry |
| `dim_cmhc_zone` | Dimension | zone_key (PK), geometry |
| `dim_neighbourhood` | Dimension | neighbourhood_id (PK), geometry |
| `dim_policy_event` | Dimension | event_id (PK) |
**V1 Rule**: `dim_neighbourhood` has NO FK to fact tables—reference overlay only.
### dbt Layers
| Layer | Naming | Purpose |
|-------|--------|---------|
| Staging | `stg_{source}__{entity}` | 1:1 source, cleaned, typed |
| Intermediate | `int_{domain}__{transform}` | Business logic |
| Marts | `mart_{domain}` | Final analytical tables |
---
## DO NOT BUILD (Phase 1)
**Stop and flag if a task seems to require these**:
| Feature | Reason |
|---------|--------|
| `bridge_district_neighbourhood` table | Area-weighted aggregation is Phase 4 |
| Crime data integration | Deferred to Phase 4 |
| Historical boundary reconciliation (140->158) | 2021+ data only for V1 |
| ML prediction models | Energy project scope (Phase 3) |
| Multi-project shared infrastructure | Build first, abstract second (Phase 2) |
---
## Sprint 1 Deliverables
| Category | Tasks |
|----------|-------|
| **Bootstrap** | Git init, pyproject.toml, .env.example, Makefile, CLAUDE.md |
| **Infrastructure** | Docker Compose (PostgreSQL + PostGIS), scripts/ directory |
| **App Foundation** | portfolio_app/ structure, config.py, error handling |
| **Tests** | tests/ directory, conftest.py, pytest config |
| **Data Acquisition** | Download TRREB PDFs, START boundary digitization (HUMAN task) |
### Human Tasks (Cannot Automate)
| Task | Tool | Effort |
|------|------|--------|
| Digitize TRREB district boundaries | QGIS | 3-4 hours |
| Research policy events (10-20) | Manual | 2-3 hours |
| Replace social link placeholders | Manual | 5 minutes |
---
## Environment Variables
Required in `.env`:
```bash
DATABASE_URL=postgresql://user:pass@localhost:5432/portfolio
POSTGRES_USER=portfolio
POSTGRES_PASSWORD=<secure>
POSTGRES_DB=portfolio
DASH_DEBUG=true
SECRET_KEY=<random>
LOG_LEVEL=INFO
```
---
## Script Standards
All scripts in `scripts/`:
- Include usage comments at top
- Idempotent where possible
- Exit codes: 0 = success, 1 = error
- Use `set -euo pipefail` for bash
- Log to stdout, errors to stderr
---
## Reference Documents
| Document | Location | Use When |
|----------|----------|----------|
| Full specification | `docs/PROJECT_REFERENCE.md` | Architecture decisions |
| Data schemas | `docs/toronto_housing_spec.md` | Parser/model tasks |
| WBS details | `docs/wbs.md` | Sprint planning |
| Bio content | `docs/bio_content.md` | Building home.py |
---
*Last Updated: Sprint 1*

157
Makefile Normal file
View File

@@ -0,0 +1,157 @@
.PHONY: setup docker-up docker-down db-init run test dbt-run dbt-test lint format ci deploy clean help
# Default target
.DEFAULT_GOAL := help
# Environment
PYTHON := python3
PIP := pip
DOCKER_COMPOSE := docker compose
# Colors for output
BLUE := \033[0;34m
GREEN := \033[0;32m
YELLOW := \033[0;33m
NC := \033[0m
help: ## Show this help message
@echo "Usage: make [target]"
@echo ""
@echo "Targets:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " $(BLUE)%-15s$(NC) %s\n", $$1, $$2}'
# =============================================================================
# Setup
# =============================================================================
setup: ## Install dependencies, create .env, init pre-commit
@echo "$(GREEN)Installing dependencies...$(NC)"
$(PIP) install -e ".[dev,dbt]"
@echo "$(GREEN)Setting up environment...$(NC)"
@if [ ! -f .env ]; then cp .env.example .env; echo "$(YELLOW)Created .env from .env.example - please update values$(NC)"; fi
@echo "$(GREEN)Installing pre-commit hooks...$(NC)"
pre-commit install
@echo "$(GREEN)Setup complete!$(NC)"
# =============================================================================
# Docker
# =============================================================================
docker-up: ## Start PostgreSQL + PostGIS containers
@echo "$(GREEN)Starting database containers...$(NC)"
$(DOCKER_COMPOSE) up -d
@echo "$(GREEN)Waiting for database to be ready...$(NC)"
@sleep 3
@echo "$(GREEN)Database containers started!$(NC)"
docker-down: ## Stop containers
@echo "$(YELLOW)Stopping containers...$(NC)"
$(DOCKER_COMPOSE) down
docker-logs: ## View container logs
$(DOCKER_COMPOSE) logs -f
# =============================================================================
# Database
# =============================================================================
db-init: ## Initialize database schema
@echo "$(GREEN)Initializing database schema...$(NC)"
@if [ -f scripts/db/init.sh ]; then \
bash scripts/db/init.sh; \
else \
echo "$(YELLOW)scripts/db/init.sh not found - skipping$(NC)"; \
fi
db-reset: ## Drop and recreate database (DESTRUCTIVE)
@echo "$(YELLOW)WARNING: This will delete all data!$(NC)"
@read -p "Are you sure? [y/N] " confirm && [ "$$confirm" = "y" ] || exit 1
$(DOCKER_COMPOSE) down -v
$(DOCKER_COMPOSE) up -d
@sleep 3
$(MAKE) db-init
# =============================================================================
# Application
# =============================================================================
run: ## Start Dash development server
@echo "$(GREEN)Starting Dash server...$(NC)"
$(PYTHON) -m portfolio_app.app
# =============================================================================
# Testing
# =============================================================================
test: ## Run pytest
@echo "$(GREEN)Running tests...$(NC)"
pytest
test-cov: ## Run pytest with coverage
@echo "$(GREEN)Running tests with coverage...$(NC)"
pytest --cov=portfolio_app --cov-report=html --cov-report=term
# =============================================================================
# dbt
# =============================================================================
dbt-run: ## Run dbt models
@echo "$(GREEN)Running dbt models...$(NC)"
cd dbt && dbt run
dbt-test: ## Run dbt tests
@echo "$(GREEN)Running dbt tests...$(NC)"
cd dbt && dbt test
dbt-docs: ## Generate dbt documentation
@echo "$(GREEN)Generating dbt docs...$(NC)"
cd dbt && dbt docs generate && dbt docs serve
# =============================================================================
# Code Quality
# =============================================================================
lint: ## Run ruff linter
@echo "$(GREEN)Running linter...$(NC)"
ruff check .
format: ## Run ruff formatter
@echo "$(GREEN)Formatting code...$(NC)"
ruff format .
ruff check --fix .
typecheck: ## Run mypy type checker
@echo "$(GREEN)Running type checker...$(NC)"
mypy portfolio_app
ci: ## Run all checks (lint, typecheck, test)
@echo "$(GREEN)Running CI checks...$(NC)"
$(MAKE) lint
$(MAKE) typecheck
$(MAKE) test
@echo "$(GREEN)All checks passed!$(NC)"
# =============================================================================
# Deployment
# =============================================================================
deploy: ## Deploy to production
@echo "$(YELLOW)Deployment not yet configured$(NC)"
@echo "TODO: Add deployment script"
# =============================================================================
# Cleanup
# =============================================================================
clean: ## Remove build artifacts and caches
@echo "$(YELLOW)Cleaning up...$(NC)"
rm -rf build/
rm -rf dist/
rm -rf *.egg-info/
rm -rf .pytest_cache/
rm -rf .ruff_cache/
rm -rf .mypy_cache/
rm -rf htmlcov/
rm -rf .coverage
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
@echo "$(GREEN)Clean complete!$(NC)"

View File

View File

0
dbt/macros/.gitkeep Normal file
View File

View File

View File

View File

0
dbt/tests/.gitkeep Normal file
View File

22
docker-compose.yml Normal file
View File

@@ -0,0 +1,22 @@
services:
db:
image: postgis/postgis:16-3.4
container_name: portfolio-db
restart: unless-stopped
ports:
- "5432:5432"
environment:
POSTGRES_USER: ${POSTGRES_USER:-portfolio}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-portfolio_dev}
POSTGRES_DB: ${POSTGRES_DB:-portfolio}
volumes:
- postgres_data:/var/lib/postgresql/data
- ./scripts/db/init-postgis.sql:/docker-entrypoint-initdb.d/init-postgis.sql:ro
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-portfolio} -d ${POSTGRES_DB:-portfolio}"]
interval: 10s
timeout: 5s
retries: 5
volumes:
postgres_data:

0
notebooks/.gitkeep Normal file
View File

View File

@@ -0,0 +1,3 @@
"""Analytics Portfolio Application."""
__version__ = "0.1.0"

View File

View File

View File

@@ -0,0 +1,5 @@
"""Error handling for the portfolio application."""
from .exceptions import LoadError, ParseError, PortfolioError, ValidationError
__all__ = ["PortfolioError", "ParseError", "ValidationError", "LoadError"]

View File

@@ -0,0 +1,17 @@
"""Custom exceptions for the portfolio application."""
class PortfolioError(Exception):
"""Base exception for all portfolio errors."""
class ParseError(PortfolioError):
"""PDF/CSV parsing failed."""
class ValidationError(PortfolioError):
"""Pydantic or business rule validation failed."""
class LoadError(PortfolioError):
"""Database load operation failed."""

View File

View File

@@ -0,0 +1 @@
"""Dash pages."""

View File

@@ -0,0 +1 @@
"""Toronto Housing Dashboard page."""

View File

@@ -0,0 +1 @@
"""Toronto dashboard callbacks."""

View File

@@ -0,0 +1 @@
"""Toronto housing data logic."""

View File

@@ -0,0 +1 @@
"""Database loaders for Toronto housing data."""

View File

@@ -0,0 +1 @@
"""SQLAlchemy models for Toronto housing data."""

View File

@@ -0,0 +1 @@
"""Data parsers for Toronto housing data sources."""

View File

@@ -0,0 +1 @@
"""Pydantic schemas for Toronto housing data validation."""

148
pyproject.toml Normal file
View File

@@ -0,0 +1,148 @@
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "portfolio"
version = "0.1.0"
description = "Analytics Portfolio - Data engineering and visualization showcase"
readme = "README.md"
license = {text = "MIT"}
requires-python = ">=3.11"
authors = [
{name = "Leo Miranda"}
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = [
# Database
"sqlalchemy>=2.0",
"psycopg2-binary>=2.9",
"geoalchemy2>=0.14",
# Validation
"pydantic>=2.0",
"pydantic-settings>=2.0",
# Data Processing
"pandas>=2.1",
"geopandas>=0.14",
"shapely>=2.0",
# Visualization
"dash>=2.14",
"plotly>=5.18",
"dash-mantine-components>=0.14",
# PDF Parsing
"pdfplumber>=0.10",
"tabula-py>=2.9",
# Utilities
"python-dotenv>=1.0",
"httpx>=0.25",
]
[project.optional-dependencies]
dev = [
# Testing
"pytest>=7.0",
"pytest-cov>=4.0",
"pytest-asyncio>=0.21",
# Linting & Formatting
"ruff>=0.1",
"mypy>=1.7",
# Pre-commit
"pre-commit>=3.5",
# Type stubs
"pandas-stubs",
"types-requests",
]
dbt = [
"dbt-postgres>=1.7",
]
[project.scripts]
portfolio = "portfolio_app.app:main"
[tool.setuptools.packages.find]
where = ["."]
include = ["portfolio_app*"]
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_functions = ["test_*"]
addopts = [
"-v",
"--tb=short",
"--strict-markers",
]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"integration: marks tests as integration tests",
]
[tool.ruff]
target-version = "py311"
line-length = 88
exclude = [
".git",
".venv",
"__pycache__",
"build",
"dist",
".ruff_cache",
"dbt/target",
]
[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"B", # flake8-bugbear
"C4", # flake8-comprehensions
"UP", # pyupgrade
"SIM", # flake8-simplify
]
ignore = [
"E501", # line too long (handled by formatter)
]
[tool.ruff.lint.isort]
known-first-party = ["portfolio_app"]
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
[tool.mypy]
python_version = "3.11"
strict = true
warn_return_any = true
warn_unused_ignores = true
disallow_untyped_defs = true
plugins = ["pydantic.mypy"]
[[tool.mypy.overrides]]
module = [
"dash.*",
"plotly.*",
"geopandas.*",
"shapely.*",
"pdfplumber.*",
"tabula.*",
]
ignore_missing_imports = true

0
scripts/db/.gitkeep Normal file
View File

View File

@@ -0,0 +1,8 @@
-- Initialize PostGIS extension
-- This script runs automatically on first container start
-- Enable PostGIS extension
CREATE EXTENSION IF NOT EXISTS postgis;
-- Verify installation
SELECT PostGIS_Version();

0
scripts/dbt/.gitkeep Normal file
View File

0
scripts/deploy/.gitkeep Normal file
View File

0
scripts/dev/.gitkeep Normal file
View File

0
scripts/docker/.gitkeep Normal file
View File

1
tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Test suite for analytics portfolio."""

9
tests/conftest.py Normal file
View File

@@ -0,0 +1,9 @@
"""Pytest configuration and fixtures."""
import pytest
@pytest.fixture
def sample_fixture():
"""Example fixture - replace with actual fixtures as needed."""
return {}