diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 9785c69..246fc9b 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -6,7 +6,7 @@ }, "metadata": { "description": "Project management plugins with Gitea and NetBox integrations", - "version": "5.6.0" + "version": "5.7.0" }, "plugins": [ { @@ -155,7 +155,7 @@ }, { "name": "data-platform", - "version": "1.2.0", + "version": "1.3.0", "description": "Data engineering tools with pandas, PostgreSQL/PostGIS, and dbt integration", "source": "./plugins/data-platform", "author": { diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e9cd37..994769f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ All notable changes to the Leo Claude Marketplace will be documented in this fil The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [5.7.0] - 2026-02-02 + +### Added +- **data-platform**: New `data-advisor` agent for data integrity, schema, and dbt compliance validation +- **data-platform**: New `data-integrity-audit.md` skill defining audit rules, severity levels, and scanning strategies +- **data-platform**: New `/data-gate` command for binary pass/fail data integrity gates (projman integration) +- **data-platform**: New `/data-review` command for comprehensive data integrity audits + +### Changed +- Domain Advisory Pattern now fully operational for both Viz and Data domains +- projman orchestrator `Domain/Data` gates now resolve to live `/data-gate` command (previously fell through to "gate unavailable" warning) + +--- + ## [5.6.0] - 2026-02-01 ### Added diff --git a/README.md b/README.md index 9dc5944..fef0947 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ The marketplace supports cross-plugin domain advisory integration: | Domain | Plugin | Gate Command | |--------|--------|--------------| | Visualization | viz-platform | `/design-gate` | -| Data (planned) | data-platform | `/data-gate` | +| Data | data-platform | `/data-gate` | ## MCP Servers diff --git a/plugins/data-platform/.claude-plugin/plugin.json b/plugins/data-platform/.claude-plugin/plugin.json index 49d9eba..18f596c 100644 --- a/plugins/data-platform/.claude-plugin/plugin.json +++ b/plugins/data-platform/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "data-platform", - "version": "1.1.0", + "version": "1.3.0", "description": "Data engineering tools with pandas, PostgreSQL/PostGIS, and dbt integration", "author": { "name": "Leo Miranda", diff --git a/plugins/data-platform/agents/data-advisor.md b/plugins/data-platform/agents/data-advisor.md new file mode 100644 index 0000000..0a69067 --- /dev/null +++ b/plugins/data-platform/agents/data-advisor.md @@ -0,0 +1,320 @@ +--- +agent: data-advisor +description: Reviews code for data integrity, schema validity, and dbt compliance using data-platform MCP tools +triggers: + - /data-review command + - /data-gate command + - projman orchestrator domain gate +--- + +# Data Advisor Agent + +You are a strict data integrity auditor. Your role is to review code for proper schema usage, dbt compliance, lineage integrity, and data quality standards. + +## Visual Output Requirements + +**MANDATORY: Display header at start of every response.** + +``` ++----------------------------------------------------------------------+ +| DATA-PLATFORM - Data Advisor | +| [Target Path] | ++----------------------------------------------------------------------+ +``` + +## Trigger Conditions + +Activate this agent when: +- User runs `/data-review ` +- User runs `/data-gate ` +- Projman orchestrator requests data domain gate check +- Code review includes database operations, dbt models, or data pipelines + +## Skills to Load + +- skills/data-integrity-audit.md +- skills/mcp-tools-reference.md + +## Available MCP Tools + +### PostgreSQL (Schema Validation) + +| Tool | Purpose | +|------|---------| +| `pg_connect` | Verify database is reachable | +| `pg_tables` | List tables, verify existence | +| `pg_columns` | Get column details, verify types and constraints | +| `pg_schemas` | List available schemas | +| `pg_query` | Run diagnostic queries (SELECT only in review context) | + +### PostGIS (Spatial Validation) + +| Tool | Purpose | +|------|---------| +| `st_tables` | List tables with geometry columns | +| `st_geometry_type` | Verify geometry types | +| `st_srid` | Verify coordinate reference systems | +| `st_extent` | Verify spatial extent is reasonable | + +### dbt (Project Validation) + +| Tool | Purpose | +|------|---------| +| `dbt_parse` | Validate project structure (ALWAYS run first) | +| `dbt_compile` | Verify SQL renders correctly | +| `dbt_test` | Run data tests | +| `dbt_build` | Combined run + test | +| `dbt_ls` | List all resources (models, tests, sources) | +| `dbt_lineage` | Get model dependency graph | +| `dbt_docs_generate` | Generate documentation for inspection | + +### pandas (Data Validation) + +| Tool | Purpose | +|------|---------| +| `describe` | Statistical summary for data quality checks | +| `head` | Preview data for structural verification | +| `list_data` | Check for stale DataFrames | + +## Operating Modes + +### Review Mode (default) + +Triggered by `/data-review ` + +**Characteristics:** +- Produces detailed report with all findings +- Groups findings by severity (FAIL/WARN/INFO) +- Includes actionable recommendations with fixes +- Does NOT block - informational only +- Shows category compliance status + +### Gate Mode + +Triggered by `/data-gate ` or projman orchestrator domain gate + +**Characteristics:** +- Binary PASS/FAIL output +- Only reports FAIL-level issues +- Returns exit status for automation integration +- Blocks completion on FAIL +- Compact output for CI/CD pipelines + +## Audit Workflow + +### 1. Receive Target Path + +Accept file or directory path from command invocation. + +### 2. Determine Scope + +Analyze target to identify what type of data work is present: + +| Pattern | Type | Checks to Run | +|---------|------|---------------| +| `dbt_project.yml` present | dbt project | Full dbt validation | +| `*.sql` files in dbt path | dbt models | Model compilation, lineage | +| `*.py` with `pg_query`/`pg_execute` | Database operations | Schema validation | +| `schema.yml` files | dbt schemas | Schema drift detection | +| Migration files (`*_migration.sql`) | Schema changes | Full PostgreSQL + dbt checks | + +### 3. Run Database Checks (if applicable) + +``` +1. pg_connect → verify database reachable + If fails: WARN, continue with file-based checks + +2. pg_tables → verify expected tables exist + If missing: FAIL + +3. pg_columns on affected tables → verify types + If mismatch: FAIL +``` + +### 4. Run dbt Checks (if applicable) + +``` +1. dbt_parse → validate project + If fails: FAIL immediately (project broken) + +2. dbt_ls → catalog all resources + Record models, tests, sources + +3. dbt_lineage on target models → check integrity + Orphaned refs: FAIL + +4. dbt_compile on target models → verify SQL + Compilation errors: FAIL + +5. dbt_test --select → run tests + Test failures: FAIL + +6. Cross-reference tests → models without tests + Missing tests: WARN +``` + +### 5. Run PostGIS Checks (if applicable) + +``` +1. st_tables → list spatial tables + If none found: skip PostGIS checks + +2. st_srid → verify SRID correct + Unexpected SRID: FAIL + +3. st_geometry_type → verify expected types + Wrong type: WARN + +4. st_extent → sanity check bounding box + Unreasonable extent: FAIL +``` + +### 6. Scan Python Code (manual patterns) + +For Python files with database operations: + +| Pattern | Issue | Severity | +|---------|-------|----------| +| `f"SELECT * FROM {table}"` | SQL injection risk | WARN | +| `f"INSERT INTO {table}"` | Unparameterized mutation | WARN | +| `pg_execute` without WHERE in DELETE/UPDATE | Dangerous mutation | WARN | +| Hardcoded connection strings | Credential exposure | WARN | + +### 7. Generate Report + +Output format depends on operating mode (see templates in `skills/data-integrity-audit.md`). + +## Report Formats + +### Gate Mode Output + +**PASS:** +``` +DATA GATE: PASS +No blocking data integrity violations found. +``` + +**FAIL:** +``` +DATA GATE: FAIL + +Blocking Issues (2): +1. dbt/models/staging/stg_census.sql - Compilation error: column 'census_yr' not found + Fix: Column was renamed to 'census_year' in source table. Update model. + +2. portfolio_app/toronto/loaders/census.py:67 - References table 'census_raw' which does not exist + Fix: Table was renamed to 'census_demographics' in migration 003. + +Run /data-review for full audit report. +``` + +### Review Mode Output + +``` ++----------------------------------------------------------------------+ +| DATA-PLATFORM - Data Integrity Audit | +| /path/to/project | ++----------------------------------------------------------------------+ + +Target: /path/to/project +Scope: 12 files scanned, 8 models checked, 3 tables verified + +FINDINGS + +FAIL (2) + 1. [dbt/models/staging/stg_census.sql] Compilation error + Error: column 'census_yr' does not exist + Fix: Column was renamed to 'census_year'. Update SELECT clause. + + 2. [portfolio_app/loaders/census.py:67] Missing table reference + Error: Table 'census_raw' does not exist + Fix: Table renamed to 'census_demographics' in migration 003. + +WARN (3) + 1. [dbt/models/marts/dim_neighbourhoods.sql] Missing dbt test + Issue: No unique test on neighbourhood_id + Suggestion: Add unique test to schema.yml + + 2. [portfolio_app/toronto/queries.py:45] Hardcoded SQL + Issue: f"SELECT * FROM {table_name}" without parameterization + Suggestion: Use parameterized queries + + 3. [dbt/models/staging/stg_legacy.sql] Orphaned model + Issue: No downstream consumers or exposures + Suggestion: Remove if unused or add to exposure + +INFO (1) + 1. [dbt/models/marts/fct_demographics.sql] Documentation gap + Note: Model description missing in schema.yml + Suggestion: Add description for discoverability + +SUMMARY + Schema: 2 issues + Lineage: Intact + dbt: 1 failure + PostGIS: Not applicable + +VERDICT: FAIL (2 blocking issues) +``` + +## Severity Definitions + +| Level | Criteria | Action Required | +|-------|----------|-----------------| +| **FAIL** | dbt parse/compile fails, missing tables/columns, type mismatches, broken lineage, invalid SRID | Must fix before completion | +| **WARN** | Missing tests, hardcoded SQL, schema drift, orphaned models | Should fix | +| **INFO** | Documentation gaps, optimization opportunities | Consider for improvement | + +## Error Handling + +| Error | Response | +|-------|----------| +| Database not reachable | WARN: "PostgreSQL unavailable, skipping schema checks" - continue | +| No dbt_project.yml | Skip dbt checks silently - not an error | +| No PostGIS tables | Skip PostGIS checks silently - not an error | +| MCP tool fails | WARN: "Tool {name} failed: {error}" - continue with remaining | +| Empty path | PASS: "No data artifacts found in target path" | +| Invalid path | Error: "Path not found: {path}" | + +## Integration with projman + +When called as a domain gate by projman orchestrator: + +1. Receive path from orchestrator (changed files for the issue) +2. Determine what type of data work changed +3. Run audit in gate mode +4. Return structured result: + ``` + Gate: data + Status: PASS | FAIL + Blocking: N issues + Summary: Brief description + ``` +5. Orchestrator decides whether to proceed based on gate status + +## Example Interactions + +**User**: `/data-review dbt/models/staging/` +**Agent**: +1. Scans all .sql files in staging/ +2. Runs dbt_parse to validate project +3. Runs dbt_compile on each model +4. Checks lineage for orphaned refs +5. Cross-references test coverage +6. Returns detailed report + +**User**: `/data-gate portfolio_app/toronto/` +**Agent**: +1. Scans for Python files with pg_query/pg_execute +2. Checks if referenced tables exist +3. Validates column types +4. Returns PASS if clean, FAIL with blocking issues if not +5. Compact output for automation + +## Communication Style + +Technical and precise. Report findings with exact locations, specific violations, and actionable fixes: + +- "Table `census_demographics` column `population` is `varchar(50)` in PostgreSQL but referenced as `integer` in `stg_census.sql` line 14. This will cause a runtime cast error." +- "Model `dim_neighbourhoods` has no `unique` test on `neighbourhood_id`. Add to `schema.yml` to prevent duplicates." +- "Spatial extent for `toronto_boundaries` shows global coordinates (-180 to 180). Expected Toronto bbox (~-79.6 to -79.1 longitude). Likely missing ST_Transform or wrong SRID on import." diff --git a/plugins/data-platform/commands/data-gate.md b/plugins/data-platform/commands/data-gate.md new file mode 100644 index 0000000..527cf53 --- /dev/null +++ b/plugins/data-platform/commands/data-gate.md @@ -0,0 +1,104 @@ +--- +description: Data integrity compliance gate (pass/fail) for sprint execution +arguments: + - name: path + description: File or directory to validate + required: true +--- + +# /data-gate + +Binary pass/fail validation for data integrity compliance. Used by projman orchestrator during sprint execution to gate issue completion. + +## Usage + +``` +/data-gate +``` + +**Examples:** +``` +/data-gate ./dbt/models/staging/ +/data-gate ./portfolio_app/toronto/parsers/ +/data-gate ./dbt/ +``` + +## What It Does + +1. **Activates** the `data-advisor` agent in gate mode +2. **Loads** the `skills/data-integrity-audit.md` skill +3. **Determines scope** from target path: + - dbt project directory: full dbt validation (parse, compile, test, lineage) + - Python files with database operations: schema validation + - SQL files: dbt model validation + - Mixed: all applicable checks +4. **Checks only FAIL-level violations:** + - dbt parse failures (project broken) + - dbt compilation errors (SQL invalid) + - Missing tables/columns referenced in code + - Data type mismatches that cause runtime errors + - Broken lineage (orphaned model references) + - PostGIS SRID mismatches +5. **Returns binary result:** + - `PASS` - No blocking violations found + - `FAIL` - One or more blocking violations + +## Output + +### On PASS +``` +DATA GATE: PASS +No blocking data integrity violations found. +``` + +### On FAIL +``` +DATA GATE: FAIL + +Blocking Issues (2): +1. dbt/models/staging/stg_census.sql - Compilation error: column 'census_yr' not found + Fix: Column was renamed to 'census_year' in source table. Update model. + +2. portfolio_app/toronto/loaders/census.py:67 - References table 'census_raw' which does not exist + Fix: Table was renamed to 'census_demographics' in migration 003. + +Run /data-review for full audit report. +``` + +## Integration with projman + +This command is automatically invoked by the projman orchestrator when: + +1. An issue has the `Domain/Data` label +2. The orchestrator is about to mark the issue as complete +3. The orchestrator passes the path of changed files + +**Gate behavior:** +- PASS: Issue can be marked complete +- FAIL: Issue stays open, blocker comment added with failure details + +## Differences from /data-review + +| Aspect | /data-gate | /data-review | +|--------|------------|--------------| +| Output | Binary PASS/FAIL | Detailed report with all severities | +| Severity | FAIL only | FAIL + WARN + INFO | +| Purpose | Automation gate | Human review | +| Verbosity | Minimal | Comprehensive | +| Speed | Skips INFO checks | Full scan | + +## When to Use + +- **Sprint execution**: Automatic quality gates via projman +- **CI/CD pipelines**: Automated data integrity checks +- **Quick validation**: Fast pass/fail without full report +- **Pre-merge checks**: Verify data changes before integration + +For detailed findings including warnings and suggestions, use `/data-review` instead. + +## Requirements + +- data-platform MCP server must be running +- For dbt checks: dbt project must be configured (auto-detected via `dbt_project.yml`) +- For PostgreSQL checks: connection configured in `~/.config/claude/postgres.env` +- If database or dbt unavailable: applicable checks skipped with warning (non-blocking degradation) diff --git a/plugins/data-platform/commands/data-review.md b/plugins/data-platform/commands/data-review.md new file mode 100644 index 0000000..229d102 --- /dev/null +++ b/plugins/data-platform/commands/data-review.md @@ -0,0 +1,149 @@ +--- +description: Audit data integrity, schema validity, and dbt compliance +arguments: + - name: path + description: File, directory, or dbt project to audit + required: true +--- + +# /data-review + +Comprehensive data integrity audit producing a detailed report with findings at all severity levels. For human review and standalone codebase auditing. + +## Usage + +``` +/data-review +``` + +**Examples:** +``` +/data-review ./dbt/ +/data-review ./portfolio_app/toronto/ +/data-review ./dbt/models/marts/ +``` + +## What It Does + +1. **Activates** the `data-advisor` agent in review mode +2. **Scans target path** to determine scope: + - Identifies dbt project files (.sql models, schema.yml, sources.yml) + - Identifies Python files with database operations + - Identifies migration files + - Identifies PostGIS usage +3. **Runs all check categories:** + - Schema validity (PostgreSQL tables, columns, types) + - dbt project health (parse, compile, test, lineage) + - PostGIS compliance (SRID, geometry types, extent) + - Data type consistency + - Code patterns (unsafe SQL, hardcoded queries) +4. **Produces detailed report** with all severity levels (FAIL, WARN, INFO) +5. **Provides actionable recommendations** for each finding + +## Output Format + +``` ++----------------------------------------------------------------------+ +| DATA-PLATFORM - Data Integrity Audit | +| /path/to/project | ++----------------------------------------------------------------------+ + +Target: /path/to/project +Scope: N files scanned, N models checked, N tables verified + +FINDINGS + +FAIL (N) + 1. [location] violation description + Fix: actionable fix + +WARN (N) + 1. [location] warning description + Suggestion: improvement suggestion + +INFO (N) + 1. [location] info description + Note: context + +SUMMARY + Schema: Valid | N issues + Lineage: Intact | N orphaned + dbt: Passes | N failures + PostGIS: Valid | N issues | Not applicable + +VERDICT: PASS | FAIL (N blocking issues) +``` + +## When to Use + +### Before Sprint Planning +Audit data layer health to identify tech debt and inform sprint scope. +``` +/data-review ./dbt/ +``` + +### During Code Review +Get detailed data integrity findings alongside code review comments. +``` +/data-review ./dbt/models/staging/stg_new_source.sql +``` + +### After Migrations +Verify schema changes didn't break anything downstream. +``` +/data-review ./migrations/ +``` + +### Periodic Health Checks +Regular data infrastructure audits for proactive maintenance. +``` +/data-review ./data_pipeline/ +``` + +### New Project Onboarding +Understand the current state of data architecture. +``` +/data-review . +``` + +## Severity Levels + +| Level | Meaning | Gate Impact | +|-------|---------|-------------| +| **FAIL** | Blocking issues that will cause runtime errors | Would block `/data-gate` | +| **WARN** | Quality issues that should be addressed | Does not block gate | +| **INFO** | Suggestions for improvement | Does not block gate | + +## Differences from /data-gate + +`/data-review` gives you the full picture. `/data-gate` gives the orchestrator a yes/no. + +| Aspect | /data-gate | /data-review | +|--------|------------|--------------| +| Output | Binary PASS/FAIL | Detailed report | +| Severity | FAIL only | FAIL + WARN + INFO | +| Purpose | Automation | Human review | +| Verbosity | Minimal | Comprehensive | +| Speed | Fast (skips INFO) | Thorough | + +Use `/data-review` when you want to understand. +Use `/data-gate` when you want to automate. + +## Requirements + +- data-platform MCP server must be running +- For dbt checks: dbt project must be configured (auto-detected via `dbt_project.yml`) +- For PostgreSQL checks: connection configured in `~/.config/claude/postgres.env` + +**Graceful degradation:** If database or dbt unavailable, applicable checks are skipped with a note in the report rather than failing entirely. + +## Skills Used + +- `skills/data-integrity-audit.md` - Audit rules and patterns +- `skills/mcp-tools-reference.md` - MCP tool reference + +## Related Commands + +- `/data-gate` - Binary pass/fail for automation +- `/lineage` - Visualize dbt model dependencies +- `/schema` - Explore database schema diff --git a/plugins/data-platform/skills/data-integrity-audit.md b/plugins/data-platform/skills/data-integrity-audit.md new file mode 100644 index 0000000..7c3394e --- /dev/null +++ b/plugins/data-platform/skills/data-integrity-audit.md @@ -0,0 +1,307 @@ +--- +name: data-integrity-audit +description: Rules and patterns for auditing data integrity, schema validity, and dbt compliance +--- + +# Data Integrity Audit + +## Purpose + +Defines what "data valid" means for the data-platform domain. This skill is loaded by the `data-advisor` agent for both review and gate modes during sprint execution and standalone audits. + +--- + +## What to Check + +| Check Category | What It Validates | MCP Tools Used | +|----------------|-------------------|----------------| +| **Schema Validity** | Tables exist, columns have correct types, constraints present, no orphaned columns | `pg_tables`, `pg_columns`, `pg_schemas` | +| **dbt Project Health** | Project parses without errors, models compile, tests defined for critical models | `dbt_parse`, `dbt_compile`, `dbt_test`, `dbt_ls` | +| **Lineage Integrity** | No orphaned models (referenced but missing), no circular dependencies, upstream sources exist | `dbt_lineage`, `dbt_ls` | +| **Data Type Consistency** | DataFrame dtypes match expected schema, no silent type coercion, date formats consistent | `describe`, `head`, `pg_columns` | +| **PostGIS Compliance** | Spatial tables have correct SRID, geometry types match expectations, extent is reasonable | `st_tables`, `st_geometry_type`, `st_srid`, `st_extent` | +| **Query Safety** | SELECT queries used for reads (not raw SQL for mutations), parameterized patterns | Code review - manual pattern check | + +--- + +## Common Violations + +### FAIL-Level Violations (Block Gate) + +| Violation | Detection Method | Example | +|-----------|-----------------|---------| +| dbt parse failure | `dbt_parse` returns error | Project YAML invalid, missing ref targets | +| dbt compilation error | `dbt_compile` fails | SQL syntax error, undefined column reference | +| Missing table/column | `pg_tables`, `pg_columns` lookup | Code references `census_raw` but table doesn't exist | +| Type mismatch | Compare `pg_columns` vs dbt schema | Column is `varchar` in DB but model expects `integer` | +| Broken lineage | `dbt_lineage` shows orphaned refs | Model references `stg_old_format` which doesn't exist | +| PostGIS SRID mismatch | `st_srid` returns unexpected value | Geometry column has SRID 0 instead of 4326 | +| Unreasonable spatial extent | `st_extent` returns global bbox | Toronto data shows coordinates in China | + +### WARN-Level Violations (Report, Don't Block) + +| Violation | Detection Method | Example | +|-----------|-----------------|---------| +| Missing dbt tests | `dbt_ls` shows model without test | `dim_customers` has no `unique` test on `customer_id` | +| Undocumented columns | dbt schema.yml missing descriptions | Model columns have no documentation | +| Schema drift | `pg_columns` vs dbt schema.yml | Column exists in DB but not in dbt YAML | +| Hardcoded SQL | Scan Python for string concatenation | `f"SELECT * FROM {table}"` without parameterization | +| Orphaned model | `dbt_lineage` shows no downstream | `stg_legacy` has no consumers and no exposure | + +### INFO-Level Violations (Suggestions Only) + +| Violation | Detection Method | Example | +|-----------|-----------------|---------| +| Missing indexes | Query pattern suggests need | Frequent filter on non-indexed column | +| Documentation gaps | dbt docs incomplete | Missing model description | +| Unused models | `dbt_ls` vs actual queries | Model exists but never selected | +| Optimization opportunity | `describe` shows data patterns | Column has low cardinality, could be enum | + +--- + +## Severity Classification + +| Severity | When to Apply | Gate Behavior | +|----------|--------------|---------------| +| **FAIL** | Broken lineage, models that won't compile, missing tables/columns, data type mismatches that cause runtime errors, invalid SRID | Blocks issue completion | +| **WARN** | Missing dbt tests, undocumented columns, schema drift, hardcoded SQL, orphaned models | Does NOT block gate, included in review report | +| **INFO** | Optimization opportunities, documentation gaps, unused models | Review report only | + +### Severity Decision Tree + +``` +Is the dbt project broken (parse/compile fails)? + YES -> FAIL + NO -> Does code reference non-existent tables/columns? + YES -> FAIL + NO -> Would this cause a runtime error? + YES -> FAIL + NO -> Does it violate data quality standards? + YES -> WARN + NO -> Is it an optimization/documentation suggestion? + YES -> INFO + NO -> Not a violation +``` + +--- + +## Scanning Strategy + +### For dbt Projects + +1. **Parse validation** (ALWAYS FIRST) + ``` + dbt_parse → if fails, immediate FAIL (project is broken) + ``` + +2. **Catalog resources** + ``` + dbt_ls → list all models, tests, sources, exposures + ``` + +3. **Lineage check** + ``` + dbt_lineage on changed models → check upstream/downstream integrity + ``` + +4. **Compilation check** + ``` + dbt_compile on changed models → verify SQL renders correctly + ``` + +5. **Test execution** + ``` + dbt_test --select → verify tests pass + ``` + +6. **Test coverage audit** + ``` + Cross-reference dbt_ls tests against model list → flag models without tests (WARN) + ``` + +### For PostgreSQL Schema Changes + +1. **Table verification** + ``` + pg_tables → verify expected tables exist + ``` + +2. **Column validation** + ``` + pg_columns on affected tables → verify types match expectations + ``` + +3. **Schema comparison** + ``` + Compare pg_columns output against dbt schema.yml → flag drift + ``` + +### For PostGIS/Spatial Data + +1. **Spatial table scan** + ``` + st_tables → list tables with geometry columns + ``` + +2. **SRID validation** + ``` + st_srid → verify SRID is correct for expected region + Expected: 4326 (WGS84) for GPS data, local projections for regional data + ``` + +3. **Geometry type check** + ``` + st_geometry_type → verify expected types (Point, Polygon, etc.) + ``` + +4. **Extent sanity check** + ``` + st_extent → verify bounding box is reasonable for expected region + Toronto data should be ~(-79.6 to -79.1, 43.6 to 43.9) + ``` + +### For DataFrame/pandas Operations + +1. **Data quality check** + ``` + describe → check for unexpected nulls, type issues, outliers + ``` + +2. **Structure verification** + ``` + head → verify data structure matches expectations + ``` + +3. **Memory management** + ``` + list_data → verify no stale DataFrames from previous failed runs + ``` + +### For Python Code (Manual Scan) + +1. **SQL injection patterns** + - Scan for f-strings with table/column names + - Check for string concatenation in queries + - Look for `.format()` calls with SQL + +2. **Mutation safety** + - `pg_execute` usage should be intentional, not accidental + - Verify DELETE/UPDATE have WHERE clauses + +3. **Credential exposure** + - No hardcoded connection strings + - No credentials in code (check for `.env` usage) + +--- + +## Report Templates + +### Gate Mode (Compact) + +``` +DATA GATE: PASS +No blocking data integrity violations found. +``` + +or + +``` +DATA GATE: FAIL + +Blocking Issues (N): +1. - + Fix: + +2. - + Fix: + +Run /data-review for full audit report. +``` + +### Review Mode (Detailed) + +``` ++----------------------------------------------------------------------+ +| DATA-PLATFORM - Data Integrity Audit | +| [Target Path] | ++----------------------------------------------------------------------+ + +Target: +Scope: N files scanned, N models checked, N tables verified + +FINDINGS + +FAIL (N) + 1. [location] violation description + Fix: actionable fix + + 2. [location] violation description + Fix: actionable fix + +WARN (N) + 1. [location] warning description + Suggestion: improvement suggestion + + 2. [location] warning description + Suggestion: improvement suggestion + +INFO (N) + 1. [location] info description + Note: context + +SUMMARY + Schema: Valid | N issues + Lineage: Intact | N orphaned + dbt: Passes | N failures + PostGIS: Valid | N issues | Not applicable + +VERDICT: PASS | FAIL (N blocking issues) +``` + +--- + +## Skip Patterns + +Do not flag violations in: + +- `**/tests/**` - Test files may have intentional violations +- `**/__pycache__/**` - Compiled files +- `**/fixtures/**` - Test fixtures +- `**/.scratch/**` - Temporary working files +- Files with `# noqa: data-audit` comment +- Migration files marked as historical + +--- + +## Error Handling + +| Scenario | Behavior | +|----------|----------| +| Database not reachable (`pg_connect` fails) | WARN, skip PostgreSQL checks, continue with file-based | +| dbt not configured (no `dbt_project.yml`) | Skip dbt checks entirely, not an error | +| No PostGIS tables found | Skip PostGIS checks, not an error | +| MCP tool call fails | Report as WARN with tool name, continue with remaining checks | +| No data files in scanned path | Report "No data artifacts found" - PASS (nothing to fail) | +| Empty directory | Report "No files found in path" - PASS | + +--- + +## Integration Notes + +### projman Orchestrator + +When called as a domain gate: +1. Orchestrator detects `Domain/Data` label on issue +2. Orchestrator identifies changed files +3. Orchestrator invokes `/data-gate ` +4. Agent runs gate mode scan +5. Returns PASS/FAIL to orchestrator +6. Orchestrator decides whether to complete issue + +### Standalone Usage + +For manual audits: +1. User runs `/data-review ` +2. Agent runs full review mode scan +3. Returns detailed report with all severity levels +4. User decides on actions