feat: add data-platform plugin (v4.0.0)

Add new data-platform plugin for data engineering workflows with: MCP Server (32 tools): - pandas operations (14 tools): read_csv, read_parquet, read_json, to_csv, to_parquet, describe, head, tail, filter, select, groupby, join, list_data, drop_data - PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute, pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type, st_srid, st_extent - dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build, dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage Plugin Features: - Arrow IPC data_ref system for DataFrame persistence across tool calls - Pre-execution validation for dbt with `dbt parse` - SessionStart hook for PostgreSQL connectivity check (non-blocking) - Hybrid configuration (system ~/.config/claude/postgres.env + project .env) - Memory management with 100k row limit and chunking support Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run Agents: data-ingestion, data-analysis Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools Addresses data workflow issues from personal-portfolio project: - Lost data after multiple interactions (solved by Arrow IPC data_ref) - dbt 1.9+ syntax deprecation (solved by pre-execution validation) - Ungraceful PostgreSQL error handling (solved by SessionStart hook) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 14:24:03 -05:00
parent 6a267d074b
commit 89f0354ccc
39 changed files with 5413 additions and 6 deletions
--- a/mcp-servers/data-platform/mcp_server/dbt_tools.py
+++ b/mcp-servers/data-platform/mcp_server/dbt_tools.py
@@ -0,0 +1,387 @@
+"""
+dbt MCP Tools.
+
+Provides dbt CLI wrapper with pre-execution validation.
+"""
+import subprocess
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+
+from .config import load_config
+
+logger = logging.getLogger(__name__)
+
+
+class DbtTools:
+    """dbt CLI wrapper tools with pre-validation"""
+
+    def __init__(self):
+        self.config = load_config()
+        self.project_dir = self.config.get('dbt_project_dir')
+        self.profiles_dir = self.config.get('dbt_profiles_dir')
+
+    def _get_dbt_command(self, cmd: List[str]) -> List[str]:
+        """Build dbt command with project and profiles directories"""
+        base = ['dbt']
+        if self.project_dir:
+            base.extend(['--project-dir', self.project_dir])
+        if self.profiles_dir:
+            base.extend(['--profiles-dir', self.profiles_dir])
+        base.extend(cmd)
+        return base
+
+    def _run_dbt(
+        self,
+        cmd: List[str],
+        timeout: int = 300,
+        capture_json: bool = False
+    ) -> Dict:
+        """
+        Run dbt command and return result.
+
+        Args:
+            cmd: dbt subcommand and arguments
+            timeout: Command timeout in seconds
+            capture_json: If True, parse JSON output
+
+        Returns:
+            Dict with command result
+        """
+        if not self.project_dir:
+            return {
+                'error': 'dbt project not found',
+                'suggestion': 'Set DBT_PROJECT_DIR in project .env or ensure dbt_project.yml exists'
+            }
+
+        full_cmd = self._get_dbt_command(cmd)
+        logger.info(f"Running: {' '.join(full_cmd)}")
+
+        try:
+            env = os.environ.copy()
+            # Disable dbt analytics/tracking
+            env['DBT_SEND_ANONYMOUS_USAGE_STATS'] = 'false'
+
+            result = subprocess.run(
+                full_cmd,
+                capture_output=True,
+                text=True,
+                timeout=timeout,
+                cwd=self.project_dir,
+                env=env
+            )
+
+            output = {
+                'success': result.returncode == 0,
+                'command': ' '.join(cmd),
+                'stdout': result.stdout,
+                'stderr': result.stderr if result.returncode != 0 else None
+            }
+
+            if capture_json and result.returncode == 0:
+                try:
+                    output['data'] = json.loads(result.stdout)
+                except json.JSONDecodeError:
+                    pass
+
+            return output
+
+        except subprocess.TimeoutExpired:
+            return {
+                'error': f'Command timed out after {timeout}s',
+                'command': ' '.join(cmd)
+            }
+        except FileNotFoundError:
+            return {
+                'error': 'dbt not found in PATH',
+                'suggestion': 'Install dbt: pip install dbt-core dbt-postgres'
+            }
+        except Exception as e:
+            logger.error(f"dbt command failed: {e}")
+            return {'error': str(e)}
+
+    async def dbt_parse(self) -> Dict:
+        """
+        Validate dbt project without executing (pre-flight check).
+
+        Returns:
+            Dict with validation result and any errors
+        """
+        result = self._run_dbt(['parse'])
+
+        # Check if _run_dbt returned an error (e.g., project not found, timeout, dbt not installed)
+        if 'error' in result:
+            return result
+
+        if not result.get('success'):
+            # Extract useful error info from stderr
+            stderr = result.get('stderr', '') or result.get('stdout', '')
+            errors = []
+
+            # Look for common dbt 1.9+ deprecation warnings
+            if 'deprecated' in stderr.lower():
+                errors.append({
+                    'type': 'deprecation',
+                    'message': 'Deprecated syntax found - check dbt 1.9+ migration guide'
+                })
+
+            # Look for compilation errors
+            if 'compilation error' in stderr.lower():
+                errors.append({
+                    'type': 'compilation',
+                    'message': 'SQL compilation error - check model syntax'
+                })
+
+            return {
+                'valid': False,
+                'errors': errors,
+                'details': stderr[:2000] if stderr else None,
+                'suggestion': 'Fix issues before running dbt models'
+            }
+
+        return {
+            'valid': True,
+            'message': 'dbt project validation passed'
+        }
+
+    async def dbt_run(
+        self,
+        select: Optional[str] = None,
+        exclude: Optional[str] = None,
+        full_refresh: bool = False
+    ) -> Dict:
+        """
+        Run dbt models with pre-validation.
+
+        Args:
+            select: Model selection (e.g., "model_name", "+model_name", "tag:daily")
+            exclude: Models to exclude
+            full_refresh: If True, rebuild incremental models
+
+        Returns:
+            Dict with run result
+        """
+        # ALWAYS validate first
+        parse_result = await self.dbt_parse()
+        if not parse_result.get('valid'):
+            return {
+                'error': 'Pre-validation failed',
+                **parse_result
+            }
+
+        cmd = ['run']
+        if select:
+            cmd.extend(['--select', select])
+        if exclude:
+            cmd.extend(['--exclude', exclude])
+        if full_refresh:
+            cmd.append('--full-refresh')
+
+        return self._run_dbt(cmd)
+
+    async def dbt_test(
+        self,
+        select: Optional[str] = None,
+        exclude: Optional[str] = None
+    ) -> Dict:
+        """
+        Run dbt tests.
+
+        Args:
+            select: Test selection
+            exclude: Tests to exclude
+
+        Returns:
+            Dict with test results
+        """
+        cmd = ['test']
+        if select:
+            cmd.extend(['--select', select])
+        if exclude:
+            cmd.extend(['--exclude', exclude])
+
+        return self._run_dbt(cmd)
+
+    async def dbt_build(
+        self,
+        select: Optional[str] = None,
+        exclude: Optional[str] = None,
+        full_refresh: bool = False
+    ) -> Dict:
+        """
+        Run dbt build (run + test) with pre-validation.
+
+        Args:
+            select: Model/test selection
+            exclude: Resources to exclude
+            full_refresh: If True, rebuild incremental models
+
+        Returns:
+            Dict with build result
+        """
+        # ALWAYS validate first
+        parse_result = await self.dbt_parse()
+        if not parse_result.get('valid'):
+            return {
+                'error': 'Pre-validation failed',
+                **parse_result
+            }
+
+        cmd = ['build']
+        if select:
+            cmd.extend(['--select', select])
+        if exclude:
+            cmd.extend(['--exclude', exclude])
+        if full_refresh:
+            cmd.append('--full-refresh')
+
+        return self._run_dbt(cmd)
+
+    async def dbt_compile(
+        self,
+        select: Optional[str] = None
+    ) -> Dict:
+        """
+        Compile dbt models to SQL without executing.
+
+        Args:
+            select: Model selection
+
+        Returns:
+            Dict with compiled SQL info
+        """
+        cmd = ['compile']
+        if select:
+            cmd.extend(['--select', select])
+
+        return self._run_dbt(cmd)
+
+    async def dbt_ls(
+        self,
+        select: Optional[str] = None,
+        resource_type: Optional[str] = None,
+        output: str = 'name'
+    ) -> Dict:
+        """
+        List dbt resources.
+
+        Args:
+            select: Resource selection
+            resource_type: Filter by type (model, test, seed, snapshot, source)
+            output: Output format ('name', 'path', 'json')
+
+        Returns:
+            Dict with list of resources
+        """
+        cmd = ['ls', '--output', output]
+        if select:
+            cmd.extend(['--select', select])
+        if resource_type:
+            cmd.extend(['--resource-type', resource_type])
+
+        result = self._run_dbt(cmd)
+
+        if result.get('success') and result.get('stdout'):
+            lines = [l.strip() for l in result['stdout'].split('\n') if l.strip()]
+            result['resources'] = lines
+            result['count'] = len(lines)
+
+        return result
+
+    async def dbt_docs_generate(self) -> Dict:
+        """
+        Generate dbt documentation.
+
+        Returns:
+            Dict with generation result
+        """
+        result = self._run_dbt(['docs', 'generate'])
+
+        if result.get('success') and self.project_dir:
+            # Check for generated catalog
+            catalog_path = Path(self.project_dir) / 'target' / 'catalog.json'
+            manifest_path = Path(self.project_dir) / 'target' / 'manifest.json'
+            result['catalog_generated'] = catalog_path.exists()
+            result['manifest_generated'] = manifest_path.exists()
+
+        return result
+
+    async def dbt_lineage(self, model: str) -> Dict:
+        """
+        Get model dependencies and lineage.
+
+        Args:
+            model: Model name to analyze
+
+        Returns:
+            Dict with upstream and downstream dependencies
+        """
+        if not self.project_dir:
+            return {'error': 'dbt project not found'}
+
+        manifest_path = Path(self.project_dir) / 'target' / 'manifest.json'
+
+        # Generate manifest if not exists
+        if not manifest_path.exists():
+            compile_result = await self.dbt_compile(select=model)
+            if not compile_result.get('success'):
+                return {
+                    'error': 'Failed to compile manifest',
+                    'details': compile_result
+                }
+
+        if not manifest_path.exists():
+            return {
+                'error': 'Manifest not found',
+                'suggestion': 'Run dbt compile first'
+            }
+
+        try:
+            with open(manifest_path) as f:
+                manifest = json.load(f)
+
+            # Find the model node
+            model_key = None
+            for key in manifest.get('nodes', {}):
+                if key.endswith(f'.{model}') or manifest['nodes'][key].get('name') == model:
+                    model_key = key
+                    break
+
+            if not model_key:
+                return {
+                    'error': f'Model not found: {model}',
+                    'available_models': [
+                        n.get('name') for n in manifest.get('nodes', {}).values()
+                        if n.get('resource_type') == 'model'
+                    ][:20]
+                }
+
+            node = manifest['nodes'][model_key]
+
+            # Get upstream (depends_on)
+            upstream = node.get('depends_on', {}).get('nodes', [])
+
+            # Get downstream (find nodes that depend on this one)
+            downstream = []
+            for key, other_node in manifest.get('nodes', {}).items():
+                deps = other_node.get('depends_on', {}).get('nodes', [])
+                if model_key in deps:
+                    downstream.append(key)
+
+            return {
+                'model': model,
+                'unique_id': model_key,
+                'materialization': node.get('config', {}).get('materialized'),
+                'schema': node.get('schema'),
+                'database': node.get('database'),
+                'upstream': upstream,
+                'downstream': downstream,
+                'description': node.get('description'),
+                'tags': node.get('tags', [])
+            }
+
+        except Exception as e:
+            logger.error(f"dbt_lineage failed: {e}")
+            return {'error': str(e)}