feat: add data-platform plugin (v4.0.0)

Add new data-platform plugin for data engineering workflows with: MCP Server (32 tools): - pandas operations (14 tools): read_csv, read_parquet, read_json, to_csv, to_parquet, describe, head, tail, filter, select, groupby, join, list_data, drop_data - PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute, pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type, st_srid, st_extent - dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build, dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage Plugin Features: - Arrow IPC data_ref system for DataFrame persistence across tool calls - Pre-execution validation for dbt with `dbt parse` - SessionStart hook for PostgreSQL connectivity check (non-blocking) - Hybrid configuration (system ~/.config/claude/postgres.env + project .env) - Memory management with 100k row limit and chunking support Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run Agents: data-ingestion, data-analysis Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools Addresses data workflow issues from personal-portfolio project: - Lost data after multiple interactions (solved by Arrow IPC data_ref) - dbt 1.9+ syntax deprecation (solved by pre-execution validation) - Ungraceful PostgreSQL error handling (solved by SessionStart hook) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 14:24:03 -05:00
parent 6a267d074b
commit 89f0354ccc
39 changed files with 5413 additions and 6 deletions
--- a/mcp-servers/data-platform/mcp_server/config.py
+++ b/mcp-servers/data-platform/mcp_server/config.py
@@ -0,0 +1,195 @@
+"""
+Configuration loader for Data Platform MCP Server.
+
+Implements hybrid configuration system:
+- System-level: ~/.config/claude/postgres.env (credentials)
+- Project-level: .env (dbt project paths, overrides)
+- Auto-detection: dbt_project.yml discovery
+"""
+from pathlib import Path
+from dotenv import load_dotenv
+import os
+import logging
+from typing import Dict, Optional
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class DataPlatformConfig:
+    """Hybrid configuration loader for data platform tools"""
+
+    def __init__(self):
+        self.postgres_url: Optional[str] = None
+        self.dbt_project_dir: Optional[str] = None
+        self.dbt_profiles_dir: Optional[str] = None
+        self.max_rows: int = 100_000
+
+    def load(self) -> Dict[str, Optional[str]]:
+        """
+        Load configuration from system and project levels.
+
+        Returns:
+            Dict containing postgres_url, dbt_project_dir, dbt_profiles_dir, max_rows
+
+        Note:
+            PostgreSQL credentials are optional - server can run in pandas-only mode.
+        """
+        # Load system config (PostgreSQL credentials)
+        system_config = Path.home() / '.config' / 'claude' / 'postgres.env'
+        if system_config.exists():
+            load_dotenv(system_config)
+            logger.info(f"Loaded system configuration from {system_config}")
+        else:
+            logger.info(
+                f"System config not found: {system_config} - "
+                "PostgreSQL tools will be unavailable"
+            )
+
+        # Find project directory
+        project_dir = self._find_project_directory()
+
+        # Load project config (overrides system)
+        if project_dir:
+            project_config = project_dir / '.env'
+            if project_config.exists():
+                load_dotenv(project_config, override=True)
+                logger.info(f"Loaded project configuration from {project_config}")
+
+        # Extract values
+        self.postgres_url = os.getenv('POSTGRES_URL')
+        self.dbt_project_dir = os.getenv('DBT_PROJECT_DIR')
+        self.dbt_profiles_dir = os.getenv('DBT_PROFILES_DIR')
+        self.max_rows = int(os.getenv('DATA_PLATFORM_MAX_ROWS', '100000'))
+
+        # Auto-detect dbt project if not specified
+        if not self.dbt_project_dir and project_dir:
+            self.dbt_project_dir = self._find_dbt_project(project_dir)
+            if self.dbt_project_dir:
+                logger.info(f"Auto-detected dbt project: {self.dbt_project_dir}")
+
+        # Default dbt profiles dir to ~/.dbt
+        if not self.dbt_profiles_dir:
+            default_profiles = Path.home() / '.dbt'
+            if default_profiles.exists():
+                self.dbt_profiles_dir = str(default_profiles)
+
+        return {
+            'postgres_url': self.postgres_url,
+            'dbt_project_dir': self.dbt_project_dir,
+            'dbt_profiles_dir': self.dbt_profiles_dir,
+            'max_rows': self.max_rows,
+            'postgres_available': self.postgres_url is not None,
+            'dbt_available': self.dbt_project_dir is not None
+        }
+
+    def _find_project_directory(self) -> Optional[Path]:
+        """
+        Find the user's project directory.
+
+        Returns:
+            Path to project directory, or None if not found
+        """
+        # Strategy 1: Check CLAUDE_PROJECT_DIR environment variable
+        project_dir = os.getenv('CLAUDE_PROJECT_DIR')
+        if project_dir:
+            path = Path(project_dir)
+            if path.exists():
+                logger.info(f"Found project directory from CLAUDE_PROJECT_DIR: {path}")
+                return path
+
+        # Strategy 2: Check PWD
+        pwd = os.getenv('PWD')
+        if pwd:
+            path = Path(pwd)
+            if path.exists() and (
+                (path / '.git').exists() or
+                (path / '.env').exists() or
+                (path / 'dbt_project.yml').exists()
+            ):
+                logger.info(f"Found project directory from PWD: {path}")
+                return path
+
+        # Strategy 3: Check current working directory
+        cwd = Path.cwd()
+        if (cwd / '.git').exists() or (cwd / '.env').exists() or (cwd / 'dbt_project.yml').exists():
+            logger.info(f"Found project directory from cwd: {cwd}")
+            return cwd
+
+        logger.debug("Could not determine project directory")
+        return None
+
+    def _find_dbt_project(self, start_dir: Path) -> Optional[str]:
+        """
+        Find dbt_project.yml in the project or its subdirectories.
+
+        Args:
+            start_dir: Directory to start searching from
+
+        Returns:
+            Path to dbt project directory, or None if not found
+        """
+        # Check root
+        if (start_dir / 'dbt_project.yml').exists():
+            return str(start_dir)
+
+        # Check common subdirectories
+        for subdir in ['dbt', 'transform', 'analytics', 'models']:
+            candidate = start_dir / subdir
+            if (candidate / 'dbt_project.yml').exists():
+                return str(candidate)
+
+        # Search one level deep
+        for item in start_dir.iterdir():
+            if item.is_dir() and not item.name.startswith('.'):
+                if (item / 'dbt_project.yml').exists():
+                    return str(item)
+
+        return None
+
+
+def load_config() -> Dict[str, Optional[str]]:
+    """
+    Convenience function to load configuration.
+
+    Returns:
+        Configuration dictionary
+    """
+    config = DataPlatformConfig()
+    return config.load()
+
+
+def check_postgres_connection() -> Dict[str, any]:
+    """
+    Check PostgreSQL connection status for SessionStart hook.
+
+    Returns:
+        Dict with connection status and message
+    """
+    import asyncio
+
+    config = load_config()
+    if not config.get('postgres_url'):
+        return {
+            'connected': False,
+            'message': 'PostgreSQL not configured (POSTGRES_URL not set)'
+        }
+
+    async def test_connection():
+        try:
+            import asyncpg
+            conn = await asyncpg.connect(config['postgres_url'], timeout=5)
+            version = await conn.fetchval('SELECT version()')
+            await conn.close()
+            return {
+                'connected': True,
+                'message': f'Connected to PostgreSQL',
+                'version': version.split(',')[0] if version else 'Unknown'
+            }
+        except Exception as e:
+            return {
+                'connected': False,
+                'message': f'PostgreSQL connection failed: {str(e)}'
+            }
+
+    return asyncio.run(test_connection())