feat: add data-platform plugin (v4.0.0)
Add new data-platform plugin for data engineering workflows with: MCP Server (32 tools): - pandas operations (14 tools): read_csv, read_parquet, read_json, to_csv, to_parquet, describe, head, tail, filter, select, groupby, join, list_data, drop_data - PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute, pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type, st_srid, st_extent - dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build, dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage Plugin Features: - Arrow IPC data_ref system for DataFrame persistence across tool calls - Pre-execution validation for dbt with `dbt parse` - SessionStart hook for PostgreSQL connectivity check (non-blocking) - Hybrid configuration (system ~/.config/claude/postgres.env + project .env) - Memory management with 100k row limit and chunking support Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run Agents: data-ingestion, data-analysis Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools Addresses data workflow issues from personal-portfolio project: - Lost data after multiple interactions (solved by Arrow IPC data_ref) - dbt 1.9+ syntax deprecation (solved by pre-execution validation) - Ungraceful PostgreSQL error handling (solved by SessionStart hook) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
195
mcp-servers/data-platform/mcp_server/config.py
Normal file
195
mcp-servers/data-platform/mcp_server/config.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
Configuration loader for Data Platform MCP Server.
|
||||
|
||||
Implements hybrid configuration system:
|
||||
- System-level: ~/.config/claude/postgres.env (credentials)
|
||||
- Project-level: .env (dbt project paths, overrides)
|
||||
- Auto-detection: dbt_project.yml discovery
|
||||
"""
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataPlatformConfig:
|
||||
"""Hybrid configuration loader for data platform tools"""
|
||||
|
||||
def __init__(self):
|
||||
self.postgres_url: Optional[str] = None
|
||||
self.dbt_project_dir: Optional[str] = None
|
||||
self.dbt_profiles_dir: Optional[str] = None
|
||||
self.max_rows: int = 100_000
|
||||
|
||||
def load(self) -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Load configuration from system and project levels.
|
||||
|
||||
Returns:
|
||||
Dict containing postgres_url, dbt_project_dir, dbt_profiles_dir, max_rows
|
||||
|
||||
Note:
|
||||
PostgreSQL credentials are optional - server can run in pandas-only mode.
|
||||
"""
|
||||
# Load system config (PostgreSQL credentials)
|
||||
system_config = Path.home() / '.config' / 'claude' / 'postgres.env'
|
||||
if system_config.exists():
|
||||
load_dotenv(system_config)
|
||||
logger.info(f"Loaded system configuration from {system_config}")
|
||||
else:
|
||||
logger.info(
|
||||
f"System config not found: {system_config} - "
|
||||
"PostgreSQL tools will be unavailable"
|
||||
)
|
||||
|
||||
# Find project directory
|
||||
project_dir = self._find_project_directory()
|
||||
|
||||
# Load project config (overrides system)
|
||||
if project_dir:
|
||||
project_config = project_dir / '.env'
|
||||
if project_config.exists():
|
||||
load_dotenv(project_config, override=True)
|
||||
logger.info(f"Loaded project configuration from {project_config}")
|
||||
|
||||
# Extract values
|
||||
self.postgres_url = os.getenv('POSTGRES_URL')
|
||||
self.dbt_project_dir = os.getenv('DBT_PROJECT_DIR')
|
||||
self.dbt_profiles_dir = os.getenv('DBT_PROFILES_DIR')
|
||||
self.max_rows = int(os.getenv('DATA_PLATFORM_MAX_ROWS', '100000'))
|
||||
|
||||
# Auto-detect dbt project if not specified
|
||||
if not self.dbt_project_dir and project_dir:
|
||||
self.dbt_project_dir = self._find_dbt_project(project_dir)
|
||||
if self.dbt_project_dir:
|
||||
logger.info(f"Auto-detected dbt project: {self.dbt_project_dir}")
|
||||
|
||||
# Default dbt profiles dir to ~/.dbt
|
||||
if not self.dbt_profiles_dir:
|
||||
default_profiles = Path.home() / '.dbt'
|
||||
if default_profiles.exists():
|
||||
self.dbt_profiles_dir = str(default_profiles)
|
||||
|
||||
return {
|
||||
'postgres_url': self.postgres_url,
|
||||
'dbt_project_dir': self.dbt_project_dir,
|
||||
'dbt_profiles_dir': self.dbt_profiles_dir,
|
||||
'max_rows': self.max_rows,
|
||||
'postgres_available': self.postgres_url is not None,
|
||||
'dbt_available': self.dbt_project_dir is not None
|
||||
}
|
||||
|
||||
def _find_project_directory(self) -> Optional[Path]:
|
||||
"""
|
||||
Find the user's project directory.
|
||||
|
||||
Returns:
|
||||
Path to project directory, or None if not found
|
||||
"""
|
||||
# Strategy 1: Check CLAUDE_PROJECT_DIR environment variable
|
||||
project_dir = os.getenv('CLAUDE_PROJECT_DIR')
|
||||
if project_dir:
|
||||
path = Path(project_dir)
|
||||
if path.exists():
|
||||
logger.info(f"Found project directory from CLAUDE_PROJECT_DIR: {path}")
|
||||
return path
|
||||
|
||||
# Strategy 2: Check PWD
|
||||
pwd = os.getenv('PWD')
|
||||
if pwd:
|
||||
path = Path(pwd)
|
||||
if path.exists() and (
|
||||
(path / '.git').exists() or
|
||||
(path / '.env').exists() or
|
||||
(path / 'dbt_project.yml').exists()
|
||||
):
|
||||
logger.info(f"Found project directory from PWD: {path}")
|
||||
return path
|
||||
|
||||
# Strategy 3: Check current working directory
|
||||
cwd = Path.cwd()
|
||||
if (cwd / '.git').exists() or (cwd / '.env').exists() or (cwd / 'dbt_project.yml').exists():
|
||||
logger.info(f"Found project directory from cwd: {cwd}")
|
||||
return cwd
|
||||
|
||||
logger.debug("Could not determine project directory")
|
||||
return None
|
||||
|
||||
def _find_dbt_project(self, start_dir: Path) -> Optional[str]:
|
||||
"""
|
||||
Find dbt_project.yml in the project or its subdirectories.
|
||||
|
||||
Args:
|
||||
start_dir: Directory to start searching from
|
||||
|
||||
Returns:
|
||||
Path to dbt project directory, or None if not found
|
||||
"""
|
||||
# Check root
|
||||
if (start_dir / 'dbt_project.yml').exists():
|
||||
return str(start_dir)
|
||||
|
||||
# Check common subdirectories
|
||||
for subdir in ['dbt', 'transform', 'analytics', 'models']:
|
||||
candidate = start_dir / subdir
|
||||
if (candidate / 'dbt_project.yml').exists():
|
||||
return str(candidate)
|
||||
|
||||
# Search one level deep
|
||||
for item in start_dir.iterdir():
|
||||
if item.is_dir() and not item.name.startswith('.'):
|
||||
if (item / 'dbt_project.yml').exists():
|
||||
return str(item)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def load_config() -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Convenience function to load configuration.
|
||||
|
||||
Returns:
|
||||
Configuration dictionary
|
||||
"""
|
||||
config = DataPlatformConfig()
|
||||
return config.load()
|
||||
|
||||
|
||||
def check_postgres_connection() -> Dict[str, any]:
|
||||
"""
|
||||
Check PostgreSQL connection status for SessionStart hook.
|
||||
|
||||
Returns:
|
||||
Dict with connection status and message
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
config = load_config()
|
||||
if not config.get('postgres_url'):
|
||||
return {
|
||||
'connected': False,
|
||||
'message': 'PostgreSQL not configured (POSTGRES_URL not set)'
|
||||
}
|
||||
|
||||
async def test_connection():
|
||||
try:
|
||||
import asyncpg
|
||||
conn = await asyncpg.connect(config['postgres_url'], timeout=5)
|
||||
version = await conn.fetchval('SELECT version()')
|
||||
await conn.close()
|
||||
return {
|
||||
'connected': True,
|
||||
'message': f'Connected to PostgreSQL',
|
||||
'version': version.split(',')[0] if version else 'Unknown'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'connected': False,
|
||||
'message': f'PostgreSQL connection failed: {str(e)}'
|
||||
}
|
||||
|
||||
return asyncio.run(test_connection())
|
||||
Reference in New Issue
Block a user