feat: add data-platform plugin (v4.0.0)

Add new data-platform plugin for data engineering workflows with:

MCP Server (32 tools):
- pandas operations (14 tools): read_csv, read_parquet, read_json,
  to_csv, to_parquet, describe, head, tail, filter, select, groupby,
  join, list_data, drop_data
- PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute,
  pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type,
  st_srid, st_extent
- dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build,
  dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage

Plugin Features:
- Arrow IPC data_ref system for DataFrame persistence across tool calls
- Pre-execution validation for dbt with `dbt parse`
- SessionStart hook for PostgreSQL connectivity check (non-blocking)
- Hybrid configuration (system ~/.config/claude/postgres.env + project .env)
- Memory management with 100k row limit and chunking support

Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run
Agents: data-ingestion, data-analysis

Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools

Addresses data workflow issues from personal-portfolio project:
- Lost data after multiple interactions (solved by Arrow IPC data_ref)
- dbt 1.9+ syntax deprecation (solved by pre-execution validation)
- Ungraceful PostgreSQL error handling (solved by SessionStart hook)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-25 14:24:03 -05:00
parent 6a267d074b
commit 89f0354ccc
39 changed files with 5413 additions and 6 deletions

View File

@@ -0,0 +1,795 @@
"""
MCP Server entry point for Data Platform integration.
Provides pandas, PostgreSQL/PostGIS, and dbt tools to Claude Code via JSON-RPC 2.0 over stdio.
"""
import asyncio
import logging
import json
from mcp.server import Server
from mcp.server.stdio import stdio_server
from mcp.types import Tool, TextContent
from .config import DataPlatformConfig
from .data_store import DataStore
from .pandas_tools import PandasTools
from .postgres_tools import PostgresTools
from .dbt_tools import DbtTools
# Suppress noisy MCP validation warnings on stderr
logging.basicConfig(level=logging.INFO)
logging.getLogger("root").setLevel(logging.ERROR)
logging.getLogger("mcp").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
class DataPlatformMCPServer:
"""MCP Server for data platform integration"""
def __init__(self):
self.server = Server("data-platform-mcp")
self.config = None
self.pandas_tools = None
self.postgres_tools = None
self.dbt_tools = None
async def initialize(self):
"""Initialize server and load configuration."""
try:
config_loader = DataPlatformConfig()
self.config = config_loader.load()
self.pandas_tools = PandasTools()
self.postgres_tools = PostgresTools()
self.dbt_tools = DbtTools()
# Log available capabilities
caps = []
caps.append("pandas")
if self.config.get('postgres_available'):
caps.append("PostgreSQL")
if self.config.get('dbt_available'):
caps.append("dbt")
logger.info(f"Data Platform MCP Server initialized with: {', '.join(caps)}")
except Exception as e:
logger.error(f"Failed to initialize: {e}")
raise
def setup_tools(self):
"""Register all available tools with the MCP server"""
@self.server.list_tools()
async def list_tools() -> list[Tool]:
"""Return list of available tools"""
tools = [
# pandas tools - always available
Tool(
name="read_csv",
description="Load CSV file into DataFrame",
inputSchema={
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to CSV file"
},
"name": {
"type": "string",
"description": "Optional name for data_ref"
},
"chunk_size": {
"type": "integer",
"description": "Process in chunks of this size"
}
},
"required": ["file_path"]
}
),
Tool(
name="read_parquet",
description="Load Parquet file into DataFrame",
inputSchema={
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to Parquet file"
},
"name": {
"type": "string",
"description": "Optional name for data_ref"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "Optional list of columns to load"
}
},
"required": ["file_path"]
}
),
Tool(
name="read_json",
description="Load JSON/JSONL file into DataFrame",
inputSchema={
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to JSON file"
},
"name": {
"type": "string",
"description": "Optional name for data_ref"
},
"lines": {
"type": "boolean",
"default": False,
"description": "Read as JSON Lines format"
}
},
"required": ["file_path"]
}
),
Tool(
name="to_csv",
description="Export DataFrame to CSV file",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"file_path": {
"type": "string",
"description": "Output file path"
},
"index": {
"type": "boolean",
"default": False,
"description": "Include index column"
}
},
"required": ["data_ref", "file_path"]
}
),
Tool(
name="to_parquet",
description="Export DataFrame to Parquet file",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"file_path": {
"type": "string",
"description": "Output file path"
},
"compression": {
"type": "string",
"default": "snappy",
"description": "Compression codec"
}
},
"required": ["data_ref", "file_path"]
}
),
Tool(
name="describe",
description="Get statistical summary of DataFrame",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
}
},
"required": ["data_ref"]
}
),
Tool(
name="head",
description="Get first N rows of DataFrame",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"n": {
"type": "integer",
"default": 10,
"description": "Number of rows"
}
},
"required": ["data_ref"]
}
),
Tool(
name="tail",
description="Get last N rows of DataFrame",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"n": {
"type": "integer",
"default": 10,
"description": "Number of rows"
}
},
"required": ["data_ref"]
}
),
Tool(
name="filter",
description="Filter DataFrame rows by condition",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"condition": {
"type": "string",
"description": "pandas query string (e.g., 'age > 30 and city == \"NYC\"')"
},
"name": {
"type": "string",
"description": "Optional name for result data_ref"
}
},
"required": ["data_ref", "condition"]
}
),
Tool(
name="select",
description="Select specific columns from DataFrame",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of column names to select"
},
"name": {
"type": "string",
"description": "Optional name for result data_ref"
}
},
"required": ["data_ref", "columns"]
}
),
Tool(
name="groupby",
description="Group DataFrame and aggregate",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to stored DataFrame"
},
"by": {
"oneOf": [
{"type": "string"},
{"type": "array", "items": {"type": "string"}}
],
"description": "Column(s) to group by"
},
"agg": {
"type": "object",
"description": "Aggregation dict (e.g., {\"sales\": \"sum\", \"count\": \"mean\"})"
},
"name": {
"type": "string",
"description": "Optional name for result data_ref"
}
},
"required": ["data_ref", "by", "agg"]
}
),
Tool(
name="join",
description="Join two DataFrames",
inputSchema={
"type": "object",
"properties": {
"left_ref": {
"type": "string",
"description": "Reference to left DataFrame"
},
"right_ref": {
"type": "string",
"description": "Reference to right DataFrame"
},
"on": {
"oneOf": [
{"type": "string"},
{"type": "array", "items": {"type": "string"}}
],
"description": "Column(s) to join on (if same name in both)"
},
"left_on": {
"oneOf": [
{"type": "string"},
{"type": "array", "items": {"type": "string"}}
],
"description": "Left join column(s)"
},
"right_on": {
"oneOf": [
{"type": "string"},
{"type": "array", "items": {"type": "string"}}
],
"description": "Right join column(s)"
},
"how": {
"type": "string",
"enum": ["inner", "left", "right", "outer"],
"default": "inner",
"description": "Join type"
},
"name": {
"type": "string",
"description": "Optional name for result data_ref"
}
},
"required": ["left_ref", "right_ref"]
}
),
Tool(
name="list_data",
description="List all stored DataFrames",
inputSchema={
"type": "object",
"properties": {}
}
),
Tool(
name="drop_data",
description="Remove a DataFrame from storage",
inputSchema={
"type": "object",
"properties": {
"data_ref": {
"type": "string",
"description": "Reference to drop"
}
},
"required": ["data_ref"]
}
),
# PostgreSQL tools
Tool(
name="pg_connect",
description="Test PostgreSQL connection and return status",
inputSchema={
"type": "object",
"properties": {}
}
),
Tool(
name="pg_query",
description="Execute SELECT query and return results as data_ref",
inputSchema={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "SQL SELECT query"
},
"params": {
"type": "array",
"items": {},
"description": "Query parameters (use $1, $2, etc.)"
},
"name": {
"type": "string",
"description": "Optional name for result data_ref"
}
},
"required": ["query"]
}
),
Tool(
name="pg_execute",
description="Execute INSERT/UPDATE/DELETE query",
inputSchema={
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "SQL DML query"
},
"params": {
"type": "array",
"items": {},
"description": "Query parameters"
}
},
"required": ["query"]
}
),
Tool(
name="pg_tables",
description="List all tables in schema",
inputSchema={
"type": "object",
"properties": {
"schema": {
"type": "string",
"default": "public",
"description": "Schema name"
}
}
}
),
Tool(
name="pg_columns",
description="Get column information for a table",
inputSchema={
"type": "object",
"properties": {
"table": {
"type": "string",
"description": "Table name"
},
"schema": {
"type": "string",
"default": "public",
"description": "Schema name"
}
},
"required": ["table"]
}
),
Tool(
name="pg_schemas",
description="List all schemas in database",
inputSchema={
"type": "object",
"properties": {}
}
),
# PostGIS tools
Tool(
name="st_tables",
description="List PostGIS-enabled tables",
inputSchema={
"type": "object",
"properties": {
"schema": {
"type": "string",
"default": "public",
"description": "Schema name"
}
}
}
),
Tool(
name="st_geometry_type",
description="Get geometry type of a column",
inputSchema={
"type": "object",
"properties": {
"table": {
"type": "string",
"description": "Table name"
},
"column": {
"type": "string",
"description": "Geometry column name"
},
"schema": {
"type": "string",
"default": "public",
"description": "Schema name"
}
},
"required": ["table", "column"]
}
),
Tool(
name="st_srid",
description="Get SRID of geometry column",
inputSchema={
"type": "object",
"properties": {
"table": {
"type": "string",
"description": "Table name"
},
"column": {
"type": "string",
"description": "Geometry column name"
},
"schema": {
"type": "string",
"default": "public",
"description": "Schema name"
}
},
"required": ["table", "column"]
}
),
Tool(
name="st_extent",
description="Get bounding box of all geometries",
inputSchema={
"type": "object",
"properties": {
"table": {
"type": "string",
"description": "Table name"
},
"column": {
"type": "string",
"description": "Geometry column name"
},
"schema": {
"type": "string",
"default": "public",
"description": "Schema name"
}
},
"required": ["table", "column"]
}
),
# dbt tools
Tool(
name="dbt_parse",
description="Validate dbt project (pre-flight check)",
inputSchema={
"type": "object",
"properties": {}
}
),
Tool(
name="dbt_run",
description="Run dbt models with pre-validation",
inputSchema={
"type": "object",
"properties": {
"select": {
"type": "string",
"description": "Model selection (e.g., 'model_name', '+model_name', 'tag:daily')"
},
"exclude": {
"type": "string",
"description": "Models to exclude"
},
"full_refresh": {
"type": "boolean",
"default": False,
"description": "Rebuild incremental models"
}
}
}
),
Tool(
name="dbt_test",
description="Run dbt tests",
inputSchema={
"type": "object",
"properties": {
"select": {
"type": "string",
"description": "Test selection"
},
"exclude": {
"type": "string",
"description": "Tests to exclude"
}
}
}
),
Tool(
name="dbt_build",
description="Run dbt build (run + test) with pre-validation",
inputSchema={
"type": "object",
"properties": {
"select": {
"type": "string",
"description": "Model/test selection"
},
"exclude": {
"type": "string",
"description": "Resources to exclude"
},
"full_refresh": {
"type": "boolean",
"default": False,
"description": "Rebuild incremental models"
}
}
}
),
Tool(
name="dbt_compile",
description="Compile dbt models to SQL without executing",
inputSchema={
"type": "object",
"properties": {
"select": {
"type": "string",
"description": "Model selection"
}
}
}
),
Tool(
name="dbt_ls",
description="List dbt resources",
inputSchema={
"type": "object",
"properties": {
"select": {
"type": "string",
"description": "Resource selection"
},
"resource_type": {
"type": "string",
"enum": ["model", "test", "seed", "snapshot", "source"],
"description": "Filter by type"
},
"output": {
"type": "string",
"enum": ["name", "path", "json"],
"default": "name",
"description": "Output format"
}
}
}
),
Tool(
name="dbt_docs_generate",
description="Generate dbt documentation",
inputSchema={
"type": "object",
"properties": {}
}
),
Tool(
name="dbt_lineage",
description="Get model dependencies and lineage",
inputSchema={
"type": "object",
"properties": {
"model": {
"type": "string",
"description": "Model name to analyze"
}
},
"required": ["model"]
}
)
]
return tools
@self.server.call_tool()
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
"""Handle tool invocation."""
try:
# Route to appropriate tool handler
# pandas tools
if name == "read_csv":
result = await self.pandas_tools.read_csv(**arguments)
elif name == "read_parquet":
result = await self.pandas_tools.read_parquet(**arguments)
elif name == "read_json":
result = await self.pandas_tools.read_json(**arguments)
elif name == "to_csv":
result = await self.pandas_tools.to_csv(**arguments)
elif name == "to_parquet":
result = await self.pandas_tools.to_parquet(**arguments)
elif name == "describe":
result = await self.pandas_tools.describe(**arguments)
elif name == "head":
result = await self.pandas_tools.head(**arguments)
elif name == "tail":
result = await self.pandas_tools.tail(**arguments)
elif name == "filter":
result = await self.pandas_tools.filter(**arguments)
elif name == "select":
result = await self.pandas_tools.select(**arguments)
elif name == "groupby":
result = await self.pandas_tools.groupby(**arguments)
elif name == "join":
result = await self.pandas_tools.join(**arguments)
elif name == "list_data":
result = await self.pandas_tools.list_data()
elif name == "drop_data":
result = await self.pandas_tools.drop_data(**arguments)
# PostgreSQL tools
elif name == "pg_connect":
result = await self.postgres_tools.pg_connect()
elif name == "pg_query":
result = await self.postgres_tools.pg_query(**arguments)
elif name == "pg_execute":
result = await self.postgres_tools.pg_execute(**arguments)
elif name == "pg_tables":
result = await self.postgres_tools.pg_tables(**arguments)
elif name == "pg_columns":
result = await self.postgres_tools.pg_columns(**arguments)
elif name == "pg_schemas":
result = await self.postgres_tools.pg_schemas()
# PostGIS tools
elif name == "st_tables":
result = await self.postgres_tools.st_tables(**arguments)
elif name == "st_geometry_type":
result = await self.postgres_tools.st_geometry_type(**arguments)
elif name == "st_srid":
result = await self.postgres_tools.st_srid(**arguments)
elif name == "st_extent":
result = await self.postgres_tools.st_extent(**arguments)
# dbt tools
elif name == "dbt_parse":
result = await self.dbt_tools.dbt_parse()
elif name == "dbt_run":
result = await self.dbt_tools.dbt_run(**arguments)
elif name == "dbt_test":
result = await self.dbt_tools.dbt_test(**arguments)
elif name == "dbt_build":
result = await self.dbt_tools.dbt_build(**arguments)
elif name == "dbt_compile":
result = await self.dbt_tools.dbt_compile(**arguments)
elif name == "dbt_ls":
result = await self.dbt_tools.dbt_ls(**arguments)
elif name == "dbt_docs_generate":
result = await self.dbt_tools.dbt_docs_generate()
elif name == "dbt_lineage":
result = await self.dbt_tools.dbt_lineage(**arguments)
else:
raise ValueError(f"Unknown tool: {name}")
return [TextContent(
type="text",
text=json.dumps(result, indent=2, default=str)
)]
except Exception as e:
logger.error(f"Tool {name} failed: {e}")
return [TextContent(
type="text",
text=json.dumps({"error": str(e)}, indent=2)
)]
async def run(self):
"""Run the MCP server"""
await self.initialize()
self.setup_tools()
async with stdio_server() as (read_stream, write_stream):
await self.server.run(
read_stream,
write_stream,
self.server.create_initialization_options()
)
async def main():
"""Main entry point"""
server = DataPlatformMCPServer()
await server.run()
if __name__ == "__main__":
asyncio.run(main())