feat: add data-platform plugin (v4.0.0)
Add new data-platform plugin for data engineering workflows with: MCP Server (32 tools): - pandas operations (14 tools): read_csv, read_parquet, read_json, to_csv, to_parquet, describe, head, tail, filter, select, groupby, join, list_data, drop_data - PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute, pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type, st_srid, st_extent - dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build, dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage Plugin Features: - Arrow IPC data_ref system for DataFrame persistence across tool calls - Pre-execution validation for dbt with `dbt parse` - SessionStart hook for PostgreSQL connectivity check (non-blocking) - Hybrid configuration (system ~/.config/claude/postgres.env + project .env) - Memory management with 100k row limit and chunking support Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run Agents: data-ingestion, data-analysis Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools Addresses data workflow issues from personal-portfolio project: - Lost data after multiple interactions (solved by Arrow IPC data_ref) - dbt 1.9+ syntax deprecation (solved by pre-execution validation) - Ungraceful PostgreSQL error handling (solved by SessionStart hook) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
219
mcp-servers/data-platform/mcp_server/data_store.py
Normal file
219
mcp-servers/data-platform/mcp_server/data_store.py
Normal file
@@ -0,0 +1,219 @@
|
||||
"""
|
||||
Arrow IPC DataFrame Registry.
|
||||
|
||||
Provides persistent storage for DataFrames across tool calls using Apache Arrow
|
||||
for efficient memory management and serialization.
|
||||
"""
|
||||
import pyarrow as pa
|
||||
import pandas as pd
|
||||
import uuid
|
||||
import logging
|
||||
from typing import Dict, Optional, List, Union
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataFrameInfo:
|
||||
"""Metadata about a stored DataFrame"""
|
||||
ref: str
|
||||
rows: int
|
||||
columns: int
|
||||
column_names: List[str]
|
||||
dtypes: Dict[str, str]
|
||||
memory_bytes: int
|
||||
created_at: datetime
|
||||
source: Optional[str] = None
|
||||
|
||||
|
||||
class DataStore:
|
||||
"""
|
||||
Singleton registry for Arrow Tables (DataFrames).
|
||||
|
||||
Uses Arrow IPC format for efficient memory usage and supports
|
||||
data_ref based retrieval across multiple tool calls.
|
||||
"""
|
||||
_instance = None
|
||||
_dataframes: Dict[str, pa.Table] = {}
|
||||
_metadata: Dict[str, DataFrameInfo] = {}
|
||||
_max_rows: int = 100_000
|
||||
|
||||
def __new__(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._dataframes = {}
|
||||
cls._metadata = {}
|
||||
return cls._instance
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls) -> 'DataStore':
|
||||
"""Get the singleton instance"""
|
||||
if cls._instance is None:
|
||||
cls._instance = cls()
|
||||
return cls._instance
|
||||
|
||||
@classmethod
|
||||
def set_max_rows(cls, max_rows: int):
|
||||
"""Set the maximum rows limit"""
|
||||
cls._max_rows = max_rows
|
||||
|
||||
def store(
|
||||
self,
|
||||
data: Union[pa.Table, pd.DataFrame],
|
||||
name: Optional[str] = None,
|
||||
source: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
Store a DataFrame and return its reference.
|
||||
|
||||
Args:
|
||||
data: Arrow Table or pandas DataFrame
|
||||
name: Optional name for the reference (auto-generated if not provided)
|
||||
source: Optional source description (e.g., file path, query)
|
||||
|
||||
Returns:
|
||||
data_ref string to retrieve the DataFrame later
|
||||
"""
|
||||
# Convert pandas to Arrow if needed
|
||||
if isinstance(data, pd.DataFrame):
|
||||
table = pa.Table.from_pandas(data)
|
||||
else:
|
||||
table = data
|
||||
|
||||
# Generate reference
|
||||
data_ref = name or f"df_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Ensure unique reference
|
||||
if data_ref in self._dataframes and name is None:
|
||||
data_ref = f"{data_ref}_{uuid.uuid4().hex[:4]}"
|
||||
|
||||
# Store table
|
||||
self._dataframes[data_ref] = table
|
||||
|
||||
# Store metadata
|
||||
schema = table.schema
|
||||
self._metadata[data_ref] = DataFrameInfo(
|
||||
ref=data_ref,
|
||||
rows=table.num_rows,
|
||||
columns=table.num_columns,
|
||||
column_names=[f.name for f in schema],
|
||||
dtypes={f.name: str(f.type) for f in schema},
|
||||
memory_bytes=table.nbytes,
|
||||
created_at=datetime.now(),
|
||||
source=source
|
||||
)
|
||||
|
||||
logger.info(f"Stored DataFrame '{data_ref}': {table.num_rows} rows, {table.num_columns} cols")
|
||||
return data_ref
|
||||
|
||||
def get(self, data_ref: str) -> Optional[pa.Table]:
|
||||
"""
|
||||
Retrieve an Arrow Table by reference.
|
||||
|
||||
Args:
|
||||
data_ref: Reference string from store()
|
||||
|
||||
Returns:
|
||||
Arrow Table or None if not found
|
||||
"""
|
||||
return self._dataframes.get(data_ref)
|
||||
|
||||
def get_pandas(self, data_ref: str) -> Optional[pd.DataFrame]:
|
||||
"""
|
||||
Retrieve a DataFrame as pandas.
|
||||
|
||||
Args:
|
||||
data_ref: Reference string from store()
|
||||
|
||||
Returns:
|
||||
pandas DataFrame or None if not found
|
||||
"""
|
||||
table = self.get(data_ref)
|
||||
if table is not None:
|
||||
return table.to_pandas()
|
||||
return None
|
||||
|
||||
def get_info(self, data_ref: str) -> Optional[DataFrameInfo]:
|
||||
"""
|
||||
Get metadata about a stored DataFrame.
|
||||
|
||||
Args:
|
||||
data_ref: Reference string
|
||||
|
||||
Returns:
|
||||
DataFrameInfo or None if not found
|
||||
"""
|
||||
return self._metadata.get(data_ref)
|
||||
|
||||
def list_refs(self) -> List[Dict]:
|
||||
"""
|
||||
List all stored DataFrame references with metadata.
|
||||
|
||||
Returns:
|
||||
List of dicts with ref, rows, columns, memory info
|
||||
"""
|
||||
result = []
|
||||
for ref, info in self._metadata.items():
|
||||
result.append({
|
||||
'ref': ref,
|
||||
'rows': info.rows,
|
||||
'columns': info.columns,
|
||||
'column_names': info.column_names,
|
||||
'memory_mb': round(info.memory_bytes / (1024 * 1024), 2),
|
||||
'source': info.source,
|
||||
'created_at': info.created_at.isoformat()
|
||||
})
|
||||
return result
|
||||
|
||||
def drop(self, data_ref: str) -> bool:
|
||||
"""
|
||||
Remove a DataFrame from the store.
|
||||
|
||||
Args:
|
||||
data_ref: Reference string
|
||||
|
||||
Returns:
|
||||
True if removed, False if not found
|
||||
"""
|
||||
if data_ref in self._dataframes:
|
||||
del self._dataframes[data_ref]
|
||||
del self._metadata[data_ref]
|
||||
logger.info(f"Dropped DataFrame '{data_ref}'")
|
||||
return True
|
||||
return False
|
||||
|
||||
def clear(self):
|
||||
"""Remove all stored DataFrames"""
|
||||
count = len(self._dataframes)
|
||||
self._dataframes.clear()
|
||||
self._metadata.clear()
|
||||
logger.info(f"Cleared {count} DataFrames from store")
|
||||
|
||||
def total_memory_bytes(self) -> int:
|
||||
"""Get total memory used by all stored DataFrames"""
|
||||
return sum(info.memory_bytes for info in self._metadata.values())
|
||||
|
||||
def total_memory_mb(self) -> float:
|
||||
"""Get total memory in MB"""
|
||||
return round(self.total_memory_bytes() / (1024 * 1024), 2)
|
||||
|
||||
def check_row_limit(self, row_count: int) -> Dict:
|
||||
"""
|
||||
Check if row count exceeds limit.
|
||||
|
||||
Args:
|
||||
row_count: Number of rows
|
||||
|
||||
Returns:
|
||||
Dict with 'exceeded' bool and 'message' if exceeded
|
||||
"""
|
||||
if row_count > self._max_rows:
|
||||
return {
|
||||
'exceeded': True,
|
||||
'message': f"Row count ({row_count:,}) exceeds limit ({self._max_rows:,})",
|
||||
'suggestion': f"Use chunked processing or filter data first",
|
||||
'limit': self._max_rows
|
||||
}
|
||||
return {'exceeded': False}
|
||||
Reference in New Issue
Block a user