feat: add data-platform plugin (v4.0.0)

Add new data-platform plugin for data engineering workflows with:

MCP Server (32 tools):
- pandas operations (14 tools): read_csv, read_parquet, read_json,
  to_csv, to_parquet, describe, head, tail, filter, select, groupby,
  join, list_data, drop_data
- PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute,
  pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type,
  st_srid, st_extent
- dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build,
  dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage

Plugin Features:
- Arrow IPC data_ref system for DataFrame persistence across tool calls
- Pre-execution validation for dbt with `dbt parse`
- SessionStart hook for PostgreSQL connectivity check (non-blocking)
- Hybrid configuration (system ~/.config/claude/postgres.env + project .env)
- Memory management with 100k row limit and chunking support

Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run
Agents: data-ingestion, data-analysis

Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools

Addresses data workflow issues from personal-portfolio project:
- Lost data after multiple interactions (solved by Arrow IPC data_ref)
- dbt 1.9+ syntax deprecation (solved by pre-execution validation)
- Ungraceful PostgreSQL error handling (solved by SessionStart hook)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-25 14:24:03 -05:00
parent 6a267d074b
commit 89f0354ccc
39 changed files with 5413 additions and 6 deletions

View File

@@ -0,0 +1,219 @@
"""
Arrow IPC DataFrame Registry.
Provides persistent storage for DataFrames across tool calls using Apache Arrow
for efficient memory management and serialization.
"""
import pyarrow as pa
import pandas as pd
import uuid
import logging
from typing import Dict, Optional, List, Union
from dataclasses import dataclass
from datetime import datetime
logger = logging.getLogger(__name__)
@dataclass
class DataFrameInfo:
"""Metadata about a stored DataFrame"""
ref: str
rows: int
columns: int
column_names: List[str]
dtypes: Dict[str, str]
memory_bytes: int
created_at: datetime
source: Optional[str] = None
class DataStore:
"""
Singleton registry for Arrow Tables (DataFrames).
Uses Arrow IPC format for efficient memory usage and supports
data_ref based retrieval across multiple tool calls.
"""
_instance = None
_dataframes: Dict[str, pa.Table] = {}
_metadata: Dict[str, DataFrameInfo] = {}
_max_rows: int = 100_000
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._dataframes = {}
cls._metadata = {}
return cls._instance
@classmethod
def get_instance(cls) -> 'DataStore':
"""Get the singleton instance"""
if cls._instance is None:
cls._instance = cls()
return cls._instance
@classmethod
def set_max_rows(cls, max_rows: int):
"""Set the maximum rows limit"""
cls._max_rows = max_rows
def store(
self,
data: Union[pa.Table, pd.DataFrame],
name: Optional[str] = None,
source: Optional[str] = None
) -> str:
"""
Store a DataFrame and return its reference.
Args:
data: Arrow Table or pandas DataFrame
name: Optional name for the reference (auto-generated if not provided)
source: Optional source description (e.g., file path, query)
Returns:
data_ref string to retrieve the DataFrame later
"""
# Convert pandas to Arrow if needed
if isinstance(data, pd.DataFrame):
table = pa.Table.from_pandas(data)
else:
table = data
# Generate reference
data_ref = name or f"df_{uuid.uuid4().hex[:8]}"
# Ensure unique reference
if data_ref in self._dataframes and name is None:
data_ref = f"{data_ref}_{uuid.uuid4().hex[:4]}"
# Store table
self._dataframes[data_ref] = table
# Store metadata
schema = table.schema
self._metadata[data_ref] = DataFrameInfo(
ref=data_ref,
rows=table.num_rows,
columns=table.num_columns,
column_names=[f.name for f in schema],
dtypes={f.name: str(f.type) for f in schema},
memory_bytes=table.nbytes,
created_at=datetime.now(),
source=source
)
logger.info(f"Stored DataFrame '{data_ref}': {table.num_rows} rows, {table.num_columns} cols")
return data_ref
def get(self, data_ref: str) -> Optional[pa.Table]:
"""
Retrieve an Arrow Table by reference.
Args:
data_ref: Reference string from store()
Returns:
Arrow Table or None if not found
"""
return self._dataframes.get(data_ref)
def get_pandas(self, data_ref: str) -> Optional[pd.DataFrame]:
"""
Retrieve a DataFrame as pandas.
Args:
data_ref: Reference string from store()
Returns:
pandas DataFrame or None if not found
"""
table = self.get(data_ref)
if table is not None:
return table.to_pandas()
return None
def get_info(self, data_ref: str) -> Optional[DataFrameInfo]:
"""
Get metadata about a stored DataFrame.
Args:
data_ref: Reference string
Returns:
DataFrameInfo or None if not found
"""
return self._metadata.get(data_ref)
def list_refs(self) -> List[Dict]:
"""
List all stored DataFrame references with metadata.
Returns:
List of dicts with ref, rows, columns, memory info
"""
result = []
for ref, info in self._metadata.items():
result.append({
'ref': ref,
'rows': info.rows,
'columns': info.columns,
'column_names': info.column_names,
'memory_mb': round(info.memory_bytes / (1024 * 1024), 2),
'source': info.source,
'created_at': info.created_at.isoformat()
})
return result
def drop(self, data_ref: str) -> bool:
"""
Remove a DataFrame from the store.
Args:
data_ref: Reference string
Returns:
True if removed, False if not found
"""
if data_ref in self._dataframes:
del self._dataframes[data_ref]
del self._metadata[data_ref]
logger.info(f"Dropped DataFrame '{data_ref}'")
return True
return False
def clear(self):
"""Remove all stored DataFrames"""
count = len(self._dataframes)
self._dataframes.clear()
self._metadata.clear()
logger.info(f"Cleared {count} DataFrames from store")
def total_memory_bytes(self) -> int:
"""Get total memory used by all stored DataFrames"""
return sum(info.memory_bytes for info in self._metadata.values())
def total_memory_mb(self) -> float:
"""Get total memory in MB"""
return round(self.total_memory_bytes() / (1024 * 1024), 2)
def check_row_limit(self, row_count: int) -> Dict:
"""
Check if row count exceeds limit.
Args:
row_count: Number of rows
Returns:
Dict with 'exceeded' bool and 'message' if exceeded
"""
if row_count > self._max_rows:
return {
'exceeded': True,
'message': f"Row count ({row_count:,}) exceeds limit ({self._max_rows:,})",
'suggestion': f"Use chunked processing or filter data first",
'limit': self._max_rows
}
return {'exceeded': False}