feat: add data-platform plugin (v4.0.0)
Add new data-platform plugin for data engineering workflows with: MCP Server (32 tools): - pandas operations (14 tools): read_csv, read_parquet, read_json, to_csv, to_parquet, describe, head, tail, filter, select, groupby, join, list_data, drop_data - PostgreSQL/PostGIS (10 tools): pg_connect, pg_query, pg_execute, pg_tables, pg_columns, pg_schemas, st_tables, st_geometry_type, st_srid, st_extent - dbt integration (8 tools): dbt_parse, dbt_run, dbt_test, dbt_build, dbt_compile, dbt_ls, dbt_docs_generate, dbt_lineage Plugin Features: - Arrow IPC data_ref system for DataFrame persistence across tool calls - Pre-execution validation for dbt with `dbt parse` - SessionStart hook for PostgreSQL connectivity check (non-blocking) - Hybrid configuration (system ~/.config/claude/postgres.env + project .env) - Memory management with 100k row limit and chunking support Commands: /initial-setup, /ingest, /profile, /schema, /explain, /lineage, /run Agents: data-ingestion, data-analysis Test suite: 71 tests covering config, data store, pandas, postgres, dbt tools Addresses data workflow issues from personal-portfolio project: - Lost data after multiple interactions (solved by Arrow IPC data_ref) - dbt 1.9+ syntax deprecation (solved by pre-execution validation) - Ungraceful PostgreSQL error handling (solved by SessionStart hook) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
301
mcp-servers/data-platform/tests/test_pandas_tools.py
Normal file
301
mcp-servers/data-platform/tests/test_pandas_tools.py
Normal file
@@ -0,0 +1,301 @@
|
||||
"""
|
||||
Unit tests for pandas MCP tools.
|
||||
"""
|
||||
import pytest
|
||||
import pandas as pd
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_csv(tmp_path):
|
||||
"""Create a temporary CSV file for testing"""
|
||||
csv_path = tmp_path / 'test.csv'
|
||||
df = pd.DataFrame({
|
||||
'id': [1, 2, 3, 4, 5],
|
||||
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
|
||||
'value': [10.5, 20.0, 30.5, 40.0, 50.5]
|
||||
})
|
||||
df.to_csv(csv_path, index=False)
|
||||
return str(csv_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_parquet(tmp_path):
|
||||
"""Create a temporary Parquet file for testing"""
|
||||
parquet_path = tmp_path / 'test.parquet'
|
||||
df = pd.DataFrame({
|
||||
'id': [1, 2, 3],
|
||||
'data': ['a', 'b', 'c']
|
||||
})
|
||||
df.to_parquet(parquet_path)
|
||||
return str(parquet_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_json(tmp_path):
|
||||
"""Create a temporary JSON file for testing"""
|
||||
json_path = tmp_path / 'test.json'
|
||||
df = pd.DataFrame({
|
||||
'x': [1, 2],
|
||||
'y': [3, 4]
|
||||
})
|
||||
df.to_json(json_path, orient='records')
|
||||
return str(json_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pandas_tools():
|
||||
"""Create PandasTools instance with fresh store"""
|
||||
from mcp_server.pandas_tools import PandasTools
|
||||
from mcp_server.data_store import DataStore
|
||||
|
||||
# Reset store for test isolation
|
||||
store = DataStore.get_instance()
|
||||
store._dataframes = {}
|
||||
store._metadata = {}
|
||||
|
||||
return PandasTools()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_csv(pandas_tools, temp_csv):
|
||||
"""Test reading CSV file"""
|
||||
result = await pandas_tools.read_csv(temp_csv, name='csv_test')
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['data_ref'] == 'csv_test'
|
||||
assert result['rows'] == 5
|
||||
assert 'id' in result['columns']
|
||||
assert 'name' in result['columns']
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_csv_nonexistent(pandas_tools):
|
||||
"""Test reading nonexistent CSV file"""
|
||||
result = await pandas_tools.read_csv('/nonexistent/path.csv')
|
||||
|
||||
assert 'error' in result
|
||||
assert 'not found' in result['error'].lower()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_parquet(pandas_tools, temp_parquet):
|
||||
"""Test reading Parquet file"""
|
||||
result = await pandas_tools.read_parquet(temp_parquet, name='parquet_test')
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['rows'] == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_read_json(pandas_tools, temp_json):
|
||||
"""Test reading JSON file"""
|
||||
result = await pandas_tools.read_json(temp_json, name='json_test')
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['rows'] == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_to_csv(pandas_tools, temp_csv, tmp_path):
|
||||
"""Test exporting to CSV"""
|
||||
# First load some data
|
||||
await pandas_tools.read_csv(temp_csv, name='export_test')
|
||||
|
||||
# Export to new file
|
||||
output_path = str(tmp_path / 'output.csv')
|
||||
result = await pandas_tools.to_csv('export_test', output_path)
|
||||
|
||||
assert result['success'] is True
|
||||
assert os.path.exists(output_path)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_to_parquet(pandas_tools, temp_csv, tmp_path):
|
||||
"""Test exporting to Parquet"""
|
||||
await pandas_tools.read_csv(temp_csv, name='parquet_export')
|
||||
|
||||
output_path = str(tmp_path / 'output.parquet')
|
||||
result = await pandas_tools.to_parquet('parquet_export', output_path)
|
||||
|
||||
assert result['success'] is True
|
||||
assert os.path.exists(output_path)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_describe(pandas_tools, temp_csv):
|
||||
"""Test describe statistics"""
|
||||
await pandas_tools.read_csv(temp_csv, name='describe_test')
|
||||
|
||||
result = await pandas_tools.describe('describe_test')
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert 'shape' in result
|
||||
assert result['shape']['rows'] == 5
|
||||
assert 'statistics' in result
|
||||
assert 'null_counts' in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_head(pandas_tools, temp_csv):
|
||||
"""Test getting first N rows"""
|
||||
await pandas_tools.read_csv(temp_csv, name='head_test')
|
||||
|
||||
result = await pandas_tools.head('head_test', n=3)
|
||||
|
||||
assert result['returned_rows'] == 3
|
||||
assert len(result['data']) == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tail(pandas_tools, temp_csv):
|
||||
"""Test getting last N rows"""
|
||||
await pandas_tools.read_csv(temp_csv, name='tail_test')
|
||||
|
||||
result = await pandas_tools.tail('tail_test', n=2)
|
||||
|
||||
assert result['returned_rows'] == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_filter(pandas_tools, temp_csv):
|
||||
"""Test filtering rows"""
|
||||
await pandas_tools.read_csv(temp_csv, name='filter_test')
|
||||
|
||||
result = await pandas_tools.filter('filter_test', 'value > 25')
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['rows'] == 3 # 30.5, 40.0, 50.5
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_filter_invalid_condition(pandas_tools, temp_csv):
|
||||
"""Test filter with invalid condition"""
|
||||
await pandas_tools.read_csv(temp_csv, name='filter_error')
|
||||
|
||||
result = await pandas_tools.filter('filter_error', 'invalid_column > 0')
|
||||
|
||||
assert 'error' in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_select(pandas_tools, temp_csv):
|
||||
"""Test selecting columns"""
|
||||
await pandas_tools.read_csv(temp_csv, name='select_test')
|
||||
|
||||
result = await pandas_tools.select('select_test', ['id', 'name'])
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['columns'] == ['id', 'name']
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_select_invalid_column(pandas_tools, temp_csv):
|
||||
"""Test select with invalid column"""
|
||||
await pandas_tools.read_csv(temp_csv, name='select_error')
|
||||
|
||||
result = await pandas_tools.select('select_error', ['id', 'nonexistent'])
|
||||
|
||||
assert 'error' in result
|
||||
assert 'available_columns' in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_groupby(pandas_tools, tmp_path):
|
||||
"""Test groupby aggregation"""
|
||||
# Create test data with groups
|
||||
csv_path = tmp_path / 'groupby.csv'
|
||||
df = pd.DataFrame({
|
||||
'category': ['A', 'A', 'B', 'B'],
|
||||
'value': [10, 20, 30, 40]
|
||||
})
|
||||
df.to_csv(csv_path, index=False)
|
||||
|
||||
await pandas_tools.read_csv(str(csv_path), name='groupby_test')
|
||||
|
||||
result = await pandas_tools.groupby(
|
||||
'groupby_test',
|
||||
by='category',
|
||||
agg={'value': 'sum'}
|
||||
)
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['rows'] == 2 # Two groups: A, B
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_join(pandas_tools, tmp_path):
|
||||
"""Test joining DataFrames"""
|
||||
# Create left table
|
||||
left_path = tmp_path / 'left.csv'
|
||||
pd.DataFrame({
|
||||
'id': [1, 2, 3],
|
||||
'name': ['A', 'B', 'C']
|
||||
}).to_csv(left_path, index=False)
|
||||
|
||||
# Create right table
|
||||
right_path = tmp_path / 'right.csv'
|
||||
pd.DataFrame({
|
||||
'id': [1, 2, 4],
|
||||
'value': [100, 200, 400]
|
||||
}).to_csv(right_path, index=False)
|
||||
|
||||
await pandas_tools.read_csv(str(left_path), name='left')
|
||||
await pandas_tools.read_csv(str(right_path), name='right')
|
||||
|
||||
result = await pandas_tools.join('left', 'right', on='id', how='inner')
|
||||
|
||||
assert 'data_ref' in result
|
||||
assert result['rows'] == 2 # Only id 1 and 2 match
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_data(pandas_tools, temp_csv):
|
||||
"""Test listing all DataFrames"""
|
||||
await pandas_tools.read_csv(temp_csv, name='list_test1')
|
||||
await pandas_tools.read_csv(temp_csv, name='list_test2')
|
||||
|
||||
result = await pandas_tools.list_data()
|
||||
|
||||
assert result['count'] == 2
|
||||
refs = [df['ref'] for df in result['dataframes']]
|
||||
assert 'list_test1' in refs
|
||||
assert 'list_test2' in refs
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drop_data(pandas_tools, temp_csv):
|
||||
"""Test dropping DataFrame"""
|
||||
await pandas_tools.read_csv(temp_csv, name='drop_test')
|
||||
|
||||
result = await pandas_tools.drop_data('drop_test')
|
||||
|
||||
assert result['success'] is True
|
||||
|
||||
# Verify it's gone
|
||||
list_result = await pandas_tools.list_data()
|
||||
refs = [df['ref'] for df in list_result['dataframes']]
|
||||
assert 'drop_test' not in refs
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_drop_nonexistent(pandas_tools):
|
||||
"""Test dropping nonexistent DataFrame"""
|
||||
result = await pandas_tools.drop_data('nonexistent')
|
||||
|
||||
assert 'error' in result
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_operations_on_nonexistent(pandas_tools):
|
||||
"""Test operations on nonexistent data_ref"""
|
||||
result = await pandas_tools.describe('nonexistent')
|
||||
assert 'error' in result
|
||||
|
||||
result = await pandas_tools.head('nonexistent')
|
||||
assert 'error' in result
|
||||
|
||||
result = await pandas_tools.filter('nonexistent', 'x > 0')
|
||||
assert 'error' in result
|
||||
Reference in New Issue
Block a user