Files

2025-08-01 09:31:37 -04:00

18 KiB

Raw Blame History

JobForge MVP - Database Design & Schema

Version: 1.0.0 MVP
Database: PostgreSQL 16 with pgvector
Target Audience: Backend Developers
Last Updated: July 2025

🎯 Database Overview

Technology Stack

Database: PostgreSQL 16
Extensions: pgvector (for AI embeddings)
Security: Row Level Security (RLS) for multi-tenancy
Connection: AsyncPG with SQLAlchemy 2.0
Migrations: Direct SQL for MVP (Alembic in Phase 2)

Design Principles

User Isolation: Complete data separation between users
Data Integrity: Foreign key constraints and validation
Performance: Optimized indexes for common queries
Security: RLS policies prevent cross-user data access
Scalability: Schema designed for future SaaS features

📊 Entity Relationship Diagram

erDiagram
    USERS ||--o{ APPLICATIONS : creates
    USERS ||--o{ USER_RESUMES : owns
    APPLICATIONS ||--o{ DOCUMENTS : contains
    DOCUMENTS ||--o| DOCUMENT_EMBEDDINGS : has_embedding
    
    USERS {
        uuid id PK
        varchar email UK
        varchar password_hash
        varchar full_name
        timestamp created_at
        timestamp updated_at
    }
    
    APPLICATIONS {
        uuid id PK
        uuid user_id FK
        varchar name
        varchar company_name
        varchar role_title
        text job_url
        text job_description
        varchar location
        varchar priority_level
        varchar status
        boolean research_completed
        boolean resume_optimized
        boolean cover_letter_generated
        timestamp created_at
        timestamp updated_at
    }
    
    DOCUMENTS {
        uuid id PK
        uuid application_id FK
        varchar document_type
        text content
        timestamp created_at
        timestamp updated_at
    }
    
    USER_RESUMES {
        uuid id PK
        uuid user_id FK
        varchar name
        text content
        varchar focus_area
        boolean is_primary
        timestamp created_at
        timestamp updated_at
    }
    
    DOCUMENT_EMBEDDINGS {
        uuid id PK
        uuid document_id FK
        vector embedding
        timestamp created_at
    }

🗄️ Complete Database Schema

Database Initialization

-- Enable required extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS vector;

-- Create custom types
CREATE TYPE priority_level_type AS ENUM ('low', 'medium', 'high');
CREATE TYPE application_status_type AS ENUM (
    'draft', 
    'research_complete', 
    'resume_ready', 
    'cover_letter_ready'
);
CREATE TYPE document_type_enum AS ENUM (
    'research_report', 
    'optimized_resume', 
    'cover_letter'
);
CREATE TYPE focus_area_type AS ENUM (
    'software_development',
    'data_science', 
    'management',
    'consulting',
    'other'
);

Core Tables

Users Table

CREATE TABLE users (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    email VARCHAR(255) UNIQUE NOT NULL,
    password_hash VARCHAR(255) NOT NULL,
    full_name VARCHAR(255) NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    
    -- Constraints
    CONSTRAINT email_format CHECK (email ~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'),
    CONSTRAINT name_not_empty CHECK (LENGTH(TRIM(full_name)) > 0)
);

-- Indexes
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_users_created_at ON users(created_at);

-- Row Level Security
ALTER TABLE users ENABLE ROW LEVEL SECURITY;

-- Users can only see their own record
CREATE POLICY users_own_data ON users
    FOR ALL 
    USING (id = current_setting('app.current_user_id')::UUID);

Applications Table

CREATE TABLE applications (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    company_name VARCHAR(255) NOT NULL,
    role_title VARCHAR(255) NOT NULL,
    job_url TEXT,
    job_description TEXT NOT NULL,
    location VARCHAR(255),
    priority_level priority_level_type DEFAULT 'medium',
    status application_status_type DEFAULT 'draft',
    
    -- Phase tracking
    research_completed BOOLEAN DEFAULT FALSE,
    resume_optimized BOOLEAN DEFAULT FALSE,
    cover_letter_generated BOOLEAN DEFAULT FALSE,
    
    -- Timestamps
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    
    -- Constraints
    CONSTRAINT job_description_min_length CHECK (LENGTH(job_description) >= 50),
    CONSTRAINT company_name_not_empty CHECK (LENGTH(TRIM(company_name)) > 0),
    CONSTRAINT role_title_not_empty CHECK (LENGTH(TRIM(role_title)) > 0),
    CONSTRAINT valid_job_url CHECK (
        job_url IS NULL OR 
        job_url ~* '^https?://[^\s/$.?#].[^\s]*$'
    ),
    
    -- Business logic constraints
    CONSTRAINT resume_requires_research CHECK (
        NOT resume_optimized OR research_completed
    ),
    CONSTRAINT cover_letter_requires_resume CHECK (
        NOT cover_letter_generated OR resume_optimized
    )
);

-- Indexes
CREATE INDEX idx_applications_user_id ON applications(user_id);
CREATE INDEX idx_applications_status ON applications(status);
CREATE INDEX idx_applications_priority ON applications(priority_level);
CREATE INDEX idx_applications_created_at ON applications(created_at);
CREATE INDEX idx_applications_company_name ON applications(company_name);

-- Full text search index for job descriptions
CREATE INDEX idx_applications_job_description_fts 
ON applications USING gin(to_tsvector('english', job_description));

-- Row Level Security
ALTER TABLE applications ENABLE ROW LEVEL SECURITY;

CREATE POLICY applications_user_access ON applications
    FOR ALL
    USING (user_id = current_setting('app.current_user_id')::UUID);

Documents Table

CREATE TABLE documents (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    application_id UUID NOT NULL REFERENCES applications(id) ON DELETE CASCADE,
    document_type document_type_enum NOT NULL,
    content TEXT NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    
    -- Constraints
    CONSTRAINT content_min_length CHECK (LENGTH(content) >= 10),
    CONSTRAINT unique_document_per_application UNIQUE (application_id, document_type)
);

-- Indexes
CREATE INDEX idx_documents_application_id ON documents(application_id);
CREATE INDEX idx_documents_type ON documents(document_type);
CREATE INDEX idx_documents_updated_at ON documents(updated_at);

-- Full text search index for document content
CREATE INDEX idx_documents_content_fts 
ON documents USING gin(to_tsvector('english', content));

-- Row Level Security
ALTER TABLE documents ENABLE ROW LEVEL SECURITY;

CREATE POLICY documents_user_access ON documents
    FOR ALL
    USING (
        application_id IN (
            SELECT id FROM applications 
            WHERE user_id = current_setting('app.current_user_id')::UUID
        )
    );

User Resumes Table

CREATE TABLE user_resumes (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    content TEXT NOT NULL,
    focus_area focus_area_type DEFAULT 'other',
    is_primary BOOLEAN DEFAULT FALSE,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    
    -- Constraints
    CONSTRAINT resume_name_not_empty CHECK (LENGTH(TRIM(name)) > 0),
    CONSTRAINT resume_content_min_length CHECK (LENGTH(content) >= 100),
    
    -- Only one primary resume per user
    CONSTRAINT unique_primary_resume UNIQUE (user_id, is_primary) 
    DEFERRABLE INITIALLY DEFERRED
);

-- Indexes
CREATE INDEX idx_user_resumes_user_id ON user_resumes(user_id);
CREATE INDEX idx_user_resumes_focus_area ON user_resumes(focus_area);
CREATE INDEX idx_user_resumes_is_primary ON user_resumes(is_primary);

-- Full text search index for resume content
CREATE INDEX idx_user_resumes_content_fts 
ON user_resumes USING gin(to_tsvector('english', content));

-- Row Level Security
ALTER TABLE user_resumes ENABLE ROW LEVEL SECURITY;

CREATE POLICY user_resumes_access ON user_resumes
    FOR ALL
    USING (user_id = current_setting('app.current_user_id')::UUID);

Document Embeddings Table (AI Features)

CREATE TABLE document_embeddings (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
    embedding vector(1536),  -- OpenAI text-embedding-3-large dimension
    created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
    
    -- Constraints
    CONSTRAINT unique_embedding_per_document UNIQUE (document_id)
);

-- Vector similarity index
CREATE INDEX idx_document_embeddings_vector 
ON document_embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);

-- Regular indexes
CREATE INDEX idx_document_embeddings_document_id ON document_embeddings(document_id);

-- Row Level Security
ALTER TABLE document_embeddings ENABLE ROW LEVEL SECURITY;

CREATE POLICY document_embeddings_access ON document_embeddings
    FOR ALL
    USING (
        document_id IN (
            SELECT d.id FROM documents d
            JOIN applications a ON d.application_id = a.id
            WHERE a.user_id = current_setting('app.current_user_id')::UUID
        )
    );

🔒 Security Policies

Row Level Security Overview

All tables with user data have RLS enabled to ensure complete data isolation:

-- Function to get current user ID from session
CREATE OR REPLACE FUNCTION get_current_user_id()
RETURNS UUID AS $$
BEGIN
    RETURN current_setting('app.current_user_id')::UUID;
EXCEPTION
    WHEN others THEN
        RETURN NULL;
END;
$$ LANGUAGE plpgsql SECURITY DEFINER;

-- Helper function to check if user owns application
CREATE OR REPLACE FUNCTION user_owns_application(app_id UUID)
RETURNS BOOLEAN AS $$
BEGIN
    RETURN EXISTS (
        SELECT 1 FROM applications 
        WHERE id = app_id 
        AND user_id = get_current_user_id()
    );
END;
$$ LANGUAGE plpgsql SECURITY DEFINER;

Setting User Context

Backend must set user context for each request:

# In FastAPI dependency
async def set_user_context(user: User = Depends(get_current_user)):
    async with get_db_connection() as conn:
        await conn.execute(
            "SET LOCAL app.current_user_id = %s", 
            str(user.id)
        )
    return user

🚀 Database Functions

Trigger Functions

-- Update timestamp trigger function
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = NOW();
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- Apply to all tables with updated_at
CREATE TRIGGER update_users_updated_at
    BEFORE UPDATE ON users
    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();

CREATE TRIGGER update_applications_updated_at
    BEFORE UPDATE ON applications  
    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();

CREATE TRIGGER update_documents_updated_at
    BEFORE UPDATE ON documents
    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();

CREATE TRIGGER update_user_resumes_updated_at
    BEFORE UPDATE ON user_resumes
    FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();

Business Logic Functions

-- Generate application name
CREATE OR REPLACE FUNCTION generate_application_name(
    p_company_name VARCHAR,
    p_role_title VARCHAR
) RETURNS VARCHAR AS $$
DECLARE
    clean_company VARCHAR;
    clean_role VARCHAR;
    date_suffix VARCHAR;
BEGIN
    -- Clean and normalize names
    clean_company := LOWER(REGEXP_REPLACE(p_company_name, '[^a-zA-Z0-9]', '_', 'g'));
    clean_role := LOWER(REGEXP_REPLACE(p_role_title, '[^a-zA-Z0-9]', '_', 'g'));
    date_suffix := TO_CHAR(NOW(), 'YYYY_MM_DD');
    
    RETURN clean_company || '_' || clean_role || '_' || date_suffix;
END;
$$ LANGUAGE plpgsql;

-- Update application phases trigger
CREATE OR REPLACE FUNCTION update_application_phases()
RETURNS TRIGGER AS $$
BEGIN
    -- Auto-update phase completion based on document existence
    IF TG_OP = 'INSERT' OR TG_OP = 'UPDATE' THEN
        UPDATE applications SET
            research_completed = EXISTS (
                SELECT 1 FROM documents 
                WHERE application_id = NEW.application_id 
                AND document_type = 'research_report'
            ),
            resume_optimized = EXISTS (
                SELECT 1 FROM documents 
                WHERE application_id = NEW.application_id 
                AND document_type = 'optimized_resume'
            ),  
            cover_letter_generated = EXISTS (
                SELECT 1 FROM documents 
                WHERE application_id = NEW.application_id 
                AND document_type = 'cover_letter'
            )
        WHERE id = NEW.application_id;
        
        -- Update status based on completion
        UPDATE applications SET
            status = CASE
                WHEN cover_letter_generated THEN 'cover_letter_ready'
                WHEN resume_optimized THEN 'resume_ready'
                WHEN research_completed THEN 'research_complete'
                ELSE 'draft'
            END
        WHERE id = NEW.application_id;
    END IF;
    
    RETURN COALESCE(NEW, OLD);
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER documents_update_phases
    AFTER INSERT OR UPDATE OR DELETE ON documents
    FOR EACH ROW EXECUTE FUNCTION update_application_phases();

📈 Performance Optimization

Query Optimization

-- Most common query patterns with optimized indexes

-- 1. Get user applications (paginated)
-- Index: idx_applications_user_id, idx_applications_created_at
SELECT * FROM applications 
WHERE user_id = $1 
ORDER BY created_at DESC 
LIMIT $2 OFFSET $3;

-- 2. Get application with documents
-- Index: idx_documents_application_id
SELECT a.*, d.document_type, d.content
FROM applications a
LEFT JOIN documents d ON a.id = d.application_id
WHERE a.id = $1 AND a.user_id = $2;

-- 3. Search applications by company/role
-- Index: idx_applications_company_name, full-text search
SELECT * FROM applications
WHERE user_id = $1
AND (
    company_name ILIKE $2 
    OR role_title ILIKE $3
    OR to_tsvector('english', job_description) @@ plainto_tsquery('english', $4)
)
ORDER BY created_at DESC;

Connection Pooling

# SQLAlchemy async engine configuration
engine = create_async_engine(
    DATABASE_URL,
    pool_size=20,          # Connection pool size
    max_overflow=30,       # Additional connections beyond pool_size
    pool_pre_ping=True,    # Validate connections before use
    pool_recycle=3600,     # Recycle connections every hour
    echo=False             # Disable SQL logging in production
)

🧪 Test Data Setup

Development Seed Data

-- Insert test user (password: "testpass123")
INSERT INTO users (id, email, password_hash, full_name) VALUES (
    '123e4567-e89b-12d3-a456-426614174000',
    'test@example.com', 
    '$2b$12$LQv3c1yqBWVHxkd0LHAkCOYz6TtxMQJqhN8/LewgdyN8yF5V4M2kq',
    'Test User'
);

-- Insert test resume
INSERT INTO user_resumes (user_id, name, content, focus_area, is_primary) VALUES (
    '123e4567-e89b-12d3-a456-426614174000',
    'Software Developer Resume',
    '# Test User\n\n## Experience\n\nSoftware Developer at Tech Corp...',
    'software_development',
    true
);

-- Insert test application
INSERT INTO applications (
    user_id, name, company_name, role_title, 
    job_description, status, research_completed
) VALUES (
    '123e4567-e89b-12d3-a456-426614174000',
    'google_senior_developer_2025_07_01',
    'Google',
    'Senior Developer',
    'We are seeking an experienced software developer to join our team...',
    'research_complete',
    true
);

🔄 Database Migrations (Future)

Migration Strategy for Phase 2

When adding Alembic migrations:

# alembic/env.py configuration for RLS
from sqlalchemy import text

def run_migrations_online():
    # Set up RLS context for migrations
    with engine.connect() as connection:
        connection.execute(text("SET row_security = off"))
        
        context.configure(
            connection=connection,
            target_metadata=target_metadata,
            compare_type=True,
            compare_server_default=True
        )
        
        with context.begin_transaction():
            context.run_migrations()

Planned Schema Changes

Usage tracking tables for SaaS billing
Subscription management tables
Audit log tables for compliance
Performance metrics tables
Additional indexes based on production usage

🛠️ Database Maintenance

Regular Maintenance Tasks

-- Vacuum and analyze (run weekly)
VACUUM ANALYZE;

-- Update table statistics
ANALYZE applications;
ANALYZE documents;
ANALYZE user_resumes;

-- Check index usage
SELECT schemaname, tablename, indexname, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_tup_read DESC;

-- Monitor vector index performance
SELECT * FROM pg_stat_user_indexes 
WHERE indexname LIKE '%vector%';

Backup Strategy

# Daily backup script
pg_dump -h localhost -U jobforge_user -d jobforge_mvp \
    --clean --if-exists --verbose \
    > backup_$(date +%Y%m%d).sql

# Restore from backup
psql -h localhost -U jobforge_user -d jobforge_mvp < backup_20250701.sql

📊 Monitoring Queries

Performance Monitoring

-- Slow queries
SELECT query, mean_time, calls, total_time
FROM pg_stat_statements
WHERE mean_time > 100  -- queries slower than 100ms
ORDER BY mean_time DESC;

-- Table sizes
SELECT 
    schemaname,
    tablename,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size,
    pg_total_relation_size(schemaname||'.'||tablename) as size_bytes
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY size_bytes DESC;

-- Connection counts
SELECT state, count(*) 
FROM pg_stat_activity 
GROUP BY state;

This database design provides a solid foundation for the MVP while being prepared for future SaaS features. The RLS policies ensure complete user data isolation, and the schema is optimized for the expected query patterns.

18 KiB Raw Blame History