#!/usr/bin/env python3 """ CODITECT Projects Database Initialization (ADR-118 TIER 4)
Initializes projects.db for project registration, indexing, and semantic search.
Usage: python3 scripts/init_projects_db.py # Initialize python3 scripts/init_projects_db.py --stats # Show stats python3 scripts/init_projects_db.py --reset # Reset database (CAUTION)
Task ID: J.15.1.1 Created: 2026-01-28 ADR: ADR-118 Four-Tier Database Architecture """
import argparse import json import sqlite3 import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional
Add parent to path for imports
sys.path.insert(0, str(Path(file).parent.parent))
try: from scripts.core.paths import get_projects_db_path, get_context_storage_dir except ImportError: # Fallback for standalone execution def get_context_storage_dir() -> Path: home = Path.home() candidates = [ home / "PROJECTS" / ".coditect-data" / "context-storage", home / ".coditect-data" / "context-storage", ] for c in candidates: if c.exists(): return c return candidates[0]
def get_projects_db_path() -> Path:
return get_context_storage_dir() / "projects.db"
=============================================================================
Schema Definitions
=============================================================================
SCHEMA_TABLES_SQL = """
-- CODITECT projects.db Schema (ADR-118 TIER 4) -- Purpose: Project registration, content indexing, and semantic search -- Backup: Optional (regenerable via /cx --index-project)
-- Projects table: Registered project metadata CREATE TABLE IF NOT EXISTS projects ( id INTEGER PRIMARY KEY AUTOINCREMENT,
-- Identity
project_uuid TEXT UNIQUE NOT NULL, -- Cloud-assigned or local UUID
name TEXT NOT NULL, -- Human-readable project name
path TEXT UNIQUE NOT NULL, -- Absolute path to project root
-- Git metadata
github_owner TEXT, -- GitHub owner/org (if applicable)
github_repo TEXT, -- GitHub repository name
github_url TEXT, -- Full GitHub URL
default_branch TEXT DEFAULT 'main', -- Default git branch
-- Hierarchy (for monorepos/submodules)
parent_project_id INTEGER REFERENCES projects(id) ON DELETE SET NULL,
project_type TEXT DEFAULT 'standalone', -- standalone, monorepo, submodule
-- Status
status TEXT DEFAULT 'active', -- active, archived, pending_sync
last_indexed_at TEXT, -- Last successful indexing timestamp
last_sync_at TEXT, -- Last cloud sync timestamp
-- Metadata
description TEXT, -- Project description
primary_language TEXT, -- Primary programming language
framework TEXT, -- Primary framework (react, django, etc.)
-- Timestamps
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
-- Cloud sync
cloud_registered INTEGER DEFAULT 0, -- 1 if registered with cloud
cloud_project_uuid TEXT, -- Cloud-assigned UUID (if different)
pending_sync INTEGER DEFAULT 0 -- 1 if awaiting sync
);
-- Content hashes: File change detection for incremental indexing CREATE TABLE IF NOT EXISTS content_hashes ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
-- File identity
relative_path TEXT NOT NULL, -- Path relative to project root
file_hash TEXT NOT NULL, -- SHA-256 hash of file content
-- Metadata
file_size INTEGER, -- File size in bytes
content_type TEXT, -- code, document, config, test, etc.
language TEXT, -- Detected programming language
-- Indexing status
last_hashed_at TEXT NOT NULL, -- When hash was computed
last_indexed_at TEXT, -- When content was indexed for embeddings
needs_reindex INTEGER DEFAULT 0, -- 1 if file changed since last index
UNIQUE(project_id, relative_path)
);
-- Project embeddings: Semantic search vectors for project content CREATE TABLE IF NOT EXISTS project_embeddings ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE, content_hash_id INTEGER REFERENCES content_hashes(id) ON DELETE CASCADE,
-- Content
chunk_index INTEGER NOT NULL, -- Chunk number within file (0-based)
chunk_text TEXT NOT NULL, -- The actual text chunk
chunk_hash TEXT NOT NULL, -- Hash of chunk_text for deduplication
-- Embedding
embedding BLOB, -- Vector embedding (numpy array as bytes)
embedding_model TEXT DEFAULT 'all-MiniLM-L6-v2', -- Model used for embedding
embedding_dim INTEGER DEFAULT 384, -- Embedding dimension
-- Metadata
start_line INTEGER, -- Starting line number in source file
end_line INTEGER, -- Ending line number in source file
content_type TEXT, -- code, docstring, comment, markdown
language TEXT, -- Programming language (for code chunks)
-- Timestamps
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(content_hash_id, chunk_index)
);
-- Exclude patterns: Per-project ignore rules CREATE TABLE IF NOT EXISTS exclude_patterns ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_id INTEGER REFERENCES projects(id) ON DELETE CASCADE, -- NULL = global
-- Pattern
pattern TEXT NOT NULL, -- Glob pattern (e.g., "*.pyc", "node_modules/")
pattern_type TEXT DEFAULT 'glob', -- glob, regex, literal
-- Scope
scope TEXT DEFAULT 'directory', -- file, directory, both
reason TEXT, -- Why this pattern is excluded
-- Source
source TEXT DEFAULT 'gitignore', -- gitignore, manual, framework
UNIQUE(project_id, pattern, pattern_type)
);
-- Project tags: Flexible tagging for categorization CREATE TABLE IF NOT EXISTS project_tags ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE, tag TEXT NOT NULL, tag_type TEXT DEFAULT 'user', -- user, auto, system
UNIQUE(project_id, tag)
);
-- Project activity: Track project access patterns CREATE TABLE IF NOT EXISTS project_activity ( id INTEGER PRIMARY KEY AUTOINCREMENT, project_id INTEGER NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
-- Activity
activity_type TEXT NOT NULL, -- session, indexing, query, sync
session_uuid TEXT, -- Claude Code session UUID if applicable
-- Details
details TEXT, -- JSON details of activity
files_touched INTEGER DEFAULT 0, -- Number of files involved
-- Timestamp
timestamp TEXT DEFAULT CURRENT_TIMESTAMP
);
-- FTS5 virtual table for project search CREATE VIRTUAL TABLE IF NOT EXISTS projects_fts USING fts5( name, description, path, primary_language, framework, content='projects', content_rowid='id' );
-- FTS5 triggers to keep index synchronized CREATE TRIGGER IF NOT EXISTS projects_ai AFTER INSERT ON projects BEGIN INSERT INTO projects_fts(rowid, name, description, path, primary_language, framework) VALUES (new.id, new.name, new.description, new.path, new.primary_language, new.framework); END;
CREATE TRIGGER IF NOT EXISTS projects_ad AFTER DELETE ON projects BEGIN INSERT INTO projects_fts(projects_fts, rowid, name, description, path, primary_language, framework) VALUES ('delete', old.id, old.name, old.description, old.path, old.primary_language, old.framework); END;
CREATE TRIGGER IF NOT EXISTS projects_au AFTER UPDATE ON projects BEGIN INSERT INTO projects_fts(projects_fts, rowid, name, description, path, primary_language, framework) VALUES ('delete', old.id, old.name, old.description, old.path, old.primary_language, old.framework); INSERT INTO projects_fts(rowid, name, description, path, primary_language, framework) VALUES (new.id, new.name, new.description, new.path, new.primary_language, new.framework); END; """
SCHEMA_INDEXES_SQL = """ -- Indexes for fast lookups CREATE INDEX IF NOT EXISTS idx_projects_status ON projects(status); CREATE INDEX IF NOT EXISTS idx_projects_parent ON projects(parent_project_id); CREATE INDEX IF NOT EXISTS idx_projects_type ON projects(project_type); CREATE INDEX IF NOT EXISTS idx_projects_cloud ON projects(cloud_registered); CREATE INDEX IF NOT EXISTS idx_projects_pending ON projects(pending_sync);
CREATE INDEX IF NOT EXISTS idx_content_hashes_project ON content_hashes(project_id); CREATE INDEX IF NOT EXISTS idx_content_hashes_type ON content_hashes(content_type); CREATE INDEX IF NOT EXISTS idx_content_hashes_needs_reindex ON content_hashes(needs_reindex); CREATE INDEX IF NOT EXISTS idx_content_hashes_hash ON content_hashes(file_hash);
CREATE INDEX IF NOT EXISTS idx_embeddings_project ON project_embeddings(project_id); CREATE INDEX IF NOT EXISTS idx_embeddings_content_hash ON project_embeddings(content_hash_id); CREATE INDEX IF NOT EXISTS idx_embeddings_type ON project_embeddings(content_type); CREATE INDEX IF NOT EXISTS idx_embeddings_chunk_hash ON project_embeddings(chunk_hash);
CREATE INDEX IF NOT EXISTS idx_exclude_patterns_project ON exclude_patterns(project_id); CREATE INDEX IF NOT EXISTS idx_project_tags_project ON project_tags(project_id); CREATE INDEX IF NOT EXISTS idx_project_tags_tag ON project_tags(tag);
CREATE INDEX IF NOT EXISTS idx_activity_project ON project_activity(project_id); CREATE INDEX IF NOT EXISTS idx_activity_type ON project_activity(activity_type); CREATE INDEX IF NOT EXISTS idx_activity_session ON project_activity(session_uuid); CREATE INDEX IF NOT EXISTS idx_activity_timestamp ON project_activity(timestamp); """
Default global exclude patterns
DEFAULT_EXCLUDE_PATTERNS = [ # Version control (".git/", "directory", "Version control"), (".svn/", "directory", "Version control"), (".hg/", "directory", "Version control"),
# Package managers
("node_modules/", "directory", "NPM packages"),
("vendor/", "directory", "Vendor packages"),
("__pycache__/", "directory", "Python cache"),
(".venv/", "directory", "Python virtual environment"),
("venv/", "directory", "Python virtual environment"),
(".env/", "directory", "Environment"),
# Build outputs
("dist/", "directory", "Build output"),
("build/", "directory", "Build output"),
("target/", "directory", "Build output (Rust/Java)"),
("*.egg-info/", "directory", "Python egg info"),
# IDE
(".idea/", "directory", "JetBrains IDE"),
(".vscode/", "directory", "VS Code"),
("*.swp", "file", "Vim swap"),
("*.swo", "file", "Vim swap"),
# OS
(".DS_Store", "file", "macOS metadata"),
("Thumbs.db", "file", "Windows thumbnails"),
# Compiled/binary
("*.pyc", "file", "Compiled Python"),
("*.pyo", "file", "Optimized Python"),
("*.so", "file", "Shared object"),
("*.dll", "file", "Windows DLL"),
("*.dylib", "file", "macOS dynamic library"),
("*.o", "file", "Object file"),
("*.a", "file", "Static library"),
# Large files
("*.zip", "file", "Archive"),
("*.tar", "file", "Archive"),
("*.tar.gz", "file", "Archive"),
("*.rar", "file", "Archive"),
("*.7z", "file", "Archive"),
("*.mp4", "file", "Video"),
("*.mov", "file", "Video"),
("*.avi", "file", "Video"),
# Logs
("*.log", "file", "Log file"),
("logs/", "directory", "Log directory"),
# Secrets (should never be indexed!)
(".env", "file", "Environment secrets"),
(".env.*", "file", "Environment secrets"),
("*.pem", "file", "Private key"),
("*.key", "file", "Private key"),
("credentials.json", "file", "Credentials"),
("secrets.json", "file", "Secrets"),
]
=============================================================================
Database Functions
=============================================================================
def init_database(db_path: Path, reset: bool = False) -> sqlite3.Connection: """Initialize projects.db with schema.""" db_path.parent.mkdir(parents=True, exist_ok=True)
if reset and db_path.exists():
# Backup before reset
backup_path = db_path.with_suffix(f".backup-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}.db")
db_path.rename(backup_path)
print(f" Backed up existing database to: {backup_path}")
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
# Enable foreign keys
conn.execute("PRAGMA foreign_keys = ON")
# Create tables
conn.executescript(SCHEMA_TABLES_SQL)
conn.commit()
# Create indexes
conn.executescript(SCHEMA_INDEXES_SQL)
conn.commit()
# Insert default global exclude patterns if none exist
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM exclude_patterns WHERE project_id IS NULL")
if cursor.fetchone()[0] == 0:
for pattern, scope, reason in DEFAULT_EXCLUDE_PATTERNS:
cursor.execute("""
INSERT OR IGNORE INTO exclude_patterns (project_id, pattern, scope, reason, source)
VALUES (NULL, ?, ?, ?, 'framework')
""", (pattern, scope, reason))
conn.commit()
print(f" Inserted {len(DEFAULT_EXCLUDE_PATTERNS)} default exclude patterns")
return conn
def get_stats(db_path: Path) -> dict: """Get database statistics.""" if not db_path.exists(): return {"error": "Database does not exist", "path": str(db_path)}
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
stats = {
"path": str(db_path),
"size_bytes": db_path.stat().st_size,
"tables": {}
}
# Count rows in each table
tables = [
"projects", "content_hashes", "project_embeddings",
"exclude_patterns", "project_tags", "project_activity"
]
for table in tables:
try:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
stats["tables"][table] = cursor.fetchone()[0]
except sqlite3.OperationalError:
stats["tables"][table] = "table missing"
# Additional metrics
cursor.execute("SELECT COUNT(*) FROM projects WHERE status = 'active'")
stats["active_projects"] = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM content_hashes WHERE needs_reindex = 1")
stats["files_needing_reindex"] = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM projects WHERE pending_sync = 1")
stats["pending_cloud_sync"] = cursor.fetchone()[0]
conn.close()
return stats
def print_stats(stats: dict): """Print statistics in human-readable format.""" print("\n" + "=" * 60) print("CODITECT projects.db Statistics (ADR-118 TIER 4)") print("=" * 60)
if "error" in stats:
print(f"\nError: {stats['error']}")
print(f"Path: {stats['path']}")
return
print(f"\nDatabase: {stats['path']}")
print(f"Size: {stats['size_bytes']:,} bytes ({stats['size_bytes'] / 1024:.1f} KB)")
print("\nTable Row Counts:")
print("-" * 40)
for table, count in stats["tables"].items():
print(f" {table:25} {count:>10}")
print("\nSummary:")
print("-" * 40)
print(f" Active projects: {stats.get('active_projects', 0):>10}")
print(f" Files needing reindex: {stats.get('files_needing_reindex', 0):>10}")
print(f" Pending cloud sync: {stats.get('pending_cloud_sync', 0):>10}")
print("\n" + "=" * 60)
=============================================================================
Main
=============================================================================
def main(): parser = argparse.ArgumentParser( description="Initialize CODITECT projects.db (ADR-118 TIER 4)" ) parser.add_argument( "--stats", action="store_true", help="Show database statistics" ) parser.add_argument( "--reset", action="store_true", help="Reset database (creates backup first)" ) parser.add_argument( "--json", action="store_true", help="Output stats as JSON" ) parser.add_argument( "--db-path", type=Path, help="Override database path" )
args = parser.parse_args()
db_path = args.db_path or get_projects_db_path()
if args.stats:
stats = get_stats(db_path)
if args.json:
print(json.dumps(stats, indent=2))
else:
print_stats(stats)
return 0
# Initialize database
print("\n" + "=" * 60)
print("CODITECT projects.db Initialization (ADR-118 TIER 4)")
print("=" * 60)
print(f"\nTask ID: J.15.1.1")
print(f"Database: {db_path}")
if args.reset:
print("\n⚠️ RESET MODE: Existing data will be backed up and cleared")
conn = init_database(db_path, reset=args.reset)
print("\n✅ Database initialized successfully")
print("\nTables created:")
print(" - projects (project metadata)")
print(" - content_hashes (file change detection)")
print(" - project_embeddings (semantic search vectors)")
print(" - exclude_patterns (ignore rules)")
print(" - project_tags (categorization)")
print(" - project_activity (access patterns)")
print(" - projects_fts (full-text search)")
# Show stats after init
stats = get_stats(db_path)
print(f"\nGlobal exclude patterns: {stats['tables'].get('exclude_patterns', 0)}")
conn.close()
print("\n" + "=" * 60)
print("Next steps:")
print(" /cx --register-project <path> # Register a project")
print(" /cx --index-project <path> # Index project content")
print(" /cxq --projects # List registered projects")
print("=" * 60 + "\n")
return 0
if name == "main": sys.exit(main())