scripts-file-extractor
#!/usr/bin/env python3 """ CP-08: File Node Extractor (ADR-151)
Extracts file entities from call_graph_functions in sessions.db:
- node_type: 'file'
- Properties: path, language, function_count, last_indexed
Note: There's no dedicated call_graph_files table. We derive file nodes from distinct file_path values in call_graph_functions.
Source: sessions.db call_graph_functions table Target: org.db kg_nodes table
Created: 2026-02-03 Track: J (Memory Intelligence) Task: J.3.4.6 """
import logging from pathlib import Path from typing import Any, Dict, Generator, Optional, Tuple
from .base_extractor import SQLiteSourceExtractor
logger = logging.getLogger(name)
class FileExtractor(SQLiteSourceExtractor): """ Extract file entities from call_graph_functions into kg_nodes.
Files are derived from distinct file_path values in the call graph.
"""
@property
def node_type(self) -> str:
return "file"
def extract_entities(self) -> Generator[Tuple[str, str, Optional[str], Dict[str, Any], Optional[str], Optional[str]], None, None]:
"""
Extract files from call_graph_functions.
Yields:
Tuple of (node_id, name, subtype, properties, source_table, source_id)
"""
conn = self.connect_source()
# Get distinct files with function counts and language
cursor = conn.execute("""
SELECT
file_path,
language,
COUNT(*) as function_count,
MAX(updated_at) as last_indexed,
MIN(start_line) as min_line,
MAX(end_line) as max_line
FROM call_graph_functions
GROUP BY file_path, language
ORDER BY file_path
""")
for row in cursor:
file_path = row['file_path']
language = row['language']
# Generate node_id from file path
# e.g., file:scripts/core/paths.py
node_id = self.generate_node_id(file_path)
# Get file name for display
name = Path(file_path).name
# Determine subtype from language/extension
subtype = self._get_file_subtype(file_path, language)
# Build properties
properties = {
"path": file_path,
"language": language,
"function_count": row['function_count'],
"last_indexed": row['last_indexed'],
"estimated_lines": row['max_line'] if row['max_line'] else None,
}
# Clean None values
properties = {k: v for k, v in properties.items() if v is not None}
yield (
node_id,
name,
subtype,
properties,
"call_graph_functions", # source_table (derived from)
file_path, # source_id
)
def _get_file_subtype(self, file_path: str, language: Optional[str]) -> str:
"""
Determine file subtype from path or language.
Returns: python, javascript, typescript, markdown, yaml, json, etc.
"""
if language:
return language.lower()
# Fallback to extension
ext_map = {
".py": "python",
".js": "javascript",
".ts": "typescript",
".tsx": "typescript",
".md": "markdown",
".yaml": "yaml",
".yml": "yaml",
".json": "json",
".sh": "shell",
".rs": "rust",
".go": "go",
".sql": "sql",
}
path = Path(file_path)
return ext_map.get(path.suffix.lower(), "unknown")