Skip to main content

scripts-test-extractors

#!/usr/bin/env python3 """

title: "Session Extractor Tests" component_type: test version: "1.0.0" audience: contributor status: stable summary: "Unit tests for LLM session extractors" keywords: ['test', 'extractor', 'claude', 'codex', 'gemini', 'cusf'] tokens: ~500 created: 2026-01-28 updated: 2026-01-28

Unit tests for LLM session extractors (J.13.4.2).

Tests ClaudeExtractor, CodexExtractor, GeminiExtractor, and ExtractorFactory.

Track: J.13 (Memory - Generic Session Export) Task: J.13.4.2 """

import json import os import sys import tempfile import unittest from datetime import datetime, timezone from pathlib import Path from unittest.mock import patch, MagicMock

Add parent paths for imports

_test_dir = Path(file).resolve().parent _scripts_dir = _test_dir.parent _coditect_root = _scripts_dir.parent if str(_coditect_root) not in sys.path: sys.path.insert(0, str(_coditect_root)) if str(_scripts_dir) not in sys.path: sys.path.insert(0, str(_scripts_dir))

from core.session_extractor import ( SessionExtractor, SessionMetadata, ExtractedEntry, ExtractionResult ) from extractors.claude_extractor import ClaudeExtractor from extractors.codex_extractor import CodexExtractor from extractors.gemini_extractor import GeminiExtractor from core.extractor_factory import ExtractorFactory, get_extractor, detect_extractor

class TestSessionMetadata(unittest.TestCase): """Tests for SessionMetadata dataclass."""

def test_metadata_creation(self):
"""Test basic metadata creation."""
meta = SessionMetadata(
session_id="test-123",
llm_source="claude"
)
self.assertEqual(meta.session_id, "test-123")
self.assertEqual(meta.llm_source, "claude")
self.assertIsNone(meta.llm_model)

def test_metadata_to_dict(self):
"""Test metadata serialization."""
meta = SessionMetadata(
session_id="test-456",
llm_source="codex",
llm_model="gpt-4",
total_messages=10
)
d = meta.to_dict()
self.assertEqual(d["session_id"], "test-456")
self.assertEqual(d["llm_source"], "codex")
self.assertEqual(d["llm_model"], "gpt-4")
self.assertEqual(d["total_messages"], 10)

class TestExtractedEntry(unittest.TestCase): """Tests for ExtractedEntry dataclass."""

def test_message_entry(self):
"""Test message entry creation."""
entry = ExtractedEntry(
type="message",
timestamp=datetime.now(timezone.utc),
data={
"message_id": "msg-1",
"role": "user",
"content": "Hello world"
}
)
self.assertEqual(entry.type, "message")
self.assertEqual(entry.data["role"], "user")
self.assertEqual(entry.data["content"], "Hello world")

def test_tool_use_entry(self):
"""Test tool use entry creation."""
entry = ExtractedEntry(
type="tool_use",
timestamp=datetime.now(timezone.utc),
data={
"tool_id": "tool-1",
"tool_name": "Read",
"tool_input": {"file_path": "/test.txt"}
}
)
self.assertEqual(entry.type, "tool_use")
self.assertEqual(entry.data["tool_name"], "Read")

def test_entry_to_dict(self):
"""Test entry serialization."""
ts = datetime.now(timezone.utc)
entry = ExtractedEntry(
type="message",
timestamp=ts,
data={
"role": "assistant",
"content": "Hi there"
}
)
d = entry.to_dict()
self.assertEqual(d["type"], "message")
self.assertEqual(d["role"], "assistant")
self.assertIn("timestamp", d)

class TestExtractionResult(unittest.TestCase): """Tests for ExtractionResult dataclass."""

def test_successful_result(self):
"""Test successful extraction result."""
meta = SessionMetadata(session_id="s1", llm_source="claude")
entries = [
ExtractedEntry(
type="message",
timestamp=datetime.now(timezone.utc),
data={"role": "user", "content": "Test"}
)
]
result = ExtractionResult(
success=True,
metadata=meta,
entries=entries
)
self.assertTrue(result.success)
self.assertEqual(result.entry_count, 1)

def test_failed_result(self):
"""Test failed extraction result."""
meta = SessionMetadata(session_id="s2", llm_source="codex")
result = ExtractionResult(
success=False,
metadata=meta,
errors=["File not found"]
)
self.assertFalse(result.success)
self.assertEqual(len(result.errors), 1)

class TestClaudeExtractor(unittest.TestCase): """Tests for ClaudeExtractor."""

def setUp(self):
self.extractor = ClaudeExtractor()

def test_llm_name(self):
"""Test LLM name property."""
self.assertEqual(self.extractor.llm_name, "claude")

def test_can_extract_jsonl(self):
"""Test can_extract with valid JSONL."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write(json.dumps({
"type": "user",
"message": {"role": "user", "content": "Hello"}
}) + "\n")
f.flush()

try:
result = self.extractor.can_extract(Path(f.name))
# Should return True for Claude-format JSONL
self.assertIsInstance(result, bool)
finally:
os.unlink(f.name)

def test_can_extract_nonexistent(self):
"""Test can_extract with non-existent file."""
result = self.extractor.can_extract(Path("/nonexistent/file.jsonl"))
self.assertFalse(result)

def test_extract_nonexistent(self):
"""Test extract with non-existent file."""
result = self.extractor.extract(Path("/nonexistent/file.jsonl"))
self.assertFalse(result.success)
self.assertIn("not found", result.errors[0].lower())

def test_extract_valid_jsonl(self):
"""Test extract with valid Claude JSONL."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
# Write Claude-format entries
f.write(json.dumps({
"type": "user",
"message": {"role": "user", "content": "Hello Claude"},
"uuid": "msg-1"
}) + "\n")
f.write(json.dumps({
"type": "assistant",
"message": {"role": "assistant", "content": "Hello! How can I help?"},
"uuid": "msg-2",
"model": "claude-opus-4-5"
}) + "\n")
f.flush()

try:
result = self.extractor.extract(Path(f.name))
self.assertTrue(result.success)
self.assertEqual(result.metadata.llm_source, "claude")
# Should have extracted messages
self.assertGreater(result.entry_count, 0)
finally:
os.unlink(f.name)

class TestCodexExtractor(unittest.TestCase): """Tests for CodexExtractor."""

def setUp(self):
self.extractor = CodexExtractor()

def test_llm_name(self):
"""Test LLM name property."""
self.assertEqual(self.extractor.llm_name, "codex")

def test_can_extract_codex_format(self):
"""Test can_extract with Codex format."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write(json.dumps({
"session_id": "sess-1",
"role": "user",
"content": "Hello Codex"
}) + "\n")
f.flush()

try:
result = self.extractor.can_extract(Path(f.name))
self.assertTrue(result)
finally:
os.unlink(f.name)

def test_extract_valid_codex(self):
"""Test extract with valid Codex JSONL."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write(json.dumps({
"session_id": "codex-sess-1",
"role": "user",
"content": "Write a function",
"timestamp": "2026-01-28T10:00:00Z"
}) + "\n")
f.write(json.dumps({
"session_id": "codex-sess-1",
"role": "assistant",
"content": "def hello(): pass",
"timestamp": "2026-01-28T10:00:05Z",
"model": "gpt-4"
}) + "\n")
f.flush()

try:
result = self.extractor.extract(Path(f.name))
self.assertTrue(result.success)
self.assertEqual(result.metadata.session_id, "codex-sess-1")
finally:
os.unlink(f.name)

class TestGeminiExtractor(unittest.TestCase): """Tests for GeminiExtractor."""

def setUp(self):
self.extractor = GeminiExtractor()

def test_llm_name(self):
"""Test LLM name property."""
self.assertEqual(self.extractor.llm_name, "gemini")

def test_can_extract_gemini_format(self):
"""Test can_extract with Gemini format."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write(json.dumps({
"role": "user",
"parts": [{"text": "Hello Gemini"}]
}) + "\n")
f.flush()

try:
result = self.extractor.can_extract(Path(f.name))
self.assertTrue(result)
finally:
os.unlink(f.name)

def test_extract_valid_gemini(self):
"""Test extract with valid Gemini JSONL."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write(json.dumps({
"role": "user",
"parts": [{"text": "Hello Gemini"}],
"timestamp": 1706432400000 # milliseconds
}) + "\n")
f.write(json.dumps({
"role": "model",
"parts": [{"text": "Hello! How can I help?"}],
"timestamp": 1706432410000,
"modelVersion": "gemini-1.5-pro"
}) + "\n")
f.flush()

try:
result = self.extractor.extract(Path(f.name))
self.assertTrue(result.success)
self.assertEqual(result.metadata.llm_source, "gemini")
finally:
os.unlink(f.name)

class TestExtractorFactory(unittest.TestCase): """Tests for ExtractorFactory."""

def test_list_supported(self):
"""Test list_supported returns all extractors."""
supported = ExtractorFactory.list_supported()
self.assertIn("claude", supported)
self.assertIn("codex", supported)
self.assertIn("gemini", supported)

def test_is_supported(self):
"""Test is_supported for known LLMs."""
self.assertTrue(ExtractorFactory.is_supported("claude"))
self.assertTrue(ExtractorFactory.is_supported("codex"))
self.assertTrue(ExtractorFactory.is_supported("gemini"))
self.assertFalse(ExtractorFactory.is_supported("unknown"))

def test_get_extractor_claude(self):
"""Test get_extractor returns ClaudeExtractor."""
extractor = ExtractorFactory.get_extractor("claude")
self.assertIsInstance(extractor, ClaudeExtractor)

def test_get_extractor_codex(self):
"""Test get_extractor returns CodexExtractor."""
extractor = ExtractorFactory.get_extractor("codex")
self.assertIsInstance(extractor, CodexExtractor)

def test_get_extractor_gemini(self):
"""Test get_extractor returns GeminiExtractor."""
extractor = ExtractorFactory.get_extractor("gemini")
self.assertIsInstance(extractor, GeminiExtractor)

def test_get_extractor_unknown(self):
"""Test get_extractor raises for unknown LLM."""
with self.assertRaises(ValueError):
ExtractorFactory.get_extractor("unknown_llm")

def test_detect_extractor_claude(self):
"""Test detect_extractor for Claude JSONL."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write(json.dumps({
"type": "user",
"message": {"role": "user", "content": "Hello"}
}) + "\n")
f.flush()

try:
extractor = ExtractorFactory.detect_extractor(Path(f.name))
# Should detect as Claude format
if extractor:
self.assertIn(extractor.llm_name, ["claude", "codex", "gemini"])
finally:
os.unlink(f.name)

def test_get_or_detect_explicit(self):
"""Test get_or_detect with explicit LLM."""
with tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode='w') as f:
f.write("{}\n")
f.flush()

try:
extractor = ExtractorFactory.get_or_detect(Path(f.name), llm="claude")
self.assertEqual(extractor.llm_name, "claude")
finally:
os.unlink(f.name)

class TestConvenienceFunctions(unittest.TestCase): """Tests for module-level convenience functions."""

def test_get_extractor_function(self):
"""Test get_extractor convenience function."""
extractor = get_extractor("claude")
self.assertIsInstance(extractor, ClaudeExtractor)

def test_detect_extractor_function(self):
"""Test detect_extractor convenience function."""
# Should return None for non-existent file
result = detect_extractor(Path("/nonexistent/file.jsonl"))
self.assertIsNone(result)

if name == "main": unittest.main()