Skip to main content

UDOM Pipeline — 1-2-3 Detailed Quick Start

Version: 1.0 | Date: 2026-02-09
Audience: Senior engineers with Python, Docker, and cloud-native background
Assumes: Familiarity with PDF processing, LaTeX, and structured document models


Overview

The UDOM (Universal Document Object Model) Pipeline converts scientific papers into machine-readable, high-fidelity markdown through 3-source extraction and fusion:

  • 62× faster than pymupdf4llm using Docling PDF engine (~5–7s/paper)
  • 100% Grade A quality across 135+ papers (0.87–0.94 on 9-dimension scoring)
  • 25 typed components (heading, paragraph, equation, figure, table, citation, etc.) fused from 3 independent sources
  • 9-dimension QA — structure, tables, math, citations, images, content density, LaTeX residual, heading hierarchy, bibliography
  • 10–35s end-to-end per paper including all 3 sources + fusion + scoring

Step 1: Local Setup — Single Paper Extraction

Prerequisites

# Python 3.11+ with scientific stack
python -m venv udom-env && source udom-env/bin/activate

# Core extraction engines
pip install docling>=2.0 # IBM's document understanding engine (62x faster than pymupdf4llm)
pip install requests beautifulsoup4 lxml # ar5iv HTML fetching
pip install pandoc-python # LaTeX source conversion

# Quality scoring
pip install numpy scikit-learn # Scoring dimensions
pip install katex # Math validation (optional)

# Verify Docling installation
python -c "from docling.document_converter import DocumentConverter; print('Docling OK')"

Minimal Extraction — Hello World

#!/usr/bin/env python3
"""udom_single.py — Extract one arXiv paper through 3-source UDOM pipeline."""

import json
import time
from dataclasses import dataclass, field, asdict
from enum import Enum
from typing import Optional

# --- UDOM Component Types ---
class ComponentType(str, Enum):
HEADING = "heading"
PARAGRAPH = "paragraph"
EQUATION = "equation"
FIGURE = "figure"
TABLE = "table"
CITATION = "citation"
CODE = "code"
LIST = "list"
ABSTRACT = "abstract"
BIBLIOGRAPHY = "bibliography"
# ... 15 more typed components in full UDOM spec

@dataclass
class UDOMComponent:
"""Single typed component in the Universal Document Object Model."""
type: ComponentType
content: str
source: str # "docling" | "ar5iv" | "latex"
confidence: float # 0.0–1.0 extraction confidence
position: int # ordinal position in document
metadata: dict = field(default_factory=dict)

@dataclass
class UDOMDocument:
"""Complete document in UDOM format — fused from 3 sources."""
arxiv_id: str
title: str
components: list[UDOMComponent] = field(default_factory=list)
quality_score: Optional[dict] = None
source_stats: dict = field(default_factory=dict)

# --- Source 1: Docling PDF Extraction ---
def extract_docling(arxiv_id: str) -> list[UDOMComponent]:
"""Extract components via Docling PDF engine. ~5-7s per paper."""
from docling.document_converter import DocumentConverter

pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
converter = DocumentConverter()
result = converter.convert(pdf_url)

components = []
for i, element in enumerate(result.document.iterate_items()):
comp_type = _map_docling_type(element) # Map Docling types → UDOM types
components.append(UDOMComponent(
type=comp_type,
content=element.export_to_markdown(),
source="docling",
confidence=0.85, # Docling baseline confidence
position=i,
metadata={"docling_type": str(type(element).__name__)}
))
return components

# --- Source 2: ar5iv HTML Extraction ---
def extract_ar5iv(arxiv_id: str) -> list[UDOMComponent]:
"""Extract components from ar5iv LaTeXML-rendered HTML. Best for tables + math."""
import requests
from bs4 import BeautifulSoup

url = f"https://ar5iv.labs.arxiv.org/html/{arxiv_id}"
resp = requests.get(url, timeout=30)
if resp.status_code != 200:
return [] # Graceful degradation — ar5iv not available for all papers

soup = BeautifulSoup(resp.text, "lxml")
components = []

# Extract math with alttext (preserves LaTeX semantics)
for i, math_el in enumerate(soup.find_all("math")):
alttext = math_el.get("alttext", "")
if alttext:
components.append(UDOMComponent(
type=ComponentType.EQUATION,
content=f"${alttext}$",
source="ar5iv",
confidence=0.95, # ar5iv math is high-fidelity
position=i,
metadata={"display": math_el.get("display", "inline")}
))

# Extract tables (ar5iv preserves structure better than PDF)
for i, table_el in enumerate(soup.find_all("table")):
components.append(UDOMComponent(
type=ComponentType.TABLE,
content=_html_table_to_markdown(table_el),
source="ar5iv",
confidence=0.92,
position=1000 + i, # Will be reordered during fusion
metadata={"caption": _find_caption(table_el)}
))

return components

# --- Source 3: LaTeX Source Extraction ---
def extract_latex_source(arxiv_id: str) -> list[UDOMComponent]:
"""Extract from arXiv LaTeX source via pandoc. Best math fidelity."""
import subprocess, tempfile, tarfile, requests

source_url = f"https://arxiv.org/e-print/{arxiv_id}"
resp = requests.get(source_url, timeout=30)
if resp.status_code != 200:
return []

with tempfile.TemporaryDirectory() as tmpdir:
# arXiv source is typically .tar.gz
tar_path = f"{tmpdir}/source.tar.gz"
with open(tar_path, "wb") as f:
f.write(resp.content)

try:
with tarfile.open(tar_path) as tar:
tar.extractall(tmpdir)
except tarfile.ReadError:
return [] # Not a tar file, skip

# Find main .tex file and convert via pandoc
tex_files = list(Path(tmpdir).glob("**/*.tex"))
if not tex_files:
return []

main_tex = _find_main_tex(tex_files)
md_output = subprocess.run(
["pandoc", str(main_tex), "-f", "latex", "-t", "markdown", "--wrap=none"],
capture_output=True, text=True, cwd=tmpdir
)

return _parse_pandoc_markdown(md_output.stdout, source="latex")

# --- UDOM Fusion Engine ---
def fuse_components(
docling: list[UDOMComponent],
ar5iv: list[UDOMComponent],
latex: list[UDOMComponent]
) -> list[UDOMComponent]:
"""
Fuse 3 sources into canonical UDOM document.

Strategy:
- Docling provides document structure (headings, paragraphs, ordering)
- ar5iv provides best tables and inline math
- LaTeX source provides best display math and citations
- Highest-confidence component wins per position
"""
fused = []

# Use Docling as structural backbone
for comp in docling:
best = comp

# For equations: prefer LaTeX source > ar5iv > Docling
if comp.type == ComponentType.EQUATION:
latex_match = _find_matching(latex, comp, ComponentType.EQUATION)
ar5iv_match = _find_matching(ar5iv, comp, ComponentType.EQUATION)
if latex_match and latex_match.confidence > best.confidence:
best = latex_match
elif ar5iv_match and ar5iv_match.confidence > best.confidence:
best = ar5iv_match

# For tables: prefer ar5iv > Docling > LaTeX
elif comp.type == ComponentType.TABLE:
ar5iv_match = _find_matching(ar5iv, comp, ComponentType.TABLE)
if ar5iv_match and ar5iv_match.confidence > best.confidence:
best = ar5iv_match

fused.append(best)

return fused

# --- Quality Scoring (9 dimensions) ---
def score_quality(components: list[UDOMComponent]) -> dict:
"""Score document across 9 quality dimensions. Grade A ≥ 0.85."""
scores = {
"structure": _score_structure(components),
"tables": _score_tables(components),
"math": _score_math(components),
"citations": _score_citations(components),
"images": _score_images(components),
"content_density": _score_density(components),
"latex_residual": _score_latex_residual(components),
"heading_hierarchy": _score_heading_hierarchy(components),
"bibliography": _score_bibliography(components),
}
scores["overall"] = sum(scores.values()) / len(scores)
scores["grade"] = "A" if scores["overall"] >= 0.85 else "B" if scores["overall"] >= 0.70 else "C"
return scores

# --- Main Pipeline ---
def process_paper(arxiv_id: str) -> UDOMDocument:
"""Full 3-source UDOM pipeline for a single paper."""
start = time.time()

print(f"[UDOM] Processing {arxiv_id}...")
docling_components = extract_docling(arxiv_id)
print(f" Docling: {len(docling_components)} components")

ar5iv_components = extract_ar5iv(arxiv_id)
print(f" ar5iv: {len(ar5iv_components)} components")

latex_components = extract_latex_source(arxiv_id)
print(f" LaTeX: {len(latex_components)} components")

fused = fuse_components(docling_components, ar5iv_components, latex_components)
quality = score_quality(fused)

elapsed = time.time() - start
print(f" Fused: {len(fused)} components | Grade: {quality['grade']} ({quality['overall']:.2f}) | {elapsed:.1f}s")

return UDOMDocument(
arxiv_id=arxiv_id,
title=_extract_title(fused),
components=fused,
quality_score=quality,
source_stats={
"docling": len(docling_components),
"ar5iv": len(ar5iv_components),
"latex": len(latex_components),
"fused": len(fused),
"elapsed_seconds": round(elapsed, 1),
}
)

if __name__ == "__main__":
# Test with a well-known LeCun paper
doc = process_paper("2003.05991") # Momentum Contrast (MoCo v2)
print(json.dumps(asdict(doc), indent=2, default=str))

Expected output:

[UDOM] Processing 2003.05991...
Docling: 187 components
ar5iv: 94 components
LaTeX: 156 components
Fused: 312 components | Grade: A (0.91) | 12.4s

Step 2: Realistic Workflow — Batch Processing with Quality Gates

Batch Pipeline with Orchestrator-Workers Pattern

#!/usr/bin/env python3
"""udom_batch.py — Process multiple papers with quality gates and audit trail."""

import asyncio
import json
from pathlib import Path
from datetime import datetime

class UDOMBatchPipeline:
"""
Batch processor implementing CODITECT's orchestrator-workers pattern.

- Orchestrator: manages paper queue, quality gates, retry logic
- Workers: parallel extraction across 3 sources
- Evaluator: 9-dimension quality scoring with Grade A threshold
"""

def __init__(self, output_dir: str, max_concurrent: int = 5):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.max_concurrent = max_concurrent
self.audit_log = []

# Quality gate: minimum Grade A (0.85)
self.quality_threshold = 0.85
self.max_retries = 2

async def process_batch(self, arxiv_ids: list[str]) -> dict:
"""Process a batch of papers with bounded concurrency."""
semaphore = asyncio.Semaphore(self.max_concurrent)

async def process_with_semaphore(arxiv_id):
async with semaphore:
return await self._process_with_retry(arxiv_id)

tasks = [process_with_semaphore(aid) for aid in arxiv_ids]
results = await asyncio.gather(*tasks, return_exceptions=True)

# Generate batch report
report = self._generate_batch_report(arxiv_ids, results)
self._write_audit_log()
return report

async def _process_with_retry(self, arxiv_id: str) -> UDOMDocument:
"""Process single paper with retry on quality gate failure."""
for attempt in range(1, self.max_retries + 1):
doc = await asyncio.to_thread(process_paper, arxiv_id)

self.audit_log.append({
"timestamp": datetime.utcnow().isoformat(),
"arxiv_id": arxiv_id,
"attempt": attempt,
"grade": doc.quality_score["grade"],
"score": doc.quality_score["overall"],
"components": doc.source_stats["fused"],
"elapsed_s": doc.source_stats["elapsed_seconds"],
})

if doc.quality_score["overall"] >= self.quality_threshold:
self._write_document(doc)
return doc

# Quality gate failed — retry with enhanced extraction
print(f" ⚠ {arxiv_id}: Grade {doc.quality_score['grade']} "
f"({doc.quality_score['overall']:.2f}), retry {attempt}/{self.max_retries}")

# Max retries exhausted — write with quality warning
self._write_document(doc, quality_warning=True)
return doc

def _write_document(self, doc: UDOMDocument, quality_warning: bool = False):
"""Write UDOM document to markdown + JSON."""
md_path = self.output_dir / f"{doc.arxiv_id.replace('/', '_')}.md"
json_path = self.output_dir / f"{doc.arxiv_id.replace('/', '_')}.udom.json"

# Markdown for agent consumption
md_content = self._render_markdown(doc)
if quality_warning:
md_content = f"<!-- QUALITY WARNING: Below Grade A threshold -->\n{md_content}"
md_path.write_text(md_content)

# JSON for programmatic access
json_path.write_text(json.dumps(asdict(doc), indent=2, default=str))

def _generate_batch_report(self, arxiv_ids, results) -> dict:
"""Generate batch processing report with quality statistics."""
successful = [r for r in results if isinstance(r, UDOMDocument)]
failed = [r for r in results if isinstance(r, Exception)]

grades = {"A": 0, "B": 0, "C": 0}
for doc in successful:
grades[doc.quality_score["grade"]] += 1

return {
"total": len(arxiv_ids),
"successful": len(successful),
"failed": len(failed),
"grades": grades,
"grade_a_rate": grades["A"] / max(len(successful), 1),
"avg_score": sum(d.quality_score["overall"] for d in successful) / max(len(successful), 1),
"avg_time_s": sum(d.source_stats["elapsed_seconds"] for d in successful) / max(len(successful), 1),
"total_components": sum(d.source_stats["fused"] for d in successful),
}

# --- Run batch ---
async def main():
# LeCun papers — self-supervised learning cluster
papers = [
"2003.05991", # MoCo v2
"2104.14294", # Barlow Twins
"2006.07733", # VICReg
"2303.15256", # Previously Grade B, now Grade A
# ... extend to full 218-paper list
]

pipeline = UDOMBatchPipeline(
output_dir="./udom-batch-runs/run-20260209",
max_concurrent=5
)
report = await pipeline.process_batch(papers)
print(json.dumps(report, indent=2))

if __name__ == "__main__":
asyncio.run(main())

Expected output:

{
"total": 4,
"successful": 4,
"failed": 0,
"grades": {"A": 4, "B": 0, "C": 0},
"grade_a_rate": 1.0,
"avg_score": 0.90,
"avg_time_s": 18.3,
"total_components": 1247
}

Step 3: Deploy — Production Pipeline with CODITECT Integration

Docker Compose — Production Topology

# docker-compose.udom.yml
version: "3.9"

services:
udom-orchestrator:
build: ./udom-pipeline
environment:
- UDOM_QUALITY_THRESHOLD=0.85
- UDOM_MAX_CONCURRENT=10
- UDOM_MAX_RETRIES=2
- POSTGRES_URL=postgresql://coditect:secret@postgres:5432/udom
- NATS_URL=nats://nats:4222
depends_on:
- postgres
- nats
volumes:
- udom-output:/data/output

udom-worker-docling:
build: ./udom-pipeline
command: ["python", "-m", "udom.workers.docling"]
deploy:
replicas: 3 # Scale Docling workers independently
environment:
- NATS_URL=nats://nats:4222

udom-worker-ar5iv:
build: ./udom-pipeline
command: ["python", "-m", "udom.workers.ar5iv"]
deploy:
replicas: 2
environment:
- NATS_URL=nats://nats:4222

udom-worker-latex:
build: ./udom-pipeline
command: ["python", "-m", "udom.workers.latex"]
deploy:
replicas: 2
environment:
- NATS_URL=nats://nats:4222

udom-quality-scorer:
build: ./udom-pipeline
command: ["python", "-m", "udom.workers.scorer"]
environment:
- NATS_URL=nats://nats:4222
- POSTGRES_URL=postgresql://coditect:secret@postgres:5432/udom

udom-navigator:
build: ./udom-navigator
ports:
- "8080:80"
volumes:
- udom-output:/data/output:ro
environment:
- UDOM_DATA_DIR=/data/output

postgres:
image: postgres:16
environment:
POSTGRES_DB: udom
POSTGRES_USER: coditect
POSTGRES_PASSWORD: secret
volumes:
- pgdata:/var/lib/postgresql/data

nats:
image: nats:2.10
ports:
- "4222:4222"
- "8222:8222" # Monitoring

volumes:
udom-output:
pgdata:

CODITECT Agent Integration

# coditect_udom_agent.py — Research agent consuming UDOM pipeline output

from coditect.agents import BaseAgent, tool
from coditect.orchestrator import TaskClassification

class ResearchDiscoveryAgent(BaseAgent):
"""
CODITECT research agent that leverages UDOM pipeline
for autonomous scientific literature analysis.
"""

classification = TaskClassification.RESEARCH
model_routing = "opus" # Research tasks route to highest-capability model

@tool("search_equations")
async def search_equations(self, query: str, corpus: str = "lecun-ssl") -> list[dict]:
"""Search UDOM corpus for equations matching semantic query."""
results = await self.udom_store.search(
component_type="equation",
query=query,
corpus=corpus,
limit=20,
)
return [{"paper": r.arxiv_id, "equation": r.content, "context": r.metadata} for r in results]

@tool("compare_results")
async def compare_results(self, metric: str, papers: list[str]) -> dict:
"""Extract and compare experimental results across papers."""
tables = []
for paper_id in papers:
paper_tables = await self.udom_store.get_components(
arxiv_id=paper_id,
component_type="table",
)
tables.extend([t for t in paper_tables if metric.lower() in t.content.lower()])

return {"metric": metric, "papers": len(papers), "relevant_tables": len(tables), "data": tables}

@tool("synthesize_findings")
async def synthesize_findings(self, topic: str, max_papers: int = 20) -> str:
"""Synthesize findings across UDOM corpus on a given topic."""
relevant = await self.udom_store.semantic_search(
query=topic,
limit=max_papers,
)

# Agent uses structured UDOM components for synthesis
# — equations are machine-readable, tables are structured, citations are linked
return await self.llm.synthesize(
prompt=f"Synthesize findings on '{topic}' from these {len(relevant)} papers.",
context=[r.to_agent_context() for r in relevant],
)

Verify Deployment

# Start the stack
docker compose -f docker-compose.udom.yml up -d

# Submit a batch
curl -X POST http://localhost:8080/api/batch \
-H "Content-Type: application/json" \
-d '{"arxiv_ids": ["2003.05991", "2104.14294"], "quality_threshold": 0.85}'

# Check batch status
curl http://localhost:8080/api/batch/latest/status

# Access UDOM Navigator
open http://localhost:8080/navigator

# Expected: all papers Grade A, 10-35s each, 300+ components per paper

Version-Specific Gotchas

IssueDetailsWorkaround
Docling v2.x API changesDocumentConverter constructor args changed from v1Pin docling>=2.0,<3.0
ar5iv rate limiting>100 req/min triggers 429Use asyncio.Semaphore(5) + exponential backoff
Pandoc LaTeX errorsSome .tex files use custom packages--wrap=none --from=latex+raw_tex allows passthrough
KaTeX macro expansionPaper-specific \newcommand not in standard KaTeXBuild custom macro dict from LaTeX source preamble
PostgreSQL JSONB indexingUDOM components stored as JSONB; slow without GIN indexCREATE INDEX idx_udom_components ON udom_docs USING GIN (components)

Quick start covers: single paper extraction, batch processing with quality gates, production deployment with CODITECT integration.