Technical Design Document (TDD)
Technical Overview This document provides detailed technical specifications for implementing the CODITECT Audio2Text system, including code structure, API contracts, algorithms, and...
Technical Design Document (TDD)
CODITECT Audio2Text
Version: 1.0 Date: 2025-11-07 Status: Draft
1. Technical Overview
This document provides detailed technical specifications for implementing the CODITECT Audio2Text system, including code structure, API contracts, algorithms, and implementation guidelines.
2. Backend Implementation
2.1 Project Structure
backend/
├── src/
│ ├── api/
│ │ ├── __init__.py
│ │ ├── routes/
│ │ │ ├── transcription.py
│ │ │ ├── health.py
│ │ │ └── batch.py
│ │ └── dependencies.py
│ ├── services/
│ │ ├── __init__.py
│ │ ├── download_service.py
│ │ ├── transcription_service.py
│ │ ├── processing_service.py
│ │ └── job_manager.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── requests.py
│ │ ├── responses.py
│ │ └── schemas.py
│ ├── config/
│ │ ├── __init__.py
│ │ ├── settings.py
│ │ └── logging_config.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── validators.py
│ │ ├── file_utils.py
│ │ └── exceptions.py
│ └── main.py
├── requirements.txt
├── setup.py
└── pyproject.toml
2.2 Core Modules
2.2.1 Download Service Implementation
File: backend/src/services/download_service.py
import yt_dlp
from pathlib import Path
from typing import Dict, Optional
import logging
logger = logging.getLogger(__name__)
class DownloadService:
"""Service for downloading audio from YouTube URLs using yt-dlp"""
def __init__(self, output_dir: Path, cache_dir: Path):
self.output_dir = output_dir
self.cache_dir = cache_dir
def download_audio(
self,
url: str,
format: str = "mp3",
quality: str = "bestaudio"
) -> Dict[str, str]:
"""
Download audio from YouTube URL
Args:
url: YouTube video URL
format: Output audio format (mp3, wav, m4a)
quality: Audio quality selector
Returns:
Dict with 'file_path', 'title', 'duration', 'metadata'
Raises:
DownloadError: If download fails
"""
ydl_opts = {
'format': quality,
'outtmpl': str(self.output_dir / '%(id)s.%(ext)s'),
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': format,
'preferredquality': '192',
}],
'quiet': False,
'no_warnings': False,
'extract_flat': False,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
return {
'file_path': str(self.output_dir / f"{info['id']}.{format}"),
'title': info.get('title', 'Unknown'),
'duration': info.get('duration', 0),
'metadata': {
'uploader': info.get('uploader'),
'upload_date': info.get('upload_date'),
'description': info.get('description', '')[:500]
}
}
except Exception as e:
logger.error(f"Download failed for {url}: {str(e)}")
raise DownloadError(f"Failed to download: {str(e)}")
def validate_url(self, url: str) -> bool:
"""Validate if URL is a valid YouTube URL"""
try:
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
ydl.extract_info(url, download=False)
return True
except:
return False
2.2.2 Transcription Service Implementation
File: backend/src/services/transcription_service.py
import whisper
from pathlib import Path
from typing import Dict, Optional, List
import logging
import torch
logger = logging.getLogger(__name__)
class TranscriptionService:
"""Service for transcribing audio using OpenAI Whisper"""
# Model configurations
MODEL_CONFIGS = {
'tiny': {'vram': 1, 'speed_factor': 10},
'base': {'vram': 1, 'speed_factor': 7},
'small': {'vram': 2, 'speed_factor': 4},
'medium': {'vram': 5, 'speed_factor': 2},
'large': {'vram': 10, 'speed_factor': 1}
}
def __init__(self, model_name: str = "base", device: Optional[str] = None):
"""
Initialize transcription service
Args:
model_name: Whisper model size (tiny, base, small, medium, large)
device: Device to use ('cuda', 'cpu', or None for auto-detect)
"""
self.model_name = model_name
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
self.model = None
def load_model(self):
"""Lazy load the Whisper model"""
if self.model is None:
logger.info(f"Loading Whisper model '{self.model_name}' on {self.device}")
self.model = whisper.load_model(self.model_name, device=self.device)
logger.info(f"Model loaded successfully")
def transcribe(
self,
audio_path: Path,
language: Optional[str] = None,
task: str = "transcribe",
output_formats: List[str] = ["txt", "srt"]
) -> Dict:
"""
Transcribe audio file to text
Args:
audio_path: Path to audio file
language: Language code (e.g., 'en', 'es') or None for auto-detect
task: 'transcribe' or 'translate' (to English)
output_formats: List of output formats to generate
Returns:
Dict with 'text', 'segments', 'language', 'output_files'
"""
self.load_model()
# Transcription options
options = {
'task': task,
'verbose': False,
}
if language:
options['language'] = language
try:
logger.info(f"Transcribing {audio_path}")
result = self.model.transcribe(str(audio_path), **options)
# Generate output files
output_files = {}
for fmt in output_formats:
output_files[fmt] = self._save_output(
result,
audio_path.stem,
fmt
)
return {
'text': result['text'],
'segments': result['segments'],
'language': result['language'],
'output_files': output_files
}
except Exception as e:
logger.error(f"Transcription failed: {str(e)}")
raise TranscriptionError(f"Transcription failed: {str(e)}")
def _save_output(self, result: Dict, base_name: str, format: str) -> Path:
"""Save transcription in specified format"""
output_dir = Path("data/output")
output_dir.mkdir(parents=True, exist_ok=True)
if format == "txt":
output_path = output_dir / f"{base_name}.txt"
output_path.write_text(result['text'])
elif format == "srt":
output_path = output_dir / f"{base_name}.srt"
srt_content = self._generate_srt(result['segments'])
output_path.write_text(srt_content)
elif format == "json":
import json
output_path = output_dir / f"{base_name}.json"
output_path.write_text(json.dumps(result, indent=2))
elif format == "vtt":
output_path = output_dir / f"{base_name}.vtt"
vtt_content = self._generate_vtt(result['segments'])
output_path.write_text(vtt_content)
return output_path
def _generate_srt(self, segments: List[Dict]) -> str:
"""Generate SRT format subtitle content"""
srt_lines = []
for i, segment in enumerate(segments, start=1):
start = self._format_timestamp(segment['start'])
end = self._format_timestamp(segment['end'])
text = segment['text'].strip()
srt_lines.append(f"{i}")
srt_lines.append(f"{start} --> {end}")
srt_lines.append(text)
srt_lines.append("")
return "\n".join(srt_lines)
def _generate_vtt(self, segments: List[Dict]) -> str:
"""Generate WebVTT format subtitle content"""
vtt_lines = ["WEBVTT", ""]
for segment in segments:
start = self._format_timestamp(segment['start'], vtt=True)
end = self._format_timestamp(segment['end'], vtt=True)
text = segment['text'].strip()
vtt_lines.append(f"{start} --> {end}")
vtt_lines.append(text)
vtt_lines.append("")
return "\n".join(vtt_lines)
def _format_timestamp(self, seconds: float, vtt: bool = False) -> str:
"""Format timestamp for SRT/VTT"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
sep = "." if vtt else ","
return f"{hours:02d}:{minutes:02d}:{secs:02d}{sep}{millis:03d}"
@classmethod
def get_recommended_model(cls, available_vram_gb: float) -> str:
"""Recommend model based on available VRAM"""
for model in ['large', 'medium', 'small', 'base', 'tiny']:
if cls.MODEL_CONFIGS[model]['vram'] <= available_vram_gb:
return model
return 'tiny'
2.2.3 Job Manager Implementation
File: backend/src/services/job_manager.py
import uuid
from typing import Dict, Optional, List
from enum import Enum
from datetime import datetime
from dataclasses import dataclass, asdict
import asyncio
from collections import deque
class JobStatus(Enum):
PENDING = "pending"
DOWNLOADING = "downloading"
PROCESSING = "processing"
TRANSCRIBING = "transcribing"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
@dataclass
class Job:
id: str
url: str
status: JobStatus
created_at: datetime
updated_at: datetime
progress: float = 0.0
error: Optional[str] = None
result: Optional[Dict] = None
metadata: Optional[Dict] = None
def to_dict(self):
data = asdict(self)
data['status'] = self.status.value
data['created_at'] = self.created_at.isoformat()
data['updated_at'] = self.updated_at.isoformat()
return data
class JobManager:
"""Manages transcription job queue and execution"""
def __init__(self, max_concurrent: int = 3):
self.jobs: Dict[str, Job] = {}
self.max_concurrent = max_concurrent
self.active_jobs: int = 0
self.queue = deque()
def create_job(self, url: str, metadata: Optional[Dict] = None) -> Job:
"""Create a new transcription job"""
job_id = str(uuid.uuid4())
job = Job(
id=job_id,
url=url,
status=JobStatus.PENDING,
created_at=datetime.now(),
updated_at=datetime.now(),
metadata=metadata or {}
)
self.jobs[job_id] = job
self.queue.append(job_id)
return job
def get_job(self, job_id: str) -> Optional[Job]:
"""Get job by ID"""
return self.jobs.get(job_id)
def update_job_status(
self,
job_id: str,
status: JobStatus,
progress: Optional[float] = None,
error: Optional[str] = None
):
"""Update job status and progress"""
job = self.jobs.get(job_id)
if job:
job.status = status
job.updated_at = datetime.now()
if progress is not None:
job.progress = progress
if error:
job.error = error
def complete_job(self, job_id: str, result: Dict):
"""Mark job as completed with results"""
job = self.jobs.get(job_id)
if job:
job.status = JobStatus.COMPLETED
job.progress = 100.0
job.result = result
job.updated_at = datetime.now()
def fail_job(self, job_id: str, error: str):
"""Mark job as failed"""
job = self.jobs.get(job_id)
if job:
job.status = JobStatus.FAILED
job.error = error
job.updated_at = datetime.now()
def get_all_jobs(self) -> List[Job]:
"""Get all jobs"""
return list(self.jobs.values())
def get_pending_jobs(self) -> List[Job]:
"""Get all pending jobs"""
return [j for j in self.jobs.values() if j.status == JobStatus.PENDING]
2.3 API Endpoints
2.3.1 REST API Specification
File: backend/src/api/routes/transcription.py
from fastapi import APIRouter, HTTPException, BackgroundTasks
from pydantic import BaseModel, HttpUrl
from typing import Optional, List
router = APIRouter(prefix="/api/transcribe", tags=["transcription"])
class TranscribeRequest(BaseModel):
url: HttpUrl
model: str = "base"
language: Optional[str] = None
output_formats: List[str] = ["txt", "srt"]
class TranscribeResponse(BaseModel):
job_id: str
status: str
message: str
class JobStatusResponse(BaseModel):
job_id: str
status: str
progress: float
error: Optional[str] = None
result: Optional[dict] = None
@router.post("/", response_model=TranscribeResponse)
async def create_transcription_job(
request: TranscribeRequest,
background_tasks: BackgroundTasks
):
"""
Create a new transcription job
- **url**: YouTube video URL
- **model**: Whisper model size (tiny, base, small, medium, large)
- **language**: Optional language code (auto-detect if not specified)
- **output_formats**: List of output formats (txt, srt, json, vtt)
"""
# Implementation here
pass
@router.get("/{job_id}", response_model=JobStatusResponse)
async def get_job_status(job_id: str):
"""Get the status of a transcription job"""
# Implementation here
pass
@router.delete("/{job_id}")
async def cancel_job(job_id: str):
"""Cancel a pending or running job"""
# Implementation here
pass
2.4 Configuration Management
File: backend/src/config/settings.py
from pydantic_settings import BaseSettings
from pathlib import Path
from typing import Optional
class Settings(BaseSettings):
# Application
APP_NAME: str = "CODITECT Audio2Text"
VERSION: str = "1.0.0"
DEBUG: bool = False
# Server
HOST: str = "0.0.0.0"
PORT: int = 8000
WORKERS: int = 1
# Paths
DATA_DIR: Path = Path("data")
INPUT_DIR: Path = DATA_DIR / "input"
OUTPUT_DIR: Path = DATA_DIR / "output"
CACHE_DIR: Path = DATA_DIR / "cache"
MODELS_DIR: Path = DATA_DIR / "models"
# Whisper
DEFAULT_MODEL: str = "base"
DEVICE: Optional[str] = None # 'cuda', 'cpu', or None for auto
# Processing
MAX_CONCURRENT_JOBS: int = 3
MAX_FILE_SIZE_MB: int = 2000
CACHE_RETENTION_DAYS: int = 7
# Security
ALLOWED_DOMAINS: list = ["youtube.com", "youtu.be"]
MAX_RETRIES: int = 3
class Config:
env_file = ".env"
settings = Settings()
3. Frontend Implementation
3.1 Project Structure
frontend/
├── src/
│ ├── components/
│ │ ├── TranscriptionForm.tsx
│ │ ├── JobList.tsx
│ │ ├── JobDetail.tsx
│ │ ├── ProgressBar.tsx
│ │ └── Settings.tsx
│ ├── pages/
│ │ ├── home.tsx
│ │ ├── jobs.tsx
│ │ └── about.tsx
│ ├── services/
│ │ ├── api.ts
│ │ └── websocket.ts
│ ├── styles/
│ │ └── theme.ts
│ ├── utils/
│ │ └── helpers.ts
│ ├── app.tsx
│ └── main.tsx
├── package.json
└── tsconfig.json
3.2 Key Components
File: frontend/src/services/api.ts
import axios from 'axios';
const API_BASE_URL = import.meta.env.VITE_API_URL || 'http://localhost:8000';
const api = axios.create({
baseURL: API_BASE_URL,
timeout: 30000,
});
export interface TranscribeRequest {
url: string;
model?: string;
language?: string;
output_formats?: string[];
}
export interface Job {
id: string;
url: string;
status: string;
progress: number;
error?: string;
result?: any;
created_at: string;
updated_at: string;
}
export const transcriptionAPI = {
createJob: async (request: TranscribeRequest): Promise<{ job_id: string }> => {
const response = await api.post('/api/transcribe', request);
return response.data;
},
getJobStatus: async (jobId: string): Promise<Job> => {
const response = await api.get(`/api/transcribe/${jobId}`);
return response.data;
},
getAllJobs: async (): Promise<Job[]> => {
const response = await api.get('/api/jobs');
return response.data;
},
cancelJob: async (jobId: string): Promise<void> => {
await api.delete(`/api/transcribe/${jobId}`);
},
};
export default api;
4. Core Library Implementation
4.1 Core Module Structure
core/
├── src/
│ ├── transcription/
│ │ ├── __init__.py
│ │ ├── whisper_wrapper.py
│ │ └── model_manager.py
│ ├── download/
│ │ ├── __init__.py
│ │ └── ytdlp_wrapper.py
│ ├── processing/
│ │ ├── __init__.py
│ │ ├── audio_converter.py
│ │ └── format_handlers.py
│ └── shared/
│ ├── __init__.py
│ ├── types.py
│ └── utils.py
└── setup.py
5. Database Schema (Optional - for multi-user)
CREATE TABLE jobs (
id UUID PRIMARY KEY,
user_id INTEGER,
url TEXT NOT NULL,
status VARCHAR(20) NOT NULL,
progress FLOAT DEFAULT 0.0,
error TEXT,
result JSONB,
metadata JSONB,
created_at TIMESTAMP DEFAULT NOW(),
updated_at TIMESTAMP DEFAULT NOW()
);
CREATE INDEX idx_jobs_user_id ON jobs(user_id);
CREATE INDEX idx_jobs_status ON jobs(status);
CREATE INDEX idx_jobs_created_at ON jobs(created_at);
6. Build and Deployment
6.1 Requirements Files
backend/requirements.txt:
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0
pydantic-settings==2.1.0
openai-whisper==20231117
yt-dlp==2023.11.16
ffmpeg-python==0.2.0
python-multipart==0.0.6
websockets==12.0
torch==2.1.0
6.2 Docker Configuration
Dockerfile:
FROM python:3.10-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Install Python dependencies
COPY backend/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY backend/src ./src
COPY config ./config
# Create data directories
RUN mkdir -p data/{input,output,cache,models}
EXPOSE 8000
CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
7. Testing Strategy
7.1 Unit Tests
- Test individual service methods
- Mock external dependencies (yt-dlp, whisper)
- Validate data models and schemas
7.2 Integration Tests
- Test API endpoints end-to-end
- Test full transcription workflow
- Test error handling and retries
7.3 Performance Tests
- Benchmark transcription speeds
- Test concurrent job handling
- Memory usage profiling
8. Error Codes
| Code | Description | HTTP Status |
|---|---|---|
| E001 | Invalid URL format | 400 |
| E002 | Download failed | 500 |
| E003 | Transcription failed | 500 |
| E004 | Model not available | 503 |
| E005 | Resource exhausted | 429 |
| E006 | Job not found | 404 |
| E007 | Invalid model name | 400 |
Revision History
| Version | Date | Author | Changes |
|---|---|---|---|
| 1.0 | 2025-11-07 | System | Initial draft |