Heartbeat Mechanism - CODITECT Licensing Platform
Overview
The heartbeat mechanism maintains active license sessions by periodically pinging the License API. This prevents sessions from expiring during active use and enables automatic cleanup of zombie sessions (crashed clients).
Sequence Diagram
Implementation
Client-Side (CODITECT CLI)
Python Implementation:
import asyncio
import aiohttp
import logging
from datetime import datetime, timedelta
from typing import Optional
class HeartbeatManager:
"""Manages license heartbeat to keep session alive."""
def __init__(
self,
session_id: str,
api_url: str = "https://auth.coditect.ai",
interval: int = 300, # 5 minutes
grace_period: int = 86400 # 24 hours
):
self.session_id = session_id
self.api_url = api_url
self.interval = interval
self.grace_period = grace_period
self.task: Optional[asyncio.Task] = None
self.running = False
self.last_success: Optional[datetime] = None
self.retry_count = 0
self.max_retries = 3
def start(self):
"""Start the heartbeat background task."""
if self.running:
logging.warning("Heartbeat already running")
return
self.running = True
self.last_success = datetime.utcnow()
self.task = asyncio.create_task(self._heartbeat_loop())
logging.info(f"Heartbeat started (interval: {self.interval}s)")
def stop(self):
"""Stop the heartbeat background task."""
if not self.running:
return
self.running = False
if self.task:
self.task.cancel()
try:
await self.task
except asyncio.CancelledError:
pass
logging.info("Heartbeat stopped")
def _heartbeat_loop(self):
"""Main heartbeat loop - runs every `interval` seconds."""
while self.running:
try:
await asyncio.sleep(self.interval)
await self._send_heartbeat()
except asyncio.CancelledError:
break
except Exception as e:
logging.error(f"Heartbeat loop error: {e}")
await self._handle_error(e)
def _send_heartbeat(self):
"""Send heartbeat to License API."""
url = f"{self.api_url}/api/v1/licenses/heartbeat"
try:
async with aiohttp.ClientSession() as session:
async with session.put(
url,
json={"session_id": self.session_id},
timeout=aiohttp.ClientTimeout(total=10)
) as resp:
if resp.status == 200:
data = await resp.json()
self.last_success = datetime.utcnow()
self.retry_count = 0
logging.debug(
f"Heartbeat success (expires in {data['expires_in']}s)"
)
elif resp.status == 404:
# Session expired on server
logging.error("Session expired on server")
await self._handle_session_expired()
elif resp.status == 429:
# Rate limited
logging.warning("Heartbeat rate limited")
await asyncio.sleep(60) # Wait 1 minute
else:
error_msg = await resp.text()
logging.error(f"Heartbeat failed: {resp.status} - {error_msg}")
await self._handle_error(Exception(error_msg))
except asyncio.TimeoutError:
logging.warning("Heartbeat timeout")
await self._handle_error(TimeoutError("Heartbeat timeout"))
except aiohttp.ClientError as e:
logging.warning(f"Heartbeat network error: {e}")
await self._handle_error(e)
def _handle_error(self, error: Exception):
"""Handle heartbeat errors with exponential backoff."""
self.retry_count += 1
if self.retry_count <= self.max_retries:
# Exponential backoff: 5s, 10s, 20s
wait_time = min(5 * (2 ** (self.retry_count - 1)), 60)
logging.info(f"Retrying heartbeat in {wait_time}s (attempt {self.retry_count})")
await asyncio.sleep(wait_time)
else:
# Max retries exceeded - check grace period
time_since_success = (datetime.utcnow() - self.last_success).total_seconds()
if time_since_success < self.grace_period:
# Still within grace period - enter offline mode
remaining = self.grace_period - time_since_success
logging.warning(
f"Entering offline mode ({remaining/3600:.1f} hours remaining)"
)
self.retry_count = 0 # Reset for next attempt
await asyncio.sleep(60) # Try again in 1 minute
else:
# Grace period expired
logging.error("Grace period expired - license invalid")
await self._handle_session_expired()
def _handle_session_expired(self):
"""Handle session expiration."""
self.running = False
logging.error("Session expired - stopping CODITECT")
# Emit event to main application
from coditect.events import emit_event
emit_event("LICENSE_EXPIRED", {
"session_id": self.session_id,
"timestamp": datetime.utcnow().isoformat()
})
# Usage in CODITECT CLI
def main():
# After acquiring license
session_id = "abc123-session-id"
heartbeat = HeartbeatManager(
session_id=session_id,
interval=300, # 5 minutes
grace_period=86400 # 24 hours offline
)
await heartbeat.start()
# CODITECT runs...
try:
await run_coditect()
finally:
await heartbeat.stop()
Server-Side (License API)
Django REST Framework Endpoint:
from rest_framework.decorators import api_view, permission_classes
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework import status
from django_redis import get_redis_connection
from django.utils import timezone
import logging
@api_view(['PUT'])
@permission_classes([IsAuthenticated])
def heartbeat(request):
"""
Extend session TTL to keep license alive.
Request body:
{
"session_id": "abc123-session-id"
}
Response:
{
"status": "alive",
"expires_in": 360,
"timestamp": "2025-11-23T20:00:00Z"
}
"""
# Get session_id from request
session_id = request.data.get('session_id')
# Get Redis connection
redis_client = get_redis_connection()
# Check if session exists
session_key = f"session:{session_id}"
if not redis_client.exists(session_key):
logging.warning(f"Heartbeat for non-existent session: {session_id}")
return Response(
status_code=status.HTTP_404_NOT_FOUND,
detail="Session not found or expired"
)
# Extend TTL to 6 minutes (360 seconds)
# This gives a 1-minute buffer after the 5-minute heartbeat interval
ttl = 360
redis_client.expire(session_key, ttl)
# Optional: Update last_heartbeat timestamp
redis_client.hset(session_key, "last_heartbeat", timezone.now().isoformat())
logging.info(f"Heartbeat successful: {session_id}")
return Response(
{
"status": "alive",
"expires_in": ttl,
"timestamp": timezone.now().isoformat()
},
status=status.HTTP_200_OK
)
Redis State:
# Before heartbeat (TTL = 120s remaining)
session:abc123-session-id = "1" (TTL: 120s)
# After heartbeat (TTL reset to 360s)
session:abc123-session-id = "1" (TTL: 360s)
Timing Analysis
Normal Operation
Timeline:
┌─────────────────────────────────────────────────────────────┐
│ T=0 T=300s T=600s T=900s T=1200s T=1500s │
│ Acquire → HB1 → HB2 → HB3 → HB4 → HB5 │
│ (extend) (extend) (extend) (extend) (extend) │
│ │
│ Session TTL: 360s at each heartbeat │
└─────────────────────────────────────────────────────────────┘
Legend:
- Acquire: License acquisition (session created with 360s TTL)
- HB1-5: Heartbeat requests (reset TTL to 360s)
Key Timings:
- Heartbeat interval: 300s (5 minutes)
- Session TTL: 360s (6 minutes)
- Safety buffer: 60s (1 minute)
Why 60s buffer?
- Network delays: 1-5s typical
- Clock drift: 1-10s possible
- Retry attempts: Up to 30s
- Total buffer: 60s covers all edge cases
Missed Heartbeat Scenario
Timeline (Network Outage):
┌─────────────────────────────────────────────────────────────┐
│ T=0 T=300s T=360s T=365s T=385s T=405s │
│ Acquire → HB1 → Session→ Retry1 → Retry2 → Retry3 │
│ (fail) Expires (5s) (10s) (20s) │
│ │
│ After T=405s: Enter offline mode (if within 24h grace) │
└─────────────────────────────────────────────────────────────┘
Max Offline Time:
- Grace period: 24 hours (86400s)
- After 24h: License expires, must reacquire
Error Handling
1. Session Not Found (404)
Cause: Session expired due to missed heartbeats or server cleanup
Response:
{
"detail": "Session not found or expired"
}
Client Behavior:
- Stop heartbeat loop
- Emit
LICENSE_EXPIREDevent - Show user: "Your license session expired. Click to reacquire."
- Option to reacquire license or exit
2. Rate Limited (429)
Cause: Too many heartbeat requests (malicious or bug)
Response:
{
"detail": "Too many requests. Retry after 60 seconds.",
"retry_after": 60
}
Client Behavior:
- Wait 60 seconds before next attempt
- Log warning
- Continue normal operation (session not expired yet)
3. Network Timeout
Cause: Network connectivity issues
Client Behavior:
Retry 1: Wait 5s → Attempt 2
Retry 2: Wait 10s → Attempt 3
Retry 3: Wait 20s → Attempt 4
If all fail:
if (time_since_last_success < 24h):
Enter offline mode
Continue working
else:
Expire license
Stop CODITECT
4. Server Error (500)
Cause: Database or Redis failure on server
Response:
{
"detail": "Internal server error"
}
Client Behavior:
- Retry with exponential backoff
- Log error for debugging
- Enter offline mode if retries fail
Monitoring & Observability
Server-Side Metrics
Prometheus Metrics:
from prometheus_client import Counter, Histogram
# Heartbeat request counter
heartbeat_requests = Counter(
'heartbeat_requests_total',
'Total heartbeat requests',
['status'] # success, failed, expired
)
# Heartbeat latency histogram
heartbeat_latency = Histogram(
'heartbeat_latency_seconds',
'Heartbeat request latency'
)
@api_view(['PUT'])
@permission_classes([IsAuthenticated])
def heartbeat(request):
with heartbeat_latency.time():
try:
# Get session_id from request
session_id = request.data.get('session_id')
redis_client = get_redis_connection()
# ... existing logic ...
heartbeat_requests.labels(status='success').inc()
return response
except Exception as e:
if hasattr(e, 'status_code') and e.status_code == 404:
heartbeat_requests.labels(status='expired').inc()
else:
heartbeat_requests.labels(status='failed').inc()
raise
Logging:
# Successful heartbeat
logging.info(
"Heartbeat successful",
extra={
"session_id": session_id,
"tenant_id": tenant_id,
"ttl_remaining": 360,
"timestamp": datetime.utcnow().isoformat()
}
)
# Failed heartbeat
logging.warning(
"Heartbeat failed - session not found",
extra={
"session_id": session_id,
"timestamp": datetime.utcnow().isoformat()
}
)
Client-Side Telemetry
Local Logging:
[2025-11-23 20:00:00] INFO: Heartbeat started (session: abc123, interval: 300s)
[2025-11-23 20:05:00] DEBUG: Heartbeat success (expires in 360s)
[2025-11-23 20:10:00] DEBUG: Heartbeat success (expires in 360s)
[2025-11-23 20:15:00] WARNING: Heartbeat timeout - retrying in 5s
[2025-11-23 20:15:05] DEBUG: Heartbeat success (expires in 360s)
Testing
Unit Tests
@pytest.mark.asyncio
def test_heartbeat_success(redis_mock):
"""Test successful heartbeat extends TTL."""
# Arrange
session_id = "test-session-123"
redis_mock.exists.return_value = True
# Act
response = await heartbeat(session_id, redis_mock)
# Assert
assert response["status"] == "alive"
assert response["expires_in"] == 360
redis_mock.expire.assert_called_once_with(f"session:{session_id}", 360)
@pytest.mark.asyncio
def test_heartbeat_session_not_found(redis_mock):
"""Test heartbeat for non-existent session returns 404."""
# Arrange
session_id = "nonexistent-session"
redis_mock.exists.return_value = False
# Act & Assert
with pytest.raises(Response(status=status.HTTP_400_BAD_REQUEST)) as exc_info:
await heartbeat(session_id, redis_mock)
assert exc_info.value.status_code == 404
Integration Tests
@pytest.mark.asyncio
def test_heartbeat_prevents_expiry(client, redis):
"""Test heartbeat prevents session expiry."""
# 1. Acquire license
session_id = await acquire_license(client)
# 2. Wait 4 minutes (session should still be alive)
await asyncio.sleep(240)
# 3. Send heartbeat
response = await client.put(
"/api/v1/licenses/heartbeat",
json={"session_id": session_id}
)
assert response.status_code == 200
# 4. Verify session still exists in Redis
assert redis.exists(f"session:{session_id}") == 1
# 5. Check TTL was reset
ttl = redis.ttl(f"session:{session_id}")
assert 350 <= ttl <= 360 # Should be ~360s
@pytest.mark.asyncio
def test_missed_heartbeat_expires_session(client, redis):
"""Test session expires if heartbeat not sent."""
# 1. Acquire license
session_id = await acquire_license(client)
# 2. Wait 7 minutes (longer than 6-minute TTL, no heartbeat)
await asyncio.sleep(420)
# 3. Verify session expired
assert redis.exists(f"session:{session_id}") == 0
# 4. Heartbeat should return 404
response = await client.put(
"/api/v1/licenses/heartbeat",
json={"session_id": session_id}
)
assert response.status_code == 404
Security Considerations
1. Session Hijacking Prevention
Problem: Attacker gets session_id and sends heartbeats
Mitigation:
# Option 1: Require JWT token with heartbeat
@api_view(['PUT'])
@permission_classes([IsAuthenticated])
def heartbeat(request):
# Get session_id from request
session_id = request.data.get('session_id')
current_user = request.user # Verify JWT (via IsAuthenticated)
# Get Redis connection
redis_client = get_redis_connection()
# Verify session belongs to this user
stored_user_id = redis_client.hget(f"session:{session_id}", "user_id")
if stored_user_id != current_user.id:
return Response(
{"detail": "Forbidden"},
status=status.HTTP_403_FORBIDDEN
)
# Option 2: Include hardware fingerprint
@api_view(['PUT'])
@permission_classes([IsAuthenticated])
def heartbeat(request):
# Get data from request
session_id = request.data.get('session_id')
hardware_id = request.data.get('hardware_id')
# Get Redis connection
redis_client = get_redis_connection()
# Verify hardware fingerprint matches
stored_hw_id = redis_client.hget(f"session:{session_id}", "hardware_id")
if stored_hw_id != hardware_id:
return Response(
{"detail": "Hardware mismatch"},
status=status.HTTP_403_FORBIDDEN
)
2. Rate Limiting
Prevent heartbeat spam:
# Redis-based rate limiting
rate_limit_key = f"rate_limit:heartbeat:{session_id}"
count = redis.incr(rate_limit_key)
if count == 1:
redis.expire(rate_limit_key, 60) # 1-minute window
if count > 2: # Max 2 heartbeats per minute (one every 30s minimum)
raise Response(status=status.HTTP_400_BAD_REQUEST)(status_code=429, detail="Rate limit exceeded")
3. Replay Attack Prevention
Include timestamp in heartbeat:
from datetime import datetime
from django.utils import timezone
@api_view(['PUT'])
@permission_classes([IsAuthenticated])
def heartbeat(request):
# Get data from request
session_id = request.data.get('session_id')
timestamp_str = request.data.get('timestamp') # ISO 8601 timestamp
# Get Redis connection
redis_client = get_redis_connection()
# Verify timestamp is recent (within 1 minute)
heartbeat_time = datetime.fromisoformat(timestamp_str)
time_diff = abs((timezone.now().replace(tzinfo=None) - heartbeat_time).total_seconds())
if time_diff > 60:
return Response(
{"detail": "Timestamp too old"},
status=status.HTTP_400_BAD_REQUEST
)
Status: Specification Complete ✅ Implementation: Pending (Phase 2) Dependencies: Redis, License API ETA: 2 hours
Last Updated: November 23, 2025 Owner: Backend Team Reviewed By: Security Team