Skip to main content

Sequence Diagram: Zombie Session Cleanup Flow

Purpose: Automatic cleanup of abandoned sessions (crashed clients, network failures, laptop sleep) to prevent permanent seat waste.

Actors:

  • Celery Worker (periodic task scheduler)
  • License API (Django on GKE)
  • Redis (session TTL tracking)
  • PostgreSQL (session status updates)

Flow: Automated TTL-based cleanup running every 60 seconds to identify and remove expired sessions


Mermaid Sequence Diagram


Step-by-Step Breakdown

1. Celery Periodic Task (Step 1)

Server-side: Celery task for zombie cleanup:

# Server-side: Celery periodic task
from celery import shared_task
from celery.utils.log import get_task_logger
import time

logger = get_task_logger(__name__)

@shared_task(
bind=True,
name='licenses.tasks.cleanup_zombie_sessions',
soft_time_limit=50, # Warn at 50 seconds
time_limit=55, # Hard kill at 55 seconds
max_retries=3
)
def cleanup_zombie_sessions(self):
"""
Cleanup zombie sessions across all active licenses.

Schedule: Every 60 seconds (via Celery Beat)

Process:
1. Query all active licenses
2. For each license, check Redis session TTLs
3. Remove expired sessions from Redis
4. Update session status in PostgreSQL
5. Record cleanup events and metrics

Returns:
dict: Cleanup statistics
"""
from .zombie_cleaner import ZombieSessionCleaner

start_time = time.time()

cleaner = ZombieSessionCleaner()
stats = cleaner.cleanup_all_licenses()

duration = time.time() - start_time

logger.info(
f"Zombie cleanup complete: "
f"scanned={stats['licenses_scanned']}, "
f"cleaned={stats['zombies_cleaned']}, "
f"duration={duration:.2f}s"
)

return {
**stats,
'duration': duration
}

Celery Beat configuration:

# Server-side: Celery Beat schedule
from celery.schedules import crontab

CELERY_BEAT_SCHEDULE = {
'cleanup-zombie-sessions': {
'task': 'licenses.tasks.cleanup_zombie_sessions',
'schedule': 60.0, # Every 60 seconds
'options': {
'expires': 50, # Expire task if not picked up in 50s
}
},
}

2. Zombie Session Cleaner (Steps 2-9)

Server-side: Zombie cleaner implementation:

# Server-side: Zombie session cleaner
import redis
import logging
from typing import Dict, List
from datetime import timezone

logger = logging.getLogger(__name__)

class ZombieSessionCleaner:
"""
Cleans up zombie sessions (expired TTL but still in sets).

Zombie causes:
- Client crashed without releasing
- Network failure during release
- Laptop sleep mode
- Force quit (kill -9)
- Power loss

Cleanup strategy:
- TTL-based detection (Redis TTL command)
- Remove from sessions set
- Update PostgreSQL status
- Log cleanup events
"""

def __init__(self):
self.redis_client = redis.StrictRedis(
host=settings.REDIS_HOST,
port=6379,
db=0,
decode_responses=True
)

def cleanup_all_licenses(self) -> Dict[str, int]:
"""
Cleanup zombie sessions for all active licenses.

Returns:
Statistics dictionary
"""
from .models import License

# Query all active licenses
active_licenses = License.objects.filter(
is_active=True
).all()

total_zombies = 0
total_sessions_checked = 0
licenses_with_zombies = 0

for license_obj in active_licenses:
zombies_cleaned = self.cleanup_license(license_obj)

if zombies_cleaned > 0:
licenses_with_zombies += 1

total_zombies += zombies_cleaned

# Count total sessions checked
sessions_set_key = f"license:{license_obj.license_key}:sessions"
session_count = self.redis_client.scard(sessions_set_key)
total_sessions_checked += session_count

return {
'licenses_scanned': len(active_licenses),
'sessions_checked': total_sessions_checked,
'zombies_cleaned': total_zombies,
'licenses_with_zombies': licenses_with_zombies
}

def cleanup_license(self, license: 'License') -> int:
"""
Cleanup zombie sessions for a single license.

Args:
license: License object

Returns:
Number of zombies cleaned
"""
sessions_set_key = f"license:{license.license_key}:sessions"

# Get all session IDs from set
session_ids = self.redis_client.smembers(sessions_set_key)

if not session_ids:
return 0 # No sessions

zombies = []

# Check each session TTL
for session_id in session_ids:
session_key = f"session:{session_id}"

# Check if session key exists (TTL command)
ttl = self.redis_client.ttl(session_key)

if ttl == -2:
# Key doesn't exist - zombie session!
zombies.append(session_id)
logger.warning(
f"Zombie detected: {session_id} "
f"(license: {license.license_key})"
)

elif ttl == -1:
# Key exists but has no TTL - data inconsistency
logger.error(
f"Session without TTL: {session_id} "
f"(license: {license.license_key})"
)
# Fix by setting TTL
self.redis_client.expire(session_key, 360)

if not zombies:
return 0 # No zombies found

# Cleanup zombies
for session_id in zombies:
self._cleanup_zombie_session(
license=license,
session_id=session_id,
sessions_set_key=sessions_set_key
)

# Log cleanup stats
logger.info(
f"Cleaned {len(zombies)} zombies for license {license.license_key}"
)

return len(zombies)

def _cleanup_zombie_session(
self,
license: 'License',
session_id: str,
sessions_set_key: str
):
"""
Cleanup a single zombie session.

Steps:
1. Remove from sessions set
2. Update PostgreSQL record
3. Record cleanup event
4. Update metrics
"""
# Step 1: Remove from sessions set
removed = self.redis_client.srem(sessions_set_key, session_id)

if not removed:
logger.warning(f"Session {session_id} already removed from set")

# Step 2: Update database record
from .models import Session

session_obj = Session.objects.filter(
session_id=session_id
).first()

if session_obj:
session_obj.status = 'expired'
session_obj.expired_at = timezone.now()
session_obj.save()

# Calculate session duration (if not released explicitly)
if not session_obj.released_at:
session_duration = (
timezone.now() - session_obj.acquired_at
).total_seconds()

# Record usage event
from .models import UsageEvent

UsageEvent.objects.create(
license=license,
tenant=license.tenant,
event_type=UsageEvent.EventType.SESSION_DURATION,
timestamp=timezone.now(),
quantity=session_duration / 3600, # Hours
metadata={
'session_id': session_id,
'cleanup_type': 'zombie',
'duration_seconds': session_duration
}
)
else:
logger.warning(f"Session {session_id} not found in database")

# Step 3: Record cleanup event
from .models import CleanupEvent

CleanupEvent.objects.create(
license=license,
session_id=session_id,
cleanup_type='zombie',
cleaned_at=timezone.now()
)

# Step 4: Update Prometheus metrics
from prometheus_client import Counter, Gauge

zombie_cleanups = Counter(
'zombie_sessions_cleaned_total',
'Total zombie sessions cleaned',
['license_key']
)
zombie_cleanups.labels(license_key=license.license_key).inc()

# Update active sessions gauge
active_count = self.redis_client.scard(sessions_set_key)

active_sessions = Gauge(
'active_sessions',
'Active sessions',
['license_key']
)
active_sessions.labels(license_key=license.license_key).set(active_count)

# Step 5: Audit log
from .audit import audit_logger

audit_logger.log_event(
event_type='system',
user_id='system',
resource_type='session',
resource_id=session_id,
action='cleanup',
status='success',
ip_address='127.0.0.1',
user_agent='ZombieSessionCleaner',
metadata={
'license_key': license.license_key,
'cleanup_type': 'zombie',
'active_seats_after': active_count
}
)

3. Database Models (Supporting)

PostgreSQL models for tracking cleanup:

# Server-side: Cleanup event model
from django.db import models
from django.utils import timezone

class CleanupEvent(models.Model):
"""
Records zombie session cleanup events.

Used for:
- Auditing automated cleanup
- Analyzing zombie patterns
- Troubleshooting cleanup issues
"""

class CleanupType(models.TextChoices):
ZOMBIE = 'zombie', 'Zombie (TTL expired)'
EXPIRED = 'expired', 'Expired (license expired)'
MANUAL = 'manual', 'Manual (admin action)'

license = models.ForeignKey('License', on_delete=models.CASCADE)
session_id = models.CharField(max_length=36)
cleanup_type = models.CharField(
max_length=20,
choices=CleanupType.choices,
default=CleanupType.ZOMBIE
)
cleaned_at = models.DateTimeField(default=timezone.now)

# Metadata
metadata = models.JSONField(default=dict, blank=True)

class Meta:
db_table = 'cleanup_events'
indexes = [
models.Index(fields=['cleaned_at']),
models.Index(fields=['license', 'cleaned_at']),
]

def __str__(self):
return f"Cleanup: {self.session_id} ({self.cleanup_type})"

Scenarios

Normal Zombie Cleanup

Scenario: Client crashes, heartbeat stops, TTL expires

Laptop Sleep (Heartbeat Missed)

Scenario: User closes laptop, sleeps for 8 hours, reopens

Cleanup During High Load

Scenario: 1000 active sessions, cleanup completes in <5 seconds

# Performance characteristics

# Example: 50 active licenses, avg 20 sessions each = 1000 sessions

# Cleanup operation breakdown:
# - Redis SMEMBERS: 50 licenses × 2ms = 100ms
# - Redis TTL checks: 1000 sessions × 0.5ms = 500ms
# - Redis SREM: 10 zombies × 1ms = 10ms
# - PostgreSQL updates: 10 zombies × 10ms = 100ms
# - Metrics updates: 50 licenses × 1ms = 50ms

# Total: ~760ms for 1000 sessions, 10 zombies cleaned

# CPU usage: <5%
# Memory usage: <50 MB
# Network: <100 KB

# Conclusion: Cleanup is lightweight, runs every 60s without impact

Monitoring and Alerts

Prometheus metrics for zombie cleanup:

# Server-side: Zombie cleanup metrics
from prometheus_client import Counter, Gauge, Histogram

# Zombie cleanup counter
zombie_cleanups = Counter(
'zombie_sessions_cleaned_total',
'Total zombie sessions cleaned',
['license_key']
)

# Cleanup duration histogram
cleanup_duration = Histogram(
'zombie_cleanup_duration_seconds',
'Zombie cleanup duration',
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]
)

# Zombie rate gauge (zombies per hour)
zombie_rate = Gauge(
'zombie_rate_per_hour',
'Zombie sessions per hour',
['license_key']
)

Grafana alert rules:

# Alert: High zombie rate
- alert: HighZombieRate
expr: rate(zombie_sessions_cleaned_total[1h]) > 10
for: 1h
labels:
severity: warning
annotations:
summary: "High zombie session rate detected"
description: "{{ $value }} zombies/hour - investigate client crashes"

# Alert: Cleanup taking too long
- alert: SlowZombieCleanup
expr: zombie_cleanup_duration_seconds > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Zombie cleanup taking >10 seconds"
description: "Cleanup duration: {{ $value }}s - check Redis performance"

# Alert: Cleanup failures
- alert: ZombieCleanupFailures
expr: rate(celery_task_failure_total{task="cleanup_zombie_sessions"}[5m]) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Zombie cleanup task failing"
description: "Failure rate: {{ $value }}/sec - investigate Celery worker"

Analytics Queries

Zombie session analysis:

-- Zombie cleanup statistics (last 7 days)
SELECT
DATE(cleaned_at) as date,
COUNT(*) as zombies_cleaned,
COUNT(DISTINCT license_id) as affected_licenses
FROM cleanup_events
WHERE cleanup_type = 'zombie'
AND cleaned_at >= NOW() - INTERVAL '7 days'
GROUP BY DATE(cleaned_at)
ORDER BY date;

-- Top licenses with zombie sessions
SELECT
l.license_key,
l.tier,
COUNT(*) as zombie_count,
COUNT(*) / NULLIF(
(SELECT COUNT(*) FROM sessions WHERE license_id = l.id),
0
) * 100 as zombie_percentage
FROM cleanup_events ce
JOIN licenses l ON ce.license_id = l.id
WHERE ce.cleanup_type = 'zombie'
AND ce.cleaned_at >= NOW() - INTERVAL '30 days'
GROUP BY l.license_key, l.tier
ORDER BY zombie_count DESC
LIMIT 20;

-- Average time until zombie cleanup
SELECT
AVG(EXTRACT(EPOCH FROM (ce.cleaned_at - s.last_heartbeat_at))) / 60 as avg_minutes
FROM cleanup_events ce
JOIN sessions s ON ce.session_id = s.session_id
WHERE ce.cleanup_type = 'zombie'
AND s.last_heartbeat_at IS NOT NULL
AND ce.cleaned_at >= NOW() - INTERVAL '7 days';

Performance Optimization

Optimization strategies:

  1. Batch Redis operations:

    # Use Redis pipeline for bulk TTL checks
    pipeline = redis_client.pipeline()
    for session_id in session_ids:
    pipeline.ttl(f"session:{session_id}")
    ttls = pipeline.execute()
  2. Parallel license processing:

    # Process multiple licenses concurrently
    from concurrent.futures import ThreadPoolExecutor

    with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [
    executor.submit(cleanup_license, lic)
    for lic in active_licenses
    ]
    results = [f.result() for f in futures]
  3. Database connection pooling:

    # Use connection pooling for PostgreSQL
    DATABASES = {
    'default': {
    'CONN_MAX_AGE': 600, # Reuse connections
    'CONN_HEALTH_CHECKS': True
    }
    }

  • ADR-004: Session Management (TTL strategy)
  • ADR-011: Zombie Session Cleanup Strategy
  • 02-seat-acquisition-flow.md: Initial seat acquisition
  • 03-heartbeat-renewal-flow.md: Heartbeat mechanism
  • 04-seat-release-flow.md: Explicit seat release

Last Updated: 2025-11-30 Diagram Type: Sequence (Mermaid) Scope: Core licensing flow - Zombie cleanup