Skip to main content

Monitoring and Observability

This guide covers comprehensive monitoring, alerting, and observability strategies for IRIS OCR platform in production environments.

Monitoring Architecture

Metrics Collection

Prometheus Configuration

# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s

rule_files:
- "iris_rules.yml"

scrape_configs:
- job_name: 'iris-api-gateway'
static_configs:
- targets: ['api-gateway:8000']
metrics_path: '/metrics'
scrape_interval: 10s

- job_name: 'iris-image-processor'
static_configs:
- targets: ['image-processor:8001']
metrics_path: '/metrics'

- job_name: 'iris-ml-embeddings'
static_configs:
- targets: ['ml-embeddings:8002']
metrics_path: '/metrics'

- job_name: 'iris-ml-classifier'
static_configs:
- targets: ['ml-classifier:8003']
metrics_path: '/metrics'

- job_name: 'iris-ocr-extractor'
static_configs:
- targets: ['ocr-extractor:8004']
metrics_path: '/metrics'

- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']

- job_name: 'docker-daemon'
static_configs:
- targets: ['docker-host:9323']

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

Application Metrics Implementation

from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time
import functools

# Define metrics
REQUEST_COUNT = Counter(
'iris_requests_total',
'Total number of requests',
['service', 'endpoint', 'method', 'status']
)

REQUEST_DURATION = Histogram(
'iris_request_duration_seconds',
'Request duration in seconds',
['service', 'endpoint', 'method']
)

PROCESSING_TIME = Histogram(
'iris_processing_time_seconds',
'Document processing time by phase',
['phase', 'document_type']
)

ACTIVE_REQUESTS = Gauge(
'iris_active_requests',
'Number of active requests',
['service']
)

MODEL_LOAD_TIME = Histogram(
'iris_model_load_time_seconds',
'Model loading time',
['model_type']
)

PIPELINE_SUCCESS_RATE = Counter(
'iris_pipeline_results_total',
'Pipeline execution results',
['result'] # success, failure, partial
)

OCR_CONFIDENCE = Histogram(
'iris_ocr_confidence',
'OCR confidence scores',
['document_type']
)

CLASSIFICATION_ACCURACY = Gauge(
'iris_classification_accuracy',
'Classification model accuracy',
['model_version']
)

def metrics_middleware(service_name: str):
"""FastAPI middleware for automatic metrics collection"""
def decorator(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
start_time = time.time()
ACTIVE_REQUESTS.labels(service=service_name).inc()

try:
result = await func(*args, **kwargs)

# Record success metrics
REQUEST_COUNT.labels(
service=service_name,
endpoint=func.__name__,
method='POST',
status='200'
).inc()

REQUEST_DURATION.labels(
service=service_name,
endpoint=func.__name__,
method='POST'
).observe(time.time() - start_time)

return result

except Exception as e:
# Record error metrics
REQUEST_COUNT.labels(
service=service_name,
endpoint=func.__name__,
method='POST',
status='500'
).inc()
raise
finally:
ACTIVE_REQUESTS.labels(service=service_name).dec()

return wrapper
return decorator

# Usage in FastAPI services
@app.post("/process")
@metrics_middleware("image-processor")
async def process_image(file: UploadFile):
start_time = time.time()

try:
# Process image
result = await process_image_logic(file)

# Record processing time
PROCESSING_TIME.labels(
phase="image_processing",
document_type=result.get('document_type', 'unknown')
).observe(time.time() - start_time)

return result

except Exception as e:
PIPELINE_SUCCESS_RATE.labels(result='failure').inc()
raise

# Start metrics server
start_http_server(8080) # Expose metrics on port 8080

Custom Metrics for Business Logic

class IRISMetricsCollector:
"""Collect business-specific metrics for IRIS"""

def __init__(self):
self.document_type_counter = Counter(
'iris_documents_by_type_total',
'Documents processed by type',
['document_type']
)

self.quality_score_histogram = Histogram(
'iris_quality_scores',
'Document extraction quality scores',
['document_type', 'quality_level']
)

self.phase_execution_time = Histogram(
'iris_phase_execution_time_seconds',
'Execution time for each pipeline phase',
['phase', 'success']
)

self.model_cache_hits = Counter(
'iris_model_cache_hits_total',
'Model cache hit/miss statistics',
['model_type', 'result'] # hit, miss
)

def record_document_processing(self, document_type: str, quality_score: float,
quality_level: str, phase_times: dict):
"""Record metrics for a complete document processing"""

# Document type distribution
self.document_type_counter.labels(document_type=document_type).inc()

# Quality scores
self.quality_score_histogram.labels(
document_type=document_type,
quality_level=quality_level
).observe(quality_score)

# Phase execution times
for phase, duration in phase_times.items():
self.phase_execution_time.labels(
phase=phase,
success='true'
).observe(duration)

def record_model_cache_usage(self, model_type: str, cache_hit: bool):
"""Record model cache usage"""
result = 'hit' if cache_hit else 'miss'
self.model_cache_hits.labels(
model_type=model_type,
result=result
).inc()

# Global metrics collector instance
metrics_collector = IRISMetricsCollector()

Grafana Dashboards

Main Operations Dashboard

{
"dashboard": {
"title": "IRIS OCR - Operations Overview",
"panels": [
{
"title": "Request Rate",
"type": "stat",
"targets": [
{
"expr": "rate(iris_requests_total[5m])",
"legendFormat": "{{service}}"
}
]
},
{
"title": "Response Times",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(iris_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(iris_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
}
]
},
{
"title": "Error Rate",
"type": "stat",
"targets": [
{
"expr": "rate(iris_requests_total{status!~\"2..\"}[5m]) / rate(iris_requests_total[5m])",
"legendFormat": "Error Rate"
}
]
},
{
"title": "Pipeline Success Rate",
"type": "piechart",
"targets": [
{
"expr": "iris_pipeline_results_total",
"legendFormat": "{{result}}"
}
]
},
{
"title": "Documents by Type",
"type": "bargraph",
"targets": [
{
"expr": "increase(iris_documents_by_type_total[1h])",
"legendFormat": "{{document_type}}"
}
]
},
{
"title": "Active Requests",
"type": "graph",
"targets": [
{
"expr": "iris_active_requests",
"legendFormat": "{{service}}"
}
]
}
]
}
}

Performance Dashboard

{
"dashboard": {
"title": "IRIS OCR - Performance Metrics",
"panels": [
{
"title": "Processing Time by Phase",
"type": "heatmap",
"targets": [
{
"expr": "rate(iris_processing_time_seconds_bucket[5m])",
"legendFormat": "{{phase}}"
}
]
},
{
"title": "Model Loading Times",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(iris_model_load_time_seconds_bucket[5m]))",
"legendFormat": "{{model_type}}"
}
]
},
{
"title": "OCR Confidence Distribution",
"type": "histogram",
"targets": [
{
"expr": "rate(iris_ocr_confidence_bucket[5m])",
"legendFormat": "{{document_type}}"
}
]
},
{
"title": "Cache Hit Rate",
"type": "stat",
"targets": [
{
"expr": "rate(iris_model_cache_hits_total{result=\"hit\"}[5m]) / rate(iris_model_cache_hits_total[5m])",
"legendFormat": "{{model_type}}"
}
]
}
]
}
}

Infrastructure Dashboard

{
"dashboard": {
"title": "IRIS OCR - Infrastructure",
"panels": [
{
"title": "CPU Usage",
"type": "graph",
"targets": [
{
"expr": "100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{instance}}"
}
]
},
{
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100",
"legendFormat": "{{instance}}"
}
]
},
{
"title": "Disk I/O",
"type": "graph",
"targets": [
{
"expr": "rate(node_disk_reads_completed_total[5m])",
"legendFormat": "{{instance}} reads"
},
{
"expr": "rate(node_disk_writes_completed_total[5m])",
"legendFormat": "{{instance}} writes"
}
]
},
{
"title": "Network Traffic",
"type": "graph",
"targets": [
{
"expr": "rate(node_network_receive_bytes_total[5m])",
"legendFormat": "{{instance}} in"
},
{
"expr": "rate(node_network_transmit_bytes_total[5m])",
"legendFormat": "{{instance}} out"
}
]
}
]
}
}

Alerting Rules

Prometheus Alert Rules

# iris_rules.yml
groups:
- name: iris_service_alerts
rules:
- alert: IRISServiceDown
expr: up{job=~"iris-.*"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "IRIS service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} has been down for more than 1 minute"

- alert: IRISHighErrorRate
expr: rate(iris_requests_total{status!~"2.."}[5m]) / rate(iris_requests_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.service }}"
description: "Error rate is {{ $value | humanizePercentage }} on service {{ $labels.service }}"

- alert: IRISHighLatency
expr: histogram_quantile(0.95, rate(iris_request_duration_seconds_bucket[5m])) > 30
for: 3m
labels:
severity: warning
annotations:
summary: "High latency on {{ $labels.service }}"
description: "95th percentile latency is {{ $value }}s on service {{ $labels.service }}"

- alert: IRISPipelineFailureRate
expr: rate(iris_pipeline_results_total{result="failure"}[10m]) / rate(iris_pipeline_results_total[10m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High pipeline failure rate"
description: "Pipeline failure rate is {{ $value | humanizePercentage }} over the last 10 minutes"

- name: iris_resource_alerts
rules:
- alert: IRISHighCPUUsage
expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"

- alert: IRISHighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
for: 3m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"

- alert: IRISLowDiskSpace
expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
for: 1m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk usage is {{ $value }}% on {{ $labels.instance }}"

- name: iris_business_alerts
rules:
- alert: IRISLowQualityExtractions
expr: rate(iris_quality_scores{quality_level="poor"}[15m]) / rate(iris_quality_scores[15m]) > 0.2
for: 10m
labels:
severity: warning
annotations:
summary: "High rate of poor quality extractions"
description: "{{ $value | humanizePercentage }} of extractions have poor quality"

- alert: IRISModelCacheMissRate
expr: rate(iris_model_cache_hits_total{result="miss"}[10m]) / rate(iris_model_cache_hits_total[10m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High model cache miss rate"
description: "Cache miss rate is {{ $value | humanizePercentage }} for {{ $labels.model_type }}"

AlertManager Configuration

# alertmanager.yml
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'iris-alerts@your-company.com'

route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'

receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:5001/'

- name: 'critical-alerts'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#iris-critical'
title: 'IRIS Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
pagerduty_configs:
- routing_key: 'YOUR_PAGERDUTY_KEY'
description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'warning-alerts'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#iris-warnings'
title: 'IRIS Warning'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
email_configs:
- to: 'team@your-company.com'
subject: 'IRIS Warning Alert'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}

Log Management

Centralized Logging with ELK Stack

Elasticsearch Configuration

# elasticsearch.yml
cluster.name: iris-logs
node.name: node-1
path.data: /var/lib/elasticsearch
path.logs: /var/log/elasticsearch
network.host: 0.0.0.0
http.port: 9200
discovery.type: single-node
xpack.security.enabled: false

Logstash Configuration

# logstash.conf
input {
beats {
port => 5044
}
}

filter {
if [fields][service] == "iris" {
grok {
match => {
"message" => "%{TIMESTAMP_ISO8601:timestamp} \[%{LOGLEVEL:level}\] %{DATA:service} - %{GREEDYDATA:log_message}"
}
}

date {
match => [ "timestamp", "ISO8601" ]
}

if [level] == "ERROR" {
mutate {
add_tag => [ "error" ]
}
}

# Parse structured JSON logs
if [log_message] =~ /^\{.*\}$/ {
json {
source => "log_message"
target => "structured_log"
}
}
}
}

output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "iris-logs-%{+YYYY.MM.dd}"
}

# Send errors to dead letter queue for investigation
if "error" in [tags] {
file {
path => "/var/log/iris-errors.log"
}
}
}

Filebeat Configuration

# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/iris/*.log
fields:
service: iris
fields_under_root: true
multiline.pattern: '^\d{4}-\d{2}-\d{2}'
multiline.negate: true
multiline.match: after

output.logstash:
hosts: ["logstash:5044"]

processors:
- add_host_metadata:
when.not.contains.tags: forwarded

Application Logging Standards

import logging
import json
import time
from typing import Dict, Any
from contextvars import ContextVar

# Request ID context for tracing
request_id_var: ContextVar[str] = ContextVar('request_id', default='')

class StructuredLogger:
"""Structured logger for IRIS services"""

def __init__(self, service_name: str, log_level: str = "INFO"):
self.service_name = service_name
self.logger = logging.getLogger(service_name)
self.logger.setLevel(getattr(logging, log_level.upper()))

# Create structured formatter
formatter = logging.Formatter(
'%(asctime)s [%(levelname)s] %(name)s - %(message)s'
)

# Console handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
self.logger.addHandler(console_handler)

# File handler for persistent logging
file_handler = logging.FileHandler(f'/var/log/iris/{service_name}.log')
file_handler.setFormatter(formatter)
self.logger.addHandler(file_handler)

def _create_log_entry(self, level: str, message: str,
extra_data: Dict[str, Any] = None) -> str:
"""Create structured log entry"""
log_entry = {
'timestamp': time.time(),
'level': level,
'service': self.service_name,
'message': message,
'request_id': request_id_var.get(),
}

if extra_data:
log_entry.update(extra_data)

return json.dumps(log_entry)

def info(self, message: str, **kwargs):
log_entry = self._create_log_entry('INFO', message, kwargs)
self.logger.info(log_entry)

def warning(self, message: str, **kwargs):
log_entry = self._create_log_entry('WARNING', message, kwargs)
self.logger.warning(log_entry)

def error(self, message: str, **kwargs):
log_entry = self._create_log_entry('ERROR', message, kwargs)
self.logger.error(log_entry)

def debug(self, message: str, **kwargs):
log_entry = self._create_log_entry('DEBUG', message, kwargs)
self.logger.debug(log_entry)

# Usage in services
logger = StructuredLogger("api-gateway")

# Log pipeline events
logger.info("Pipeline started",
pipeline_id="12345",
document_type="ficha_residencia",
user_id="user123")

logger.error("Phase 3 failed",
pipeline_id="12345",
phase="classification",
error_code="MODEL_LOAD_FAILED",
model_path="/app/models/classifier.pth")

Distributed Tracing

Jaeger Implementation

from jaeger_client import Config
from opentracing import tracer
import opentracing

def initialize_tracer(service_name: str):
"""Initialize Jaeger tracer"""
config = Config(
config={
'sampler': {
'type': 'const',
'param': 1,
},
'logging': True,
},
service_name=service_name,
)
return config.initialize_tracer()

# Initialize tracer for each service
tracer = initialize_tracer('iris-api-gateway')

class TracingMiddleware:
"""FastAPI middleware for distributed tracing"""

def __init__(self, app, tracer):
self.app = app
self.tracer = tracer

async def __call__(self, scope, receive, send):
if scope["type"] == "http":
# Start span for the request
span = self.tracer.start_span(
operation_name=f"{scope['method']} {scope['path']}",
tags={
'http.method': scope['method'],
'http.url': scope['path'],
'component': 'http',
}
)

# Add span to context
with span:
await self.app(scope, receive, send)

span.finish()
else:
await self.app(scope, receive, send)

# Use in pipeline processing
def trace_pipeline_phase(phase_name: str):
"""Decorator to trace pipeline phases"""
def decorator(func):
async def wrapper(*args, **kwargs):
with tracer.start_span(phase_name) as span:
span.set_tag('phase', phase_name)

try:
result = await func(*args, **kwargs)
span.set_tag('success', True)
return result
except Exception as e:
span.set_tag('success', False)
span.set_tag('error', str(e))
raise
return wrapper
return decorator

# Usage in pipeline
@trace_pipeline_phase("image_processing")
async def process_image(image_data):
# Processing logic here
pass

Health Checks and Service Discovery

Comprehensive Health Checks

from fastapi import FastAPI, HTTPException
import asyncio
import time
import psutil
import subprocess

class HealthChecker:
"""Comprehensive health checking for IRIS services"""

def __init__(self, service_name: str):
self.service_name = service_name
self.start_time = time.time()
self.health_checks = {}

async def check_database_connection(self) -> Dict[str, Any]:
"""Check database connectivity if applicable"""
try:
# Implement specific database check
return {
'status': 'healthy',
'response_time': 0.05,
'details': 'Database connection successful'
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e),
'details': 'Database connection failed'
}

async def check_external_dependencies(self) -> Dict[str, Any]:
"""Check external service dependencies"""
dependency_checks = {}

# Check other IRIS services
services_to_check = [
('image-processor', 'http://image-processor:8001/health'),
('ml-embeddings', 'http://ml-embeddings:8002/health'),
('ml-classifier', 'http://ml-classifier:8003/health'),
('ocr-extractor', 'http://ocr-extractor:8004/health')
]

for service_name, health_url in services_to_check:
try:
# In real implementation, use aiohttp
dependency_checks[service_name] = {
'status': 'healthy',
'response_time': 0.1
}
except Exception as e:
dependency_checks[service_name] = {
'status': 'unhealthy',
'error': str(e)
}

return dependency_checks

async def check_system_resources(self) -> Dict[str, Any]:
"""Check system resource usage"""
try:
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')

return {
'cpu_usage': cpu_percent,
'memory_usage': memory.percent,
'disk_usage': (disk.used / disk.total) * 100,
'status': 'healthy' if cpu_percent < 90 and memory.percent < 90 else 'degraded'
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e)
}

async def check_model_availability(self) -> Dict[str, Any]:
"""Check if ML models are loaded and available"""
try:
# Check if models are accessible
model_checks = {
'classification_model': {'status': 'loaded', 'version': '1.0.0'},
'ocr_models': {'status': 'loaded', 'languages': ['es', 'en']},
'embedding_model': {'status': 'loaded', 'dimension': 768}
}

return {
'status': 'healthy',
'models': model_checks
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e)
}

async def perform_health_check(self) -> Dict[str, Any]:
"""Perform comprehensive health check"""
health_result = {
'service': self.service_name,
'status': 'healthy',
'timestamp': time.time(),
'uptime': time.time() - self.start_time,
'version': '1.0.0',
'checks': {}
}

# Run all health checks concurrently
checks = await asyncio.gather(
self.check_system_resources(),
self.check_external_dependencies(),
self.check_model_availability(),
return_exceptions=True
)

health_result['checks'] = {
'system_resources': checks[0],
'dependencies': checks[1],
'models': checks[2]
}

# Determine overall health status
if any(check.get('status') == 'unhealthy' for check in checks if isinstance(check, dict)):
health_result['status'] = 'unhealthy'
elif any(check.get('status') == 'degraded' for check in checks if isinstance(check, dict)):
health_result['status'] = 'degraded'

return health_result

# FastAPI health endpoint
health_checker = HealthChecker("api-gateway")

@app.get("/health")
async def health_check():
health_result = await health_checker.perform_health_check()

if health_result['status'] == 'unhealthy':
raise HTTPException(status_code=503, detail=health_result)

return health_result

@app.get("/health/live")
async def liveness_check():
"""Simple liveness check for Kubernetes"""
return {"status": "alive", "timestamp": time.time()}

@app.get("/health/ready")
async def readiness_check():
"""Readiness check for Kubernetes"""
# Check if service is ready to accept traffic
models_ready = await health_checker.check_model_availability()

if models_ready.get('status') != 'healthy':
raise HTTPException(status_code=503, detail="Service not ready")

return {"status": "ready", "timestamp": time.time()}

Monitoring Deployment

Docker Compose for Monitoring Stack

# docker-compose.monitoring.yml
version: '3.8'

services:
prometheus:
image: prom/prometheus:latest
container_name: iris-prometheus
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus:/etc/prometheus
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'

grafana:
image: grafana/grafana:latest
container_name: iris-grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards

alertmanager:
image: prom/alertmanager:latest
container_name: iris-alertmanager
ports:
- "9093:9093"
volumes:
- ./monitoring/alertmanager:/etc/alertmanager

node-exporter:
image: prom/node-exporter:latest
container_name: iris-node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'

elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.15.0
container_name: iris-elasticsearch
environment:
- discovery.type=single-node
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ports:
- "9200:9200"
volumes:
- elasticsearch_data:/usr/share/elasticsearch/data

logstash:
image: docker.elastic.co/logstash/logstash:7.15.0
container_name: iris-logstash
ports:
- "5044:5044"
volumes:
- ./monitoring/logstash:/usr/share/logstash/pipeline
depends_on:
- elasticsearch

kibana:
image: docker.elastic.co/kibana/kibana:7.15.0
container_name: iris-kibana
ports:
- "5601:5601"
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
depends_on:
- elasticsearch

jaeger:
image: jaegertracing/all-in-one:latest
container_name: iris-jaeger
ports:
- "16686:16686"
- "14268:14268"
environment:
- COLLECTOR_ZIPKIN_HOST_PORT=9411

volumes:
prometheus_data:
grafana_data:
elasticsearch_data:

This comprehensive monitoring setup provides full observability into IRIS OCR platform performance, health, and business metrics, enabling proactive operations and rapid incident response.