Saltar al contenido principal

Procedimientos de Backup y Recuperación

Estrategia de Backup

Tipos de Backup

IRIS implementa una estrategia de backup multi-nivel que garantiza la recuperación de datos en diferentes escenarios de fallo.

1. Backup de Base de Datos

Configuración PostgreSQL:

#!/bin/bash
# scripts/backup/database-backup.sh

# Variables de configuración
DB_HOST="localhost"
DB_PORT="5432"
DB_NAME="iris_prod"
DB_USER="iris"
BACKUP_DIR="/backups/database"
RETENTION_DAYS=30

# Crear directorio de backup
mkdir -p "$BACKUP_DIR"

# Timestamp para el backup
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="$BACKUP_DIR/iris_db_$TIMESTAMP.sql"

echo "🗄️ Iniciando backup de base de datos..."
echo "Archivo: $BACKUP_FILE"

# Backup completo con pg_dump
docker-compose exec -T postgres pg_dump \
-h "$DB_HOST" \
-p "$DB_PORT" \
-U "$DB_USER" \
-d "$DB_NAME" \
--verbose \
--no-password \
--format=custom \
--compress=9 \
--file="/tmp/backup.dump"

# Copiar backup del contenedor al host
docker cp $(docker-compose ps -q postgres):/tmp/backup.dump "$BACKUP_FILE.dump"

# Backup en formato SQL para inspección manual
docker-compose exec -T postgres pg_dump \
-h "$DB_HOST" \
-p "$DB_PORT" \
-U "$DB_USER" \
-d "$DB_NAME" \
--verbose \
--no-password \
--clean \
--if-exists > "$BACKUP_FILE"

# Comprimir backup SQL
gzip "$BACKUP_FILE"

# Verificar integridad del backup
if [ -f "$BACKUP_FILE.gz" ]; then
echo "✅ Backup completado exitosamente"

# Obtener tamaño del archivo
size=$(du -h "$BACKUP_FILE.gz" | cut -f1)
echo "📦 Tamaño del backup: $size"

# Log del backup
echo "$(date): Backup exitoso - $BACKUP_FILE.gz ($size)" >> "$BACKUP_DIR/backup.log"
else
echo "❌ Error en el backup"
echo "$(date): Backup falló - $BACKUP_FILE.gz" >> "$BACKUP_DIR/backup.log"
exit 1
fi

# Limpiar backups antiguos
echo "🧹 Limpiando backups antiguos (>$RETENTION_DAYS días)..."
find "$BACKUP_DIR" -name "iris_db_*.sql.gz" -mtime +$RETENTION_DAYS -delete
find "$BACKUP_DIR" -name "iris_db_*.dump" -mtime +$RETENTION_DAYS -delete

echo "✅ Proceso de backup completado"

Backup Incremental:

#!/bin/bash
# scripts/backup/incremental-backup.sh

# Backup usando WAL (Write-Ahead Logging) de PostgreSQL
BACKUP_DIR="/backups/incremental"
BASE_BACKUP_DIR="$BACKUP_DIR/base"
WAL_BACKUP_DIR="$BACKUP_DIR/wal"

mkdir -p "$BASE_BACKUP_DIR" "$WAL_BACKUP_DIR"

# Backup base (semanal)
if [ "$(date +%u)" -eq 1 ]; then # Lunes
echo "📅 Ejecutando backup base semanal..."

docker-compose exec -T postgres pg_basebackup \
-h localhost \
-U iris \
-D /tmp/base_backup \
-Ft \
-z \
-P \
-v

# Copiar al host
docker cp $(docker-compose ps -q postgres):/tmp/base_backup/base.tar.gz \
"$BASE_BACKUP_DIR/base_$(date +%Y%m%d).tar.gz"
fi

# Archivar WAL files
docker-compose exec -T postgres find /var/lib/postgresql/data/pg_wal \
-name "*.ready" -exec basename {} .ready \; | \
while read wal_file; do
docker cp $(docker-compose ps -q postgres):/var/lib/postgresql/data/pg_wal/$wal_file \
"$WAL_BACKUP_DIR/"
done

2. Backup de Modelos ML

Backup de Modelos Entrenados:

#!/bin/bash
# scripts/backup/models-backup.sh

MODELS_DIR="/app/data/models"
BACKUP_DIR="/backups/models"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)

echo "🤖 Iniciando backup de modelos ML..."

# Crear estructura de backup
mkdir -p "$BACKUP_DIR"
BACKUP_FILE="$BACKUP_DIR/iris_models_$TIMESTAMP.tar.gz"

# Backup de todos los modelos
docker-compose exec -T ml-classifier tar -czf /tmp/models_backup.tar.gz \
-C /app/data models/

# Copiar backup del contenedor
docker cp $(docker-compose ps -q ml-classifier):/tmp/models_backup.tar.gz "$BACKUP_FILE"

# Verificar contenido del backup
echo "📦 Contenido del backup:"
tar -tzf "$BACKUP_FILE" | head -10

# Metadata del backup
cat > "$BACKUP_DIR/models_metadata_$TIMESTAMP.json" << EOF
{
"backup_date": "$(date -Iseconds)",
"backup_file": "$(basename $BACKUP_FILE)",
"models_included": [
$(tar -tzf "$BACKUP_FILE" | grep "\.pth$\|\.pkl$" | sed 's/.*/"&"/' | paste -sd ',' -)
],
"total_size": "$(du -h $BACKUP_FILE | cut -f1)",
"model_versions": {
$(docker-compose exec -T ml-classifier python -c "
import json
import torch
import os

versions = {}
models_dir = '/app/data/models'
for file in os.listdir(models_dir):
if file.endswith('.pth'):
try:
model = torch.load(os.path.join(models_dir, file), map_location='cpu')
if isinstance(model, dict) and 'version' in model:
versions[file] = model['version']
except:
versions[file] = 'unknown'

print(json.dumps(versions, indent=8)[8:-1]) # Remove outer braces
")
}
}
EOF

echo "✅ Backup de modelos completado: $BACKUP_FILE"

3. Backup de Configuración

Backup de Configuraciones del Sistema:

#!/bin/bash
# scripts/backup/config-backup.sh

CONFIG_BACKUP_DIR="/backups/config"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="$CONFIG_BACKUP_DIR/iris_config_$TIMESTAMP.tar.gz"

echo "⚙️ Iniciando backup de configuración..."

mkdir -p "$CONFIG_BACKUP_DIR"

# Archivos de configuración a respaldar
CONFIG_FILES=(
"docker-compose.yml"
"docker-compose.prod.yml"
".env"
".env.prod"
"nginx.conf"
"packages/*/requirements.txt"
"packages/*/Dockerfile"
"scripts/"
".github/"
)

# Crear backup
tar -czf "$BACKUP_FILE" \
--exclude="*.pyc" \
--exclude="__pycache__" \
--exclude=".git" \
--exclude="node_modules" \
--exclude="*.log" \
"${CONFIG_FILES[@]}"

# Incluir configuración de la base de datos
docker-compose exec -T postgres pg_dumpall --globals-only > \
"$CONFIG_BACKUP_DIR/postgres_globals_$TIMESTAMP.sql"

# Metadata del backup
cat > "$CONFIG_BACKUP_DIR/config_metadata_$TIMESTAMP.json" << EOF
{
"backup_date": "$(date -Iseconds)",
"backup_file": "$(basename $BACKUP_FILE)",
"git_commit": "$(git rev-parse HEAD 2>/dev/null || echo 'unknown')",
"git_branch": "$(git branch --show-current 2>/dev/null || echo 'unknown')",
"docker_images": [
$(docker images --format '{{.Repository}}:{{.Tag}}' | grep iris | sed 's/.*/"&"/' | paste -sd ',' -)
],
"total_size": "$(du -h $BACKUP_FILE | cut -f1)"
}
EOF

echo "✅ Backup de configuración completado: $BACKUP_FILE"

Automatización de Backups

Configuración de Cron Jobs

# /etc/cron.d/iris-backups

# Backup diario de base de datos (2:00 AM)
0 2 * * * root /app/scripts/backup/database-backup.sh >> /var/log/iris-backup.log 2>&1

# Backup incremental cada 6 horas
0 */6 * * * root /app/scripts/backup/incremental-backup.sh >> /var/log/iris-backup.log 2>&1

# Backup semanal de modelos (domingos 3:00 AM)
0 3 * * 0 root /app/scripts/backup/models-backup.sh >> /var/log/iris-backup.log 2>&1

# Backup mensual de configuración (primer día del mes 4:00 AM)
0 4 1 * * root /app/scripts/backup/config-backup.sh >> /var/log/iris-backup.log 2>&1

# Verificación diaria de backups (6:00 AM)
0 6 * * * root /app/scripts/backup/verify-backups.sh >> /var/log/iris-backup.log 2>&1

Script de Verificación

#!/bin/bash
# scripts/backup/verify-backups.sh

echo "🔍 Verificación diaria de backups - $(date)"
echo "============================================"

BACKUP_BASE="/backups"
ALERTS_SCRIPT="/app/scripts/alerts/send-alert.sh"

# Función para verificar backup
verify_backup() {
local backup_type=$1
local backup_dir="$BACKUP_BASE/$backup_type"
local max_age_hours=$2

echo "Verificando $backup_type..."

# Encontrar el backup más reciente
latest_backup=$(find "$backup_dir" -name "*.gz" -o -name "*.dump" | sort -r | head -n1)

if [ -z "$latest_backup" ]; then
echo "❌ No se encontraron backups en $backup_dir"
$ALERTS_SCRIPT "No backups found for $backup_type"
return 1
fi

# Verificar antigüedad
backup_age_hours=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 3600 ))

if [ $backup_age_hours -gt $max_age_hours ]; then
echo "⚠️ Backup de $backup_type muy antiguo: $backup_age_hours horas"
$ALERTS_SCRIPT "Backup for $backup_type is $backup_age_hours hours old"
return 1
fi

# Verificar integridad del archivo
case "$latest_backup" in
*.gz)
if ! gzip -t "$latest_backup" 2>/dev/null; then
echo "❌ Backup de $backup_type corrupto"
$ALERTS_SCRIPT "Backup corruption detected for $backup_type"
return 1
fi
;;
*.dump)
if ! file "$latest_backup" | grep -q "PostgreSQL"; then
echo "❌ Backup de $backup_type no es válido"
$ALERTS_SCRIPT "Invalid backup format for $backup_type"
return 1
fi
;;
esac

echo "✅ Backup de $backup_type: OK ($backup_age_hours horas)"
return 0
}

# Verificar todos los tipos de backup
verify_backup "database" 25 # Máximo 25 horas (backup diario)
verify_backup "models" 168 # Máximo 168 horas (backup semanal)
verify_backup "config" 744 # Máximo 744 horas (backup mensual)

# Verificar espacio disponible
backup_space_used=$(du -sh "$BACKUP_BASE" | cut -f1)
backup_space_available=$(df -h "$BACKUP_BASE" | tail -1 | awk '{print $4}')

echo "💾 Espacio usado por backups: $backup_space_used"
echo "💾 Espacio disponible: $backup_space_available"

# Alerta si queda poco espacio (menos de 10GB)
available_gb=$(df --output=avail "$BACKUP_BASE" | tail -1)
available_gb=$((available_gb / 1024 / 1024))

if [ $available_gb -lt 10 ]; then
echo "⚠️ Poco espacio disponible para backups: ${available_gb}GB"
$ALERTS_SCRIPT "Low disk space for backups: ${available_gb}GB remaining"
fi

echo "Verificación completada"

Procedimientos de Recuperación

Recuperación de Base de Datos

Restauración Completa

#!/bin/bash
# scripts/recovery/restore-database.sh

BACKUP_FILE=$1
DB_NAME="iris_prod"
DB_USER="iris"

if [ -z "$BACKUP_FILE" ]; then
echo "❌ Uso: $0 <backup_file>"
echo "Ejemplo: $0 /backups/database/iris_db_20240115_020000.sql.gz"
exit 1
fi

if [ ! -f "$BACKUP_FILE" ]; then
echo "❌ Archivo de backup no encontrado: $BACKUP_FILE"
exit 1
fi

echo "🚨 ADVERTENCIA: Esta operación sobrescribirá la base de datos actual"
echo "Archivo de backup: $BACKUP_FILE"
echo "Base de datos objetivo: $DB_NAME"
read -p "¿Continuar? (yes/no): " confirm

if [ "$confirm" != "yes" ]; then
echo "Operación cancelada"
exit 0
fi

echo "🗄️ Iniciando restauración de base de datos..."

# Detener servicios que usan la base de datos
echo "⏹️ Deteniendo servicios..."
docker-compose stop api-gateway ml-classifier ml-embeddings ocr-extractor

# Crear backup de seguridad de la base actual
echo "📦 Creando backup de seguridad..."
SAFETY_BACKUP="/tmp/iris_safety_backup_$(date +%s).sql"
docker-compose exec -T postgres pg_dump -U "$DB_USER" -d "$DB_NAME" > "$SAFETY_BACKUP"
echo "Backup de seguridad guardado en: $SAFETY_BACKUP"

# Preparar archivo de backup
TEMP_BACKUP="/tmp/restore_backup.sql"

case "$BACKUP_FILE" in
*.gz)
echo "📦 Descomprimiendo backup..."
gunzip -c "$BACKUP_FILE" > "$TEMP_BACKUP"
;;
*.dump)
echo "📦 Convirtiendo backup custom format..."
docker-compose exec -T postgres pg_restore --clean --if-exists -f /tmp/restore_backup.sql "$BACKUP_FILE"
docker cp $(docker-compose ps -q postgres):/tmp/restore_backup.sql "$TEMP_BACKUP"
;;
*.sql)
cp "$BACKUP_FILE" "$TEMP_BACKUP"
;;
*)
echo "❌ Formato de backup no soportado"
exit 1
;;
esac

# Restaurar base de datos
echo "📥 Restaurando base de datos..."

# Terminar conexiones activas
docker-compose exec -T postgres psql -U "$DB_USER" -d postgres -c "
SELECT pg_terminate_backend(pg_stat_activity.pid)
FROM pg_stat_activity
WHERE pg_stat_activity.datname = '$DB_NAME'
AND pid <> pg_backend_pid();
"

# Recrear base de datos
docker-compose exec -T postgres psql -U "$DB_USER" -d postgres -c "
DROP DATABASE IF EXISTS $DB_NAME;
CREATE DATABASE $DB_NAME OWNER $DB_USER;
"

# Restaurar datos
if docker-compose exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" < "$TEMP_BACKUP"; then
echo "✅ Base de datos restaurada exitosamente"

# Verificar integridad
echo "🔍 Verificando integridad..."
row_count=$(docker-compose exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" -t -c "
SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';
" | xargs)

echo "📊 Tablas restauradas: $row_count"

# Limpiar archivo temporal
rm -f "$TEMP_BACKUP"

else
echo "❌ Error en la restauración"
echo "💡 Restaurando backup de seguridad..."

docker-compose exec -T postgres psql -U "$DB_USER" -d postgres -c "
DROP DATABASE IF EXISTS $DB_NAME;
CREATE DATABASE $DB_NAME OWNER $DB_USER;
"

docker-compose exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" < "$SAFETY_BACKUP"
echo "🔄 Backup de seguridad restaurado"

rm -f "$TEMP_BACKUP"
exit 1
fi

# Reiniciar servicios
echo "▶️ Reiniciando servicios..."
docker-compose start api-gateway ml-classifier ml-embeddings ocr-extractor

# Verificar funcionamiento
echo "🏥 Verificando funcionamiento..."
sleep 10

if curl -f -s http://localhost:8000/health > /dev/null; then
echo "✅ Servicios funcionando correctamente"
else
echo "⚠️ Servicios no responden - verificar logs"
docker-compose logs --tail=20 api-gateway
fi

echo "🎉 Restauración completada"

Point-in-Time Recovery (PITR)

#!/bin/bash
# scripts/recovery/pitr-restore.sh

TARGET_TIME=$1
BASE_BACKUP_DIR="/backups/incremental/base"
WAL_BACKUP_DIR="/backups/incremental/wal"

if [ -z "$TARGET_TIME" ]; then
echo "❌ Uso: $0 'YYYY-MM-DD HH:MM:SS'"
echo "Ejemplo: $0 '2024-01-15 14:30:00'"
exit 1
fi

echo "⏰ Restauración Point-in-Time a: $TARGET_TIME"

# Encontrar backup base más cercano anterior al tiempo objetivo
echo "🔍 Buscando backup base apropiado..."
BASE_BACKUP=$(find "$BASE_BACKUP_DIR" -name "base_*.tar.gz" | \
sort | \
while read backup; do
backup_date=$(basename "$backup" | sed 's/base_\([0-9]*\).*/\1/')
backup_timestamp=$(date -d "${backup_date:0:4}-${backup_date:4:2}-${backup_date:6:2}" +%s)
target_timestamp=$(date -d "$TARGET_TIME" +%s)

if [ $backup_timestamp -le $target_timestamp ]; then
echo "$backup"
fi
done | tail -1)

if [ -z "$BASE_BACKUP" ]; then
echo "❌ No se encontró backup base apropiado"
exit 1
fi

echo "📦 Usando backup base: $BASE_BACKUP"

# Detener PostgreSQL
docker-compose stop postgres

# Limpiar directorio de datos
POSTGRES_DATA_DIR="/var/lib/docker/volumes/iris_postgres_data/_data"
sudo rm -rf "${POSTGRES_DATA_DIR:?}"/*

# Restaurar backup base
echo "📥 Restaurando backup base..."
sudo tar -xzf "$BASE_BACKUP" -C "$POSTGRES_DATA_DIR"

# Configurar recovery
cat > "$POSTGRES_DATA_DIR/recovery.conf" << EOF
restore_command = 'cp $WAL_BACKUP_DIR/%f %p'
recovery_target_time = '$TARGET_TIME'
recovery_target_action = 'promote'
EOF

# Reiniciar PostgreSQL para recovery
echo "🔄 Iniciando proceso de recovery..."
docker-compose start postgres

# Monitorear proceso de recovery
echo "📊 Monitoreando recovery..."
while true; do
if docker-compose exec postgres test -f /var/lib/postgresql/data/recovery.done; then
echo "✅ Recovery completado"
break
fi

echo "⏳ Recovery en progreso..."
sleep 5
done

# Verificar estado
echo "🏥 Verificando estado de la base de datos..."
if docker-compose exec -T postgres psql -U iris -d iris_prod -c "SELECT NOW();"; then
echo "✅ Base de datos operativa"
else
echo "❌ Problemas con la base de datos"
exit 1
fi

echo "🎉 Point-in-Time Recovery completado"

Recuperación de Modelos ML

#!/bin/bash
# scripts/recovery/restore-models.sh

BACKUP_FILE=$1
MODELS_DIR="/app/data/models"

if [ -z "$BACKUP_FILE" ]; then
echo "❌ Uso: $0 <models_backup_file>"
echo "Ejemplo: $0 /backups/models/iris_models_20240115_030000.tar.gz"
exit 1
fi

if [ ! -f "$BACKUP_FILE" ]; then
echo "❌ Archivo de backup no encontrado: $BACKUP_FILE"
exit 1
fi

echo "🤖 Restaurando modelos ML desde: $BACKUP_FILE"

# Detener servicios ML
echo "⏹️ Deteniendo servicios ML..."
docker-compose stop ml-classifier ml-embeddings

# Backup de modelos actuales
echo "📦 Respaldando modelos actuales..."
CURRENT_BACKUP="/tmp/current_models_$(date +%s).tar.gz"
docker-compose run --rm ml-classifier tar -czf /tmp/current_models.tar.gz -C /app/data models/
docker cp $(docker-compose ps -q ml-classifier):/tmp/current_models.tar.gz "$CURRENT_BACKUP"
echo "Modelos actuales respaldados en: $CURRENT_BACKUP"

# Limpiar directorio de modelos
echo "🧹 Limpiando directorio de modelos..."
docker-compose run --rm ml-classifier rm -rf /app/data/models/*

# Restaurar modelos
echo "📥 Restaurando modelos..."
docker-compose run --rm ml-classifier tar -xzf /app/backup_file.tar.gz -C /app/data/

# Copiar archivo de backup al contenedor temporalmente
docker cp "$BACKUP_FILE" $(docker-compose ps -q ml-classifier):/app/backup_file.tar.gz

# Extraer modelos
docker-compose exec -T ml-classifier tar -xzf /app/backup_file.tar.gz -C /app/data/

# Verificar modelos restaurados
echo "🔍 Verificando modelos restaurados..."
model_count=$(docker-compose exec -T ml-classifier find /app/data/models -name "*.pth" | wc -l)
echo "📊 Modelos encontrados: $model_count"

# Verificar integridad de modelos
echo "🧪 Verificando integridad de modelos..."
docker-compose exec -T ml-classifier python -c "
import torch
import os
import sys

models_dir = '/app/data/models'
errors = []

for file in os.listdir(models_dir):
if file.endswith('.pth'):
try:
model = torch.load(os.path.join(models_dir, file), map_location='cpu')
print(f'✅ {file}: OK')
except Exception as e:
print(f'❌ {file}: Error - {e}')
errors.append(file)

if errors:
print(f'Errores en {len(errors)} modelos')
sys.exit(1)
else:
print('Todos los modelos verificados correctamente')
"

if [ $? -eq 0 ]; then
echo "✅ Modelos restaurados e integrados correctamente"

# Limpiar archivo temporal
docker-compose exec -T ml-classifier rm -f /app/backup_file.tar.gz

# Reiniciar servicios ML
echo "▶️ Reiniciando servicios ML..."
docker-compose start ml-classifier ml-embeddings

# Verificar funcionamiento
echo "🏥 Verificando funcionamiento..."
sleep 15

if curl -f -s http://localhost:8003/health > /dev/null; then
echo "✅ Servicios ML funcionando correctamente"
else
echo "⚠️ Servicios ML no responden"
docker-compose logs --tail=10 ml-classifier
fi

else
echo "❌ Error en la verificación de modelos"
echo "🔄 Restaurando modelos anteriores..."

# Restaurar backup de seguridad
docker-compose run --rm ml-classifier rm -rf /app/data/models/*
docker cp "$CURRENT_BACKUP" $(docker-compose ps -q ml-classifier):/tmp/current_models.tar.gz
docker-compose exec -T ml-classifier tar -xzf /tmp/current_models.tar.gz -C /app/data/

echo "💡 Modelos anteriores restaurados"
exit 1
fi

echo "🎉 Restauración de modelos completada"

Recuperación Completa del Sistema

#!/bin/bash
# scripts/recovery/full-system-restore.sh

DB_BACKUP=$1
MODELS_BACKUP=$2
CONFIG_BACKUP=$3

if [ $# -ne 3 ]; then
echo "❌ Uso: $0 <db_backup> <models_backup> <config_backup>"
echo "Ejemplo:"
echo " $0 /backups/database/iris_db_20240115.sql.gz \\"
echo " /backups/models/iris_models_20240115.tar.gz \\"
echo " /backups/config/iris_config_20240115.tar.gz"
exit 1
fi

echo "🚨 RECUPERACIÓN COMPLETA DEL SISTEMA IRIS"
echo "========================================"
echo "Base de datos: $DB_BACKUP"
echo "Modelos ML: $MODELS_BACKUP"
echo "Configuración: $CONFIG_BACKUP"
echo ""

read -p "⚠️ Esto sobrescribirá todo el sistema. ¿Continuar? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Operación cancelada"
exit 0
fi

# Log de recuperación
RECOVERY_LOG="/tmp/iris_recovery_$(date +%s).log"
exec > >(tee -a "$RECOVERY_LOG")
exec 2>&1

echo "📋 Log de recuperación: $RECOVERY_LOG"
echo "⏰ Inicio: $(date)"

# Función para verificar éxito de comando
check_success() {
if [ $? -eq 0 ]; then
echo "✅ $1: Exitoso"
else
echo "❌ $1: Falló"
echo "🚨 Recuperación abortada. Ver log: $RECOVERY_LOG"
exit 1
fi
}

# 1. Detener todos los servicios
echo ""
echo "🟡 FASE 1: Deteniendo servicios"
docker-compose down
check_success "Detener servicios"

# 2. Crear backup de seguridad completo
echo ""
echo "🟡 FASE 2: Backup de seguridad"
SAFETY_DIR="/tmp/iris_safety_$(date +%s)"
mkdir -p "$SAFETY_DIR"

# Backup de volúmenes Docker
docker run --rm -v iris_postgres_data:/data -v "$SAFETY_DIR:/backup" \
ubuntu tar -czf /backup/postgres_data.tar.gz -C /data .
check_success "Backup de datos PostgreSQL"

docker run --rm -v iris_redis_data:/data -v "$SAFETY_DIR:/backup" \
ubuntu tar -czf /backup/redis_data.tar.gz -C /data .
check_success "Backup de datos Redis"

echo "🔒 Backup de seguridad en: $SAFETY_DIR"

# 3. Restaurar configuración
echo ""
echo "🟡 FASE 3: Restaurando configuración"
tar -xzf "$CONFIG_BACKUP" -C /tmp/
check_success "Extraer configuración"

# Copiar archivos de configuración
cp /tmp/docker-compose.yml ./
cp /tmp/.env ./
cp /tmp/nginx.conf ./
check_success "Copiar configuración"

# 4. Recrear volúmenes
echo ""
echo "🟡 FASE 4: Recreando volúmenes"
docker volume rm iris_postgres_data iris_redis_data 2>/dev/null || true
docker volume create iris_postgres_data
docker volume create iris_redis_data
check_success "Recrear volúmenes"

# 5. Restaurar base de datos
echo ""
echo "🟡 FASE 5: Restaurando base de datos"
./scripts/recovery/restore-database.sh "$DB_BACKUP"
check_success "Restaurar base de datos"

# 6. Restaurar modelos ML
echo ""
echo "🟡 FASE 6: Restaurando modelos ML"
./scripts/recovery/restore-models.sh "$MODELS_BACKUP"
check_success "Restaurar modelos ML"

# 7. Iniciar todos los servicios
echo ""
echo "🟡 FASE 7: Iniciando servicios"
docker-compose up -d
check_success "Iniciar servicios"

# 8. Verificación integral
echo ""
echo "🟡 FASE 8: Verificación integral"
echo "⏳ Esperando inicialización de servicios..."
sleep 30

# Verificar servicios principales
services=("api-gateway" "image-processor" "ml-embeddings" "ml-classifier" "ocr-extractor")
for service in "${services[@]}"; do
if curl -f -s "http://localhost:8000/services/$service/health" > /dev/null; then
echo "✅ $service: Operativo"
else
echo "❌ $service: No responde"
echo "📋 Logs de $service:"
docker-compose logs --tail=5 "$service"
fi
done

# Verificar base de datos
db_tables=$(docker-compose exec -T postgres psql -U iris -d iris_prod -t -c "
SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';
" | xargs)
echo "📊 Tablas en base de datos: $db_tables"

# Verificar modelos ML
model_count=$(docker-compose exec -T ml-classifier find /app/data/models -name "*.pth" | wc -l)
echo "🤖 Modelos ML disponibles: $model_count"

# Test funcional básico
echo ""
echo "🧪 Test funcional básico..."
if curl -f -s "http://localhost:8000/health" > /dev/null; then
echo "✅ API Gateway responde"

# Test de pipeline completo si hay imágenes de prueba
if [ -f "/app/data/test/sample.jpg" ]; then
echo "📄 Probando pipeline completo..."
response=$(curl -s -X POST -F "file=@/app/data/test/sample.jpg" \
"http://localhost:8000/process")
if echo "$response" | grep -q "success"; then
echo "✅ Pipeline completo funcional"
else
echo "⚠️ Pipeline con problemas - verificar manualmente"
fi
fi
else
echo "❌ API Gateway no responde"
fi

echo ""
echo "🎉 RECUPERACIÓN COMPLETA FINALIZADA"
echo "⏰ Fin: $(date)"
echo "📋 Log completo: $RECOVERY_LOG"
echo "🔒 Backup de seguridad: $SAFETY_DIR"
echo ""
echo "📝 PRÓXIMOS PASOS:"
echo "1. Verificar funcionamiento completo del sistema"
echo "2. Ejecutar tests de integración si están disponibles"
echo "3. Notificar a usuarios que el sistema está operativo"
echo "4. Revisar logs para asegurar funcionamiento normal"
echo "5. Programar backup inmediato del sistema restaurado"

Disaster Recovery

Plan de Contingencia

RTO/RPO Objectives

Recovery Time Objective (RTO):

  • Crítico (P1): 4 horas
  • Alto (P2): 24 horas
  • Medio (P3): 72 horas

Recovery Point Objective (RPO):

  • Base de datos: 1 hora (backup incremental)
  • Modelos ML: 7 días (backup semanal)
  • Configuración: 30 días (backup mensual)

Escenarios de Disaster Recovery

Escenario 1: Fallo de Hardware

#!/bin/bash
# scripts/dr/hardware-failure-response.sh

echo "🖥️ Respondiendo a fallo de hardware..."

# 1. Evaluar extensión del daño
echo "🔍 Evaluando daño..."
./scripts/dr/assess-damage.sh

# 2. Activar infraestructura de backup
echo "🔄 Activando infraestructura de backup..."
# Asumir que tenemos infraestructura en la nube configurada
terraform apply -var="environment=disaster-recovery" infrastructure/dr/

# 3. Restaurar desde backups más recientes
echo "📥 Restaurando desde backups..."
latest_db_backup=$(find /backups/database -name "*.sql.gz" | sort -r | head -n1)
latest_models_backup=$(find /backups/models -name "*.tar.gz" | sort -r | head -n1)
latest_config_backup=$(find /backups/config -name "*.tar.gz" | sort -r | head -n1)

./scripts/recovery/full-system-restore.sh \
"$latest_db_backup" \
"$latest_models_backup" \
"$latest_config_backup"

# 4. Actualizar DNS para apuntar a nueva infraestructura
echo "🌐 Actualizando DNS..."
./scripts/dr/update-dns.sh "new-server-ip"

# 5. Verificar sistema completo
echo "✅ Verificando sistema..."
./scripts/dr/full-system-verification.sh

Escenario 2: Corrupción de Datos

#!/bin/bash
# scripts/dr/data-corruption-response.sh

CORRUPTION_TYPE=$1 # database, models, config

echo "💥 Respondiendo a corrupción de datos: $CORRUPTION_TYPE"

case "$CORRUPTION_TYPE" in
"database")
echo "🗄️ Manejando corrupción de base de datos..."

# Verificar alcance de la corrupción
./scripts/dr/assess-db-corruption.sh

# Intentar reparación automática primero
if docker-compose exec postgres pg_checksums --enable --pgdata=/var/lib/postgresql/data; then
echo "✅ Checksums habilitados, verificando integridad..."
if docker-compose exec postgres pg_checksums --check --pgdata=/var/lib/postgresql/data; then
echo "✅ Base de datos íntegra"
exit 0
fi
fi

# Si la reparación falla, restaurar desde backup
echo "📥 Restaurando desde backup..."
latest_backup=$(find /backups/database -name "*.sql.gz" | sort -r | head -n1)
./scripts/recovery/restore-database.sh "$latest_backup"
;;

"models")
echo "🤖 Manejando corrupción de modelos ML..."

# Verificar qué modelos están corruptos
./scripts/dr/verify-models-integrity.sh

# Restaurar modelos desde backup
latest_backup=$(find /backups/models -name "*.tar.gz" | sort -r | head -n1)
./scripts/recovery/restore-models.sh "$latest_backup"
;;

"config")
echo "⚙️ Manejando corrupción de configuración..."

# Restaurar configuración desde backup y git
git stash
git checkout HEAD -- .

# Si git no es suficiente, usar backup
if [ $? -ne 0 ]; then
latest_backup=$(find /backups/config -name "*.tar.gz" | sort -r | head -n1)
tar -xzf "$latest_backup"
fi
;;
esac

echo "✅ Recuperación de corrupción completada"

Infraestructura de DR en la Nube

Terraform Configuration:

# infrastructure/dr/main.tf

provider "aws" {
region = var.dr_region
}

# DR VPC
resource "aws_vpc" "dr_vpc" {
cidr_block = "10.1.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true

tags = {
Name = "iris-dr-vpc"
Environment = "disaster-recovery"
}
}

# DR Subnets
resource "aws_subnet" "dr_subnet" {
count = 2
vpc_id = aws_vpc.dr_vpc.id
cidr_block = "10.1.${count.index + 1}.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]

tags = {
Name = "iris-dr-subnet-${count.index + 1}"
}
}

# DR Security Group
resource "aws_security_group" "dr_sg" {
name = "iris-dr-sg"
description = "Security group for IRIS DR"
vpc_id = aws_vpc.dr_vpc.id

ingress {
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}

ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}

ingress {
from_port = 8000
to_port = 8010
protocol = "tcp"
cidr_blocks = ["10.1.0.0/16"]
}

egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}

# DR EC2 Instance
resource "aws_instance" "dr_instance" {
ami = var.ami_id
instance_type = var.instance_type
subnet_id = aws_subnet.dr_subnet[0].id
vpc_security_group_ids = [aws_security_group.dr_sg.id]

user_data = base64encode(templatefile("${path.module}/userdata.sh", {
docker_compose_url = var.docker_compose_url
}))

tags = {
Name = "iris-dr-instance"
Environment = "disaster-recovery"
}
}

# DR RDS Instance
resource "aws_db_instance" "dr_db" {
identifier = "iris-dr-db"

engine = "postgres"
engine_version = "15.4"
instance_class = "db.t3.medium"

allocated_storage = 100
max_allocated_storage = 1000
storage_type = "gp2"
storage_encrypted = true

db_name = "iris_prod"
username = "iris"
password = var.db_password

vpc_security_group_ids = [aws_security_group.dr_sg.id]
db_subnet_group_name = aws_db_subnet_group.dr_db_subnet_group.name

backup_retention_period = 7
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"

skip_final_snapshot = true

tags = {
Name = "iris-dr-database"
Environment = "disaster-recovery"
}
}

# DR Database Subnet Group
resource "aws_db_subnet_group" "dr_db_subnet_group" {
name = "iris-dr-db-subnet-group"
subnet_ids = aws_subnet.dr_subnet[*].id

tags = {
Name = "iris-dr-db-subnet-group"
}
}

Este sistema de backup y recuperación proporciona una protección completa para IRIS, asegurando que los datos críticos puedan ser restaurados rápidamente en caso de cualquier tipo de fallo o desastre.