Procedimientos de Backup y Recuperación
Estrategia de Backup
Tipos de Backup
IRIS implementa una estrategia de backup multi-nivel que garantiza la recuperación de datos en diferentes escenarios de fallo.
1. Backup de Base de Datos
Configuración PostgreSQL:
#!/bin/bash
# scripts/backup/database-backup.sh
# Variables de configuración
DB_HOST="localhost"
DB_PORT="5432"
DB_NAME="iris_prod"
DB_USER="iris"
BACKUP_DIR="/backups/database"
RETENTION_DAYS=30
# Crear directorio de backup
mkdir -p "$BACKUP_DIR"
# Timestamp para el backup
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="$BACKUP_DIR/iris_db_$TIMESTAMP.sql"
echo "🗄️ Iniciando backup de base de datos..."
echo "Archivo: $BACKUP_FILE"
# Backup completo con pg_dump
docker-compose exec -T postgres pg_dump \
-h "$DB_HOST" \
-p "$DB_PORT" \
-U "$DB_USER" \
-d "$DB_NAME" \
--verbose \
--no-password \
--format=custom \
--compress=9 \
--file="/tmp/backup.dump"
# Copiar backup del contenedor al host
docker cp $(docker-compose ps -q postgres):/tmp/backup.dump "$BACKUP_FILE.dump"
# Backup en formato SQL para inspección manual
docker-compose exec -T postgres pg_dump \
-h "$DB_HOST" \
-p "$DB_PORT" \
-U "$DB_USER" \
-d "$DB_NAME" \
--verbose \
--no-password \
--clean \
--if-exists > "$BACKUP_FILE"
# Comprimir backup SQL
gzip "$BACKUP_FILE"
# Verificar integridad del backup
if [ -f "$BACKUP_FILE.gz" ]; then
echo "✅ Backup completado exitosamente"
# Obtener tamaño del archivo
size=$(du -h "$BACKUP_FILE.gz" | cut -f1)
echo "📦 Tamaño del backup: $size"
# Log del backup
echo "$(date): Backup exitoso - $BACKUP_FILE.gz ($size)" >> "$BACKUP_DIR/backup.log"
else
echo "❌ Error en el backup"
echo "$(date): Backup falló - $BACKUP_FILE.gz" >> "$BACKUP_DIR/backup.log"
exit 1
fi
# Limpiar backups antiguos
echo "🧹 Limpiando backups antiguos (>$RETENTION_DAYS días)..."
find "$BACKUP_DIR" -name "iris_db_*.sql.gz" -mtime +$RETENTION_DAYS -delete
find "$BACKUP_DIR" -name "iris_db_*.dump" -mtime +$RETENTION_DAYS -delete
echo "✅ Proceso de backup completado"
Backup Incremental:
#!/bin/bash
# scripts/backup/incremental-backup.sh
# Backup usando WAL (Write-Ahead Logging) de PostgreSQL
BACKUP_DIR="/backups/incremental"
BASE_BACKUP_DIR="$BACKUP_DIR/base"
WAL_BACKUP_DIR="$BACKUP_DIR/wal"
mkdir -p "$BASE_BACKUP_DIR" "$WAL_BACKUP_DIR"
# Backup base (semanal)
if [ "$(date +%u)" -eq 1 ]; then # Lunes
echo "📅 Ejecutando backup base semanal..."
docker-compose exec -T postgres pg_basebackup \
-h localhost \
-U iris \
-D /tmp/base_backup \
-Ft \
-z \
-P \
-v
# Copiar al host
docker cp $(docker-compose ps -q postgres):/tmp/base_backup/base.tar.gz \
"$BASE_BACKUP_DIR/base_$(date +%Y%m%d).tar.gz"
fi
# Archivar WAL files
docker-compose exec -T postgres find /var/lib/postgresql/data/pg_wal \
-name "*.ready" -exec basename {} .ready \; | \
while read wal_file; do
docker cp $(docker-compose ps -q postgres):/var/lib/postgresql/data/pg_wal/$wal_file \
"$WAL_BACKUP_DIR/"
done
2. Backup de Modelos ML
Backup de Modelos Entrenados:
#!/bin/bash
# scripts/backup/models-backup.sh
MODELS_DIR="/app/data/models"
BACKUP_DIR="/backups/models"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
echo "🤖 Iniciando backup de modelos ML..."
# Crear estructura de backup
mkdir -p "$BACKUP_DIR"
BACKUP_FILE="$BACKUP_DIR/iris_models_$TIMESTAMP.tar.gz"
# Backup de todos los modelos
docker-compose exec -T ml-classifier tar -czf /tmp/models_backup.tar.gz \
-C /app/data models/
# Copiar backup del contenedor
docker cp $(docker-compose ps -q ml-classifier):/tmp/models_backup.tar.gz "$BACKUP_FILE"
# Verificar contenido del backup
echo "📦 Contenido del backup:"
tar -tzf "$BACKUP_FILE" | head -10
# Metadata del backup
cat > "$BACKUP_DIR/models_metadata_$TIMESTAMP.json" << EOF
{
"backup_date": "$(date -Iseconds)",
"backup_file": "$(basename $BACKUP_FILE)",
"models_included": [
$(tar -tzf "$BACKUP_FILE" | grep "\.pth$\|\.pkl$" | sed 's/.*/"&"/' | paste -sd ',' -)
],
"total_size": "$(du -h $BACKUP_FILE | cut -f1)",
"model_versions": {
$(docker-compose exec -T ml-classifier python -c "
import json
import torch
import os
versions = {}
models_dir = '/app/data/models'
for file in os.listdir(models_dir):
if file.endswith('.pth'):
try:
model = torch.load(os.path.join(models_dir, file), map_location='cpu')
if isinstance(model, dict) and 'version' in model:
versions[file] = model['version']
except:
versions[file] = 'unknown'
print(json.dumps(versions, indent=8)[8:-1]) # Remove outer braces
")
}
}
EOF
echo "✅ Backup de modelos completado: $BACKUP_FILE"
3. Backup de Configuración
Backup de Configuraciones del Sistema:
#!/bin/bash
# scripts/backup/config-backup.sh
CONFIG_BACKUP_DIR="/backups/config"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="$CONFIG_BACKUP_DIR/iris_config_$TIMESTAMP.tar.gz"
echo "⚙️ Iniciando backup de configuración..."
mkdir -p "$CONFIG_BACKUP_DIR"
# Archivos de configuración a respaldar
CONFIG_FILES=(
"docker-compose.yml"
"docker-compose.prod.yml"
".env"
".env.prod"
"nginx.conf"
"packages/*/requirements.txt"
"packages/*/Dockerfile"
"scripts/"
".github/"
)
# Crear backup
tar -czf "$BACKUP_FILE" \
--exclude="*.pyc" \
--exclude="__pycache__" \
--exclude=".git" \
--exclude="node_modules" \
--exclude="*.log" \
"${CONFIG_FILES[@]}"
# Incluir configuración de la base de datos
docker-compose exec -T postgres pg_dumpall --globals-only > \
"$CONFIG_BACKUP_DIR/postgres_globals_$TIMESTAMP.sql"
# Metadata del backup
cat > "$CONFIG_BACKUP_DIR/config_metadata_$TIMESTAMP.json" << EOF
{
"backup_date": "$(date -Iseconds)",
"backup_file": "$(basename $BACKUP_FILE)",
"git_commit": "$(git rev-parse HEAD 2>/dev/null || echo 'unknown')",
"git_branch": "$(git branch --show-current 2>/dev/null || echo 'unknown')",
"docker_images": [
$(docker images --format '{{.Repository}}:{{.Tag}}' | grep iris | sed 's/.*/"&"/' | paste -sd ',' -)
],
"total_size": "$(du -h $BACKUP_FILE | cut -f1)"
}
EOF
echo "✅ Backup de configuración completado: $BACKUP_FILE"
Automatización de Backups
Configuración de Cron Jobs
# /etc/cron.d/iris-backups
# Backup diario de base de datos (2:00 AM)
0 2 * * * root /app/scripts/backup/database-backup.sh >> /var/log/iris-backup.log 2>&1
# Backup incremental cada 6 horas
0 */6 * * * root /app/scripts/backup/incremental-backup.sh >> /var/log/iris-backup.log 2>&1
# Backup semanal de modelos (domingos 3:00 AM)
0 3 * * 0 root /app/scripts/backup/models-backup.sh >> /var/log/iris-backup.log 2>&1
# Backup mensual de configuración (primer día del mes 4:00 AM)
0 4 1 * * root /app/scripts/backup/config-backup.sh >> /var/log/iris-backup.log 2>&1
# Verificación diaria de backups (6:00 AM)
0 6 * * * root /app/scripts/backup/verify-backups.sh >> /var/log/iris-backup.log 2>&1
Script de Verificación
#!/bin/bash
# scripts/backup/verify-backups.sh
echo "🔍 Verificación diaria de backups - $(date)"
echo "============================================"
BACKUP_BASE="/backups"
ALERTS_SCRIPT="/app/scripts/alerts/send-alert.sh"
# Función para verificar backup
verify_backup() {
local backup_type=$1
local backup_dir="$BACKUP_BASE/$backup_type"
local max_age_hours=$2
echo "Verificando $backup_type..."
# Encontrar el backup más reciente
latest_backup=$(find "$backup_dir" -name "*.gz" -o -name "*.dump" | sort -r | head -n1)
if [ -z "$latest_backup" ]; then
echo "❌ No se encontraron backups en $backup_dir"
$ALERTS_SCRIPT "No backups found for $backup_type"
return 1
fi
# Verificar antigüedad
backup_age_hours=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 3600 ))
if [ $backup_age_hours -gt $max_age_hours ]; then
echo "⚠️ Backup de $backup_type muy antiguo: $backup_age_hours horas"
$ALERTS_SCRIPT "Backup for $backup_type is $backup_age_hours hours old"
return 1
fi
# Verificar integridad del archivo
case "$latest_backup" in
*.gz)
if ! gzip -t "$latest_backup" 2>/dev/null; then
echo "❌ Backup de $backup_type corrupto"
$ALERTS_SCRIPT "Backup corruption detected for $backup_type"
return 1
fi
;;
*.dump)
if ! file "$latest_backup" | grep -q "PostgreSQL"; then
echo "❌ Backup de $backup_type no es válido"
$ALERTS_SCRIPT "Invalid backup format for $backup_type"
return 1
fi
;;
esac
echo "✅ Backup de $backup_type: OK ($backup_age_hours horas)"
return 0
}
# Verificar todos los tipos de backup
verify_backup "database" 25 # Máximo 25 horas (backup diario)
verify_backup "models" 168 # Máximo 168 horas (backup semanal)
verify_backup "config" 744 # Máximo 744 horas (backup mensual)
# Verificar espacio disponible
backup_space_used=$(du -sh "$BACKUP_BASE" | cut -f1)
backup_space_available=$(df -h "$BACKUP_BASE" | tail -1 | awk '{print $4}')
echo "💾 Espacio usado por backups: $backup_space_used"
echo "💾 Espacio disponible: $backup_space_available"
# Alerta si queda poco espacio (menos de 10GB)
available_gb=$(df --output=avail "$BACKUP_BASE" | tail -1)
available_gb=$((available_gb / 1024 / 1024))
if [ $available_gb -lt 10 ]; then
echo "⚠️ Poco espacio disponible para backups: ${available_gb}GB"
$ALERTS_SCRIPT "Low disk space for backups: ${available_gb}GB remaining"
fi
echo "Verificación completada"
Procedimientos de Recuperación
Recuperación de Base de Datos
Restauración Completa
#!/bin/bash
# scripts/recovery/restore-database.sh
BACKUP_FILE=$1
DB_NAME="iris_prod"
DB_USER="iris"
if [ -z "$BACKUP_FILE" ]; then
echo "❌ Uso: $0 <backup_file>"
echo "Ejemplo: $0 /backups/database/iris_db_20240115_020000.sql.gz"
exit 1
fi
if [ ! -f "$BACKUP_FILE" ]; then
echo "❌ Archivo de backup no encontrado: $BACKUP_FILE"
exit 1
fi
echo "🚨 ADVERTENCIA: Esta operación sobrescribirá la base de datos actual"
echo "Archivo de backup: $BACKUP_FILE"
echo "Base de datos objetivo: $DB_NAME"
read -p "¿Continuar? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Operación cancelada"
exit 0
fi
echo "🗄️ Iniciando restauración de base de datos..."
# Detener servicios que usan la base de datos
echo "⏹️ Deteniendo servicios..."
docker-compose stop api-gateway ml-classifier ml-embeddings ocr-extractor
# Crear backup de seguridad de la base actual
echo "📦 Creando backup de seguridad..."
SAFETY_BACKUP="/tmp/iris_safety_backup_$(date +%s).sql"
docker-compose exec -T postgres pg_dump -U "$DB_USER" -d "$DB_NAME" > "$SAFETY_BACKUP"
echo "Backup de seguridad guardado en: $SAFETY_BACKUP"
# Preparar archivo de backup
TEMP_BACKUP="/tmp/restore_backup.sql"
case "$BACKUP_FILE" in
*.gz)
echo "📦 Descomprimiendo backup..."
gunzip -c "$BACKUP_FILE" > "$TEMP_BACKUP"
;;
*.dump)
echo "📦 Convirtiendo backup custom format..."
docker-compose exec -T postgres pg_restore --clean --if-exists -f /tmp/restore_backup.sql "$BACKUP_FILE"
docker cp $(docker-compose ps -q postgres):/tmp/restore_backup.sql "$TEMP_BACKUP"
;;
*.sql)
cp "$BACKUP_FILE" "$TEMP_BACKUP"
;;
*)
echo "❌ Formato de backup no soportado"
exit 1
;;
esac
# Restaurar base de datos
echo "📥 Restaurando base de datos..."
# Terminar conexiones activas
docker-compose exec -T postgres psql -U "$DB_USER" -d postgres -c "
SELECT pg_terminate_backend(pg_stat_activity.pid)
FROM pg_stat_activity
WHERE pg_stat_activity.datname = '$DB_NAME'
AND pid <> pg_backend_pid();
"
# Recrear base de datos
docker-compose exec -T postgres psql -U "$DB_USER" -d postgres -c "
DROP DATABASE IF EXISTS $DB_NAME;
CREATE DATABASE $DB_NAME OWNER $DB_USER;
"
# Restaurar datos
if docker-compose exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" < "$TEMP_BACKUP"; then
echo "✅ Base de datos restaurada exitosamente"
# Verificar integridad
echo "🔍 Verificando integridad..."
row_count=$(docker-compose exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" -t -c "
SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';
" | xargs)
echo "📊 Tablas restauradas: $row_count"
# Limpiar archivo temporal
rm -f "$TEMP_BACKUP"
else
echo "❌ Error en la restauración"
echo "💡 Restaurando backup de seguridad..."
docker-compose exec -T postgres psql -U "$DB_USER" -d postgres -c "
DROP DATABASE IF EXISTS $DB_NAME;
CREATE DATABASE $DB_NAME OWNER $DB_USER;
"
docker-compose exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" < "$SAFETY_BACKUP"
echo "🔄 Backup de seguridad restaurado"
rm -f "$TEMP_BACKUP"
exit 1
fi
# Reiniciar servicios
echo "▶️ Reiniciando servicios..."
docker-compose start api-gateway ml-classifier ml-embeddings ocr-extractor
# Verificar funcionamiento
echo "🏥 Verificando funcionamiento..."
sleep 10
if curl -f -s http://localhost:8000/health > /dev/null; then
echo "✅ Servicios funcionando correctamente"
else
echo "⚠️ Servicios no responden - verificar logs"
docker-compose logs --tail=20 api-gateway
fi
echo "🎉 Restauración completada"
Point-in-Time Recovery (PITR)
#!/bin/bash
# scripts/recovery/pitr-restore.sh
TARGET_TIME=$1
BASE_BACKUP_DIR="/backups/incremental/base"
WAL_BACKUP_DIR="/backups/incremental/wal"
if [ -z "$TARGET_TIME" ]; then
echo "❌ Uso: $0 'YYYY-MM-DD HH:MM:SS'"
echo "Ejemplo: $0 '2024-01-15 14:30:00'"
exit 1
fi
echo "⏰ Restauración Point-in-Time a: $TARGET_TIME"
# Encontrar backup base más cercano anterior al tiempo objetivo
echo "🔍 Buscando backup base apropiado..."
BASE_BACKUP=$(find "$BASE_BACKUP_DIR" -name "base_*.tar.gz" | \
sort | \
while read backup; do
backup_date=$(basename "$backup" | sed 's/base_\([0-9]*\).*/\1/')
backup_timestamp=$(date -d "${backup_date:0:4}-${backup_date:4:2}-${backup_date:6:2}" +%s)
target_timestamp=$(date -d "$TARGET_TIME" +%s)
if [ $backup_timestamp -le $target_timestamp ]; then
echo "$backup"
fi
done | tail -1)
if [ -z "$BASE_BACKUP" ]; then
echo "❌ No se encontró backup base apropiado"
exit 1
fi
echo "📦 Usando backup base: $BASE_BACKUP"
# Detener PostgreSQL
docker-compose stop postgres
# Limpiar directorio de datos
POSTGRES_DATA_DIR="/var/lib/docker/volumes/iris_postgres_data/_data"
sudo rm -rf "${POSTGRES_DATA_DIR:?}"/*
# Restaurar backup base
echo "📥 Restaurando backup base..."
sudo tar -xzf "$BASE_BACKUP" -C "$POSTGRES_DATA_DIR"
# Configurar recovery
cat > "$POSTGRES_DATA_DIR/recovery.conf" << EOF
restore_command = 'cp $WAL_BACKUP_DIR/%f %p'
recovery_target_time = '$TARGET_TIME'
recovery_target_action = 'promote'
EOF
# Reiniciar PostgreSQL para recovery
echo "🔄 Iniciando proceso de recovery..."
docker-compose start postgres
# Monitorear proceso de recovery
echo "📊 Monitoreando recovery..."
while true; do
if docker-compose exec postgres test -f /var/lib/postgresql/data/recovery.done; then
echo "✅ Recovery completado"
break
fi
echo "⏳ Recovery en progreso..."
sleep 5
done
# Verificar estado
echo "🏥 Verificando estado de la base de datos..."
if docker-compose exec -T postgres psql -U iris -d iris_prod -c "SELECT NOW();"; then
echo "✅ Base de datos operativa"
else
echo "❌ Problemas con la base de datos"
exit 1
fi
echo "🎉 Point-in-Time Recovery completado"
Recuperación de Modelos ML
#!/bin/bash
# scripts/recovery/restore-models.sh
BACKUP_FILE=$1
MODELS_DIR="/app/data/models"
if [ -z "$BACKUP_FILE" ]; then
echo "❌ Uso: $0 <models_backup_file>"
echo "Ejemplo: $0 /backups/models/iris_models_20240115_030000.tar.gz"
exit 1
fi
if [ ! -f "$BACKUP_FILE" ]; then
echo "❌ Archivo de backup no encontrado: $BACKUP_FILE"
exit 1
fi
echo "🤖 Restaurando modelos ML desde: $BACKUP_FILE"
# Detener servicios ML
echo "⏹️ Deteniendo servicios ML..."
docker-compose stop ml-classifier ml-embeddings
# Backup de modelos actuales
echo "📦 Respaldando modelos actuales..."
CURRENT_BACKUP="/tmp/current_models_$(date +%s).tar.gz"
docker-compose run --rm ml-classifier tar -czf /tmp/current_models.tar.gz -C /app/data models/
docker cp $(docker-compose ps -q ml-classifier):/tmp/current_models.tar.gz "$CURRENT_BACKUP"
echo "Modelos actuales respaldados en: $CURRENT_BACKUP"
# Limpiar directorio de modelos
echo "🧹 Limpiando directorio de modelos..."
docker-compose run --rm ml-classifier rm -rf /app/data/models/*
# Restaurar modelos
echo "📥 Restaurando modelos..."
docker-compose run --rm ml-classifier tar -xzf /app/backup_file.tar.gz -C /app/data/
# Copiar archivo de backup al contenedor temporalmente
docker cp "$BACKUP_FILE" $(docker-compose ps -q ml-classifier):/app/backup_file.tar.gz
# Extraer modelos
docker-compose exec -T ml-classifier tar -xzf /app/backup_file.tar.gz -C /app/data/
# Verificar modelos restaurados
echo "🔍 Verificando modelos restaurados..."
model_count=$(docker-compose exec -T ml-classifier find /app/data/models -name "*.pth" | wc -l)
echo "📊 Modelos encontrados: $model_count"
# Verificar integridad de modelos
echo "🧪 Verificando integridad de modelos..."
docker-compose exec -T ml-classifier python -c "
import torch
import os
import sys
models_dir = '/app/data/models'
errors = []
for file in os.listdir(models_dir):
if file.endswith('.pth'):
try:
model = torch.load(os.path.join(models_dir, file), map_location='cpu')
print(f'✅ {file}: OK')
except Exception as e:
print(f'❌ {file}: Error - {e}')
errors.append(file)
if errors:
print(f'Errores en {len(errors)} modelos')
sys.exit(1)
else:
print('Todos los modelos verificados correctamente')
"
if [ $? -eq 0 ]; then
echo "✅ Modelos restaurados e integrados correctamente"
# Limpiar archivo temporal
docker-compose exec -T ml-classifier rm -f /app/backup_file.tar.gz
# Reiniciar servicios ML
echo "▶️ Reiniciando servicios ML..."
docker-compose start ml-classifier ml-embeddings
# Verificar funcionamiento
echo "🏥 Verificando funcionamiento..."
sleep 15
if curl -f -s http://localhost:8003/health > /dev/null; then
echo "✅ Servicios ML funcionando correctamente"
else
echo "⚠️ Servicios ML no responden"
docker-compose logs --tail=10 ml-classifier
fi
else
echo "❌ Error en la verificación de modelos"
echo "🔄 Restaurando modelos anteriores..."
# Restaurar backup de seguridad
docker-compose run --rm ml-classifier rm -rf /app/data/models/*
docker cp "$CURRENT_BACKUP" $(docker-compose ps -q ml-classifier):/tmp/current_models.tar.gz
docker-compose exec -T ml-classifier tar -xzf /tmp/current_models.tar.gz -C /app/data/
echo "💡 Modelos anteriores restaurados"
exit 1
fi
echo "🎉 Restauración de modelos completada"
Recuperación Completa del Sistema
#!/bin/bash
# scripts/recovery/full-system-restore.sh
DB_BACKUP=$1
MODELS_BACKUP=$2
CONFIG_BACKUP=$3
if [ $# -ne 3 ]; then
echo "❌ Uso: $0 <db_backup> <models_backup> <config_backup>"
echo "Ejemplo:"
echo " $0 /backups/database/iris_db_20240115.sql.gz \\"
echo " /backups/models/iris_models_20240115.tar.gz \\"
echo " /backups/config/iris_config_20240115.tar.gz"
exit 1
fi
echo "🚨 RECUPERACIÓN COMPLETA DEL SISTEMA IRIS"
echo "========================================"
echo "Base de datos: $DB_BACKUP"
echo "Modelos ML: $MODELS_BACKUP"
echo "Configuración: $CONFIG_BACKUP"
echo ""
read -p "⚠️ Esto sobrescribirá todo el sistema. ¿Continuar? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "Operación cancelada"
exit 0
fi
# Log de recuperación
RECOVERY_LOG="/tmp/iris_recovery_$(date +%s).log"
exec > >(tee -a "$RECOVERY_LOG")
exec 2>&1
echo "📋 Log de recuperación: $RECOVERY_LOG"
echo "⏰ Inicio: $(date)"
# Función para verificar éxito de comando
check_success() {
if [ $? -eq 0 ]; then
echo "✅ $1: Exitoso"
else
echo "❌ $1: Falló"
echo "🚨 Recuperación abortada. Ver log: $RECOVERY_LOG"
exit 1
fi
}
# 1. Detener todos los servicios
echo ""
echo "🟡 FASE 1: Deteniendo servicios"
docker-compose down
check_success "Detener servicios"
# 2. Crear backup de seguridad completo
echo ""
echo "🟡 FASE 2: Backup de seguridad"
SAFETY_DIR="/tmp/iris_safety_$(date +%s)"
mkdir -p "$SAFETY_DIR"
# Backup de volúmenes Docker
docker run --rm -v iris_postgres_data:/data -v "$SAFETY_DIR:/backup" \
ubuntu tar -czf /backup/postgres_data.tar.gz -C /data .
check_success "Backup de datos PostgreSQL"
docker run --rm -v iris_redis_data:/data -v "$SAFETY_DIR:/backup" \
ubuntu tar -czf /backup/redis_data.tar.gz -C /data .
check_success "Backup de datos Redis"
echo "🔒 Backup de seguridad en: $SAFETY_DIR"
# 3. Restaurar configuración
echo ""
echo "🟡 FASE 3: Restaurando configuración"
tar -xzf "$CONFIG_BACKUP" -C /tmp/
check_success "Extraer configuración"
# Copiar archivos de configuración
cp /tmp/docker-compose.yml ./
cp /tmp/.env ./
cp /tmp/nginx.conf ./
check_success "Copiar configuración"
# 4. Recrear volúmenes
echo ""
echo "🟡 FASE 4: Recreando volúmenes"
docker volume rm iris_postgres_data iris_redis_data 2>/dev/null || true
docker volume create iris_postgres_data
docker volume create iris_redis_data
check_success "Recrear volúmenes"
# 5. Restaurar base de datos
echo ""
echo "🟡 FASE 5: Restaurando base de datos"
./scripts/recovery/restore-database.sh "$DB_BACKUP"
check_success "Restaurar base de datos"
# 6. Restaurar modelos ML
echo ""
echo "🟡 FASE 6: Restaurando modelos ML"
./scripts/recovery/restore-models.sh "$MODELS_BACKUP"
check_success "Restaurar modelos ML"
# 7. Iniciar todos los servicios
echo ""
echo "🟡 FASE 7: Iniciando servicios"
docker-compose up -d
check_success "Iniciar servicios"
# 8. Verificación integral
echo ""
echo "🟡 FASE 8: Verificación integral"
echo "⏳ Esperando inicialización de servicios..."
sleep 30
# Verificar servicios principales
services=("api-gateway" "image-processor" "ml-embeddings" "ml-classifier" "ocr-extractor")
for service in "${services[@]}"; do
if curl -f -s "http://localhost:8000/services/$service/health" > /dev/null; then
echo "✅ $service: Operativo"
else
echo "❌ $service: No responde"
echo "📋 Logs de $service:"
docker-compose logs --tail=5 "$service"
fi
done
# Verificar base de datos
db_tables=$(docker-compose exec -T postgres psql -U iris -d iris_prod -t -c "
SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public';
" | xargs)
echo "📊 Tablas en base de datos: $db_tables"
# Verificar modelos ML
model_count=$(docker-compose exec -T ml-classifier find /app/data/models -name "*.pth" | wc -l)
echo "🤖 Modelos ML disponibles: $model_count"
# Test funcional básico
echo ""
echo "🧪 Test funcional básico..."
if curl -f -s "http://localhost:8000/health" > /dev/null; then
echo "✅ API Gateway responde"
# Test de pipeline completo si hay imágenes de prueba
if [ -f "/app/data/test/sample.jpg" ]; then
echo "📄 Probando pipeline completo..."
response=$(curl -s -X POST -F "file=@/app/data/test/sample.jpg" \
"http://localhost:8000/process")
if echo "$response" | grep -q "success"; then
echo "✅ Pipeline completo funcional"
else
echo "⚠️ Pipeline con problemas - verificar manualmente"
fi
fi
else
echo "❌ API Gateway no responde"
fi
echo ""
echo "🎉 RECUPERACIÓN COMPLETA FINALIZADA"
echo "⏰ Fin: $(date)"
echo "📋 Log completo: $RECOVERY_LOG"
echo "🔒 Backup de seguridad: $SAFETY_DIR"
echo ""
echo "📝 PRÓXIMOS PASOS:"
echo "1. Verificar funcionamiento completo del sistema"
echo "2. Ejecutar tests de integración si están disponibles"
echo "3. Notificar a usuarios que el sistema está operativo"
echo "4. Revisar logs para asegurar funcionamiento normal"
echo "5. Programar backup inmediato del sistema restaurado"
Disaster Recovery
Plan de Contingencia
RTO/RPO Objectives
Recovery Time Objective (RTO):
- Crítico (P1): 4 horas
- Alto (P2): 24 horas
- Medio (P3): 72 horas
Recovery Point Objective (RPO):
- Base de datos: 1 hora (backup incremental)
- Modelos ML: 7 días (backup semanal)
- Configuración: 30 días (backup mensual)
Escenarios de Disaster Recovery
Escenario 1: Fallo de Hardware
#!/bin/bash
# scripts/dr/hardware-failure-response.sh
echo "🖥️ Respondiendo a fallo de hardware..."
# 1. Evaluar extensión del daño
echo "🔍 Evaluando daño..."
./scripts/dr/assess-damage.sh
# 2. Activar infraestructura de backup
echo "🔄 Activando infraestructura de backup..."
# Asumir que tenemos infraestructura en la nube configurada
terraform apply -var="environment=disaster-recovery" infrastructure/dr/
# 3. Restaurar desde backups más recientes
echo "📥 Restaurando desde backups..."
latest_db_backup=$(find /backups/database -name "*.sql.gz" | sort -r | head -n1)
latest_models_backup=$(find /backups/models -name "*.tar.gz" | sort -r | head -n1)
latest_config_backup=$(find /backups/config -name "*.tar.gz" | sort -r | head -n1)
./scripts/recovery/full-system-restore.sh \
"$latest_db_backup" \
"$latest_models_backup" \
"$latest_config_backup"
# 4. Actualizar DNS para apuntar a nueva infraestructura
echo "🌐 Actualizando DNS..."
./scripts/dr/update-dns.sh "new-server-ip"
# 5. Verificar sistema completo
echo "✅ Verificando sistema..."
./scripts/dr/full-system-verification.sh
Escenario 2: Corrupción de Datos
#!/bin/bash
# scripts/dr/data-corruption-response.sh
CORRUPTION_TYPE=$1 # database, models, config
echo "💥 Respondiendo a corrupción de datos: $CORRUPTION_TYPE"
case "$CORRUPTION_TYPE" in
"database")
echo "🗄️ Manejando corrupción de base de datos..."
# Verificar alcance de la corrupción
./scripts/dr/assess-db-corruption.sh
# Intentar reparación automática primero
if docker-compose exec postgres pg_checksums --enable --pgdata=/var/lib/postgresql/data; then
echo "✅ Checksums habilitados, verificando integridad..."
if docker-compose exec postgres pg_checksums --check --pgdata=/var/lib/postgresql/data; then
echo "✅ Base de datos íntegra"
exit 0
fi
fi
# Si la reparación falla, restaurar desde backup
echo "📥 Restaurando desde backup..."
latest_backup=$(find /backups/database -name "*.sql.gz" | sort -r | head -n1)
./scripts/recovery/restore-database.sh "$latest_backup"
;;
"models")
echo "🤖 Manejando corrupción de modelos ML..."
# Verificar qué modelos están corruptos
./scripts/dr/verify-models-integrity.sh
# Restaurar modelos desde backup
latest_backup=$(find /backups/models -name "*.tar.gz" | sort -r | head -n1)
./scripts/recovery/restore-models.sh "$latest_backup"
;;
"config")
echo "⚙️ Manejando corrupción de configuración..."
# Restaurar configuración desde backup y git
git stash
git checkout HEAD -- .
# Si git no es suficiente, usar backup
if [ $? -ne 0 ]; then
latest_backup=$(find /backups/config -name "*.tar.gz" | sort -r | head -n1)
tar -xzf "$latest_backup"
fi
;;
esac
echo "✅ Recuperación de corrupción completada"
Infraestructura de DR en la Nube
Terraform Configuration:
# infrastructure/dr/main.tf
provider "aws" {
region = var.dr_region
}
# DR VPC
resource "aws_vpc" "dr_vpc" {
cidr_block = "10.1.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "iris-dr-vpc"
Environment = "disaster-recovery"
}
}
# DR Subnets
resource "aws_subnet" "dr_subnet" {
count = 2
vpc_id = aws_vpc.dr_vpc.id
cidr_block = "10.1.${count.index + 1}.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "iris-dr-subnet-${count.index + 1}"
}
}
# DR Security Group
resource "aws_security_group" "dr_sg" {
name = "iris-dr-sg"
description = "Security group for IRIS DR"
vpc_id = aws_vpc.dr_vpc.id
ingress {
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
ingress {
from_port = 8000
to_port = 8010
protocol = "tcp"
cidr_blocks = ["10.1.0.0/16"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
# DR EC2 Instance
resource "aws_instance" "dr_instance" {
ami = var.ami_id
instance_type = var.instance_type
subnet_id = aws_subnet.dr_subnet[0].id
vpc_security_group_ids = [aws_security_group.dr_sg.id]
user_data = base64encode(templatefile("${path.module}/userdata.sh", {
docker_compose_url = var.docker_compose_url
}))
tags = {
Name = "iris-dr-instance"
Environment = "disaster-recovery"
}
}
# DR RDS Instance
resource "aws_db_instance" "dr_db" {
identifier = "iris-dr-db"
engine = "postgres"
engine_version = "15.4"
instance_class = "db.t3.medium"
allocated_storage = 100
max_allocated_storage = 1000
storage_type = "gp2"
storage_encrypted = true
db_name = "iris_prod"
username = "iris"
password = var.db_password
vpc_security_group_ids = [aws_security_group.dr_sg.id]
db_subnet_group_name = aws_db_subnet_group.dr_db_subnet_group.name
backup_retention_period = 7
backup_window = "03:00-04:00"
maintenance_window = "sun:04:00-sun:05:00"
skip_final_snapshot = true
tags = {
Name = "iris-dr-database"
Environment = "disaster-recovery"
}
}
# DR Database Subnet Group
resource "aws_db_subnet_group" "dr_db_subnet_group" {
name = "iris-dr-db-subnet-group"
subnet_ids = aws_subnet.dr_subnet[*].id
tags = {
Name = "iris-dr-db-subnet-group"
}
}
Este sistema de backup y recuperación proporciona una protección completa para IRIS, asegurando que los datos críticos puedan ser restaurados rápidamente en caso de cualquier tipo de fallo o desastre.