Fases 3-4: Clasificación de Documentos
Propósito
Las Fases 3 y 4 del pipeline IRIS se encargan de la clasificación supervisada de documentos utilizando los clusters descubiertos en la Fase 2. La Fase 3 entrena modelos de clasificación específicos, mientras que la Fase 4 aplica estos modelos para clasificar nuevos documentos.
Arquitectura del Servicio
Componentes Principales
Tecnologías Utilizadas
- PyTorch: Framework principal de deep learning
- EfficientNet: Arquitectura de CNN optimizada para clasificación
- Albumentations: Augmentación avanzada de imágenes
- TensorBoard: Monitoreo y visualización del entrenamiento
- FastAPI: API REST para servicio de clasificación
- MLflow: Tracking de experimentos y versionado de modelos
Funcionalidades
Fase 3: Entrenamiento de Clasificadores
1. Preparación de Datos
El sistema utiliza los resultados del clustering de la Fase 2 como etiquetas para entrenar clasificadores supervisados.
Data Pipeline:
import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import numpy as np
class DocumentDataset(Dataset):
def __init__(self, image_paths, labels, transform=None, mode='train'):
"""
Dataset para documentos clasificados
Args:
image_paths: Lista de rutas de imágenes
labels: Lista de etiquetas de cluster/clase
transform: Transformaciones de augmentación
mode: 'train', 'val', o 'test'
"""
self.image_paths = image_paths
self.labels = labels
self.transform = transform
self.mode = mode
# Mapear etiquetas a índices consecutivos
unique_labels = sorted(list(set(labels)))
self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
self.num_classes = len(unique_labels)
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
# Cargar imagen
image = cv2.imread(self.image_paths[idx])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Obtener etiqueta
label = self.labels[idx]
label_idx = self.label_to_idx[label]
# Aplicar transformaciones
if self.transform:
transformed = self.transform(image=image)
image = transformed['image']
return {
'image': image,
'label': torch.tensor(label_idx, dtype=torch.long),
'image_path': self.image_paths[idx]
}
def get_training_transforms():
"""Transformaciones de augmentación para entrenamiento"""
return A.Compose([
# Geométricas
A.Rotate(limit=5, p=0.3),
A.HorizontalFlip(p=0.2),
A.ShiftScaleRotate(
shift_limit=0.1, scale_limit=0.1, rotate_limit=5, p=0.3
),
# Fotométricas
A.RandomBrightnessContrast(
brightness_limit=0.2, contrast_limit=0.2, p=0.3
),
A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),
A.GaussianBlur(blur_limit=3, p=0.1),
# Normalizacion y conversión
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
A.Resize(224, 224),
ToTensorV2()
])
def get_validation_transforms():
"""Transformaciones para validación (sin augmentación)"""
return A.Compose([
A.Resize(224, 224),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
ToTensorV2()
])
2. Arquitectura del Modelo
Utiliza EfficientNet como backbone con fine-tuning específico para documentos.
Modelo de Clasificación:
import torch.nn as nn
import timm
class DocumentClassifier(nn.Module):
def __init__(self, num_classes, model_name='efficientnet_b3', pretrained=True):
"""
Clasificador de documentos basado en EfficientNet
Args:
num_classes: Número de tipos de documentos a clasificar
model_name: Modelo base de EfficientNet
pretrained: Usar pesos pre-entrenados
"""
super().__init__()
# Backbone pre-entrenado
self.backbone = timm.create_model(
model_name,
pretrained=pretrained,
num_classes=0, # Sin clasificador final
global_pool='avg'
)
# Obtener dimensiones del feature extractor
self.feature_dim = self.backbone.num_features
# Dropout para regularización
self.dropout = nn.Dropout(0.3)
# Clasificador personalizado
self.classifier = nn.Sequential(
nn.Linear(self.feature_dim, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, num_classes)
)
# Inicialización de pesos
self._initialize_weights()
def _initialize_weights(self):
"""Inicialización Xavier para las capas lineales"""
for m in self.classifier.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
nn.init.constant_(m.bias, 0)
def forward(self, x):
# Extraer características
features = self.backbone(x)
# Aplicar dropout
features = self.dropout(features)
# Clasificar
logits = self.classifier(features)
return logits
def get_features(self, x):
"""Extraer solo las características (para análisis)"""
with torch.no_grad():
features = self.backbone(x)
return features
3. Entrenamiento
Sistema completo de entrenamiento con monitoreo y validación.
Training Loop:
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import mlflow
import mlflow.pytorch
from sklearn.metrics import classification_report, confusion_matrix
class DocumentTrainer:
def __init__(self, model, train_loader, val_loader, device='cuda'):
self.model = model.to(device)
self.train_loader = train_loader
self.val_loader = val_loader
self.device = device
# Optimizador y scheduler
self.optimizer = optim.AdamW(
model.parameters(),
lr=1e-4,
weight_decay=1e-4
)
self.scheduler = ReduceLROnPlateau(
self.optimizer,
mode='max',
factor=0.5,
patience=5,
verbose=True
)
# Función de pérdida con pesos para clases desbalanceadas
self.criterion = nn.CrossEntropyLoss()
# Métricas de tracking
self.train_losses = []
self.val_accuracies = []
self.best_val_acc = 0.0
def train_epoch(self):
"""Entrena una época"""
self.model.train()
total_loss = 0.0
correct_predictions = 0
total_samples = 0
for batch in self.train_loader:
images = batch['image'].to(self.device)
labels = batch['label'].to(self.device)
# Forward pass
outputs = self.model(images)
loss = self.criterion(outputs, labels)
# Backward pass
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# Métricas
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
correct_predictions += (predicted == labels).sum().item()
total_samples += labels.size(0)
avg_loss = total_loss / len(self.train_loader)
accuracy = correct_predictions / total_samples
return avg_loss, accuracy
def validate(self):
"""Valida el modelo"""
self.model.eval()
correct_predictions = 0
total_samples = 0
all_predictions = []
all_labels = []
with torch.no_grad():
for batch in self.val_loader:
images = batch['image'].to(self.device)
labels = batch['label'].to(self.device)
outputs = self.model(images)
_, predicted = torch.max(outputs.data, 1)
correct_predictions += (predicted == labels).sum().item()
total_samples += labels.size(0)
all_predictions.extend(predicted.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
accuracy = correct_predictions / total_samples
return accuracy, all_predictions, all_labels
def train(self, num_epochs=50, save_path='best_model.pth'):
"""Entrenamiento completo"""
print(f"Iniciando entrenamiento por {num_epochs} épocas...")
# Inicializar MLflow tracking
mlflow.start_run()
mlflow.log_params({
'model_architecture': 'EfficientNet-B3',
'num_epochs': num_epochs,
'batch_size': self.train_loader.batch_size,
'learning_rate': self.optimizer.param_groups[0]['lr'],
'num_classes': self.model.classifier[-1].out_features
})
for epoch in range(num_epochs):
# Entrenar época
train_loss, train_acc = self.train_epoch()
# Validar
val_acc, val_predictions, val_labels = self.validate()
# Scheduler step
self.scheduler.step(val_acc)
# Logging
self.train_losses.append(train_loss)
self.val_accuracies.append(val_acc)
# MLflow logging
mlflow.log_metrics({
'train_loss': train_loss,
'train_accuracy': train_acc,
'val_accuracy': val_acc,
'learning_rate': self.optimizer.param_groups[0]['lr']
}, step=epoch)
print(f"Época {epoch+1}/{num_epochs}:")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f" Val Acc: {val_acc:.4f}")
# Guardar mejor modelo
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
torch.save({
'epoch': epoch,
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'val_accuracy': val_acc,
'num_classes': self.model.classifier[-1].out_features
}, save_path)
print(f" ✅ Nuevo mejor modelo guardado (Val Acc: {val_acc:.4f})")
# Reporte final
print("\n" + "="*50)
print("ENTRENAMIENTO COMPLETADO")
print(f"Mejor Accuracy de Validación: {self.best_val_acc:.4f}")
# Reporte de clasificación detallado
final_report = classification_report(
val_labels, val_predictions,
target_names=[f"Clase_{i}" for i in range(len(set(val_labels)))]
)
print("\nReporte de Clasificación:")
print(final_report)
# Log del modelo final
mlflow.pytorch.log_model(self.model, "model")
mlflow.log_metric("best_val_accuracy", self.best_val_acc)
mlflow.end_run()
return self.best_val_acc
Fase 4: Clasificación en Producción
1. Servicio de Clasificación
API optimizada para clasificación en tiempo real.
Classification Service:
import torch
import torch.nn.functional as F
from PIL import Image
import io
import base64
class ClassificationService:
def __init__(self, model_path, device='cuda'):
"""
Servicio de clasificación en producción
Args:
model_path: Ruta al modelo entrenado
device: Dispositivo de computación
"""
self.device = device
self.model = None
self.transforms = get_validation_transforms()
self.class_names = []
self.load_model(model_path)
def load_model(self, model_path):
"""Cargar modelo entrenado"""
checkpoint = torch.load(model_path, map_location=self.device)
# Recrear modelo
num_classes = checkpoint['num_classes']
self.model = DocumentClassifier(num_classes)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.model.to(self.device)
self.model.eval()
# Cargar nombres de clases si están disponibles
if 'class_names' in checkpoint:
self.class_names = checkpoint['class_names']
else:
self.class_names = [f"Type_{i}" for i in range(num_classes)]
print(f"Modelo cargado: {num_classes} clases")
def predict(self, image, return_probabilities=False):
"""
Clasificar una imagen
Args:
image: Imagen PIL o numpy array
return_probabilities: Retornar probabilidades para todas las clases
Returns:
prediction: Resultado de clasificación
"""
# Preparar imagen
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
# Aplicar transformaciones
transformed = self.transforms(image=np.array(image))
input_tensor = transformed['image'].unsqueeze(0).to(self.device)
# Inferencia
with torch.no_grad():
outputs = self.model(input_tensor)
probabilities = F.softmax(outputs, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()
confidence = probabilities[0][predicted_class].item()
result = {
'predicted_class': self.class_names[predicted_class],
'predicted_class_id': predicted_class,
'confidence': confidence
}
if return_probabilities:
result['all_probabilities'] = {
self.class_names[i]: probabilities[0][i].item()
for i in range(len(self.class_names))
}
return result
def predict_batch(self, images, batch_size=16):
"""Clasificar múltiples imágenes en batch"""
results = []
for i in range(0, len(images), batch_size):
batch = images[i:i + batch_size]
batch_tensors = []
for image in batch:
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
transformed = self.transforms(image=np.array(image))
batch_tensors.append(transformed['image'])
# Crear batch tensor
batch_tensor = torch.stack(batch_tensors).to(self.device)
# Inferencia del batch
with torch.no_grad():
outputs = self.model(batch_tensor)
probabilities = F.softmax(outputs, dim=1)
for j in range(len(batch)):
predicted_class = torch.argmax(probabilities[j]).item()
confidence = probabilities[j][predicted_class].item()
results.append({
'predicted_class': self.class_names[predicted_class],
'predicted_class_id': predicted_class,
'confidence': confidence
})
return results
API Endpoints
POST /train
Entrena un nuevo modelo de clasificación.
Request:
curl -X POST "http://localhost:8003/train" \
-H "Content-Type: application/json" \
-d '{
"training_data": {
"image_paths": ["img1.jpg", "img2.jpg", ...],
"labels": [0, 1, 0, 2, 1, ...],
"class_names": ["Type_0", "Type_1", "Type_2"]
},
"training_config": {
"epochs": 30,
"batch_size": 16,
"learning_rate": 1e-4,
"validation_split": 0.2
}
}'
Response:
{
"success": true,
"training_id": "train_20240115_143022",
"status": "completed",
"metrics": {
"final_accuracy": 0.94,
"best_epoch": 23,
"training_time": "45 minutes",
"num_classes": 3,
"num_samples": 150
},
"model_path": "/app/data/models/classifier_20240115_143022.pth",
"classification_report": {
"Type_0": {"precision": 0.95, "recall": 0.92, "f1-score": 0.93},
"Type_1": {"precision": 0.91, "recall": 0.96, "f1-score": 0.93},
"Type_2": {"precision": 0.96, "recall": 0.94, "f1-score": 0.95}
}
}
POST /classify
Clasifica una o múltiples imágenes.
Single Image Request:
curl -X POST "http://localhost:8003/classify" \
-H "Content-Type: multipart/form-data" \
-F "file=@document.jpg" \
-F "return_probabilities=true"
Batch Request:
curl -X POST "http://localhost:8003/classify" \
-H "Content-Type: multipart/form-data" \
-F "files=@doc1.jpg" \
-F "files=@doc2.jpg" \
-F "files=@doc3.jpg"
Response:
{
"success": true,
"processing_time": 0.23,
"results": [
{
"filename": "document.jpg",
"predicted_class": "Ficha_Residencia",
"predicted_class_id": 1,
"confidence": 0.96,
"all_probabilities": {
"Ficha_Residencia": 0.96,
"Cedula_Identidad": 0.03,
"Pasaporte": 0.01
}
}
]
}
GET /models
Lista modelos disponibles y cargados.
Response:
{
"available_models": [
{
"model_id": "classifier_20240115_143022",
"created_date": "2024-01-15T14:30:22Z",
"num_classes": 3,
"accuracy": 0.94,
"status": "active"
},
{
"model_id": "classifier_20240110_091505",
"created_date": "2024-01-10T09:15:05Z",
"num_classes": 2,
"accuracy": 0.89,
"status": "archived"
}
],
"current_model": "classifier_20240115_143022"
}
POST /switch_model
Cambia el modelo activo para clasificación.
Request:
{
"model_id": "classifier_20240115_143022"
}
GET /model_performance
Obtiene métricas detalladas del modelo actual.
Response:
{
"model_id": "classifier_20240115_143022",
"performance_metrics": {
"overall_accuracy": 0.94,
"precision_macro": 0.94,
"recall_macro": 0.94,
"f1_score_macro": 0.94
},
"per_class_metrics": {
"Ficha_Residencia": {
"precision": 0.95,
"recall": 0.92,
"f1_score": 0.93,
"support": 50
},
"Cedula_Identidad": {
"precision": 0.91,
"recall": 0.96,
"f1_score": 0.93,
"support": 45
},
"Pasaporte": {
"precision": 0.96,
"recall": 0.94,
"f1_score": 0.95,
"support": 55
}
},
"confusion_matrix": [
[46, 2, 2],
[1, 43, 1],
[2, 1, 52]
]
}
Configuración y Optimización
Variables de Entorno
# Configuración del servicio
ML_CLASSIFIER_PORT=8003
ML_CLASSIFIER_HOST=0.0.0.0
ML_CLASSIFIER_WORKERS=2
# Configuración de modelos
DEFAULT_MODEL_ARCHITECTURE=efficientnet_b3
MODEL_SAVE_DIR=/app/data/models/classifiers
CHECKPOINT_INTERVAL=5
# Configuración de entrenamiento
DEFAULT_BATCH_SIZE=16
DEFAULT_LEARNING_RATE=1e-4
DEFAULT_EPOCHS=30
VALIDATION_SPLIT=0.2
# Configuración de PyTorch
PYTORCH_DEVICE=cuda
TORCH_NUM_THREADS=4
MIXED_PRECISION_ENABLED=true
# MLflow tracking
MLFLOW_TRACKING_URI=http://localhost:5000
MLFLOW_EXPERIMENT_NAME=iris_classification
Data Augmentation Avanzada
def get_advanced_transforms():
"""Transformaciones avanzadas para documentos específicos"""
return A.Compose([
# Simulación de condiciones reales de cámara
A.Perspective(scale=(0.02, 0.05), p=0.3),
A.OpticalDistortion(distort_limit=0.1, p=0.2),
# Simulación de condiciones de iluminación
A.RandomShadow(p=0.2),
A.RandomBrightnessContrast(
brightness_limit=0.3, contrast_limit=0.3, p=0.4
),
# Simulación de calidad de papel
A.GaussNoise(var_limit=(10.0, 30.0), p=0.3),
A.ISONoise(color_shift=(0.01, 0.05), intensity=(0.1, 0.5), p=0.2),
# Simulación de artefactos de escaneo
A.MotionBlur(blur_limit=3, p=0.1),
A.MedianBlur(blur_limit=3, p=0.1),
# Normalizacion estándar
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
A.Resize(224, 224),
ToTensorV2()
])
Monitoreo de Performance
class PerformanceMonitor:
def __init__(self):
self.predictions_count = 0
self.total_inference_time = 0
self.confidence_scores = []
def log_prediction(self, inference_time, confidence):
"""Registrar métricas de una predicción"""
self.predictions_count += 1
self.total_inference_time += inference_time
self.confidence_scores.append(confidence)
def get_performance_summary(self):
"""Obtener resumen de performance"""
if self.predictions_count == 0:
return {"status": "no_predictions"}
avg_inference_time = self.total_inference_time / self.predictions_count
avg_confidence = np.mean(self.confidence_scores)
low_confidence_rate = sum(1 for c in self.confidence_scores if c < 0.8) / len(self.confidence_scores)
return {
"total_predictions": self.predictions_count,
"average_inference_time": avg_inference_time,
"average_confidence": avg_confidence,
"low_confidence_rate": low_confidence_rate,
"throughput_per_second": 1.0 / avg_inference_time if avg_inference_time > 0 else 0
}
Interpretabilidad y Explicaciones
Grad-CAM para Visualización
import cv2
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
class ModelExplainer:
def __init__(self, model, target_layer):
"""
Explicabilidad del modelo usando Grad-CAM
Args:
model: Modelo de clasificación entrenado
target_layer: Capa target para Grad-CAM
"""
self.model = model
self.cam = GradCAM(model=model, target_layers=[target_layer])
def explain_prediction(self, image, predicted_class_id):
"""
Generar explicación visual de la predicción
Args:
image: Imagen de entrada (numpy array)
predicted_class_id: ID de la clase predicha
Returns:
explanation: Imagen con mapa de calor superpuesto
"""
# Preparar imagen para Grad-CAM
input_tensor = self.transforms(image=image)['image'].unsqueeze(0)
# Generar mapa de activación
cam_mask = self.cam(input_tensor=input_tensor, targets=[predicted_class_id])
cam_mask = cam_mask[0] # Primera imagen del batch
# Superponer en imagen original
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
explanation = show_cam_on_image(image_rgb, cam_mask, use_rgb=True)
return explanation
Las Fases 3-4 completan el ciclo de aprendizaje automático en IRIS, transformando los clusters no supervisados en un clasificador robusto capaz de categorizar automáticamente nuevos documentos con alta precisión y confianza.