Skip to main content

Fases 3-4: Clasificación de Documentos

Propósito

Las Fases 3 y 4 del pipeline IRIS se encargan de la clasificación supervisada de documentos utilizando los clusters descubiertos en la Fase 2. La Fase 3 entrena modelos de clasificación específicos, mientras que la Fase 4 aplica estos modelos para clasificar nuevos documentos.

Arquitectura del Servicio

Componentes Principales

Tecnologías Utilizadas

  • PyTorch: Framework principal de deep learning
  • EfficientNet: Arquitectura de CNN optimizada para clasificación
  • Albumentations: Augmentación avanzada de imágenes
  • TensorBoard: Monitoreo y visualización del entrenamiento
  • FastAPI: API REST para servicio de clasificación
  • MLflow: Tracking de experimentos y versionado de modelos

Funcionalidades

Fase 3: Entrenamiento de Clasificadores

1. Preparación de Datos

El sistema utiliza los resultados del clustering de la Fase 2 como etiquetas para entrenar clasificadores supervisados.

Data Pipeline:

import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import numpy as np

class DocumentDataset(Dataset):
def __init__(self, image_paths, labels, transform=None, mode='train'):
"""
Dataset para documentos clasificados

Args:
image_paths: Lista de rutas de imágenes
labels: Lista de etiquetas de cluster/clase
transform: Transformaciones de augmentación
mode: 'train', 'val', o 'test'
"""
self.image_paths = image_paths
self.labels = labels
self.transform = transform
self.mode = mode

# Mapear etiquetas a índices consecutivos
unique_labels = sorted(list(set(labels)))
self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
self.num_classes = len(unique_labels)

def __len__(self):
return len(self.image_paths)

def __getitem__(self, idx):
# Cargar imagen
image = cv2.imread(self.image_paths[idx])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Obtener etiqueta
label = self.labels[idx]
label_idx = self.label_to_idx[label]

# Aplicar transformaciones
if self.transform:
transformed = self.transform(image=image)
image = transformed['image']

return {
'image': image,
'label': torch.tensor(label_idx, dtype=torch.long),
'image_path': self.image_paths[idx]
}

def get_training_transforms():
"""Transformaciones de augmentación para entrenamiento"""
return A.Compose([
# Geométricas
A.Rotate(limit=5, p=0.3),
A.HorizontalFlip(p=0.2),
A.ShiftScaleRotate(
shift_limit=0.1, scale_limit=0.1, rotate_limit=5, p=0.3
),

# Fotométricas
A.RandomBrightnessContrast(
brightness_limit=0.2, contrast_limit=0.2, p=0.3
),
A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),
A.GaussianBlur(blur_limit=3, p=0.1),

# Normalizacion y conversión
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
A.Resize(224, 224),
ToTensorV2()
])

def get_validation_transforms():
"""Transformaciones para validación (sin augmentación)"""
return A.Compose([
A.Resize(224, 224),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
),
ToTensorV2()
])

2. Arquitectura del Modelo

Utiliza EfficientNet como backbone con fine-tuning específico para documentos.

Modelo de Clasificación:

import torch.nn as nn
import timm

class DocumentClassifier(nn.Module):
def __init__(self, num_classes, model_name='efficientnet_b3', pretrained=True):
"""
Clasificador de documentos basado en EfficientNet

Args:
num_classes: Número de tipos de documentos a clasificar
model_name: Modelo base de EfficientNet
pretrained: Usar pesos pre-entrenados
"""
super().__init__()

# Backbone pre-entrenado
self.backbone = timm.create_model(
model_name,
pretrained=pretrained,
num_classes=0, # Sin clasificador final
global_pool='avg'
)

# Obtener dimensiones del feature extractor
self.feature_dim = self.backbone.num_features

# Dropout para regularización
self.dropout = nn.Dropout(0.3)

# Clasificador personalizado
self.classifier = nn.Sequential(
nn.Linear(self.feature_dim, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, num_classes)
)

# Inicialización de pesos
self._initialize_weights()

def _initialize_weights(self):
"""Inicialización Xavier para las capas lineales"""
for m in self.classifier.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
nn.init.constant_(m.bias, 0)

def forward(self, x):
# Extraer características
features = self.backbone(x)

# Aplicar dropout
features = self.dropout(features)

# Clasificar
logits = self.classifier(features)

return logits

def get_features(self, x):
"""Extraer solo las características (para análisis)"""
with torch.no_grad():
features = self.backbone(x)
return features

3. Entrenamiento

Sistema completo de entrenamiento con monitoreo y validación.

Training Loop:

import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import mlflow
import mlflow.pytorch
from sklearn.metrics import classification_report, confusion_matrix

class DocumentTrainer:
def __init__(self, model, train_loader, val_loader, device='cuda'):
self.model = model.to(device)
self.train_loader = train_loader
self.val_loader = val_loader
self.device = device

# Optimizador y scheduler
self.optimizer = optim.AdamW(
model.parameters(),
lr=1e-4,
weight_decay=1e-4
)
self.scheduler = ReduceLROnPlateau(
self.optimizer,
mode='max',
factor=0.5,
patience=5,
verbose=True
)

# Función de pérdida con pesos para clases desbalanceadas
self.criterion = nn.CrossEntropyLoss()

# Métricas de tracking
self.train_losses = []
self.val_accuracies = []
self.best_val_acc = 0.0

def train_epoch(self):
"""Entrena una época"""
self.model.train()
total_loss = 0.0
correct_predictions = 0
total_samples = 0

for batch in self.train_loader:
images = batch['image'].to(self.device)
labels = batch['label'].to(self.device)

# Forward pass
outputs = self.model(images)
loss = self.criterion(outputs, labels)

# Backward pass
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

# Métricas
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
correct_predictions += (predicted == labels).sum().item()
total_samples += labels.size(0)

avg_loss = total_loss / len(self.train_loader)
accuracy = correct_predictions / total_samples

return avg_loss, accuracy

def validate(self):
"""Valida el modelo"""
self.model.eval()
correct_predictions = 0
total_samples = 0
all_predictions = []
all_labels = []

with torch.no_grad():
for batch in self.val_loader:
images = batch['image'].to(self.device)
labels = batch['label'].to(self.device)

outputs = self.model(images)
_, predicted = torch.max(outputs.data, 1)

correct_predictions += (predicted == labels).sum().item()
total_samples += labels.size(0)

all_predictions.extend(predicted.cpu().numpy())
all_labels.extend(labels.cpu().numpy())

accuracy = correct_predictions / total_samples

return accuracy, all_predictions, all_labels

def train(self, num_epochs=50, save_path='best_model.pth'):
"""Entrenamiento completo"""
print(f"Iniciando entrenamiento por {num_epochs} épocas...")

# Inicializar MLflow tracking
mlflow.start_run()
mlflow.log_params({
'model_architecture': 'EfficientNet-B3',
'num_epochs': num_epochs,
'batch_size': self.train_loader.batch_size,
'learning_rate': self.optimizer.param_groups[0]['lr'],
'num_classes': self.model.classifier[-1].out_features
})

for epoch in range(num_epochs):
# Entrenar época
train_loss, train_acc = self.train_epoch()

# Validar
val_acc, val_predictions, val_labels = self.validate()

# Scheduler step
self.scheduler.step(val_acc)

# Logging
self.train_losses.append(train_loss)
self.val_accuracies.append(val_acc)

# MLflow logging
mlflow.log_metrics({
'train_loss': train_loss,
'train_accuracy': train_acc,
'val_accuracy': val_acc,
'learning_rate': self.optimizer.param_groups[0]['lr']
}, step=epoch)

print(f"Época {epoch+1}/{num_epochs}:")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f" Val Acc: {val_acc:.4f}")

# Guardar mejor modelo
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
torch.save({
'epoch': epoch,
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'val_accuracy': val_acc,
'num_classes': self.model.classifier[-1].out_features
}, save_path)

print(f" ✅ Nuevo mejor modelo guardado (Val Acc: {val_acc:.4f})")

# Reporte final
print("\n" + "="*50)
print("ENTRENAMIENTO COMPLETADO")
print(f"Mejor Accuracy de Validación: {self.best_val_acc:.4f}")

# Reporte de clasificación detallado
final_report = classification_report(
val_labels, val_predictions,
target_names=[f"Clase_{i}" for i in range(len(set(val_labels)))]
)
print("\nReporte de Clasificación:")
print(final_report)

# Log del modelo final
mlflow.pytorch.log_model(self.model, "model")
mlflow.log_metric("best_val_accuracy", self.best_val_acc)
mlflow.end_run()

return self.best_val_acc

Fase 4: Clasificación en Producción

1. Servicio de Clasificación

API optimizada para clasificación en tiempo real.

Classification Service:

import torch
import torch.nn.functional as F
from PIL import Image
import io
import base64

class ClassificationService:
def __init__(self, model_path, device='cuda'):
"""
Servicio de clasificación en producción

Args:
model_path: Ruta al modelo entrenado
device: Dispositivo de computación
"""
self.device = device
self.model = None
self.transforms = get_validation_transforms()
self.class_names = []

self.load_model(model_path)

def load_model(self, model_path):
"""Cargar modelo entrenado"""
checkpoint = torch.load(model_path, map_location=self.device)

# Recrear modelo
num_classes = checkpoint['num_classes']
self.model = DocumentClassifier(num_classes)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.model.to(self.device)
self.model.eval()

# Cargar nombres de clases si están disponibles
if 'class_names' in checkpoint:
self.class_names = checkpoint['class_names']
else:
self.class_names = [f"Type_{i}" for i in range(num_classes)]

print(f"Modelo cargado: {num_classes} clases")

def predict(self, image, return_probabilities=False):
"""
Clasificar una imagen

Args:
image: Imagen PIL o numpy array
return_probabilities: Retornar probabilidades para todas las clases

Returns:
prediction: Resultado de clasificación
"""
# Preparar imagen
if isinstance(image, np.ndarray):
image = Image.fromarray(image)

# Aplicar transformaciones
transformed = self.transforms(image=np.array(image))
input_tensor = transformed['image'].unsqueeze(0).to(self.device)

# Inferencia
with torch.no_grad():
outputs = self.model(input_tensor)
probabilities = F.softmax(outputs, dim=1)

predicted_class = torch.argmax(probabilities, dim=1).item()
confidence = probabilities[0][predicted_class].item()

result = {
'predicted_class': self.class_names[predicted_class],
'predicted_class_id': predicted_class,
'confidence': confidence
}

if return_probabilities:
result['all_probabilities'] = {
self.class_names[i]: probabilities[0][i].item()
for i in range(len(self.class_names))
}

return result

def predict_batch(self, images, batch_size=16):
"""Clasificar múltiples imágenes en batch"""
results = []

for i in range(0, len(images), batch_size):
batch = images[i:i + batch_size]
batch_tensors = []

for image in batch:
if isinstance(image, np.ndarray):
image = Image.fromarray(image)

transformed = self.transforms(image=np.array(image))
batch_tensors.append(transformed['image'])

# Crear batch tensor
batch_tensor = torch.stack(batch_tensors).to(self.device)

# Inferencia del batch
with torch.no_grad():
outputs = self.model(batch_tensor)
probabilities = F.softmax(outputs, dim=1)

for j in range(len(batch)):
predicted_class = torch.argmax(probabilities[j]).item()
confidence = probabilities[j][predicted_class].item()

results.append({
'predicted_class': self.class_names[predicted_class],
'predicted_class_id': predicted_class,
'confidence': confidence
})

return results

API Endpoints

POST /train

Entrena un nuevo modelo de clasificación.

Request:

curl -X POST "http://localhost:8003/train" \
-H "Content-Type: application/json" \
-d '{
"training_data": {
"image_paths": ["img1.jpg", "img2.jpg", ...],
"labels": [0, 1, 0, 2, 1, ...],
"class_names": ["Type_0", "Type_1", "Type_2"]
},
"training_config": {
"epochs": 30,
"batch_size": 16,
"learning_rate": 1e-4,
"validation_split": 0.2
}
}'

Response:

{
"success": true,
"training_id": "train_20240115_143022",
"status": "completed",
"metrics": {
"final_accuracy": 0.94,
"best_epoch": 23,
"training_time": "45 minutes",
"num_classes": 3,
"num_samples": 150
},
"model_path": "/app/data/models/classifier_20240115_143022.pth",
"classification_report": {
"Type_0": {"precision": 0.95, "recall": 0.92, "f1-score": 0.93},
"Type_1": {"precision": 0.91, "recall": 0.96, "f1-score": 0.93},
"Type_2": {"precision": 0.96, "recall": 0.94, "f1-score": 0.95}
}
}

POST /classify

Clasifica una o múltiples imágenes.

Single Image Request:

curl -X POST "http://localhost:8003/classify" \
-H "Content-Type: multipart/form-data" \
-F "file=@document.jpg" \
-F "return_probabilities=true"

Batch Request:

curl -X POST "http://localhost:8003/classify" \
-H "Content-Type: multipart/form-data" \
-F "files=@doc1.jpg" \
-F "files=@doc2.jpg" \
-F "files=@doc3.jpg"

Response:

{
"success": true,
"processing_time": 0.23,
"results": [
{
"filename": "document.jpg",
"predicted_class": "Ficha_Residencia",
"predicted_class_id": 1,
"confidence": 0.96,
"all_probabilities": {
"Ficha_Residencia": 0.96,
"Cedula_Identidad": 0.03,
"Pasaporte": 0.01
}
}
]
}

GET /models

Lista modelos disponibles y cargados.

Response:

{
"available_models": [
{
"model_id": "classifier_20240115_143022",
"created_date": "2024-01-15T14:30:22Z",
"num_classes": 3,
"accuracy": 0.94,
"status": "active"
},
{
"model_id": "classifier_20240110_091505",
"created_date": "2024-01-10T09:15:05Z",
"num_classes": 2,
"accuracy": 0.89,
"status": "archived"
}
],
"current_model": "classifier_20240115_143022"
}

POST /switch_model

Cambia el modelo activo para clasificación.

Request:

{
"model_id": "classifier_20240115_143022"
}

GET /model_performance

Obtiene métricas detalladas del modelo actual.

Response:

{
"model_id": "classifier_20240115_143022",
"performance_metrics": {
"overall_accuracy": 0.94,
"precision_macro": 0.94,
"recall_macro": 0.94,
"f1_score_macro": 0.94
},
"per_class_metrics": {
"Ficha_Residencia": {
"precision": 0.95,
"recall": 0.92,
"f1_score": 0.93,
"support": 50
},
"Cedula_Identidad": {
"precision": 0.91,
"recall": 0.96,
"f1_score": 0.93,
"support": 45
},
"Pasaporte": {
"precision": 0.96,
"recall": 0.94,
"f1_score": 0.95,
"support": 55
}
},
"confusion_matrix": [
[46, 2, 2],
[1, 43, 1],
[2, 1, 52]
]
}

Configuración y Optimización

Variables de Entorno

# Configuración del servicio
ML_CLASSIFIER_PORT=8003
ML_CLASSIFIER_HOST=0.0.0.0
ML_CLASSIFIER_WORKERS=2

# Configuración de modelos
DEFAULT_MODEL_ARCHITECTURE=efficientnet_b3
MODEL_SAVE_DIR=/app/data/models/classifiers
CHECKPOINT_INTERVAL=5

# Configuración de entrenamiento
DEFAULT_BATCH_SIZE=16
DEFAULT_LEARNING_RATE=1e-4
DEFAULT_EPOCHS=30
VALIDATION_SPLIT=0.2

# Configuración de PyTorch
PYTORCH_DEVICE=cuda
TORCH_NUM_THREADS=4
MIXED_PRECISION_ENABLED=true

# MLflow tracking
MLFLOW_TRACKING_URI=http://localhost:5000
MLFLOW_EXPERIMENT_NAME=iris_classification

Data Augmentation Avanzada

def get_advanced_transforms():
"""Transformaciones avanzadas para documentos específicos"""
return A.Compose([
# Simulación de condiciones reales de cámara
A.Perspective(scale=(0.02, 0.05), p=0.3),
A.OpticalDistortion(distort_limit=0.1, p=0.2),

# Simulación de condiciones de iluminación
A.RandomShadow(p=0.2),
A.RandomBrightnessContrast(
brightness_limit=0.3, contrast_limit=0.3, p=0.4
),

# Simulación de calidad de papel
A.GaussNoise(var_limit=(10.0, 30.0), p=0.3),
A.ISONoise(color_shift=(0.01, 0.05), intensity=(0.1, 0.5), p=0.2),

# Simulación de artefactos de escaneo
A.MotionBlur(blur_limit=3, p=0.1),
A.MedianBlur(blur_limit=3, p=0.1),

# Normalizacion estándar
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
A.Resize(224, 224),
ToTensorV2()
])

Monitoreo de Performance

class PerformanceMonitor:
def __init__(self):
self.predictions_count = 0
self.total_inference_time = 0
self.confidence_scores = []

def log_prediction(self, inference_time, confidence):
"""Registrar métricas de una predicción"""
self.predictions_count += 1
self.total_inference_time += inference_time
self.confidence_scores.append(confidence)

def get_performance_summary(self):
"""Obtener resumen de performance"""
if self.predictions_count == 0:
return {"status": "no_predictions"}

avg_inference_time = self.total_inference_time / self.predictions_count
avg_confidence = np.mean(self.confidence_scores)
low_confidence_rate = sum(1 for c in self.confidence_scores if c < 0.8) / len(self.confidence_scores)

return {
"total_predictions": self.predictions_count,
"average_inference_time": avg_inference_time,
"average_confidence": avg_confidence,
"low_confidence_rate": low_confidence_rate,
"throughput_per_second": 1.0 / avg_inference_time if avg_inference_time > 0 else 0
}

Interpretabilidad y Explicaciones

Grad-CAM para Visualización

import cv2
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image

class ModelExplainer:
def __init__(self, model, target_layer):
"""
Explicabilidad del modelo usando Grad-CAM

Args:
model: Modelo de clasificación entrenado
target_layer: Capa target para Grad-CAM
"""
self.model = model
self.cam = GradCAM(model=model, target_layers=[target_layer])

def explain_prediction(self, image, predicted_class_id):
"""
Generar explicación visual de la predicción

Args:
image: Imagen de entrada (numpy array)
predicted_class_id: ID de la clase predicha

Returns:
explanation: Imagen con mapa de calor superpuesto
"""
# Preparar imagen para Grad-CAM
input_tensor = self.transforms(image=image)['image'].unsqueeze(0)

# Generar mapa de activación
cam_mask = self.cam(input_tensor=input_tensor, targets=[predicted_class_id])
cam_mask = cam_mask[0] # Primera imagen del batch

# Superponer en imagen original
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
explanation = show_cam_on_image(image_rgb, cam_mask, use_rgb=True)

return explanation

Las Fases 3-4 completan el ciclo de aprendizaje automático en IRIS, transformando los clusters no supervisados en un clasificador robusto capaz de categorizar automáticamente nuevos documentos con alta precisión y confianza.