Source code for main_pipeline

import os
import logging
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import CLIPProcessor, CLIPModel
from utils import setup_logging
from data import load_real_images, generate_fake_images, prepare_image_paths_labels
from features import prepare_combined_features
from models import train_and_save_models


# -------------------------
# Main Function
# -------------------------
[docs]def main(): """ Main function to run the complete pipeline for image processing, feature extraction, model training, and saving results. This function orchestrates the following tasks: 1. Setup logging for monitoring the pipeline. 2. Load real images from a specified directory and preprocess them. 3. Generate fake images using a pre-trained Stable Diffusion model. 4. Prepare the dataset by combining real and fake image paths and labels. 5. Perform feature extraction using the CLIP model. 6. Save the extracted features and labels for reuse. 7. Split the dataset into training and testing sets. 8. Train models (SVM, XGBoost, Neural Network) with hyperparameter optimization using Optuna. 9. Save the trained models and evaluation metrics. The final pipeline ensures that the models and metrics are stored in a specified output directory for future use and evaluation. Returns: None Example: main() # Runs the entire image processing and model training pipeline. """ setup_logging() # Configuration data_dir = 'extracted_imagenet10' fake_images_dir = 'generated_fake_images' class_index_json = 'in100_class_index.json' output_model_dir = 'ModelsForUse' selected_classes = [ 'n02342885', 'n01882714', 'n02129604', 'n03627232', 'n02980441', 'n02007558', 'n03384352', 'n02279972', 'n03388043', 'n02391049' ] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging.info(f"Using device: {device}") # Load Real Images real_dataset = load_real_images(data_dir, selected_classes, max_images_per_class=1000) logging.info(f"Real dataset loaded with {len(real_dataset)} images.") # Generate Fake Images generate_fake_images(selected_classes, class_index_json, fake_images_dir, max_images_per_class=1000, device=device) logging.info("Fake images generated.") # Check Dataset for cls in selected_classes: class_dir = os.path.join(fake_images_dir, cls) if os.path.exists(class_dir): fake_count = len([img for img in os.listdir(class_dir) if img.endswith(".png")]) logging.info(f"Class {cls}: {fake_count} fake images.") # Prepare Dataset paths_labels = prepare_image_paths_labels(real_dataset, fake_images_dir, selected_classes) logging.info(f"Total images prepared (real + fake): {len(paths_labels)}") # Feature Extraction clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) features, labels = prepare_combined_features(paths_labels, clip_processor, clip_model, device) logging.info(f"Features extracted: {features.shape[0]} samples, {features.shape[1]} features.") # Save Features for Reuse os.makedirs(output_model_dir, exist_ok=True) np.save(os.path.join(output_model_dir, "features.npy"), features) np.save(os.path.join(output_model_dir, "labels.npy"), labels) logging.info("Features and labels saved for reuse.") # Train/Test Split X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42) logging.info(f"Data split into training ({len(X_train)}) and testing ({len(X_test)}) samples.") # Train and Save Models train_and_save_models(X_train, X_test, y_train, y_test, output_model_dir) # Final Pipeline Summary logging.info("Pipeline completed successfully.") logging.info(f"Total images processed: {len(paths_labels)}") logging.info(f"Features generated: {features.shape}, Labels generated: {labels.shape}") logging.info(f"Models and metrics saved in directory: {output_model_dir}")
if __name__ == "__main__": main()