import argparse import copy import torch import sys sys.path.append("../") from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset from boolean_classifier.datasets.ngram_dataset import NGramDataset from boolean_classifier.architectures.ffnn import FFNN from torch.utils.data import DataLoader import multiprocessing import json import os import torch.nn from torch.optim.lr_scheduler import _LRScheduler from torch.utils.data import DataLoader from tqdm import tqdm import joblib class EarlyStoppingPyTorchTrainer: """Trainer for PyTorch models with early stopping.""" def __init__(self, optimizer: torch.optim.Optimizer, epochs: int = 5, loss: torch.nn.Module = None, scheduler: _LRScheduler = None, feature_selector = None) -> None: """ Create PyTorch trainer. Parameters ---------- optimizer : torch.optim.Optimizer Optimizer to use for training the model. epochs : int, optional Number of epochs, by default 5. loss : torch.nn.Module, optional Loss to minimize, by default None. scheduler : _LRScheduler, optional Scheduler for the optimizer, by default None. """ self._epochs = epochs self._optimizer = optimizer self._loss = loss if loss is not None else torch.nn.CrossEntropyLoss() self._scheduler = scheduler self.feature_selector = feature_selector self.training_losses = [] self.training_accuracies = [] self.validation_losses = [] self.validation_accuracies = [] def train(self, model: torch.nn.Module, train_loader: DataLoader, val_loader: DataLoader, patience: int) -> torch.nn.Module: """ Train model with given loaders and early stopping. Parameters ---------- model : torch.nn.Module Pytorch model to be trained. train_loader : DataLoader Train data loader. val_loader : DataLoader Validation data loader. patience : int Number of epochs to wait before early stopping. Returns ------- torch.nn.Module Trained model. """ best_loss = float("inf") best_model = None patience_counter = 0 for _ in range(self._epochs): model = self.fit(model, train_loader) val_loss = self.validate(model, val_loader) if val_loss <= best_loss: best_loss = val_loss best_model = copy.deepcopy(model) patience_counter = 0 else: patience_counter += 1 if patience_counter >= patience: break return best_model def fit(self, model: torch.nn.Module, dataloader: DataLoader) -> torch.nn.Module: """ Train model for one epoch with given loader. Parameters ---------- model : torch.nn.Module Pytorch model to be trained. dataloader : DataLoader Train data loader. Returns ------- torch.nn.Module Trained model. """ device = next(model.parameters()).device model = model.train() model = model.to(device) running_loss = 0.0 train_total = 0 train_correct = 0 for x, y in tqdm(dataloader): if self.feature_selector is not None: x = torch.Tensor(self.feature_selector.transform(x)) x, y = x.to(device), y.to(device) self._optimizer.zero_grad() outputs = model(x) loss = self._loss(outputs, y) loss.backward() self._optimizer.step() running_loss += loss.item() y_preds = outputs.softmax(dim=1).argmax(dim=1) train_total += y.size(0) train_correct += (y_preds == y).sum().item() self.training_losses.append(running_loss / train_total) self.training_accuracies.append(train_correct / train_total) if self._scheduler is not None: self._scheduler.step() return model def validate(self, model: torch.nn.Module, dataloader: DataLoader) -> float: """ Validate model with given loader. Parameters ---------- model : torch.nn.Module Pytorch model to be balidated. dataloader : DataLoader Validation data loader. Returns ------- float Validation loss of the model. """ running_loss = 0 val_total = 0 val_correct = 0 device = next(model.parameters()).device model = model.eval() model = model.to(device) with torch.no_grad(): for x, y in tqdm(dataloader): if self.feature_selector is not None: x = torch.Tensor(self.feature_selector.transform(x)) x, y = x.to(device), y.to(device) outputs = model(x) loss = self._loss(outputs, y) running_loss += loss.item() y_preds = outputs.softmax(dim=1).argmax(dim=1) val_total += y.size(0) val_correct += (y_preds == y).sum().item() self.validation_losses.append(running_loss / val_total) self.validation_accuracies.append(val_correct / val_total) return loss def save_results(trainer: EarlyStoppingPyTorchTrainer, configuration: dict): results = { "training_losses": trainer.training_losses, "training_accuracies": trainer.training_accuracies, "validation_losses": trainer.validation_losses, "validation_accuracies": trainer.validation_accuracies } with open(os.path.join(configuration["model_path"], "results.json"), "w") as output_file: json.dump(results, output_file) def load_configuration(configuration_filepath: str) -> dict: with open(configuration_filepath, "r") as configuration_file: configuration = json.load(configuration_file) return configuration if __name__ == "__main__": parser = argparse.ArgumentParser(description='Train malware detector') parser.add_argument("training_file", type=str, help="Training file containing the hashes and labels of the benign and malicious samples" ) parser.add_argument("validation_file", type=str, help="Validation file containing the hashes and labels of the benign and malicious samples" ) parser.add_argument("dataset_type", type=str, help="Type of dataset: {BooleanBigrams, Bigrams, EMBER}" ) parser.add_argument("configuration_file", type=str, help="Configuration file containing the hyperparameters of the model" ) parser.add_argument("--batch_size", type=int, help="Batch size for training", default=32 ) parser.add_argument("--num_epochs", type=int, help="Max epochs", default=50 ) parser.add_argument("--patience", type=int, help="Patience for early stopping", default=5 ) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device: ", device) num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1) if args.dataset_type == "BooleanBigrams": training_dataset = BooleanNGramDataset(args.training_file) validation_dataset = BooleanNGramDataset(args.validation_file) elif args.dataset_type == "Bigrams": training_dataset = NGramDataset(args.training_file) validation_dataset = NGramDataset(args.validation_file) else: raise NotImplementedError("Only Boolean dataset is currently supported") training_dataloader = DataLoader( training_dataset, batch_size=args.batch_size, num_workers=num_workers, ) validation_dataloader = DataLoader( validation_dataset, batch_size=args.batch_size, num_workers=num_workers, ) configuration = load_configuration(args.configuration_file) model = FFNN(configuration) model = model.to(device) if configuration["feature_selector"] is not None: feature_selector = joblib.load(configuration["feature_selector"]) else: feature_selector = None criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) trainer = EarlyStoppingPyTorchTrainer( optimizer, epochs=args.num_epochs, loss=criterion, feature_selector=feature_selector ) model = trainer.train( model, training_dataloader, validation_dataloader, args.patience ) if not os.path.exists(configuration["model_path"]): os.makedirs(configuration["model_path"]) torch.save(model.state_dict(), os.path.join(configuration["model_path"],"model.pth")) save_results(trainer, configuration)