| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- import argparse
- import copy
-
- import torch
- import sys
- sys.path.append("../")
- from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
- from boolean_classifier.datasets.ngram_dataset import NGramDataset
- from boolean_classifier.architectures.ffnn import FFNN
- from torch.utils.data import DataLoader
- import multiprocessing
- import json
- import os
- import torch.nn
- from torch.optim.lr_scheduler import _LRScheduler
- from torch.utils.data import DataLoader
- from tqdm import tqdm
- import joblib
-
-
- class EarlyStoppingPyTorchTrainer:
- """Trainer for PyTorch models with early stopping."""
-
- def __init__(self, optimizer: torch.optim.Optimizer, epochs: int = 5,
- loss: torch.nn.Module = None, scheduler: _LRScheduler = None, feature_selector = None) -> None:
- """
- Create PyTorch trainer.
- Parameters
- ----------
- optimizer : torch.optim.Optimizer
- Optimizer to use for training the model.
- epochs : int, optional
- Number of epochs, by default 5.
- loss : torch.nn.Module, optional
- Loss to minimize, by default None.
- scheduler : _LRScheduler, optional
- Scheduler for the optimizer, by default None.
- """
- self._epochs = epochs
- self._optimizer = optimizer
- self._loss = loss if loss is not None else torch.nn.CrossEntropyLoss()
- self._scheduler = scheduler
- self.feature_selector = feature_selector
-
- self.training_losses = []
- self.training_accuracies = []
- self.validation_losses = []
- self.validation_accuracies = []
-
- def train(self, model: torch.nn.Module,
- train_loader: DataLoader,
- val_loader: DataLoader,
- patience: int) -> torch.nn.Module:
- """
- Train model with given loaders and early stopping.
- Parameters
- ----------
- model : torch.nn.Module
- Pytorch model to be trained.
- train_loader : DataLoader
- Train data loader.
- val_loader : DataLoader
- Validation data loader.
- patience : int
- Number of epochs to wait before early stopping.
- Returns
- -------
- torch.nn.Module
- Trained model.
- """
- best_loss = float("inf")
- best_model = None
- patience_counter = 0
- for _ in range(self._epochs):
- model = self.fit(model, train_loader)
- val_loss = self.validate(model, val_loader)
- if val_loss <= best_loss:
- best_loss = val_loss
- best_model = copy.deepcopy(model)
- patience_counter = 0
- else:
- patience_counter += 1
- if patience_counter >= patience:
- break
- return best_model
-
- def fit(self,
- model: torch.nn.Module,
- dataloader: DataLoader) -> torch.nn.Module:
- """
- Train model for one epoch with given loader.
- Parameters
- ----------
- model : torch.nn.Module
- Pytorch model to be trained.
- dataloader : DataLoader
- Train data loader.
- Returns
- -------
- torch.nn.Module
- Trained model.
- """
- device = next(model.parameters()).device
- model = model.train()
- model = model.to(device)
- running_loss = 0.0
- train_total = 0
- train_correct = 0
- for x, y in tqdm(dataloader):
- if self.feature_selector is not None:
- x = torch.Tensor(self.feature_selector.transform(x))
- x, y = x.to(device), y.to(device)
- self._optimizer.zero_grad()
- outputs = model(x)
- loss = self._loss(outputs, y)
- loss.backward()
- self._optimizer.step()
- running_loss += loss.item()
- y_preds = outputs.softmax(dim=1).argmax(dim=1)
- train_total += y.size(0)
- train_correct += (y_preds == y).sum().item()
-
- self.training_losses.append(running_loss / train_total)
- self.training_accuracies.append(train_correct / train_total)
-
- if self._scheduler is not None:
- self._scheduler.step()
- return model
-
- def validate(self,
- model: torch.nn.Module,
- dataloader: DataLoader) -> float:
- """
- Validate model with given loader.
- Parameters
- ----------
- model : torch.nn.Module
- Pytorch model to be balidated.
- dataloader : DataLoader
- Validation data loader.
- Returns
- -------
- float
- Validation loss of the model.
- """
- running_loss = 0
- val_total = 0
- val_correct = 0
- device = next(model.parameters()).device
- model = model.eval()
- model = model.to(device)
- with torch.no_grad():
- for x, y in tqdm(dataloader):
- if self.feature_selector is not None:
- x = torch.Tensor(self.feature_selector.transform(x))
- x, y = x.to(device), y.to(device)
- outputs = model(x)
- loss = self._loss(outputs, y)
- running_loss += loss.item()
- y_preds = outputs.softmax(dim=1).argmax(dim=1)
-
- val_total += y.size(0)
- val_correct += (y_preds == y).sum().item()
-
- self.validation_losses.append(running_loss / val_total)
- self.validation_accuracies.append(val_correct / val_total)
- return loss
-
- def save_results(trainer: EarlyStoppingPyTorchTrainer, configuration: dict):
- results = {
- "training_losses": trainer.training_losses,
- "training_accuracies": trainer.training_accuracies,
- "validation_losses": trainer.validation_losses,
- "validation_accuracies": trainer.validation_accuracies
- }
- with open(os.path.join(configuration["model_path"], "results.json"), "w") as output_file:
- json.dump(results, output_file)
-
- def load_configuration(configuration_filepath: str) -> dict:
- with open(configuration_filepath, "r") as configuration_file:
- configuration = json.load(configuration_file)
- return configuration
-
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Train malware detector')
- parser.add_argument("training_file",
- type=str,
- help="Training file containing the hashes and labels of the benign and malicious samples"
- )
- parser.add_argument("validation_file",
- type=str,
- help="Validation file containing the hashes and labels of the benign and malicious samples"
- )
- parser.add_argument("dataset_type",
- type=str,
- help="Type of dataset: {BooleanBigrams, Bigrams, EMBER}"
- )
- parser.add_argument("configuration_file",
- type=str,
- help="Configuration file containing the hyperparameters of the model"
- )
- parser.add_argument("--batch_size",
- type=int,
- help="Batch size for training",
- default=32
- )
- parser.add_argument("--num_epochs",
- type=int,
- help="Max epochs",
- default=50
- )
- parser.add_argument("--patience",
- type=int,
- help="Patience for early stopping",
- default=5
- )
- args = parser.parse_args()
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- print("Device: ", device)
- num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
-
- if args.dataset_type == "BooleanBigrams":
- training_dataset = BooleanNGramDataset(args.training_file)
- validation_dataset = BooleanNGramDataset(args.validation_file)
- elif args.dataset_type == "Bigrams":
- training_dataset = NGramDataset(args.training_file)
- validation_dataset = NGramDataset(args.validation_file)
- else:
- raise NotImplementedError("Only Boolean dataset is currently supported")
- training_dataloader = DataLoader(
- training_dataset,
- batch_size=args.batch_size,
- num_workers=num_workers,
- )
- validation_dataloader = DataLoader(
- validation_dataset,
- batch_size=args.batch_size,
- num_workers=num_workers,
- )
-
- configuration = load_configuration(args.configuration_file)
- model = FFNN(configuration)
- model = model.to(device)
-
- if configuration["feature_selector"] is not None:
- feature_selector = joblib.load(configuration["feature_selector"])
- else:
- feature_selector = None
-
- criterion = torch.nn.CrossEntropyLoss()
- optimizer = torch.optim.Adam(model.parameters())
-
- trainer = EarlyStoppingPyTorchTrainer(
- optimizer,
- epochs=args.num_epochs,
- loss=criterion,
- feature_selector=feature_selector
- )
- model = trainer.train(
- model,
- training_dataloader,
- validation_dataloader,
- args.patience
- )
- if not os.path.exists(configuration["model_path"]):
- os.makedirs(configuration["model_path"])
- torch.save(model.state_dict(), os.path.join(configuration["model_path"],"model.pth"))
- save_results(trainer, configuration)
-
-
-
|