пре 2 дана · 9db244e6da
--- a/data/2026-IJCI/README.md
+++ b/data/2026-IJCI/README.md
 
															+
														
 
															+# A Malware Detection Model as a Benchmark for Neural Network Verification
														
 
															+
														
 
															+## Overview
														
 
															+This directory contains data and resources for the 2026-IJCI paper titled "A Malware Detection Model as a Benchmark for Neural Network Verification".
														
 
															+
														
 
															+## Contents
														
 
															+- `boolean_classifier/` - Neural Network model
														
 
															+- `verifier/` - Scripts for tool verification
														
--- a/data/2026-IJCI/boolean_classifier/__init__.py
+++ b/data/2026-IJCI/boolean_classifier/__init__.py
--- a/data/2026-IJCI/boolean_classifier/architectures/ffnn.py
+++ b/data/2026-IJCI/boolean_classifier/architectures/ffnn.py
 
															+import torch
														
 
															+
														
 
															+
														
 
															+class FFNN(torch.nn.Module):
														
 
															+    def __init__(self, configuration: dict):
														
 
															+        super().__init__()
														
 
															+        self.hidden_size = configuration["hidden_size"]
														
 
															+        self.input_size = configuration["input_size"]
														
 
															+        self.output_size = 2
														
 
															+        self.hidden_1 = torch.nn.Linear(self.input_size, self.hidden_size)
														
 
															+        self.output_layer = torch.nn.Linear(self.hidden_size, 2)
														
 
															+
														
 
															+
														
 
															+    def forward(self, x: torch.Tensor)-> torch.Tensor:
														
 
															+        x = self.hidden_1(x)
														
 
															+        x = torch.relu(x)
														
 
															+        y = self.output_layer(x)
														
 
															+        return y
														
 
															+
														
 
															+    def predict(self, x: torch.Tensor) -> torch.Tensor:
														
 
															+        outputs =  self.forward(x)
														
 
															+        outputs = outputs.softmax(dim=1)
														
 
															+        return outputs
														
--- a/data/2026-IJCI/boolean_classifier/classify_file.py
+++ b/data/2026-IJCI/boolean_classifier/classify_file.py
 
															+import argparse
														
 
															+import torch
														
 
															+import os
														
 
															+import sys
														
 
															+sys.path.append("../")
														
 
															+from boolean_classifier.architectures.ffnn import FFNN
														
 
															+import json
														
 
															+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
														
 
															+from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
														
 
															+import joblib
														
 
															+import numpy as np
														
 
															+
														
 
															+
														
 
															+def load_configuration(configuration_filepath: str) -> dict:
														
 
															+    with open(configuration_filepath, "r") as configuration_file:
														
 
															+        configuration = json.load(configuration_file)
														
 
															+    return configuration
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector')
														
 
															+    parser.add_argument("exe_filepath",
														
 
															+                        type=str,
														
 
															+                        help="Filepath of the executable"
														
 
															+                        )
														
 
															+    parser.add_argument("feature_type",
														
 
															+                        type=str,
														
 
															+                        help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}")
														
 
															+    parser.add_argument("configuration_file",
														
 
															+                        type=str,
														
 
															+                        help="Configuration file containing the hyperparameters of the model"
														
 
															+                        )
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
 
															+    print("Device: ", device)
														
 
															+
														
 
															+    configuration = load_configuration(args.configuration_file)
														
 
															+    if "feature_selector" in configuration:
														
 
															+        feature_selector = joblib.load(configuration["feature_selector"])
														
 
															+    else:
														
 
															+        feature_selector = None
														
 
															+
														
 
															+    # Load model
														
 
															+    model = FFNN(configuration)
														
 
															+    model = model.to(device)
														
 
															+    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
														
 
															+    model.eval()
														
 
															+
														
 
															+    with open(args.exe_filepath, "rb") as f:
														
 
															+        bytez = f.read()
														
 
															+
														
 
															+    if args.feature_type == "BooleanBigrams":
														
 
															+        feature_extractor = BooleanNGramFeatureExtractor(N=2)
														
 
															+        sparse_features = feature_extractor.feature_vector(bytez)
														
 
															+        features = sparse_features.todense()
														
 
															+        # print("feature vector: ", features)
														
 
															+        # zero_indices = np.where(features[0] == 0)[1]
														
 
															+        # print("Number of zero features: ", len(zero_indices))
														
 
															+        # print("Zero indices: ", zero_indices)
														
 
															+        # # Remove some items from zero_indices
														
 
															+        # if len(zero_indices) > 4000:
														
 
															+        #     zero_indices = np.random.choice(zero_indices, size=4000, replace=False)
														
 
															+        # print("Zero indices after sampling: ", zero_indices)
														
 
															+        # for i in zero_indices:
														
 
															+        #     features[0, i] = 1
														
 
															+        # zero_indices = np.where(features[0] == 0)[1]
														
 
															+        # print("Number of zero features: ", len(zero_indices))
														
 
															+    elif args.feature_type == "Bigrams":
														
 
															+        feature_extractor = NGramFeatureExtractor(N=2)
														
 
															+        features = feature_extractor.feature_vector(bytez)
														
 
															+    else:
														
 
															+        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
														
 
															+    if feature_selector is not None:
														
 
															+        features = feature_selector.transform(torch.Tensor(features))
														
 
															+    x = torch.tensor(features, dtype=torch.float).to(device)
														
 
															+    probs = model.predict(x)
														
 
															+    y_pred = probs.argmax(dim=1)
														
 
															+    print("Predicted label: ", y_pred, probs)
														
 
															+
														
 
															+
														
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_test_set.csv
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_test_set.csv
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_training_set.csv
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_training_set.csv
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_validation_set.csv
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_validation_set.csv
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_EMBER_features.py
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_EMBER_features.py
 
															+import sys
														
 
															+sys.path.append("../../../")
														
 
															+from boolean_classifier.feature_extractors.ember_feature_extractor import EMBERFeatureExtractor
														
 
															+
														
 
															+training_filepaths = ["bodmas_training_set.csv", "bodmas_validation_set.csv", "bodmas_test_set.csv"]
														
 
															+output_filepaths = ["bodmas_ember_training_set.csv", "bodmas_ember_validation_set.csv", "bodmas_ember_test_set.csv"]
														
 
															+for i, training_filepath in enumerate(training_filepaths):
														
 
															+    for output_filepath in output_filepaths:
														
 
															+        with open(output_filepath, "w") as output_file:
														
 
															+            with open(training_filepath, "r") as f:
														
 
															+                lines = f.readlines()
														
 
															+                for j, line in enumerate(lines):
														
 
															+                    exe_filepath, label = line.strip().split(",")
														
 
															+                    print(j, exe_filepath, label)
														
 
															+                    with open(exe_filepath, "rb") as exe_file:
														
 
															+                        bytez = exe_file.read()
														
 
															+                    try:
														
 
															+                        features = EMBERFeatureExtractor().feature_vector(bytez)
														
 
															+                        for feature in features:
														
 
															+                            output_file.write("{},".format(feature))
														
 
															+                        output_file.write(str(label)+"\n")
														
 
															+                    except ValueError as e:
														
 
															+                        print(e)
														
 
															+
														
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_boolean_2Gram_features.py
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_boolean_2Gram_features.py
 
															+import os
														
 
															+import sys
														
 
															+sys.path.append("../../../")
														
 
															+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
														
 
															+import numpy as np
														
 
															+from scipy.sparse import csr_matrix
														
 
															+import scipy.sparse
														
 
															+
														
 
															+
														
 
															+training_filepaths = [
														
 
															+    "bodmas_training_set.csv",
														
 
															+    "bodmas_validation_set.csv",
														
 
															+    "bodmas_test_set.csv"
														
 
															+]
														
 
															+features_training_filepaths = [
														
 
															+    "bodmas_boolean_2Gram_features_training_set.csv",
														
 
															+    "bodmas_boolean_2Gram_features_validation_set.csv",
														
 
															+    "bodmas_boolean_2Gram_features_test_set.csv"
														
 
															+]
														
 
															+
														
 
															+features_directories = [
														
 
															+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/training/",
														
 
															+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/validation/",
														
 
															+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/test/"
														
 
															+]
														
 
															+
														
 
															+training_lists = [[], [], []]
														
 
															+
														
 
															+for i, training_filepath in enumerate(training_filepaths):
														
 
															+    with open(training_filepath, "r") as f:
														
 
															+        lines = f.readlines()
														
 
															+        for j, line in enumerate(lines):
														
 
															+            exe_filepath, label = line.strip().split(",")
														
 
															+            sha = exe_filepath.split("/")[-1]
														
 
															+            print(j, exe_filepath, label)
														
 
															+            with open(exe_filepath, "rb") as exe_file:
														
 
															+                bytez = exe_file.read()
														
 
															+            try:
														
 
															+                sparse_features = BooleanNGramFeatureExtractor(N=2).feature_vector(bytez)
														
 
															+            except ValueError as e:
														
 
															+                print(e)
														
 
															+            scipy.sparse.save_npz(os.path.join(features_directories[i], sha+".npz"), sparse_features)
														
 
															+
														
 
															+            training_lists[i].append((os.path.join(features_directories[i], sha+".npz"), label))
														
 
															+
														
 
															+for i, features_filepath in enumerate(features_training_filepaths):
														
 
															+    with open(features_filepath, "w") as f:
														
 
															+        for filepath, label in training_lists[i]:
														
 
															+            f.write("{},{}\n".format(filepath, label))
														
 
															+
														
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl
--- a/data/2026-IJCI/boolean_classifier/data/examples/0a01c936a018e3fdbc804816128bb73d0c0196c0bf6f931d195529f8ed3f6d87
+++ b/data/2026-IJCI/boolean_classifier/data/examples/0a01c936a018e3fdbc804816128bb73d0c0196c0bf6f931d195529f8ed3f6d87
--- a/data/2026-IJCI/boolean_classifier/datasets/boolean_ngram_dataset.py
+++ b/data/2026-IJCI/boolean_classifier/datasets/boolean_ngram_dataset.py
 
															+from torch.utils.data import Dataset
														
 
															+import os
														
 
															+from random import shuffle
														
 
															+import numpy as np
														
 
															+import torch
														
 
															+import scipy.sparse
														
 
															+
														
 
															+
														
 
															+class BooleanNGramDataset(Dataset):
														
 
															+    def __init__(self, csv_filepath: str):
														
 
															+        self.all_files = []
														
 
															+        with open(csv_filepath, "r") as f:
														
 
															+            lines = f.readlines()
														
 
															+            for line in lines:
														
 
															+                filepath, label = line.strip().split(",")
														
 
															+                self.all_files.append((filepath, int(label)))
														
 
															+        shuffle(self.all_files)
														
 
															+
														
 
															+    def __len__(self):
														
 
															+        return len(self.all_files)
														
 
															+
														
 
															+    def __getitem__(self, index):
														
 
															+        to_load, y = self.all_files[index]
														
 
															+        # Step 1: Load the .npz file into a sparse matrix
														
 
															+        sparse_matrix = scipy.sparse.load_npz(to_load)
														
 
															+        # Step 2: Convert the sparse matrix to a dense matrix (e.g., using toarray())
														
 
															+        dense_matrix = sparse_matrix.toarray()  # You can also use .todense() if needed
														
 
															+
														
 
															+        # Step 3: Convert the dense matrix to a PyTorch tensor
														
 
															+        x = torch.tensor(dense_matrix, dtype=torch.float)
														
 
															+        x = x.squeeze()
														
 
															+        return x, torch.tensor(y)
														
--- a/data/2026-IJCI/boolean_classifier/datasets/ngram_dataset.py
+++ b/data/2026-IJCI/boolean_classifier/datasets/ngram_dataset.py
 
															+from torch.utils.data import Dataset
														
 
															+import os
														
 
															+from random import shuffle
														
 
															+import numpy as np
														
 
															+import torch
														
 
															+import scipy.sparse
														
 
															+
														
 
															+
														
 
															+class NGramDataset(Dataset):
														
 
															+    def __init__(self, csv_filepath: str):
														
 
															+        self.all_files = []
														
 
															+        with open(csv_filepath, "r") as f:
														
 
															+            lines = f.readlines()
														
 
															+            for line in lines:
														
 
															+                filepath, label = line.strip().split(",")
														
 
															+                self.all_files.append((filepath, int(label)))
														
 
															+        shuffle(self.all_files)
														
 
															+
														
 
															+
														
 
															+    def __len__(self):
														
 
															+        return len(self.all_files)
														
 
															+
														
 
															+    def __getitem__(self, index):
														
 
															+        to_load, y = self.all_files[index]
														
 
															+        # Step 1: Load the .npz file
														
 
															+        matrix = np.load(to_load)["arr_0"]
														
 
															+        # Step 2: Convert the dense matrix to a PyTorch tensor
														
 
															+        x = torch.tensor(matrix, dtype=torch.float)
														
 
															+        x = x.squeeze()
														
 
															+        return x, torch.tensor(y)
														
--- a/data/2026-IJCI/boolean_classifier/evaluate_malware_detector.py
+++ b/data/2026-IJCI/boolean_classifier/evaluate_malware_detector.py
 
															+import argparse
														
 
															+import torch
														
 
															+import sys
														
 
															+sys.path.append("../")
														
 
															+from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
														
 
															+from boolean_classifier.datasets.ngram_dataset import NGramDataset
														
 
															+from boolean_classifier.architectures.ffnn import FFNN
														
 
															+from torch.utils.data import DataLoader
														
 
															+import multiprocessing
														
 
															+import json
														
 
															+import os
														
 
															+import torch.nn
														
 
															+from torch.optim.lr_scheduler import _LRScheduler
														
 
															+from torch.utils.data import DataLoader
														
 
															+from tqdm import tqdm
														
 
															+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
														
 
															+import joblib
														
 
															+
														
 
															+
														
 
															+def load_configuration(configuration_filepath: str) -> dict:
														
 
															+    with open(configuration_filepath, "r") as configuration_file:
														
 
															+        configuration = json.load(configuration_file)
														
 
															+    return configuration
														
 
															+
														
 
															+def evaluate(model: torch.nn.Module, dataloader: DataLoader) -> tuple[list, list]:
														
 
															+    y_trues = []
														
 
															+    y_preds = []
														
 
															+    device = next(model.parameters()).device
														
 
															+    model = model.eval()
														
 
															+    with torch.no_grad():
														
 
															+        for x, y in tqdm(dataloader):
														
 
															+            if feature_selector is not None:
														
 
															+                x = torch.Tensor(feature_selector.transform(x))
														
 
															+            x, y = x.to(device), y.to(device)
														
 
															+            outputs = model.predict(x)
														
 
															+            y_pred =  outputs.argmax(dim=1)
														
 
															+            y_trues.extend(y.cpu())
														
 
															+            y_preds.extend(y_pred.cpu())
														
 
															+    return y_trues, y_preds
														
 
															+
														
 
															+def save_results(y_trues: list, y_preds: list, output_filepath: str):
														
 
															+    acc = accuracy_score(y_trues, y_preds)
														
 
															+    precision = precision_score(y_trues, y_preds)
														
 
															+    recall = recall_score(y_trues, y_preds)
														
 
															+    f1 = f1_score(y_trues, y_preds)
														
 
															+    cm = confusion_matrix(y_trues, y_preds)
														
 
															+
														
 
															+    with open(output_filepath, "w") as output_file:
														
 
															+        output_file.write("Accuracy: {}\n".format(acc))
														
 
															+        output_file.write("Precision: {}\n".format(precision))
														
 
															+        output_file.write("Recall: {}\n".format(recall))
														
 
															+        output_file.write("F1: {}\n".format(f1))
														
 
															+        output_file.write("Confusion Matrix: {}\n".format(cm))
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    parser = argparse.ArgumentParser(description='Evaluate malware detector')
														
 
															+    parser.add_argument("evaluation_file",
														
 
															+                        type=str,
														
 
															+                        help="Evaluation file containing the hashes and labels of the benign and malicious samples"
														
 
															+                        )
														
 
															+    parser.add_argument("dataset_type",
														
 
															+                        type=str,
														
 
															+                        help="Type of dataset: {Boolean, EMBER}"
														
 
															+                        )
														
 
															+    parser.add_argument("configuration_file",
														
 
															+                        type=str,
														
 
															+                        help="Configuration file containing the hyperparameters of the model"
														
 
															+                        )
														
 
															+    parser.add_argument("output_file",
														
 
															+                        type=str,
														
 
															+                        help="File to where to store the results",
														
 
															+                        )
														
 
															+    parser.add_argument("--batch_size",
														
 
															+                        type=int,
														
 
															+                        help="Batch size for training",
														
 
															+                        default=32
														
 
															+                        )
														
 
															+    args = parser.parse_args()
														
 
															+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
 
															+    print("Device: ", device)
														
 
															+    num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
														
 
															+
														
 
															+    configuration = load_configuration(args.configuration_file)
														
 
															+    if args.dataset_type == "BooleanBigrams":
														
 
															+        dataset = BooleanNGramDataset(args.evaluation_file)
														
 
															+    elif args.dataset_type == "Bigrams":
														
 
															+        dataset = NGramDataset(args.evaluation_file)
														
 
															+    else:
														
 
															+        raise NotImplementedError("Only Boolean dataset is currently supported")
														
 
															+    dataloader = DataLoader(
														
 
															+        dataset,
														
 
															+        batch_size=args.batch_size,
														
 
															+        num_workers=num_workers,
														
 
															+    )
														
 
															+    model = FFNN(configuration)
														
 
															+    model = model.to(device)
														
 
															+    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
														
 
															+    model.eval()
														
 
															+    if configuration["feature_selector"] is not None:
														
 
															+        feature_selector = joblib.load(configuration["feature_selector"])
														
 
															+    else:
														
 
															+        feature_selector = None
														
 
															+    y_trues, y_preds = evaluate(model, dataloader)
														
 
															+    save_results(y_trues, y_preds, args.output_file)
														
 
															+
														
 
															+
														
 
															+
														
--- a/data/2026-IJCI/boolean_classifier/feature_extractors/boolean_ngram_feature_extractor.py
+++ b/data/2026-IJCI/boolean_classifier/feature_extractors/boolean_ngram_feature_extractor.py
 
															+from collections import OrderedDict
														
 
															+from scipy.sparse import csr_matrix
														
 
															+
														
 
															+
														
 
															+class BooleanNGramFeatureExtractor(object):
														
 
															+    def __init__(self, N: int = 2):
														
 
															+        self.N = N
														
 
															+        self.dim = 256 ** N
														
 
															+        self.ngram_features = OrderedDict({"{},{}".format(i,j): 0 for i in range(256) for j in range(256)})
														
 
															+
														
 
															+    def feature_vector(self, bytez):
														
 
															+        raw_features = self.extract_ngram_features(bytez)
														
 
															+        return self.reduce(raw_features)
														
 
															+
														
 
															+    def extract_ngram_features(self, bytez)-> dict:
														
 
															+        words = list(bytez)
														
 
															+        bigrams = zip(words, words[1:])  # Create bi-grams
														
 
															+        bigrams = set(bigrams)
														
 
															+        for bigram in bigrams:
														
 
															+            self.ngram_features["{},{}".format(bigram[0], bigram[1])] += 1
														
 
															+        return self.ngram_features
														
 
															+
														
 
															+    def reduce(self, raw_features: dict, technique: str = None):
														
 
															+        if technique is None:
														
 
															+            return csr_matrix(list(raw_features.values()))
														
 
															+        else:
														
 
															+            raise NotImplementedError("Feature selection and dimensionality reduction technique not implemented")
														
 
															+
														
--- a/data/2026-IJCI/boolean_classifier/feature_extractors/ember_feature_extractor.py
+++ b/data/2026-IJCI/boolean_classifier/feature_extractors/ember_feature_extractor.py
 
															+#!/usr/bin/python
														
 
															+''' Extracts some basic features from PE files. Many of the features
														
 
															+implemented have been used in previously published works. For more information,
														
 
															+check out the following resources:
														
 
															+* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
														
 
															+* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
														
 
															+* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
														
 
															+* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
														
 
															+* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
														
 
															+
														
 
															+It may be useful to do feature selection to reduce this set of features to a meaningful set
														
 
															+for your modeling problem.
														
 
															+'''
														
 
															+
														
 
															+import hashlib
														
 
															+import json
														
 
															+import os
														
 
															+import re
														
 
															+
														
 
															+import lief
														
 
															+import numpy as np
														
 
															+from sklearn.feature_extraction import FeatureHasher
														
 
															+
														
 
															+LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
														
 
															+LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10)
														
 
															+LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)
														
 
															+
														
 
															+
														
 
															+class FeatureType(object):
														
 
															+    ''' Base class from which each feature type may inherit '''
														
 
															+
														
 
															+    name = ''
														
 
															+    dim = 0
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        return '{}({})'.format(self.name, self.dim)
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        ''' Generate a JSON-able representation of the file '''
														
 
															+        raise (NotImplementedError)
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        ''' Generate a feature vector from the raw features '''
														
 
															+        raise (NotImplementedError)
														
 
															+
														
 
															+    def feature_vector(self, bytez, lief_binary):
														
 
															+        ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
														
 
															+        if there are significant speedups to be gained from combining the two functions. '''
														
 
															+        return self.process_raw_features(self.raw_features(bytez, lief_binary))
														
 
															+
														
 
															+
														
 
															+class ByteHistogram(FeatureType):
														
 
															+    ''' Byte histogram (count + non-normalized) over the entire binary file '''
														
 
															+
														
 
															+    name = 'histogram'
														
 
															+    dim = 256
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
														
 
															+        return counts.tolist()
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        counts = np.array(raw_obj, dtype=np.float32)
														
 
															+        sum = counts.sum()
														
 
															+        normalized = counts / sum
														
 
															+        return normalized
														
 
															+
														
 
															+
														
 
															+class ByteEntropyHistogram(FeatureType):
														
 
															+    ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
														
 
															+    This roughly approximates the joint probability of byte value and local entropy.
														
 
															+    See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
														
 
															+    '''
														
 
															+
														
 
															+    name = 'byteentropy'
														
 
															+    dim = 256
														
 
															+
														
 
															+    def __init__(self, step=1024, window=2048):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+        self.window = window
														
 
															+        self.step = step
														
 
															+
														
 
															+    def _entropy_bin_counts(self, block):
														
 
															+        # coarse histogram, 16 bytes per bin
														
 
															+        c = np.bincount(block >> 4, minlength=16)  # 16-bin histogram
														
 
															+        p = c.astype(np.float32) / self.window
														
 
															+        wh = np.where(c)[0]
														
 
															+        H = np.sum(-p[wh] * np.log2(
														
 
															+            p[wh])) * 2  # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
														
 
															+
														
 
															+        Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
														
 
															+        if Hbin == 16:  # handle entropy = 8.0 bits
														
 
															+            Hbin = 15
														
 
															+
														
 
															+        return Hbin, c
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        output = np.zeros((16, 16), dtype=int)
														
 
															+        a = np.frombuffer(bytez, dtype=np.uint8)
														
 
															+        if a.shape[0] < self.window:
														
 
															+            Hbin, c = self._entropy_bin_counts(a)
														
 
															+            output[Hbin, :] += c
														
 
															+        else:
														
 
															+            # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
														
 
															+            shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
														
 
															+            strides = a.strides + (a.strides[-1],)
														
 
															+            blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
														
 
															+
														
 
															+            # from the blocks, compute histogram
														
 
															+            for block in blocks:
														
 
															+                Hbin, c = self._entropy_bin_counts(block)
														
 
															+                output[Hbin, :] += c
														
 
															+
														
 
															+        return output.flatten().tolist()
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        counts = np.array(raw_obj, dtype=np.float32)
														
 
															+        sum = counts.sum()
														
 
															+        normalized = counts / sum
														
 
															+        return normalized
														
 
															+
														
 
															+
														
 
															+class SectionInfo(FeatureType):
														
 
															+    ''' Information about section names, sizes and entropy.  Uses hashing trick
														
 
															+    to summarize all this section info into a feature vector.
														
 
															+    '''
														
 
															+
														
 
															+    name = 'section'
														
 
															+    dim = 5 + 50 + 50 + 50 + 50 + 50
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _properties(s):
														
 
															+        return [str(c).split('.')[-1] for c in s.characteristics_lists]
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        if lief_binary is None:
														
 
															+            return {"entry": "", "sections": []}
														
 
															+
														
 
															+        # properties of entry point, or if invalid, the first executable section
														
 
															+        not_found_error_class = RuntimeError if not lief.__version__.startswith("0.9.0") else lief.not_found
														
 
															+        try:
														
 
															+            if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12):
														
 
															+                section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase)
														
 
															+
														
 
															+                if section is None:
														
 
															+                    raise not_found_error_class
														
 
															+                entry_section = section.name
														
 
															+            else:  # lief < 0.12
														
 
															+                entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
														
 
															+        except not_found_error_class:
														
 
															+            # bad entry point, let's find the first executable section
														
 
															+            entry_section = ""
														
 
															+            mem_execute_characteristics = lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE if lief.__version__.startswith("0.9.0") else lief.PE.Section.CHARACTERISTICS.MEM_EXECUTE
														
 
															+            for s in lief_binary.sections:
														
 
															+                if mem_execute_characteristics in s.characteristics_lists:
														
 
															+                    entry_section = s.name
														
 
															+                    break
														
 
															+
														
 
															+        raw_obj = {"entry": entry_section}
														
 
															+        raw_obj["sections"] = [{
														
 
															+            'name': s.name,
														
 
															+            'size': s.size,
														
 
															+            'entropy': s.entropy,
														
 
															+            'vsize': s.virtual_size,
														
 
															+            'props': self._properties(s)
														
 
															+        } for s in lief_binary.sections]
														
 
															+        return raw_obj
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        sections = raw_obj['sections']
														
 
															+        general = [
														
 
															+            len(sections),  # total number of sections
														
 
															+            # number of sections with zero size
														
 
															+            sum(1 for s in sections if s['size'] == 0),
														
 
															+            # number of sections with an empty name
														
 
															+            sum(1 for s in sections if s['name'] == ""),
														
 
															+            # number of RX
														
 
															+            sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
														
 
															+            # number of W
														
 
															+            sum(1 for s in sections if 'MEM_WRITE' in s['props'])
														
 
															+        ]
														
 
															+        # gross characteristics of each section
														
 
															+        section_sizes = [(s['name'], s['size']) for s in sections]
														
 
															+        section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
														
 
															+        section_entropy = [(s['name'], s['entropy']) for s in sections]
														
 
															+        section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
														
 
															+        section_vsize = [(s['name'], s['vsize']) for s in sections]
														
 
															+        section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
														
 
															+        entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
														
 
															+        characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
														
 
															+        characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
														
 
															+
														
 
															+        return np.hstack([
														
 
															+            general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
														
 
															+            characteristics_hashed
														
 
															+        ]).astype(np.float32)
														
 
															+
														
 
															+
														
 
															+class ImportsInfo(FeatureType):
														
 
															+    ''' Information about imported libraries and functions from the
														
 
															+    import address table.  Note that the total number of imported
														
 
															+    functions is contained in GeneralFileInfo.
														
 
															+    '''
														
 
															+
														
 
															+    name = 'imports'
														
 
															+    dim = 1280
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        imports = {}
														
 
															+        if lief_binary is None:
														
 
															+            return imports
														
 
															+
														
 
															+        for lib in lief_binary.imports:
														
 
															+            if lib.name not in imports:
														
 
															+                imports[lib.name] = []  # libraries can be duplicated in listing, extend instead of overwrite
														
 
															+
														
 
															+            # Clipping assumes there are diminishing returns on the discriminatory power of imported functions
														
 
															+            #  beyond the first 10000 characters, and this will help limit the dataset size
														
 
															+            for entry in lib.entries:
														
 
															+                if entry.is_ordinal:
														
 
															+                    imports[lib.name].append("ordinal" + str(entry.ordinal))
														
 
															+                else:
														
 
															+                    imports[lib.name].append(entry.name[:10000])
														
 
															+
														
 
															+        return imports
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        # unique libraries
														
 
															+        libraries = list(set([l.lower() for l in raw_obj.keys()]))
														
 
															+        libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
														
 
															+
														
 
															+        # A string like "kernel32.dll:CreateFileMappingA" for each imported function
														
 
															+        imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
														
 
															+        imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
														
 
															+
														
 
															+        # Two separate elements: libraries (alone) and fully-qualified names of imported functions
														
 
															+        return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
														
 
															+
														
 
															+
														
 
															+class ExportsInfo(FeatureType):
														
 
															+    ''' Information about exported functions. Note that the total number of exported
														
 
															+    functions is contained in GeneralFileInfo.
														
 
															+    '''
														
 
															+
														
 
															+    name = 'exports'
														
 
															+    dim = 128
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        if lief_binary is None:
														
 
															+            return []
														
 
															+
														
 
															+        # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
														
 
															+        #  the first 10000 characters, and this will help limit the dataset size
														
 
															+        if LIEF_EXPORT_OBJECT:
														
 
															+            # export is an object with .name attribute (0.10.0 and later)
														
 
															+            clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
														
 
															+        else:
														
 
															+            # export is a string (LIEF 0.9.0 and earlier)
														
 
															+            clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
														
 
															+
														
 
															+        return clipped_exports
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
														
 
															+        return exports_hashed.astype(np.float32)
														
 
															+
														
 
															+
														
 
															+class GeneralFileInfo(FeatureType):
														
 
															+    ''' General information about the file '''
														
 
															+
														
 
															+    name = 'general'
														
 
															+    dim = 10
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        if lief_binary is None:
														
 
															+            return {
														
 
															+                'size': len(bytez),
														
 
															+                'vsize': 0,
														
 
															+                'has_debug': 0,
														
 
															+                'exports': 0,
														
 
															+                'imports': 0,
														
 
															+                'has_relocations': 0,
														
 
															+                'has_resources': 0,
														
 
															+                'has_signature': 0,
														
 
															+                'has_tls': 0,
														
 
															+                'symbols': 0
														
 
															+            }
														
 
															+
														
 
															+        return {
														
 
															+            'size': len(bytez),
														
 
															+            'vsize': lief_binary.virtual_size,
														
 
															+            'has_debug': int(lief_binary.has_debug),
														
 
															+            'exports': len(lief_binary.exported_functions),
														
 
															+            'imports': len(lief_binary.imported_functions),
														
 
															+            'has_relocations': int(lief_binary.has_relocations),
														
 
															+            'has_resources': int(lief_binary.has_resources),
														
 
															+            'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature),
														
 
															+            'has_tls': int(lief_binary.has_tls),
														
 
															+            'symbols': len(lief_binary.symbols),
														
 
															+        }
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        return np.asarray([
														
 
															+            raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
														
 
															+            raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
														
 
															+            raw_obj['symbols']
														
 
															+        ],
														
 
															+            dtype=np.float32)
														
 
															+
														
 
															+
														
 
															+class HeaderFileInfo(FeatureType):
														
 
															+    ''' Machine, architecure, OS, linker and other information extracted from header '''
														
 
															+
														
 
															+    name = 'header'
														
 
															+    dim = 62
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        raw_obj = {}
														
 
															+        raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
														
 
															+        raw_obj['optional'] = {
														
 
															+            'subsystem': "",
														
 
															+            'dll_characteristics': [],
														
 
															+            'magic': "",
														
 
															+            'major_image_version': 0,
														
 
															+            'minor_image_version': 0,
														
 
															+            'major_linker_version': 0,
														
 
															+            'minor_linker_version': 0,
														
 
															+            'major_operating_system_version': 0,
														
 
															+            'minor_operating_system_version': 0,
														
 
															+            'major_subsystem_version': 0,
														
 
															+            'minor_subsystem_version': 0,
														
 
															+            'sizeof_code': 0,
														
 
															+            'sizeof_headers': 0,
														
 
															+            'sizeof_heap_commit': 0
														
 
															+        }
														
 
															+        if lief_binary is None:
														
 
															+            return raw_obj
														
 
															+
														
 
															+        raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
														
 
															+        raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
														
 
															+        raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
														
 
															+        raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
														
 
															+        raw_obj['optional']['dll_characteristics'] = [
														
 
															+            str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
														
 
															+        ]
														
 
															+        raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
														
 
															+        raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
														
 
															+        raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
														
 
															+        raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
														
 
															+        raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
														
 
															+        raw_obj['optional'][
														
 
															+            'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
														
 
															+        raw_obj['optional'][
														
 
															+            'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
														
 
															+        raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
														
 
															+        raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
														
 
															+        raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
														
 
															+        raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
														
 
															+        raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
														
 
															+        return raw_obj
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        return np.hstack([
														
 
															+            raw_obj['coff']['timestamp'],
														
 
															+            FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
														
 
															+            FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
														
 
															+            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
														
 
															+            FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
														
 
															+            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
														
 
															+            raw_obj['optional']['major_image_version'],
														
 
															+            raw_obj['optional']['minor_image_version'],
														
 
															+            raw_obj['optional']['major_linker_version'],
														
 
															+            raw_obj['optional']['minor_linker_version'],
														
 
															+            raw_obj['optional']['major_operating_system_version'],
														
 
															+            raw_obj['optional']['minor_operating_system_version'],
														
 
															+            raw_obj['optional']['major_subsystem_version'],
														
 
															+            raw_obj['optional']['minor_subsystem_version'],
														
 
															+            raw_obj['optional']['sizeof_code'],
														
 
															+            raw_obj['optional']['sizeof_headers'],
														
 
															+            raw_obj['optional']['sizeof_heap_commit'],
														
 
															+        ]).astype(np.float32)
														
 
															+
														
 
															+
														
 
															+class StringExtractor(FeatureType):
														
 
															+    ''' Extracts strings from raw byte stream '''
														
 
															+
														
 
															+    name = 'strings'
														
 
															+    dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+        # all consecutive runs of 0x20 - 0x7f that are 5+ characters
														
 
															+        self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
														
 
															+        # occurances of the string 'C:\'.  Not actually extracting the path
														
 
															+        self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
														
 
															+        # occurances of http:// or https://.  Not actually extracting the URLs
														
 
															+        self._urls = re.compile(b'https?://', re.IGNORECASE)
														
 
															+        # occurances of the string prefix HKEY_.  No actually extracting registry names
														
 
															+        self._registry = re.compile(b'HKEY_')
														
 
															+        # crude evidence of an MZ header (dropper?) somewhere in the byte stream
														
 
															+        self._mz = re.compile(b'MZ')
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        allstrings = self._allstrings.findall(bytez)
														
 
															+        if allstrings:
														
 
															+            # statistics about strings:
														
 
															+            string_lengths = [len(s) for s in allstrings]
														
 
															+            avlength = sum(string_lengths) / len(string_lengths)
														
 
															+            # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
														
 
															+            as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
														
 
															+            c = np.bincount(as_shifted_string, minlength=96)  # histogram count
														
 
															+            # distribution of characters in printable strings
														
 
															+            csum = c.sum()
														
 
															+            p = c.astype(np.float32) / csum
														
 
															+            wh = np.where(c)[0]
														
 
															+            H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
														
 
															+        else:
														
 
															+            avlength = 0
														
 
															+            c = np.zeros((96,), dtype=np.float32)
														
 
															+            H = 0
														
 
															+            csum = 0
														
 
															+
														
 
															+        return {
														
 
															+            'numstrings': len(allstrings),
														
 
															+            'avlength': avlength,
														
 
															+            'printabledist': c.tolist(),  # store non-normalized histogram
														
 
															+            'printables': int(csum),
														
 
															+            'entropy': float(H),
														
 
															+            'paths': len(self._paths.findall(bytez)),
														
 
															+            'urls': len(self._urls.findall(bytez)),
														
 
															+            'registry': len(self._registry.findall(bytez)),
														
 
															+            'MZ': len(self._mz.findall(bytez))
														
 
															+        }
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
														
 
															+        return np.hstack([
														
 
															+            raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
														
 
															+            np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
														
 
															+            raw_obj['registry'], raw_obj['MZ']
														
 
															+        ]).astype(np.float32)
														
 
															+
														
 
															+
														
 
															+class DataDirectories(FeatureType):
														
 
															+    ''' Extracts size and virtual address of the first 15 data directories '''
														
 
															+
														
 
															+    name = 'datadirectories'
														
 
															+    dim = 15 * 2
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        super(FeatureType, self).__init__()
														
 
															+        self._name_order = [
														
 
															+            "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
														
 
															+            "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
														
 
															+            "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
														
 
															+        ]
														
 
															+
														
 
															+    def raw_features(self, bytez, lief_binary):
														
 
															+        output = []
														
 
															+        if lief_binary is None:
														
 
															+            return output
														
 
															+
														
 
															+        for data_directory in lief_binary.data_directories:
														
 
															+            output.append({
														
 
															+                "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
														
 
															+                "size": data_directory.size,
														
 
															+                "virtual_address": data_directory.rva
														
 
															+            })
														
 
															+        return output
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        features = np.zeros(2 * len(self._name_order), dtype=np.float32)
														
 
															+        for i in range(len(self._name_order)):
														
 
															+            if i < len(raw_obj):
														
 
															+                features[2 * i] = raw_obj[i]["size"]
														
 
															+                features[2 * i + 1] = raw_obj[i]["virtual_address"]
														
 
															+        return features
														
 
															+
														
 
															+
														
 
															+class EMBERFeatureExtractor(object):
														
 
															+    ''' Extract useful features from a PE file, and return as a vector of fixed size. '''
														
 
															+
														
 
															+    def __init__(self, feature_version=2, print_feature_warning=True, features_file=''):
														
 
															+        self.features = []
														
 
															+        features = {
														
 
															+            'ByteHistogram': ByteHistogram(),
														
 
															+            'ByteEntropyHistogram': ByteEntropyHistogram(),
														
 
															+            'StringExtractor': StringExtractor(),
														
 
															+            'GeneralFileInfo': GeneralFileInfo(),
														
 
															+            'HeaderFileInfo': HeaderFileInfo(),
														
 
															+            'SectionInfo': SectionInfo(),
														
 
															+            'ImportsInfo': ImportsInfo(),
														
 
															+            'ExportsInfo': ExportsInfo()
														
 
															+        }
														
 
															+
														
 
															+        if os.path.exists(features_file):
														
 
															+            with open(features_file, encoding='utf8') as f:
														
 
															+                x = json.load(f)
														
 
															+                self.features = [features[feature] for feature in x['features'] if feature in features]
														
 
															+        else:
														
 
															+            self.features = list(features.values())
														
 
															+
														
 
															+        if feature_version == 1:
														
 
															+            if not lief.__version__.startswith("0.8.3"):
														
 
															+                if print_feature_warning:
														
 
															+                    print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
														
 
															+                    print(
														
 
															+                        f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
														
 
															+                    print(f"WARNING:   in the feature calculations.")
														
 
															+        elif feature_version == 2:
														
 
															+            self.features.append(DataDirectories())
														
 
															+            if not lief.__version__.startswith("0.9.0"):
														
 
															+                if print_feature_warning:
														
 
															+                    print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
														
 
															+                    print(
														
 
															+                        f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
														
 
															+                    print(f"WARNING:   in the feature calculations.")
														
 
															+        else:
														
 
															+            raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
														
 
															+        self.dim = sum([fe.dim for fe in self.features])
														
 
															+
														
 
															+    def raw_features(self, bytez):
														
 
															+        if lief.__version__.startswith("0.9.0"):
														
 
															+            lief_errors = (
														
 
															+                lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError)
														
 
															+        else:
														
 
															+            lief_errors = (
														
 
															+                lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error,
														
 
															+                lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound,
														
 
															+                RuntimeError)
														
 
															+
														
 
															+        try:
														
 
															+            lief_binary = lief.PE.parse(list(bytez))
														
 
															+        except lief_errors as e:
														
 
															+            print("lief error: ", str(e))
														
 
															+            lief_binary = None
														
 
															+        except Exception:  # everything else (KeyboardInterrupt, SystemExit, ValueError):
														
 
															+            raise
														
 
															+
														
 
															+        features = {"sha256": hashlib.sha256(bytez).hexdigest()}
														
 
															+        features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
														
 
															+        return features
														
 
															+
														
 
															+    def process_raw_features(self, raw_obj):
														
 
															+        feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
														
 
															+        return np.hstack(feature_vectors).astype(np.float32)
														
 
															+
														
 
															+    def feature_vector(self, bytez):
														
 
															+        return self.process_raw_features(self.raw_features(bytez))
														
--- a/data/2026-IJCI/boolean_classifier/feature_extractors/ngram_feature_extractor.py
+++ b/data/2026-IJCI/boolean_classifier/feature_extractors/ngram_feature_extractor.py
 
															+from collections import OrderedDict
														
 
															+import numpy as np
														
 
															+
														
 
															+class NGramFeatureExtractor(object):
														
 
															+    def __init__(self, N: int = 2):
														
 
															+        self.N = N
														
 
															+        self.dim = 256 ** N
														
 
															+        self.ngram_features = OrderedDict({"{},{}".format(i,j): 0.0 for i in range(256) for j in range(256)})
														
 
															+
														
 
															+    def feature_vector(self, bytez):
														
 
															+        raw_features = self.extract_ngram_features(bytez)
														
 
															+        return self.reduce(raw_features)
														
 
															+
														
 
															+    def extract_ngram_features(self, bytez)-> dict:
														
 
															+        words = list(bytez)
														
 
															+        num_ngrams = len(words) - self.N
														
 
															+        bigrams = zip(words, words[1:])  # Create bi-grams
														
 
															+        for bigram in bigrams:
														
 
															+            self.ngram_features["{},{}".format(bigram[0], bigram[1])] += 1
														
 
															+        for key in self.ngram_features:
														
 
															+            self.ngram_features[key] = self.ngram_features[key] / num_ngrams
														
 
															+        return self.ngram_features
														
 
															+
														
 
															+    def reduce(self, raw_features: dict, technique: str = None):
														
 
															+        if technique is None:
														
 
															+            return np.expand_dims(np.array(list(raw_features.values())), axis=0)
														
 
															+        else:
														
 
															+            raise NotImplementedError("Feature selection and dimensionality reduction technique not implemented")
														
 
															+
														
--- a/data/2026-IJCI/boolean_classifier/ffnn_configurations/ffnn_2gram_k=1000_config.json
+++ b/data/2026-IJCI/boolean_classifier/ffnn_configurations/ffnn_2gram_k=1000_config.json
 
															+{
														
 
															+  "feature_selector": "data/BODMAS/feature_selectors/bigrams/bigrams_feature_selector_k=1000.pkl",
														
 
															+  "hidden_size": 512,
														
 
															+  "input_size": 1000,
														
 
															+  "model_path": "models/ffnn_2gram_k=1000_512_1"
														
 
															+}
														
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/model.pth
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/model.pth
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/results.json
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/results.json
 
															+{"training_losses": [0.007372988766147287, 0.005052168182717729, 0.004575773141697014, 0.004136406766564416, 0.003666894217103657, 0.0035721441213530944, 0.003411646314705646, 0.003301303897834049, 0.003245272185255189, 0.0031502678544421403, 0.003098506758948322], "training_accuracies": [0.9023583758813518, 0.9278223518923737, 0.9339492665532053, 0.9399303022935408, 0.9442418348326445, 0.9457006240376044, 0.9474511710835563, 0.9474835886214442, 0.949201718129508, 0.9492179268984521, 0.9511629791717319], "validation_losses": [0.009812075672269006, 0.010353462121517892, 0.005870703318895587, 0.005944547294264034, 0.005886296627045667, 0.005803646742737165, 0.00720510685860129, 0.005735838049743723, 0.005888903167399741, 0.005404814309630779, 0.006096020837560452], "validation_accuracies": [0.838542342108676, 0.7581377253274543, 0.9389184282194268, 0.9361950460381273, 0.9361950460381273, 0.9319154454675139, 0.9067565815069382, 0.9329529243937232, 0.9302295422124238, 0.9365841006354558, 0.9289326935546621]}
														
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/test.out
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/test.out
 
															+Accuracy: 0.9607282184655397
														
 
															+Precision: 0.9691975141853553
														
 
															+Recall: 0.9501986754966888
														
 
															+F1: 0.9596040663456393
														
 
															+Confusion Matrix: [[3801  114]
														
 
															+ [ 188 3587]]
														
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/validation.out
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/validation.out
 
															+Accuracy: 0.9319154454675139
														
 
															+Precision: 0.9192049561177078
														
 
															+Recall: 0.9438112907500663
														
 
															+F1: 0.9313456257355825
														
 
															+Confusion Matrix: [[3625  313]
														
 
															+ [ 212 3561]]
														
--- a/data/2026-IJCI/boolean_classifier/train_malware_detector.py
+++ b/data/2026-IJCI/boolean_classifier/train_malware_detector.py
 
															+import argparse
														
 
															+import copy
														
 
															+
														
 
															+import torch
														
 
															+import sys
														
 
															+sys.path.append("../")
														
 
															+from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
														
 
															+from boolean_classifier.datasets.ngram_dataset import NGramDataset
														
 
															+from boolean_classifier.architectures.ffnn import FFNN
														
 
															+from torch.utils.data import DataLoader
														
 
															+import multiprocessing
														
 
															+import json
														
 
															+import os
														
 
															+import torch.nn
														
 
															+from torch.optim.lr_scheduler import _LRScheduler
														
 
															+from torch.utils.data import DataLoader
														
 
															+from tqdm import tqdm
														
 
															+import joblib
														
 
															+
														
 
															+
														
 
															+class EarlyStoppingPyTorchTrainer:
														
 
															+    """Trainer for PyTorch models with early stopping."""
														
 
															+
														
 
															+    def __init__(self, optimizer: torch.optim.Optimizer, epochs: int = 5,
														
 
															+                 loss: torch.nn.Module = None, scheduler: _LRScheduler = None, feature_selector = None) -> None:
														
 
															+        """
														
 
															+        Create PyTorch trainer.
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        optimizer : torch.optim.Optimizer
														
 
															+            Optimizer to use for training the model.
														
 
															+        epochs : int, optional
														
 
															+            Number of epochs, by default 5.
														
 
															+        loss : torch.nn.Module, optional
														
 
															+            Loss to minimize, by default None.
														
 
															+        scheduler : _LRScheduler, optional
														
 
															+            Scheduler for the optimizer, by default None.
														
 
															+        """
														
 
															+        self._epochs = epochs
														
 
															+        self._optimizer = optimizer
														
 
															+        self._loss = loss if loss is not None else torch.nn.CrossEntropyLoss()
														
 
															+        self._scheduler = scheduler
														
 
															+        self.feature_selector = feature_selector
														
 
															+
														
 
															+        self.training_losses = []
														
 
															+        self.training_accuracies = []
														
 
															+        self.validation_losses = []
														
 
															+        self.validation_accuracies = []
														
 
															+
														
 
															+    def train(self, model: torch.nn.Module,
														
 
															+            train_loader: DataLoader,
														
 
															+            val_loader: DataLoader,
														
 
															+            patience: int) -> torch.nn.Module:
														
 
															+        """
														
 
															+        Train model with given loaders and early stopping.
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        model : torch.nn.Module
														
 
															+            Pytorch model to be trained.
														
 
															+        train_loader : DataLoader
														
 
															+            Train data loader.
														
 
															+        val_loader : DataLoader
														
 
															+            Validation data loader.
														
 
															+        patience : int
														
 
															+            Number of epochs to wait before early stopping.
														
 
															+        Returns
														
 
															+        -------
														
 
															+        torch.nn.Module
														
 
															+            Trained model.
														
 
															+        """
														
 
															+        best_loss = float("inf")
														
 
															+        best_model = None
														
 
															+        patience_counter = 0
														
 
															+        for _ in range(self._epochs):
														
 
															+            model = self.fit(model, train_loader)
														
 
															+            val_loss = self.validate(model, val_loader)
														
 
															+            if val_loss <= best_loss:
														
 
															+                best_loss = val_loss
														
 
															+                best_model = copy.deepcopy(model)
														
 
															+                patience_counter = 0
														
 
															+            else:
														
 
															+                patience_counter += 1
														
 
															+            if patience_counter >= patience:
														
 
															+                break
														
 
															+        return best_model
														
 
															+
														
 
															+    def fit(self,
														
 
															+              model: torch.nn.Module,
														
 
															+              dataloader: DataLoader) -> torch.nn.Module:
														
 
															+        """
														
 
															+        Train model for one epoch with given loader.
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        model : torch.nn.Module
														
 
															+            Pytorch model to be trained.
														
 
															+        dataloader : DataLoader
														
 
															+            Train data loader.
														
 
															+        Returns
														
 
															+        -------
														
 
															+        torch.nn.Module
														
 
															+            Trained model.
														
 
															+        """
														
 
															+        device = next(model.parameters()).device
														
 
															+        model = model.train()
														
 
															+        model = model.to(device)
														
 
															+        running_loss = 0.0
														
 
															+        train_total = 0
														
 
															+        train_correct = 0
														
 
															+        for x, y in tqdm(dataloader):
														
 
															+            if self.feature_selector is not None:
														
 
															+                x = torch.Tensor(self.feature_selector.transform(x))
														
 
															+            x, y = x.to(device), y.to(device)
														
 
															+            self._optimizer.zero_grad()
														
 
															+            outputs = model(x)
														
 
															+            loss = self._loss(outputs, y)
														
 
															+            loss.backward()
														
 
															+            self._optimizer.step()
														
 
															+            running_loss += loss.item()
														
 
															+            y_preds = outputs.softmax(dim=1).argmax(dim=1)
														
 
															+            train_total += y.size(0)
														
 
															+            train_correct += (y_preds == y).sum().item()
														
 
															+
														
 
															+        self.training_losses.append(running_loss / train_total)
														
 
															+        self.training_accuracies.append(train_correct / train_total)
														
 
															+
														
 
															+        if self._scheduler is not None:
														
 
															+            self._scheduler.step()
														
 
															+        return model
														
 
															+
														
 
															+    def validate(self,
														
 
															+                 model: torch.nn.Module,
														
 
															+                 dataloader: DataLoader) -> float:
														
 
															+        """
														
 
															+        Validate model with given loader.
														
 
															+        Parameters
														
 
															+        ----------
														
 
															+        model : torch.nn.Module
														
 
															+            Pytorch model to be balidated.
														
 
															+        dataloader : DataLoader
														
 
															+            Validation data loader.
														
 
															+        Returns
														
 
															+        -------
														
 
															+        float
														
 
															+            Validation loss of the model.
														
 
															+        """
														
 
															+        running_loss = 0
														
 
															+        val_total = 0
														
 
															+        val_correct = 0
														
 
															+        device = next(model.parameters()).device
														
 
															+        model = model.eval()
														
 
															+        model = model.to(device)
														
 
															+        with torch.no_grad():
														
 
															+            for x, y in tqdm(dataloader):
														
 
															+                if self.feature_selector is not None:
														
 
															+                    x = torch.Tensor(self.feature_selector.transform(x))
														
 
															+                x, y = x.to(device), y.to(device)
														
 
															+                outputs = model(x)
														
 
															+                loss = self._loss(outputs, y)
														
 
															+                running_loss += loss.item()
														
 
															+                y_preds = outputs.softmax(dim=1).argmax(dim=1)
														
 
															+
														
 
															+                val_total += y.size(0)
														
 
															+                val_correct += (y_preds == y).sum().item()
														
 
															+
														
 
															+            self.validation_losses.append(running_loss / val_total)
														
 
															+            self.validation_accuracies.append(val_correct / val_total)
														
 
															+        return loss
														
 
															+
														
 
															+def save_results(trainer: EarlyStoppingPyTorchTrainer, configuration: dict):
														
 
															+    results = {
														
 
															+        "training_losses": trainer.training_losses,
														
 
															+        "training_accuracies": trainer.training_accuracies,
														
 
															+        "validation_losses": trainer.validation_losses,
														
 
															+        "validation_accuracies": trainer.validation_accuracies
														
 
															+    }
														
 
															+    with open(os.path.join(configuration["model_path"], "results.json"), "w") as output_file:
														
 
															+        json.dump(results, output_file)
														
 
															+
														
 
															+def load_configuration(configuration_filepath: str) -> dict:
														
 
															+    with open(configuration_filepath, "r") as configuration_file:
														
 
															+        configuration = json.load(configuration_file)
														
 
															+    return configuration
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    parser = argparse.ArgumentParser(description='Train malware detector')
														
 
															+    parser.add_argument("training_file",
														
 
															+                        type=str,
														
 
															+                        help="Training file containing the hashes and labels of the benign and malicious samples"
														
 
															+                        )
														
 
															+    parser.add_argument("validation_file",
														
 
															+                        type=str,
														
 
															+                        help="Validation file containing the hashes and labels of the benign and malicious samples"
														
 
															+                        )
														
 
															+    parser.add_argument("dataset_type",
														
 
															+                        type=str,
														
 
															+                        help="Type of dataset: {BooleanBigrams, Bigrams, EMBER}"
														
 
															+                        )
														
 
															+    parser.add_argument("configuration_file",
														
 
															+                        type=str,
														
 
															+                        help="Configuration file containing the hyperparameters of the model"
														
 
															+                        )
														
 
															+    parser.add_argument("--batch_size",
														
 
															+                        type=int,
														
 
															+                        help="Batch size for training",
														
 
															+                        default=32
														
 
															+                        )
														
 
															+    parser.add_argument("--num_epochs",
														
 
															+                        type=int,
														
 
															+                        help="Max epochs",
														
 
															+                        default=50
														
 
															+                        )
														
 
															+    parser.add_argument("--patience",
														
 
															+                        type=int,
														
 
															+                        help="Patience for early stopping",
														
 
															+                        default=5
														
 
															+                        )
														
 
															+    args = parser.parse_args()
														
 
															+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
 
															+    print("Device: ", device)
														
 
															+    num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
														
 
															+
														
 
															+    if args.dataset_type == "BooleanBigrams":
														
 
															+        training_dataset = BooleanNGramDataset(args.training_file)
														
 
															+        validation_dataset = BooleanNGramDataset(args.validation_file)
														
 
															+    elif args.dataset_type == "Bigrams":
														
 
															+        training_dataset = NGramDataset(args.training_file)
														
 
															+        validation_dataset = NGramDataset(args.validation_file)
														
 
															+    else:
														
 
															+        raise NotImplementedError("Only Boolean dataset is currently supported")
														
 
															+    training_dataloader = DataLoader(
														
 
															+        training_dataset,
														
 
															+        batch_size=args.batch_size,
														
 
															+        num_workers=num_workers,
														
 
															+    )
														
 
															+    validation_dataloader = DataLoader(
														
 
															+        validation_dataset,
														
 
															+        batch_size=args.batch_size,
														
 
															+        num_workers=num_workers,
														
 
															+    )
														
 
															+
														
 
															+    configuration = load_configuration(args.configuration_file)
														
 
															+    model = FFNN(configuration)
														
 
															+    model = model.to(device)
														
 
															+
														
 
															+    if configuration["feature_selector"] is not None:
														
 
															+        feature_selector = joblib.load(configuration["feature_selector"])
														
 
															+    else:
														
 
															+        feature_selector = None
														
 
															+
														
 
															+    criterion = torch.nn.CrossEntropyLoss()
														
 
															+    optimizer = torch.optim.Adam(model.parameters())
														
 
															+
														
 
															+    trainer = EarlyStoppingPyTorchTrainer(
														
 
															+        optimizer,
														
 
															+        epochs=args.num_epochs,
														
 
															+        loss=criterion,
														
 
															+        feature_selector=feature_selector
														
 
															+    )
														
 
															+    model = trainer.train(
														
 
															+        model,
														
 
															+        training_dataloader,
														
 
															+        validation_dataloader,
														
 
															+        args.patience
														
 
															+    )
														
 
															+    if not os.path.exists(configuration["model_path"]):
														
 
															+        os.makedirs(configuration["model_path"])
														
 
															+    torch.save(model.state_dict(), os.path.join(configuration["model_path"],"model.pth"))
														
 
															+    save_results(trainer, configuration)
														
 
															+    
														
 
															+
														
 
															+    
														
--- a/data/2026-IJCI/verifier/create_vnnlib.py
+++ b/data/2026-IJCI/verifier/create_vnnlib.py
 
															+#!/usr/bin/python3
														
 
															+
														
 
															+# Libraries
														
 
															+
														
 
															+import argparse
														
 
															+import torch
														
 
															+import os
														
 
															+import sys
														
 
															+import json
														
 
															+import joblib
														
 
															+import numpy as np
														
 
															+
														
 
															+current = os.path.dirname(os.path.realpath(__file__))
														
 
															+parent = os.path.dirname(current)
														
 
															+sys.path.append(parent)
														
 
															+
														
 
															+from boolean_classifier.architectures.ffnn import FFNN
														
 
															+
														
 
															+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
														
 
															+from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
														
 
															+
														
 
															+# Functions
														
 
															+
														
 
															+def get_header(args, input_name, output_name, free_features_indices):
														
 
															+    '''Get the header for the VNN file'''
														
 
															+    str = f'; Input file: {args.input_file}\n'
														
 
															+    str += f'; Free features: {args.free}\n'
														
 
															+    str += f'; Free features indices:'
														
 
															+    for i in range(len(free_features_indices)):
														
 
															+        str += f' {free_features_indices[i]}'
														
 
															+    str += f'\n'
														
 
															+    str += f'; Total features: {args.total_features}\n'
														
 
															+    str += f'; Feature type: {args.feature_type}\n'
														
 
															+    str += f'; Input name: {input_name}\n'
														
 
															+    str += f'; Output name: {output_name}\n'
														
 
															+    str += f'; Epsilon: {args.epsilon}\n'
														
 
															+    str += f'; Random seed: {args.seed}\n'
														
 
															+    return str
														
 
															+
														
 
															+def get_input_vars(args, input_name):
														
 
															+    '''Get the input variables for the VNN file'''
														
 
															+    str = f'\n; Input variables:\n\n'
														
 
															+    for i in range(args.total_features):
														
 
															+        str += f'(declare-const {input_name}_{i} Real)\n'
														
 
															+    return str
														
 
															+
														
 
															+def get_output_vars(output_name):
														
 
															+    '''Get the output variables for the VNN file'''
														
 
															+    str = f'\n; Output variables:\n\n'
														
 
															+    str += f'(declare-const {output_name}_0 Real)\n'
														
 
															+    str += f'(declare-const {output_name}_1 Real)\n'
														
 
															+    return str
														
 
															+
														
 
															+def select_free_features(args, features):
														
 
															+    '''Select features to be free but only from features that are zero'''
														
 
															+    if args.list_ff_indices is not None: # If list of free feature indices is provided, use it. Do not check if they are zero or if it matches the number of arts.free features.
														
 
															+        indices = args.list_ff_indices
														
 
															+        # assert len(indices) == args.free, "Number of free features does not match the length of the provided indices."
														
 
															+        assert all(i >= 0 and i < args.total_features for i in indices), "Some indices are out of bounds."
														
 
															+    else:
														
 
															+        zero_indices = np.where(features == 0)[1] # For numpy arrays
														
 
															+        # print(f'Selecting {args.free} out of {len(zero_indices)} features with zero value')
														
 
															+        assert len(zero_indices) >= args.free, "Not enough zero features to select from."
														
 
															+        indices = np.random.choice(zero_indices, size=args.free, replace=False)
														
 
															+        # print('Free features indices:', random_indices)
														
 
															+    free_features = [False] * args.total_features
														
 
															+    for i in indices:
														
 
															+        free_features[i] = True
														
 
															+    return free_features, indices
														
 
															+
														
 
															+def get_input_constraints(args, input_name, features, free_features):
														
 
															+    '''Get the input constraints for the VNN file'''
														
 
															+    str = f'\n; Input constraints:\n\n'
														
 
															+    # Set ranges for the free features
														
 
															+    for i, free in enumerate(free_features):
														
 
															+        if free:
														
 
															+            # Standard constraint X >= 0 and <= 1
														
 
															+            str += f'(assert (>= {input_name}_{i} {max(0, features[0, i] - args.epsilon)}))\n'
														
 
															+            str += f'(assert (<= {input_name}_{i} {min(1, features[0, i] + args.epsilon)}))\n' 
														
 
															+            # Additional constraint to standard to ensure 0 or 1
														
 
															+            #str += f'(assert (or (<= {input_name}_{i} {max(0, dense_features[0, i] - args.epsilon)})'
														
 
															+            #str += f' (>= {input_name}_{i} {min(1, dense_features[0, i] + args.epsilon)})))\n'
														
 
															+        else:
														
 
															+            str += f'(assert (>= {input_name}_{i} {features[0, i]}))\n'
														
 
															+            str += f'(assert (<= {input_name}_{i} {features[0, i]}))\n'
														
 
															+    return str
														
 
															+
														
 
															+def get_output_constraints(output_name, predicted_label):
														
 
															+    '''Get the output constraints for the VNN file'''
														
 
															+    str = f'\n; Output constraints:\n\n'
														
 
															+    if predicted_label == 1:
														
 
															+        str += f'(assert (>= {output_name}_0 0.55))\n'
														
 
															+        str += f'(assert (<= {output_name}_0 1.0))\n'
														
 
															+        str += f'(assert (>= {output_name}_1 0.0))\n'
														
 
															+        str += f'(assert (<= {output_name}_1 0.45))\n'
														
 
															+    else:
														
 
															+        str += f'(assert (>= {output_name}_0 0.0))\n'
														
 
															+        str += f'(assert (<= {output_name}_0 0.45))\n'
														
 
															+        str += f'(assert (>= {output_name}_1 0.55))\n'
														
 
															+        str += f'(assert (<= {output_name}_1 1.0))\n'   
														
 
															+    return str
														
 
															+
														
 
															+def load_configuration(configuration_filepath: str) -> dict:
														
 
															+    with open(configuration_filepath, "r") as configuration_file:
														
 
															+        configuration = json.load(configuration_file)
														
 
															+    return configuration
														
 
															+
														
 
															+class VNNLIBargs():
														
 
															+    def __init__(self, input_file, model_path, config_file, feature_type, free, total_features, list_ff_indices, epsilon=1, output_file='out.vnnlib', seed=None):
														
 
															+        self.input_file = input_file
														
 
															+        self.model_path = model_path
														
 
															+        self.config_file = config_file
														
 
															+        self.feature_type = feature_type
														
 
															+        self.free = free
														
 
															+        self.total_features = total_features
														
 
															+        self.list_ff_indices = list_ff_indices
														
 
															+        self.epsilon = epsilon
														
 
															+        self.output_file = output_file
														
 
															+        self.seed = seed
														
 
															+
														
 
															+def create_vnnlib(args, features, predicted_label):
														
 
															+    input_name, output_name = "X", "Y"
														
 
															+    np.random.seed(args.seed)
														
 
															+    free_features, free_features_indices = select_free_features(args, features)
														
 
															+    with open(args.output_file, 'w') as output_file:
														
 
															+        output_file.write(get_header(args, input_name, output_name, free_features_indices))
														
 
															+        output_file.write(get_input_vars(args, input_name))
														
 
															+        output_file.write(get_output_vars(output_name))
														
 
															+        output_file.write(get_input_constraints(args, input_name, features, free_features))
														
 
															+        output_file.write(get_output_constraints(output_name, predicted_label))            
														
 
															+
														
 
															+
														
 
															+# Main
														
 
															+
														
 
															+if __name__ == '__main__' :
														
 
															+    # Parse arguments
														
 
															+    parser = argparse.ArgumentParser(description = 'Generates data.')
														
 
															+    # Optional arguments
														
 
															+    parser.add_argument('input_file', type = str, help = 'Input binary file name')
														
 
															+    parser.add_argument('model_path', type = str, help = 'Path to the model .pth file')
														
 
															+    parser.add_argument('config_file', type = str, help = 'Configuration file containing the hyperparameters of the model')
														
 
															+    parser.add_argument('feature_type', type = str, help = 'Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}')
														
 
															+    parser.add_argument('free', type = int, help = 'Number of free features')
														
 
															+    parser.add_argument('total_features', type = int, help = 'Total number of features')
														
 
															+    parser.add_argument('-l', '--list_ff_indices', nargs = '+', default = None, type = int, help = 'List of free feature indices (default: None)', dest = 'list_ff_indices')
														
 
															+    parser.add_argument('-e', '--epsilon', default = 1, type = int, help = 'Input epsilon variation (default: 1)', dest = 'epsilon')
														
 
															+    parser.add_argument('-o', '--output_file', default = 'out.vnnlib', type = str, help = 'output file name (default: out.vnnlib)', dest = 'output_file')
														
 
															+    parser.add_argument('-s', '--seed', default = None, type = int, help = 'Random seed', dest = 'seed')
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    # Set device
														
 
															+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
 
															+    print("Device: ", device)
														
 
															+    
														
 
															+    configuration = load_configuration(args.config_file)
														
 
															+
														
 
															+    # Load feature extractor
														
 
															+    if  "feature_selector" in configuration:
														
 
															+        config = '../boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl'
														
 
															+        feature_selector = joblib.load(config)
														
 
															+    else:
														
 
															+        feature_selector = None
														
 
															+
														
 
															+
														
 
															+    # Load model
														
 
															+    model = FFNN(configuration)
														
 
															+    model = model.to(device)
														
 
															+    model.load_state_dict(torch.load(args.model_path, weights_only=True))
														
 
															+    model.eval()
														
 
															+
														
 
															+    with open(args.input_file, "rb") as f:
														
 
															+        bytez = f.read()
														
 
															+
														
 
															+    if args.feature_type == "BooleanBigrams":
														
 
															+        feature_extractor = BooleanNGramFeatureExtractor(N=2)
														
 
															+        sparse_features = feature_extractor.feature_vector(bytez)
														
 
															+        features = sparse_features.todense()
														
 
															+    elif args.feature_type == "Bigrams":
														
 
															+        feature_extractor = NGramFeatureExtractor(N=2)
														
 
															+        features = feature_extractor.feature_vector(bytez)
														
 
															+    else:
														
 
															+        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
														
 
															+
														
 
															+    
														
 
															+    if feature_selector is not None:
														
 
															+        features = feature_selector.transform(torch.Tensor(features))
														
 
															+    x = torch.tensor(features, dtype=torch.float).to(device)
														
 
															+    probs = model.predict(x)
														
 
															+    y_pred = probs.argmax(dim=1)
														
 
															+    print("Predicted label: ", y_pred, probs)
														
 
															+
														
 
															+    create_vnnlib(args, features, y_pred[0].item())
														
 
															+