2 days ago · 9db244e6da
--- a/data/2026-IJCI/README.md
+++ b/data/2026-IJCI/README.md
@@ -0,0 +1,9 @@
 
				+
			
 
				+# A Malware Detection Model as a Benchmark for Neural Network Verification
			
 
				+
			
 
				+## Overview
			
 
				+This directory contains data and resources for the 2026-IJCI paper titled "A Malware Detection Model as a Benchmark for Neural Network Verification".
			
 
				+
			
 
				+## Contents
			
 
				+- `boolean_classifier/` - Neural Network model
			
 
				+- `verifier/` - Scripts for tool verification
			
--- a/data/2026-IJCI/boolean_classifier/__init__.py
+++ b/data/2026-IJCI/boolean_classifier/__init__.py
--- a/data/2026-IJCI/boolean_classifier/architectures/ffnn.py
+++ b/data/2026-IJCI/boolean_classifier/architectures/ffnn.py
@@ -0,0 +1,23 @@
 
				+import torch
			
 
				+
			
 
				+
			
 
				+class FFNN(torch.nn.Module):
			
 
				+    def __init__(self, configuration: dict):
			
 
				+        super().__init__()
			
 
				+        self.hidden_size = configuration["hidden_size"]
			
 
				+        self.input_size = configuration["input_size"]
			
 
				+        self.output_size = 2
			
 
				+        self.hidden_1 = torch.nn.Linear(self.input_size, self.hidden_size)
			
 
				+        self.output_layer = torch.nn.Linear(self.hidden_size, 2)
			
 
				+
			
 
				+
			
 
				+    def forward(self, x: torch.Tensor)-> torch.Tensor:
			
 
				+        x = self.hidden_1(x)
			
 
				+        x = torch.relu(x)
			
 
				+        y = self.output_layer(x)
			
 
				+        return y
			
 
				+
			
 
				+    def predict(self, x: torch.Tensor) -> torch.Tensor:
			
 
				+        outputs =  self.forward(x)
			
 
				+        outputs = outputs.softmax(dim=1)
			
 
				+        return outputs
			
--- a/data/2026-IJCI/boolean_classifier/classify_file.py
+++ b/data/2026-IJCI/boolean_classifier/classify_file.py
@@ -0,0 +1,80 @@
 
				+import argparse
			
 
				+import torch
			
 
				+import os
			
 
				+import sys
			
 
				+sys.path.append("../")
			
 
				+from boolean_classifier.architectures.ffnn import FFNN
			
 
				+import json
			
 
				+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
			
 
				+from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
			
 
				+import joblib
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+def load_configuration(configuration_filepath: str) -> dict:
			
 
				+    with open(configuration_filepath, "r") as configuration_file:
			
 
				+        configuration = json.load(configuration_file)
			
 
				+    return configuration
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector')
			
 
				+    parser.add_argument("exe_filepath",
			
 
				+                        type=str,
			
 
				+                        help="Filepath of the executable"
			
 
				+                        )
			
 
				+    parser.add_argument("feature_type",
			
 
				+                        type=str,
			
 
				+                        help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}")
			
 
				+    parser.add_argument("configuration_file",
			
 
				+                        type=str,
			
 
				+                        help="Configuration file containing the hyperparameters of the model"
			
 
				+                        )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
			
 
				+    print("Device: ", device)
			
 
				+
			
 
				+    configuration = load_configuration(args.configuration_file)
			
 
				+    if "feature_selector" in configuration:
			
 
				+        feature_selector = joblib.load(configuration["feature_selector"])
			
 
				+    else:
			
 
				+        feature_selector = None
			
 
				+
			
 
				+    # Load model
			
 
				+    model = FFNN(configuration)
			
 
				+    model = model.to(device)
			
 
				+    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
			
 
				+    model.eval()
			
 
				+
			
 
				+    with open(args.exe_filepath, "rb") as f:
			
 
				+        bytez = f.read()
			
 
				+
			
 
				+    if args.feature_type == "BooleanBigrams":
			
 
				+        feature_extractor = BooleanNGramFeatureExtractor(N=2)
			
 
				+        sparse_features = feature_extractor.feature_vector(bytez)
			
 
				+        features = sparse_features.todense()
			
 
				+        # print("feature vector: ", features)
			
 
				+        # zero_indices = np.where(features[0] == 0)[1]
			
 
				+        # print("Number of zero features: ", len(zero_indices))
			
 
				+        # print("Zero indices: ", zero_indices)
			
 
				+        # # Remove some items from zero_indices
			
 
				+        # if len(zero_indices) > 4000:
			
 
				+        #     zero_indices = np.random.choice(zero_indices, size=4000, replace=False)
			
 
				+        # print("Zero indices after sampling: ", zero_indices)
			
 
				+        # for i in zero_indices:
			
 
				+        #     features[0, i] = 1
			
 
				+        # zero_indices = np.where(features[0] == 0)[1]
			
 
				+        # print("Number of zero features: ", len(zero_indices))
			
 
				+    elif args.feature_type == "Bigrams":
			
 
				+        feature_extractor = NGramFeatureExtractor(N=2)
			
 
				+        features = feature_extractor.feature_vector(bytez)
			
 
				+    else:
			
 
				+        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
			
 
				+    if feature_selector is not None:
			
 
				+        features = feature_selector.transform(torch.Tensor(features))
			
 
				+    x = torch.tensor(features, dtype=torch.float).to(device)
			
 
				+    probs = model.predict(x)
			
 
				+    y_pred = probs.argmax(dim=1)
			
 
				+    print("Predicted label: ", y_pred, probs)
			
 
				+
			
 
				+
			
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_test_set.csv
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_test_set.csv
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_training_set.csv
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_training_set.csv
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_validation_set.csv
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_validation_set.csv
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_EMBER_features.py
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_EMBER_features.py
@@ -0,0 +1,24 @@
 
				+import sys
			
 
				+sys.path.append("../../../")
			
 
				+from boolean_classifier.feature_extractors.ember_feature_extractor import EMBERFeatureExtractor
			
 
				+
			
 
				+training_filepaths = ["bodmas_training_set.csv", "bodmas_validation_set.csv", "bodmas_test_set.csv"]
			
 
				+output_filepaths = ["bodmas_ember_training_set.csv", "bodmas_ember_validation_set.csv", "bodmas_ember_test_set.csv"]
			
 
				+for i, training_filepath in enumerate(training_filepaths):
			
 
				+    for output_filepath in output_filepaths:
			
 
				+        with open(output_filepath, "w") as output_file:
			
 
				+            with open(training_filepath, "r") as f:
			
 
				+                lines = f.readlines()
			
 
				+                for j, line in enumerate(lines):
			
 
				+                    exe_filepath, label = line.strip().split(",")
			
 
				+                    print(j, exe_filepath, label)
			
 
				+                    with open(exe_filepath, "rb") as exe_file:
			
 
				+                        bytez = exe_file.read()
			
 
				+                    try:
			
 
				+                        features = EMBERFeatureExtractor().feature_vector(bytez)
			
 
				+                        for feature in features:
			
 
				+                            output_file.write("{},".format(feature))
			
 
				+                        output_file.write(str(label)+"\n")
			
 
				+                    except ValueError as e:
			
 
				+                        print(e)
			
 
				+
			
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_boolean_2Gram_features.py
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/extract_boolean_2Gram_features.py
@@ -0,0 +1,50 @@
 
				+import os
			
 
				+import sys
			
 
				+sys.path.append("../../../")
			
 
				+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
			
 
				+import numpy as np
			
 
				+from scipy.sparse import csr_matrix
			
 
				+import scipy.sparse
			
 
				+
			
 
				+
			
 
				+training_filepaths = [
			
 
				+    "bodmas_training_set.csv",
			
 
				+    "bodmas_validation_set.csv",
			
 
				+    "bodmas_test_set.csv"
			
 
				+]
			
 
				+features_training_filepaths = [
			
 
				+    "bodmas_boolean_2Gram_features_training_set.csv",
			
 
				+    "bodmas_boolean_2Gram_features_validation_set.csv",
			
 
				+    "bodmas_boolean_2Gram_features_test_set.csv"
			
 
				+]
			
 
				+
			
 
				+features_directories = [
			
 
				+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/training/",
			
 
				+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/validation/",
			
 
				+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/test/"
			
 
				+]
			
 
				+
			
 
				+training_lists = [[], [], []]
			
 
				+
			
 
				+for i, training_filepath in enumerate(training_filepaths):
			
 
				+    with open(training_filepath, "r") as f:
			
 
				+        lines = f.readlines()
			
 
				+        for j, line in enumerate(lines):
			
 
				+            exe_filepath, label = line.strip().split(",")
			
 
				+            sha = exe_filepath.split("/")[-1]
			
 
				+            print(j, exe_filepath, label)
			
 
				+            with open(exe_filepath, "rb") as exe_file:
			
 
				+                bytez = exe_file.read()
			
 
				+            try:
			
 
				+                sparse_features = BooleanNGramFeatureExtractor(N=2).feature_vector(bytez)
			
 
				+            except ValueError as e:
			
 
				+                print(e)
			
 
				+            scipy.sparse.save_npz(os.path.join(features_directories[i], sha+".npz"), sparse_features)
			
 
				+
			
 
				+            training_lists[i].append((os.path.join(features_directories[i], sha+".npz"), label))
			
 
				+
			
 
				+for i, features_filepath in enumerate(features_training_filepaths):
			
 
				+    with open(features_filepath, "w") as f:
			
 
				+        for filepath, label in training_lists[i]:
			
 
				+            f.write("{},{}\n".format(filepath, label))
			
 
				+
			
--- a/data/2026-IJCI/boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl
+++ b/data/2026-IJCI/boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl
--- a/data/2026-IJCI/boolean_classifier/data/examples/0a01c936a018e3fdbc804816128bb73d0c0196c0bf6f931d195529f8ed3f6d87
+++ b/data/2026-IJCI/boolean_classifier/data/examples/0a01c936a018e3fdbc804816128bb73d0c0196c0bf6f931d195529f8ed3f6d87
--- a/data/2026-IJCI/boolean_classifier/datasets/boolean_ngram_dataset.py
+++ b/data/2026-IJCI/boolean_classifier/datasets/boolean_ngram_dataset.py
@@ -0,0 +1,32 @@
 
				+from torch.utils.data import Dataset
			
 
				+import os
			
 
				+from random import shuffle
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import scipy.sparse
			
 
				+
			
 
				+
			
 
				+class BooleanNGramDataset(Dataset):
			
 
				+    def __init__(self, csv_filepath: str):
			
 
				+        self.all_files = []
			
 
				+        with open(csv_filepath, "r") as f:
			
 
				+            lines = f.readlines()
			
 
				+            for line in lines:
			
 
				+                filepath, label = line.strip().split(",")
			
 
				+                self.all_files.append((filepath, int(label)))
			
 
				+        shuffle(self.all_files)
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        return len(self.all_files)
			
 
				+
			
 
				+    def __getitem__(self, index):
			
 
				+        to_load, y = self.all_files[index]
			
 
				+        # Step 1: Load the .npz file into a sparse matrix
			
 
				+        sparse_matrix = scipy.sparse.load_npz(to_load)
			
 
				+        # Step 2: Convert the sparse matrix to a dense matrix (e.g., using toarray())
			
 
				+        dense_matrix = sparse_matrix.toarray()  # You can also use .todense() if needed
			
 
				+
			
 
				+        # Step 3: Convert the dense matrix to a PyTorch tensor
			
 
				+        x = torch.tensor(dense_matrix, dtype=torch.float)
			
 
				+        x = x.squeeze()
			
 
				+        return x, torch.tensor(y)
			
--- a/data/2026-IJCI/boolean_classifier/datasets/ngram_dataset.py
+++ b/data/2026-IJCI/boolean_classifier/datasets/ngram_dataset.py
@@ -0,0 +1,30 @@
 
				+from torch.utils.data import Dataset
			
 
				+import os
			
 
				+from random import shuffle
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+import scipy.sparse
			
 
				+
			
 
				+
			
 
				+class NGramDataset(Dataset):
			
 
				+    def __init__(self, csv_filepath: str):
			
 
				+        self.all_files = []
			
 
				+        with open(csv_filepath, "r") as f:
			
 
				+            lines = f.readlines()
			
 
				+            for line in lines:
			
 
				+                filepath, label = line.strip().split(",")
			
 
				+                self.all_files.append((filepath, int(label)))
			
 
				+        shuffle(self.all_files)
			
 
				+
			
 
				+
			
 
				+    def __len__(self):
			
 
				+        return len(self.all_files)
			
 
				+
			
 
				+    def __getitem__(self, index):
			
 
				+        to_load, y = self.all_files[index]
			
 
				+        # Step 1: Load the .npz file
			
 
				+        matrix = np.load(to_load)["arr_0"]
			
 
				+        # Step 2: Convert the dense matrix to a PyTorch tensor
			
 
				+        x = torch.tensor(matrix, dtype=torch.float)
			
 
				+        x = x.squeeze()
			
 
				+        return x, torch.tensor(y)
			
--- a/data/2026-IJCI/boolean_classifier/evaluate_malware_detector.py
+++ b/data/2026-IJCI/boolean_classifier/evaluate_malware_detector.py
@@ -0,0 +1,110 @@
 
				+import argparse
			
 
				+import torch
			
 
				+import sys
			
 
				+sys.path.append("../")
			
 
				+from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
			
 
				+from boolean_classifier.datasets.ngram_dataset import NGramDataset
			
 
				+from boolean_classifier.architectures.ffnn import FFNN
			
 
				+from torch.utils.data import DataLoader
			
 
				+import multiprocessing
			
 
				+import json
			
 
				+import os
			
 
				+import torch.nn
			
 
				+from torch.optim.lr_scheduler import _LRScheduler
			
 
				+from torch.utils.data import DataLoader
			
 
				+from tqdm import tqdm
			
 
				+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
			
 
				+import joblib
			
 
				+
			
 
				+
			
 
				+def load_configuration(configuration_filepath: str) -> dict:
			
 
				+    with open(configuration_filepath, "r") as configuration_file:
			
 
				+        configuration = json.load(configuration_file)
			
 
				+    return configuration
			
 
				+
			
 
				+def evaluate(model: torch.nn.Module, dataloader: DataLoader) -> tuple[list, list]:
			
 
				+    y_trues = []
			
 
				+    y_preds = []
			
 
				+    device = next(model.parameters()).device
			
 
				+    model = model.eval()
			
 
				+    with torch.no_grad():
			
 
				+        for x, y in tqdm(dataloader):
			
 
				+            if feature_selector is not None:
			
 
				+                x = torch.Tensor(feature_selector.transform(x))
			
 
				+            x, y = x.to(device), y.to(device)
			
 
				+            outputs = model.predict(x)
			
 
				+            y_pred =  outputs.argmax(dim=1)
			
 
				+            y_trues.extend(y.cpu())
			
 
				+            y_preds.extend(y_pred.cpu())
			
 
				+    return y_trues, y_preds
			
 
				+
			
 
				+def save_results(y_trues: list, y_preds: list, output_filepath: str):
			
 
				+    acc = accuracy_score(y_trues, y_preds)
			
 
				+    precision = precision_score(y_trues, y_preds)
			
 
				+    recall = recall_score(y_trues, y_preds)
			
 
				+    f1 = f1_score(y_trues, y_preds)
			
 
				+    cm = confusion_matrix(y_trues, y_preds)
			
 
				+
			
 
				+    with open(output_filepath, "w") as output_file:
			
 
				+        output_file.write("Accuracy: {}\n".format(acc))
			
 
				+        output_file.write("Precision: {}\n".format(precision))
			
 
				+        output_file.write("Recall: {}\n".format(recall))
			
 
				+        output_file.write("F1: {}\n".format(f1))
			
 
				+        output_file.write("Confusion Matrix: {}\n".format(cm))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser(description='Evaluate malware detector')
			
 
				+    parser.add_argument("evaluation_file",
			
 
				+                        type=str,
			
 
				+                        help="Evaluation file containing the hashes and labels of the benign and malicious samples"
			
 
				+                        )
			
 
				+    parser.add_argument("dataset_type",
			
 
				+                        type=str,
			
 
				+                        help="Type of dataset: {Boolean, EMBER}"
			
 
				+                        )
			
 
				+    parser.add_argument("configuration_file",
			
 
				+                        type=str,
			
 
				+                        help="Configuration file containing the hyperparameters of the model"
			
 
				+                        )
			
 
				+    parser.add_argument("output_file",
			
 
				+                        type=str,
			
 
				+                        help="File to where to store the results",
			
 
				+                        )
			
 
				+    parser.add_argument("--batch_size",
			
 
				+                        type=int,
			
 
				+                        help="Batch size for training",
			
 
				+                        default=32
			
 
				+                        )
			
 
				+    args = parser.parse_args()
			
 
				+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
			
 
				+    print("Device: ", device)
			
 
				+    num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
			
 
				+
			
 
				+    configuration = load_configuration(args.configuration_file)
			
 
				+    if args.dataset_type == "BooleanBigrams":
			
 
				+        dataset = BooleanNGramDataset(args.evaluation_file)
			
 
				+    elif args.dataset_type == "Bigrams":
			
 
				+        dataset = NGramDataset(args.evaluation_file)
			
 
				+    else:
			
 
				+        raise NotImplementedError("Only Boolean dataset is currently supported")
			
 
				+    dataloader = DataLoader(
			
 
				+        dataset,
			
 
				+        batch_size=args.batch_size,
			
 
				+        num_workers=num_workers,
			
 
				+    )
			
 
				+    model = FFNN(configuration)
			
 
				+    model = model.to(device)
			
 
				+    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
			
 
				+    model.eval()
			
 
				+    if configuration["feature_selector"] is not None:
			
 
				+        feature_selector = joblib.load(configuration["feature_selector"])
			
 
				+    else:
			
 
				+        feature_selector = None
			
 
				+    y_trues, y_preds = evaluate(model, dataloader)
			
 
				+    save_results(y_trues, y_preds, args.output_file)
			
 
				+
			
 
				+
			
 
				+
			
--- a/data/2026-IJCI/boolean_classifier/feature_extractors/boolean_ngram_feature_extractor.py
+++ b/data/2026-IJCI/boolean_classifier/feature_extractors/boolean_ngram_feature_extractor.py
@@ -0,0 +1,28 @@
 
				+from collections import OrderedDict
			
 
				+from scipy.sparse import csr_matrix
			
 
				+
			
 
				+
			
 
				+class BooleanNGramFeatureExtractor(object):
			
 
				+    def __init__(self, N: int = 2):
			
 
				+        self.N = N
			
 
				+        self.dim = 256 ** N
			
 
				+        self.ngram_features = OrderedDict({"{},{}".format(i,j): 0 for i in range(256) for j in range(256)})
			
 
				+
			
 
				+    def feature_vector(self, bytez):
			
 
				+        raw_features = self.extract_ngram_features(bytez)
			
 
				+        return self.reduce(raw_features)
			
 
				+
			
 
				+    def extract_ngram_features(self, bytez)-> dict:
			
 
				+        words = list(bytez)
			
 
				+        bigrams = zip(words, words[1:])  # Create bi-grams
			
 
				+        bigrams = set(bigrams)
			
 
				+        for bigram in bigrams:
			
 
				+            self.ngram_features["{},{}".format(bigram[0], bigram[1])] += 1
			
 
				+        return self.ngram_features
			
 
				+
			
 
				+    def reduce(self, raw_features: dict, technique: str = None):
			
 
				+        if technique is None:
			
 
				+            return csr_matrix(list(raw_features.values()))
			
 
				+        else:
			
 
				+            raise NotImplementedError("Feature selection and dimensionality reduction technique not implemented")
			
 
				+
			
--- a/data/2026-IJCI/boolean_classifier/feature_extractors/ember_feature_extractor.py
+++ b/data/2026-IJCI/boolean_classifier/feature_extractors/ember_feature_extractor.py
@@ -0,0 +1,567 @@
 
				+#!/usr/bin/python
			
 
				+''' Extracts some basic features from PE files. Many of the features
			
 
				+implemented have been used in previously published works. For more information,
			
 
				+check out the following resources:
			
 
				+* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
			
 
				+* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
			
 
				+* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
			
 
				+* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
			
 
				+* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
			
 
				+
			
 
				+It may be useful to do feature selection to reduce this set of features to a meaningful set
			
 
				+for your modeling problem.
			
 
				+'''
			
 
				+
			
 
				+import hashlib
			
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+
			
 
				+import lief
			
 
				+import numpy as np
			
 
				+from sklearn.feature_extraction import FeatureHasher
			
 
				+
			
 
				+LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
			
 
				+LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10)
			
 
				+LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)
			
 
				+
			
 
				+
			
 
				+class FeatureType(object):
			
 
				+    ''' Base class from which each feature type may inherit '''
			
 
				+
			
 
				+    name = ''
			
 
				+    dim = 0
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return '{}({})'.format(self.name, self.dim)
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        ''' Generate a JSON-able representation of the file '''
			
 
				+        raise (NotImplementedError)
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        ''' Generate a feature vector from the raw features '''
			
 
				+        raise (NotImplementedError)
			
 
				+
			
 
				+    def feature_vector(self, bytez, lief_binary):
			
 
				+        ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
			
 
				+        if there are significant speedups to be gained from combining the two functions. '''
			
 
				+        return self.process_raw_features(self.raw_features(bytez, lief_binary))
			
 
				+
			
 
				+
			
 
				+class ByteHistogram(FeatureType):
			
 
				+    ''' Byte histogram (count + non-normalized) over the entire binary file '''
			
 
				+
			
 
				+    name = 'histogram'
			
 
				+    dim = 256
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
			
 
				+        return counts.tolist()
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        counts = np.array(raw_obj, dtype=np.float32)
			
 
				+        sum = counts.sum()
			
 
				+        normalized = counts / sum
			
 
				+        return normalized
			
 
				+
			
 
				+
			
 
				+class ByteEntropyHistogram(FeatureType):
			
 
				+    ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
			
 
				+    This roughly approximates the joint probability of byte value and local entropy.
			
 
				+    See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
			
 
				+    '''
			
 
				+
			
 
				+    name = 'byteentropy'
			
 
				+    dim = 256
			
 
				+
			
 
				+    def __init__(self, step=1024, window=2048):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+        self.window = window
			
 
				+        self.step = step
			
 
				+
			
 
				+    def _entropy_bin_counts(self, block):
			
 
				+        # coarse histogram, 16 bytes per bin
			
 
				+        c = np.bincount(block >> 4, minlength=16)  # 16-bin histogram
			
 
				+        p = c.astype(np.float32) / self.window
			
 
				+        wh = np.where(c)[0]
			
 
				+        H = np.sum(-p[wh] * np.log2(
			
 
				+            p[wh])) * 2  # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
			
 
				+
			
 
				+        Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
			
 
				+        if Hbin == 16:  # handle entropy = 8.0 bits
			
 
				+            Hbin = 15
			
 
				+
			
 
				+        return Hbin, c
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        output = np.zeros((16, 16), dtype=int)
			
 
				+        a = np.frombuffer(bytez, dtype=np.uint8)
			
 
				+        if a.shape[0] < self.window:
			
 
				+            Hbin, c = self._entropy_bin_counts(a)
			
 
				+            output[Hbin, :] += c
			
 
				+        else:
			
 
				+            # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
			
 
				+            shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
			
 
				+            strides = a.strides + (a.strides[-1],)
			
 
				+            blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
			
 
				+
			
 
				+            # from the blocks, compute histogram
			
 
				+            for block in blocks:
			
 
				+                Hbin, c = self._entropy_bin_counts(block)
			
 
				+                output[Hbin, :] += c
			
 
				+
			
 
				+        return output.flatten().tolist()
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        counts = np.array(raw_obj, dtype=np.float32)
			
 
				+        sum = counts.sum()
			
 
				+        normalized = counts / sum
			
 
				+        return normalized
			
 
				+
			
 
				+
			
 
				+class SectionInfo(FeatureType):
			
 
				+    ''' Information about section names, sizes and entropy.  Uses hashing trick
			
 
				+    to summarize all this section info into a feature vector.
			
 
				+    '''
			
 
				+
			
 
				+    name = 'section'
			
 
				+    dim = 5 + 50 + 50 + 50 + 50 + 50
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _properties(s):
			
 
				+        return [str(c).split('.')[-1] for c in s.characteristics_lists]
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        if lief_binary is None:
			
 
				+            return {"entry": "", "sections": []}
			
 
				+
			
 
				+        # properties of entry point, or if invalid, the first executable section
			
 
				+        not_found_error_class = RuntimeError if not lief.__version__.startswith("0.9.0") else lief.not_found
			
 
				+        try:
			
 
				+            if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12):
			
 
				+                section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase)
			
 
				+
			
 
				+                if section is None:
			
 
				+                    raise not_found_error_class
			
 
				+                entry_section = section.name
			
 
				+            else:  # lief < 0.12
			
 
				+                entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
			
 
				+        except not_found_error_class:
			
 
				+            # bad entry point, let's find the first executable section
			
 
				+            entry_section = ""
			
 
				+            mem_execute_characteristics = lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE if lief.__version__.startswith("0.9.0") else lief.PE.Section.CHARACTERISTICS.MEM_EXECUTE
			
 
				+            for s in lief_binary.sections:
			
 
				+                if mem_execute_characteristics in s.characteristics_lists:
			
 
				+                    entry_section = s.name
			
 
				+                    break
			
 
				+
			
 
				+        raw_obj = {"entry": entry_section}
			
 
				+        raw_obj["sections"] = [{
			
 
				+            'name': s.name,
			
 
				+            'size': s.size,
			
 
				+            'entropy': s.entropy,
			
 
				+            'vsize': s.virtual_size,
			
 
				+            'props': self._properties(s)
			
 
				+        } for s in lief_binary.sections]
			
 
				+        return raw_obj
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        sections = raw_obj['sections']
			
 
				+        general = [
			
 
				+            len(sections),  # total number of sections
			
 
				+            # number of sections with zero size
			
 
				+            sum(1 for s in sections if s['size'] == 0),
			
 
				+            # number of sections with an empty name
			
 
				+            sum(1 for s in sections if s['name'] == ""),
			
 
				+            # number of RX
			
 
				+            sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
			
 
				+            # number of W
			
 
				+            sum(1 for s in sections if 'MEM_WRITE' in s['props'])
			
 
				+        ]
			
 
				+        # gross characteristics of each section
			
 
				+        section_sizes = [(s['name'], s['size']) for s in sections]
			
 
				+        section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
			
 
				+        section_entropy = [(s['name'], s['entropy']) for s in sections]
			
 
				+        section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
			
 
				+        section_vsize = [(s['name'], s['vsize']) for s in sections]
			
 
				+        section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
			
 
				+        entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
			
 
				+        characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
			
 
				+        characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
			
 
				+
			
 
				+        return np.hstack([
			
 
				+            general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
			
 
				+            characteristics_hashed
			
 
				+        ]).astype(np.float32)
			
 
				+
			
 
				+
			
 
				+class ImportsInfo(FeatureType):
			
 
				+    ''' Information about imported libraries and functions from the
			
 
				+    import address table.  Note that the total number of imported
			
 
				+    functions is contained in GeneralFileInfo.
			
 
				+    '''
			
 
				+
			
 
				+    name = 'imports'
			
 
				+    dim = 1280
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        imports = {}
			
 
				+        if lief_binary is None:
			
 
				+            return imports
			
 
				+
			
 
				+        for lib in lief_binary.imports:
			
 
				+            if lib.name not in imports:
			
 
				+                imports[lib.name] = []  # libraries can be duplicated in listing, extend instead of overwrite
			
 
				+
			
 
				+            # Clipping assumes there are diminishing returns on the discriminatory power of imported functions
			
 
				+            #  beyond the first 10000 characters, and this will help limit the dataset size
			
 
				+            for entry in lib.entries:
			
 
				+                if entry.is_ordinal:
			
 
				+                    imports[lib.name].append("ordinal" + str(entry.ordinal))
			
 
				+                else:
			
 
				+                    imports[lib.name].append(entry.name[:10000])
			
 
				+
			
 
				+        return imports
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        # unique libraries
			
 
				+        libraries = list(set([l.lower() for l in raw_obj.keys()]))
			
 
				+        libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
			
 
				+
			
 
				+        # A string like "kernel32.dll:CreateFileMappingA" for each imported function
			
 
				+        imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
			
 
				+        imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
			
 
				+
			
 
				+        # Two separate elements: libraries (alone) and fully-qualified names of imported functions
			
 
				+        return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
			
 
				+
			
 
				+
			
 
				+class ExportsInfo(FeatureType):
			
 
				+    ''' Information about exported functions. Note that the total number of exported
			
 
				+    functions is contained in GeneralFileInfo.
			
 
				+    '''
			
 
				+
			
 
				+    name = 'exports'
			
 
				+    dim = 128
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        if lief_binary is None:
			
 
				+            return []
			
 
				+
			
 
				+        # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
			
 
				+        #  the first 10000 characters, and this will help limit the dataset size
			
 
				+        if LIEF_EXPORT_OBJECT:
			
 
				+            # export is an object with .name attribute (0.10.0 and later)
			
 
				+            clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
			
 
				+        else:
			
 
				+            # export is a string (LIEF 0.9.0 and earlier)
			
 
				+            clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
			
 
				+
			
 
				+        return clipped_exports
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
			
 
				+        return exports_hashed.astype(np.float32)
			
 
				+
			
 
				+
			
 
				+class GeneralFileInfo(FeatureType):
			
 
				+    ''' General information about the file '''
			
 
				+
			
 
				+    name = 'general'
			
 
				+    dim = 10
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        if lief_binary is None:
			
 
				+            return {
			
 
				+                'size': len(bytez),
			
 
				+                'vsize': 0,
			
 
				+                'has_debug': 0,
			
 
				+                'exports': 0,
			
 
				+                'imports': 0,
			
 
				+                'has_relocations': 0,
			
 
				+                'has_resources': 0,
			
 
				+                'has_signature': 0,
			
 
				+                'has_tls': 0,
			
 
				+                'symbols': 0
			
 
				+            }
			
 
				+
			
 
				+        return {
			
 
				+            'size': len(bytez),
			
 
				+            'vsize': lief_binary.virtual_size,
			
 
				+            'has_debug': int(lief_binary.has_debug),
			
 
				+            'exports': len(lief_binary.exported_functions),
			
 
				+            'imports': len(lief_binary.imported_functions),
			
 
				+            'has_relocations': int(lief_binary.has_relocations),
			
 
				+            'has_resources': int(lief_binary.has_resources),
			
 
				+            'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature),
			
 
				+            'has_tls': int(lief_binary.has_tls),
			
 
				+            'symbols': len(lief_binary.symbols),
			
 
				+        }
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        return np.asarray([
			
 
				+            raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
			
 
				+            raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
			
 
				+            raw_obj['symbols']
			
 
				+        ],
			
 
				+            dtype=np.float32)
			
 
				+
			
 
				+
			
 
				+class HeaderFileInfo(FeatureType):
			
 
				+    ''' Machine, architecure, OS, linker and other information extracted from header '''
			
 
				+
			
 
				+    name = 'header'
			
 
				+    dim = 62
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        raw_obj = {}
			
 
				+        raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
			
 
				+        raw_obj['optional'] = {
			
 
				+            'subsystem': "",
			
 
				+            'dll_characteristics': [],
			
 
				+            'magic': "",
			
 
				+            'major_image_version': 0,
			
 
				+            'minor_image_version': 0,
			
 
				+            'major_linker_version': 0,
			
 
				+            'minor_linker_version': 0,
			
 
				+            'major_operating_system_version': 0,
			
 
				+            'minor_operating_system_version': 0,
			
 
				+            'major_subsystem_version': 0,
			
 
				+            'minor_subsystem_version': 0,
			
 
				+            'sizeof_code': 0,
			
 
				+            'sizeof_headers': 0,
			
 
				+            'sizeof_heap_commit': 0
			
 
				+        }
			
 
				+        if lief_binary is None:
			
 
				+            return raw_obj
			
 
				+
			
 
				+        raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
			
 
				+        raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
			
 
				+        raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
			
 
				+        raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
			
 
				+        raw_obj['optional']['dll_characteristics'] = [
			
 
				+            str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
			
 
				+        ]
			
 
				+        raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
			
 
				+        raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
			
 
				+        raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
			
 
				+        raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
			
 
				+        raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
			
 
				+        raw_obj['optional'][
			
 
				+            'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
			
 
				+        raw_obj['optional'][
			
 
				+            'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
			
 
				+        raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
			
 
				+        raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
			
 
				+        raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
			
 
				+        raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
			
 
				+        raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
			
 
				+        return raw_obj
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        return np.hstack([
			
 
				+            raw_obj['coff']['timestamp'],
			
 
				+            FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
			
 
				+            FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
			
 
				+            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
			
 
				+            FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
			
 
				+            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
			
 
				+            raw_obj['optional']['major_image_version'],
			
 
				+            raw_obj['optional']['minor_image_version'],
			
 
				+            raw_obj['optional']['major_linker_version'],
			
 
				+            raw_obj['optional']['minor_linker_version'],
			
 
				+            raw_obj['optional']['major_operating_system_version'],
			
 
				+            raw_obj['optional']['minor_operating_system_version'],
			
 
				+            raw_obj['optional']['major_subsystem_version'],
			
 
				+            raw_obj['optional']['minor_subsystem_version'],
			
 
				+            raw_obj['optional']['sizeof_code'],
			
 
				+            raw_obj['optional']['sizeof_headers'],
			
 
				+            raw_obj['optional']['sizeof_heap_commit'],
			
 
				+        ]).astype(np.float32)
			
 
				+
			
 
				+
			
 
				+class StringExtractor(FeatureType):
			
 
				+    ''' Extracts strings from raw byte stream '''
			
 
				+
			
 
				+    name = 'strings'
			
 
				+    dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+        # all consecutive runs of 0x20 - 0x7f that are 5+ characters
			
 
				+        self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
			
 
				+        # occurances of the string 'C:\'.  Not actually extracting the path
			
 
				+        self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
			
 
				+        # occurances of http:// or https://.  Not actually extracting the URLs
			
 
				+        self._urls = re.compile(b'https?://', re.IGNORECASE)
			
 
				+        # occurances of the string prefix HKEY_.  No actually extracting registry names
			
 
				+        self._registry = re.compile(b'HKEY_')
			
 
				+        # crude evidence of an MZ header (dropper?) somewhere in the byte stream
			
 
				+        self._mz = re.compile(b'MZ')
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        allstrings = self._allstrings.findall(bytez)
			
 
				+        if allstrings:
			
 
				+            # statistics about strings:
			
 
				+            string_lengths = [len(s) for s in allstrings]
			
 
				+            avlength = sum(string_lengths) / len(string_lengths)
			
 
				+            # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
			
 
				+            as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
			
 
				+            c = np.bincount(as_shifted_string, minlength=96)  # histogram count
			
 
				+            # distribution of characters in printable strings
			
 
				+            csum = c.sum()
			
 
				+            p = c.astype(np.float32) / csum
			
 
				+            wh = np.where(c)[0]
			
 
				+            H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
			
 
				+        else:
			
 
				+            avlength = 0
			
 
				+            c = np.zeros((96,), dtype=np.float32)
			
 
				+            H = 0
			
 
				+            csum = 0
			
 
				+
			
 
				+        return {
			
 
				+            'numstrings': len(allstrings),
			
 
				+            'avlength': avlength,
			
 
				+            'printabledist': c.tolist(),  # store non-normalized histogram
			
 
				+            'printables': int(csum),
			
 
				+            'entropy': float(H),
			
 
				+            'paths': len(self._paths.findall(bytez)),
			
 
				+            'urls': len(self._urls.findall(bytez)),
			
 
				+            'registry': len(self._registry.findall(bytez)),
			
 
				+            'MZ': len(self._mz.findall(bytez))
			
 
				+        }
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
			
 
				+        return np.hstack([
			
 
				+            raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
			
 
				+            np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
			
 
				+            raw_obj['registry'], raw_obj['MZ']
			
 
				+        ]).astype(np.float32)
			
 
				+
			
 
				+
			
 
				+class DataDirectories(FeatureType):
			
 
				+    ''' Extracts size and virtual address of the first 15 data directories '''
			
 
				+
			
 
				+    name = 'datadirectories'
			
 
				+    dim = 15 * 2
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        super(FeatureType, self).__init__()
			
 
				+        self._name_order = [
			
 
				+            "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
			
 
				+            "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
			
 
				+            "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
			
 
				+        ]
			
 
				+
			
 
				+    def raw_features(self, bytez, lief_binary):
			
 
				+        output = []
			
 
				+        if lief_binary is None:
			
 
				+            return output
			
 
				+
			
 
				+        for data_directory in lief_binary.data_directories:
			
 
				+            output.append({
			
 
				+                "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
			
 
				+                "size": data_directory.size,
			
 
				+                "virtual_address": data_directory.rva
			
 
				+            })
			
 
				+        return output
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        features = np.zeros(2 * len(self._name_order), dtype=np.float32)
			
 
				+        for i in range(len(self._name_order)):
			
 
				+            if i < len(raw_obj):
			
 
				+                features[2 * i] = raw_obj[i]["size"]
			
 
				+                features[2 * i + 1] = raw_obj[i]["virtual_address"]
			
 
				+        return features
			
 
				+
			
 
				+
			
 
				+class EMBERFeatureExtractor(object):
			
 
				+    ''' Extract useful features from a PE file, and return as a vector of fixed size. '''
			
 
				+
			
 
				+    def __init__(self, feature_version=2, print_feature_warning=True, features_file=''):
			
 
				+        self.features = []
			
 
				+        features = {
			
 
				+            'ByteHistogram': ByteHistogram(),
			
 
				+            'ByteEntropyHistogram': ByteEntropyHistogram(),
			
 
				+            'StringExtractor': StringExtractor(),
			
 
				+            'GeneralFileInfo': GeneralFileInfo(),
			
 
				+            'HeaderFileInfo': HeaderFileInfo(),
			
 
				+            'SectionInfo': SectionInfo(),
			
 
				+            'ImportsInfo': ImportsInfo(),
			
 
				+            'ExportsInfo': ExportsInfo()
			
 
				+        }
			
 
				+
			
 
				+        if os.path.exists(features_file):
			
 
				+            with open(features_file, encoding='utf8') as f:
			
 
				+                x = json.load(f)
			
 
				+                self.features = [features[feature] for feature in x['features'] if feature in features]
			
 
				+        else:
			
 
				+            self.features = list(features.values())
			
 
				+
			
 
				+        if feature_version == 1:
			
 
				+            if not lief.__version__.startswith("0.8.3"):
			
 
				+                if print_feature_warning:
			
 
				+                    print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
			
 
				+                    print(
			
 
				+                        f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
			
 
				+                    print(f"WARNING:   in the feature calculations.")
			
 
				+        elif feature_version == 2:
			
 
				+            self.features.append(DataDirectories())
			
 
				+            if not lief.__version__.startswith("0.9.0"):
			
 
				+                if print_feature_warning:
			
 
				+                    print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
			
 
				+                    print(
			
 
				+                        f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
			
 
				+                    print(f"WARNING:   in the feature calculations.")
			
 
				+        else:
			
 
				+            raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
			
 
				+        self.dim = sum([fe.dim for fe in self.features])
			
 
				+
			
 
				+    def raw_features(self, bytez):
			
 
				+        if lief.__version__.startswith("0.9.0"):
			
 
				+            lief_errors = (
			
 
				+                lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError)
			
 
				+        else:
			
 
				+            lief_errors = (
			
 
				+                lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error,
			
 
				+                lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound,
			
 
				+                RuntimeError)
			
 
				+
			
 
				+        try:
			
 
				+            lief_binary = lief.PE.parse(list(bytez))
			
 
				+        except lief_errors as e:
			
 
				+            print("lief error: ", str(e))
			
 
				+            lief_binary = None
			
 
				+        except Exception:  # everything else (KeyboardInterrupt, SystemExit, ValueError):
			
 
				+            raise
			
 
				+
			
 
				+        features = {"sha256": hashlib.sha256(bytez).hexdigest()}
			
 
				+        features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
			
 
				+        return features
			
 
				+
			
 
				+    def process_raw_features(self, raw_obj):
			
 
				+        feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
			
 
				+        return np.hstack(feature_vectors).astype(np.float32)
			
 
				+
			
 
				+    def feature_vector(self, bytez):
			
 
				+        return self.process_raw_features(self.raw_features(bytez))
			
--- a/data/2026-IJCI/boolean_classifier/feature_extractors/ngram_feature_extractor.py
+++ b/data/2026-IJCI/boolean_classifier/feature_extractors/ngram_feature_extractor.py
@@ -0,0 +1,29 @@
 
				+from collections import OrderedDict
			
 
				+import numpy as np
			
 
				+
			
 
				+class NGramFeatureExtractor(object):
			
 
				+    def __init__(self, N: int = 2):
			
 
				+        self.N = N
			
 
				+        self.dim = 256 ** N
			
 
				+        self.ngram_features = OrderedDict({"{},{}".format(i,j): 0.0 for i in range(256) for j in range(256)})
			
 
				+
			
 
				+    def feature_vector(self, bytez):
			
 
				+        raw_features = self.extract_ngram_features(bytez)
			
 
				+        return self.reduce(raw_features)
			
 
				+
			
 
				+    def extract_ngram_features(self, bytez)-> dict:
			
 
				+        words = list(bytez)
			
 
				+        num_ngrams = len(words) - self.N
			
 
				+        bigrams = zip(words, words[1:])  # Create bi-grams
			
 
				+        for bigram in bigrams:
			
 
				+            self.ngram_features["{},{}".format(bigram[0], bigram[1])] += 1
			
 
				+        for key in self.ngram_features:
			
 
				+            self.ngram_features[key] = self.ngram_features[key] / num_ngrams
			
 
				+        return self.ngram_features
			
 
				+
			
 
				+    def reduce(self, raw_features: dict, technique: str = None):
			
 
				+        if technique is None:
			
 
				+            return np.expand_dims(np.array(list(raw_features.values())), axis=0)
			
 
				+        else:
			
 
				+            raise NotImplementedError("Feature selection and dimensionality reduction technique not implemented")
			
 
				+
			
--- a/data/2026-IJCI/boolean_classifier/ffnn_configurations/ffnn_2gram_k=1000_config.json
+++ b/data/2026-IJCI/boolean_classifier/ffnn_configurations/ffnn_2gram_k=1000_config.json
@@ -0,0 +1,6 @@
 
				+{
			
 
				+  "feature_selector": "data/BODMAS/feature_selectors/bigrams/bigrams_feature_selector_k=1000.pkl",
			
 
				+  "hidden_size": 512,
			
 
				+  "input_size": 1000,
			
 
				+  "model_path": "models/ffnn_2gram_k=1000_512_1"
			
 
				+}
			
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/model.pth
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/model.pth
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/results.json
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/results.json
@@ -0,0 +1 @@
 
				+{"training_losses": [0.007372988766147287, 0.005052168182717729, 0.004575773141697014, 0.004136406766564416, 0.003666894217103657, 0.0035721441213530944, 0.003411646314705646, 0.003301303897834049, 0.003245272185255189, 0.0031502678544421403, 0.003098506758948322], "training_accuracies": [0.9023583758813518, 0.9278223518923737, 0.9339492665532053, 0.9399303022935408, 0.9442418348326445, 0.9457006240376044, 0.9474511710835563, 0.9474835886214442, 0.949201718129508, 0.9492179268984521, 0.9511629791717319], "validation_losses": [0.009812075672269006, 0.010353462121517892, 0.005870703318895587, 0.005944547294264034, 0.005886296627045667, 0.005803646742737165, 0.00720510685860129, 0.005735838049743723, 0.005888903167399741, 0.005404814309630779, 0.006096020837560452], "validation_accuracies": [0.838542342108676, 0.7581377253274543, 0.9389184282194268, 0.9361950460381273, 0.9361950460381273, 0.9319154454675139, 0.9067565815069382, 0.9329529243937232, 0.9302295422124238, 0.9365841006354558, 0.9289326935546621]}
			
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/test.out
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/test.out
@@ -0,0 +1,6 @@
 
				+Accuracy: 0.9607282184655397
			
 
				+Precision: 0.9691975141853553
			
 
				+Recall: 0.9501986754966888
			
 
				+F1: 0.9596040663456393
			
 
				+Confusion Matrix: [[3801  114]
			
 
				+ [ 188 3587]]
			
--- a/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/validation.out
+++ b/data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/validation.out
@@ -0,0 +1,6 @@
 
				+Accuracy: 0.9319154454675139
			
 
				+Precision: 0.9192049561177078
			
 
				+Recall: 0.9438112907500663
			
 
				+F1: 0.9313456257355825
			
 
				+Confusion Matrix: [[3625  313]
			
 
				+ [ 212 3561]]
			
--- a/data/2026-IJCI/boolean_classifier/train_malware_detector.py
+++ b/data/2026-IJCI/boolean_classifier/train_malware_detector.py
@@ -0,0 +1,271 @@
 
				+import argparse
			
 
				+import copy
			
 
				+
			
 
				+import torch
			
 
				+import sys
			
 
				+sys.path.append("../")
			
 
				+from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
			
 
				+from boolean_classifier.datasets.ngram_dataset import NGramDataset
			
 
				+from boolean_classifier.architectures.ffnn import FFNN
			
 
				+from torch.utils.data import DataLoader
			
 
				+import multiprocessing
			
 
				+import json
			
 
				+import os
			
 
				+import torch.nn
			
 
				+from torch.optim.lr_scheduler import _LRScheduler
			
 
				+from torch.utils.data import DataLoader
			
 
				+from tqdm import tqdm
			
 
				+import joblib
			
 
				+
			
 
				+
			
 
				+class EarlyStoppingPyTorchTrainer:
			
 
				+    """Trainer for PyTorch models with early stopping."""
			
 
				+
			
 
				+    def __init__(self, optimizer: torch.optim.Optimizer, epochs: int = 5,
			
 
				+                 loss: torch.nn.Module = None, scheduler: _LRScheduler = None, feature_selector = None) -> None:
			
 
				+        """
			
 
				+        Create PyTorch trainer.
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        optimizer : torch.optim.Optimizer
			
 
				+            Optimizer to use for training the model.
			
 
				+        epochs : int, optional
			
 
				+            Number of epochs, by default 5.
			
 
				+        loss : torch.nn.Module, optional
			
 
				+            Loss to minimize, by default None.
			
 
				+        scheduler : _LRScheduler, optional
			
 
				+            Scheduler for the optimizer, by default None.
			
 
				+        """
			
 
				+        self._epochs = epochs
			
 
				+        self._optimizer = optimizer
			
 
				+        self._loss = loss if loss is not None else torch.nn.CrossEntropyLoss()
			
 
				+        self._scheduler = scheduler
			
 
				+        self.feature_selector = feature_selector
			
 
				+
			
 
				+        self.training_losses = []
			
 
				+        self.training_accuracies = []
			
 
				+        self.validation_losses = []
			
 
				+        self.validation_accuracies = []
			
 
				+
			
 
				+    def train(self, model: torch.nn.Module,
			
 
				+            train_loader: DataLoader,
			
 
				+            val_loader: DataLoader,
			
 
				+            patience: int) -> torch.nn.Module:
			
 
				+        """
			
 
				+        Train model with given loaders and early stopping.
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        model : torch.nn.Module
			
 
				+            Pytorch model to be trained.
			
 
				+        train_loader : DataLoader
			
 
				+            Train data loader.
			
 
				+        val_loader : DataLoader
			
 
				+            Validation data loader.
			
 
				+        patience : int
			
 
				+            Number of epochs to wait before early stopping.
			
 
				+        Returns
			
 
				+        -------
			
 
				+        torch.nn.Module
			
 
				+            Trained model.
			
 
				+        """
			
 
				+        best_loss = float("inf")
			
 
				+        best_model = None
			
 
				+        patience_counter = 0
			
 
				+        for _ in range(self._epochs):
			
 
				+            model = self.fit(model, train_loader)
			
 
				+            val_loss = self.validate(model, val_loader)
			
 
				+            if val_loss <= best_loss:
			
 
				+                best_loss = val_loss
			
 
				+                best_model = copy.deepcopy(model)
			
 
				+                patience_counter = 0
			
 
				+            else:
			
 
				+                patience_counter += 1
			
 
				+            if patience_counter >= patience:
			
 
				+                break
			
 
				+        return best_model
			
 
				+
			
 
				+    def fit(self,
			
 
				+              model: torch.nn.Module,
			
 
				+              dataloader: DataLoader) -> torch.nn.Module:
			
 
				+        """
			
 
				+        Train model for one epoch with given loader.
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        model : torch.nn.Module
			
 
				+            Pytorch model to be trained.
			
 
				+        dataloader : DataLoader
			
 
				+            Train data loader.
			
 
				+        Returns
			
 
				+        -------
			
 
				+        torch.nn.Module
			
 
				+            Trained model.
			
 
				+        """
			
 
				+        device = next(model.parameters()).device
			
 
				+        model = model.train()
			
 
				+        model = model.to(device)
			
 
				+        running_loss = 0.0
			
 
				+        train_total = 0
			
 
				+        train_correct = 0
			
 
				+        for x, y in tqdm(dataloader):
			
 
				+            if self.feature_selector is not None:
			
 
				+                x = torch.Tensor(self.feature_selector.transform(x))
			
 
				+            x, y = x.to(device), y.to(device)
			
 
				+            self._optimizer.zero_grad()
			
 
				+            outputs = model(x)
			
 
				+            loss = self._loss(outputs, y)
			
 
				+            loss.backward()
			
 
				+            self._optimizer.step()
			
 
				+            running_loss += loss.item()
			
 
				+            y_preds = outputs.softmax(dim=1).argmax(dim=1)
			
 
				+            train_total += y.size(0)
			
 
				+            train_correct += (y_preds == y).sum().item()
			
 
				+
			
 
				+        self.training_losses.append(running_loss / train_total)
			
 
				+        self.training_accuracies.append(train_correct / train_total)
			
 
				+
			
 
				+        if self._scheduler is not None:
			
 
				+            self._scheduler.step()
			
 
				+        return model
			
 
				+
			
 
				+    def validate(self,
			
 
				+                 model: torch.nn.Module,
			
 
				+                 dataloader: DataLoader) -> float:
			
 
				+        """
			
 
				+        Validate model with given loader.
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        model : torch.nn.Module
			
 
				+            Pytorch model to be balidated.
			
 
				+        dataloader : DataLoader
			
 
				+            Validation data loader.
			
 
				+        Returns
			
 
				+        -------
			
 
				+        float
			
 
				+            Validation loss of the model.
			
 
				+        """
			
 
				+        running_loss = 0
			
 
				+        val_total = 0
			
 
				+        val_correct = 0
			
 
				+        device = next(model.parameters()).device
			
 
				+        model = model.eval()
			
 
				+        model = model.to(device)
			
 
				+        with torch.no_grad():
			
 
				+            for x, y in tqdm(dataloader):
			
 
				+                if self.feature_selector is not None:
			
 
				+                    x = torch.Tensor(self.feature_selector.transform(x))
			
 
				+                x, y = x.to(device), y.to(device)
			
 
				+                outputs = model(x)
			
 
				+                loss = self._loss(outputs, y)
			
 
				+                running_loss += loss.item()
			
 
				+                y_preds = outputs.softmax(dim=1).argmax(dim=1)
			
 
				+
			
 
				+                val_total += y.size(0)
			
 
				+                val_correct += (y_preds == y).sum().item()
			
 
				+
			
 
				+            self.validation_losses.append(running_loss / val_total)
			
 
				+            self.validation_accuracies.append(val_correct / val_total)
			
 
				+        return loss
			
 
				+
			
 
				+def save_results(trainer: EarlyStoppingPyTorchTrainer, configuration: dict):
			
 
				+    results = {
			
 
				+        "training_losses": trainer.training_losses,
			
 
				+        "training_accuracies": trainer.training_accuracies,
			
 
				+        "validation_losses": trainer.validation_losses,
			
 
				+        "validation_accuracies": trainer.validation_accuracies
			
 
				+    }
			
 
				+    with open(os.path.join(configuration["model_path"], "results.json"), "w") as output_file:
			
 
				+        json.dump(results, output_file)
			
 
				+
			
 
				+def load_configuration(configuration_filepath: str) -> dict:
			
 
				+    with open(configuration_filepath, "r") as configuration_file:
			
 
				+        configuration = json.load(configuration_file)
			
 
				+    return configuration
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser(description='Train malware detector')
			
 
				+    parser.add_argument("training_file",
			
 
				+                        type=str,
			
 
				+                        help="Training file containing the hashes and labels of the benign and malicious samples"
			
 
				+                        )
			
 
				+    parser.add_argument("validation_file",
			
 
				+                        type=str,
			
 
				+                        help="Validation file containing the hashes and labels of the benign and malicious samples"
			
 
				+                        )
			
 
				+    parser.add_argument("dataset_type",
			
 
				+                        type=str,
			
 
				+                        help="Type of dataset: {BooleanBigrams, Bigrams, EMBER}"
			
 
				+                        )
			
 
				+    parser.add_argument("configuration_file",
			
 
				+                        type=str,
			
 
				+                        help="Configuration file containing the hyperparameters of the model"
			
 
				+                        )
			
 
				+    parser.add_argument("--batch_size",
			
 
				+                        type=int,
			
 
				+                        help="Batch size for training",
			
 
				+                        default=32
			
 
				+                        )
			
 
				+    parser.add_argument("--num_epochs",
			
 
				+                        type=int,
			
 
				+                        help="Max epochs",
			
 
				+                        default=50
			
 
				+                        )
			
 
				+    parser.add_argument("--patience",
			
 
				+                        type=int,
			
 
				+                        help="Patience for early stopping",
			
 
				+                        default=5
			
 
				+                        )
			
 
				+    args = parser.parse_args()
			
 
				+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
			
 
				+    print("Device: ", device)
			
 
				+    num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
			
 
				+
			
 
				+    if args.dataset_type == "BooleanBigrams":
			
 
				+        training_dataset = BooleanNGramDataset(args.training_file)
			
 
				+        validation_dataset = BooleanNGramDataset(args.validation_file)
			
 
				+    elif args.dataset_type == "Bigrams":
			
 
				+        training_dataset = NGramDataset(args.training_file)
			
 
				+        validation_dataset = NGramDataset(args.validation_file)
			
 
				+    else:
			
 
				+        raise NotImplementedError("Only Boolean dataset is currently supported")
			
 
				+    training_dataloader = DataLoader(
			
 
				+        training_dataset,
			
 
				+        batch_size=args.batch_size,
			
 
				+        num_workers=num_workers,
			
 
				+    )
			
 
				+    validation_dataloader = DataLoader(
			
 
				+        validation_dataset,
			
 
				+        batch_size=args.batch_size,
			
 
				+        num_workers=num_workers,
			
 
				+    )
			
 
				+
			
 
				+    configuration = load_configuration(args.configuration_file)
			
 
				+    model = FFNN(configuration)
			
 
				+    model = model.to(device)
			
 
				+
			
 
				+    if configuration["feature_selector"] is not None:
			
 
				+        feature_selector = joblib.load(configuration["feature_selector"])
			
 
				+    else:
			
 
				+        feature_selector = None
			
 
				+
			
 
				+    criterion = torch.nn.CrossEntropyLoss()
			
 
				+    optimizer = torch.optim.Adam(model.parameters())
			
 
				+
			
 
				+    trainer = EarlyStoppingPyTorchTrainer(
			
 
				+        optimizer,
			
 
				+        epochs=args.num_epochs,
			
 
				+        loss=criterion,
			
 
				+        feature_selector=feature_selector
			
 
				+    )
			
 
				+    model = trainer.train(
			
 
				+        model,
			
 
				+        training_dataloader,
			
 
				+        validation_dataloader,
			
 
				+        args.patience
			
 
				+    )
			
 
				+    if not os.path.exists(configuration["model_path"]):
			
 
				+        os.makedirs(configuration["model_path"])
			
 
				+    torch.save(model.state_dict(), os.path.join(configuration["model_path"],"model.pth"))
			
 
				+    save_results(trainer, configuration)
			
 
				+    
			
 
				+
			
 
				+    
			
--- a/data/2026-IJCI/verifier/create_vnnlib.py
+++ b/data/2026-IJCI/verifier/create_vnnlib.py
@@ -0,0 +1,193 @@
 
				+#!/usr/bin/python3
			
 
				+
			
 
				+# Libraries
			
 
				+
			
 
				+import argparse
			
 
				+import torch
			
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import joblib
			
 
				+import numpy as np
			
 
				+
			
 
				+current = os.path.dirname(os.path.realpath(__file__))
			
 
				+parent = os.path.dirname(current)
			
 
				+sys.path.append(parent)
			
 
				+
			
 
				+from boolean_classifier.architectures.ffnn import FFNN
			
 
				+
			
 
				+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
			
 
				+from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
			
 
				+
			
 
				+# Functions
			
 
				+
			
 
				+def get_header(args, input_name, output_name, free_features_indices):
			
 
				+    '''Get the header for the VNN file'''
			
 
				+    str = f'; Input file: {args.input_file}\n'
			
 
				+    str += f'; Free features: {args.free}\n'
			
 
				+    str += f'; Free features indices:'
			
 
				+    for i in range(len(free_features_indices)):
			
 
				+        str += f' {free_features_indices[i]}'
			
 
				+    str += f'\n'
			
 
				+    str += f'; Total features: {args.total_features}\n'
			
 
				+    str += f'; Feature type: {args.feature_type}\n'
			
 
				+    str += f'; Input name: {input_name}\n'
			
 
				+    str += f'; Output name: {output_name}\n'
			
 
				+    str += f'; Epsilon: {args.epsilon}\n'
			
 
				+    str += f'; Random seed: {args.seed}\n'
			
 
				+    return str
			
 
				+
			
 
				+def get_input_vars(args, input_name):
			
 
				+    '''Get the input variables for the VNN file'''
			
 
				+    str = f'\n; Input variables:\n\n'
			
 
				+    for i in range(args.total_features):
			
 
				+        str += f'(declare-const {input_name}_{i} Real)\n'
			
 
				+    return str
			
 
				+
			
 
				+def get_output_vars(output_name):
			
 
				+    '''Get the output variables for the VNN file'''
			
 
				+    str = f'\n; Output variables:\n\n'
			
 
				+    str += f'(declare-const {output_name}_0 Real)\n'
			
 
				+    str += f'(declare-const {output_name}_1 Real)\n'
			
 
				+    return str
			
 
				+
			
 
				+def select_free_features(args, features):
			
 
				+    '''Select features to be free but only from features that are zero'''
			
 
				+    if args.list_ff_indices is not None: # If list of free feature indices is provided, use it. Do not check if they are zero or if it matches the number of arts.free features.
			
 
				+        indices = args.list_ff_indices
			
 
				+        # assert len(indices) == args.free, "Number of free features does not match the length of the provided indices."
			
 
				+        assert all(i >= 0 and i < args.total_features for i in indices), "Some indices are out of bounds."
			
 
				+    else:
			
 
				+        zero_indices = np.where(features == 0)[1] # For numpy arrays
			
 
				+        # print(f'Selecting {args.free} out of {len(zero_indices)} features with zero value')
			
 
				+        assert len(zero_indices) >= args.free, "Not enough zero features to select from."
			
 
				+        indices = np.random.choice(zero_indices, size=args.free, replace=False)
			
 
				+        # print('Free features indices:', random_indices)
			
 
				+    free_features = [False] * args.total_features
			
 
				+    for i in indices:
			
 
				+        free_features[i] = True
			
 
				+    return free_features, indices
			
 
				+
			
 
				+def get_input_constraints(args, input_name, features, free_features):
			
 
				+    '''Get the input constraints for the VNN file'''
			
 
				+    str = f'\n; Input constraints:\n\n'
			
 
				+    # Set ranges for the free features
			
 
				+    for i, free in enumerate(free_features):
			
 
				+        if free:
			
 
				+            # Standard constraint X >= 0 and <= 1
			
 
				+            str += f'(assert (>= {input_name}_{i} {max(0, features[0, i] - args.epsilon)}))\n'
			
 
				+            str += f'(assert (<= {input_name}_{i} {min(1, features[0, i] + args.epsilon)}))\n' 
			
 
				+            # Additional constraint to standard to ensure 0 or 1
			
 
				+            #str += f'(assert (or (<= {input_name}_{i} {max(0, dense_features[0, i] - args.epsilon)})'
			
 
				+            #str += f' (>= {input_name}_{i} {min(1, dense_features[0, i] + args.epsilon)})))\n'
			
 
				+        else:
			
 
				+            str += f'(assert (>= {input_name}_{i} {features[0, i]}))\n'
			
 
				+            str += f'(assert (<= {input_name}_{i} {features[0, i]}))\n'
			
 
				+    return str
			
 
				+
			
 
				+def get_output_constraints(output_name, predicted_label):
			
 
				+    '''Get the output constraints for the VNN file'''
			
 
				+    str = f'\n; Output constraints:\n\n'
			
 
				+    if predicted_label == 1:
			
 
				+        str += f'(assert (>= {output_name}_0 0.55))\n'
			
 
				+        str += f'(assert (<= {output_name}_0 1.0))\n'
			
 
				+        str += f'(assert (>= {output_name}_1 0.0))\n'
			
 
				+        str += f'(assert (<= {output_name}_1 0.45))\n'
			
 
				+    else:
			
 
				+        str += f'(assert (>= {output_name}_0 0.0))\n'
			
 
				+        str += f'(assert (<= {output_name}_0 0.45))\n'
			
 
				+        str += f'(assert (>= {output_name}_1 0.55))\n'
			
 
				+        str += f'(assert (<= {output_name}_1 1.0))\n'   
			
 
				+    return str
			
 
				+
			
 
				+def load_configuration(configuration_filepath: str) -> dict:
			
 
				+    with open(configuration_filepath, "r") as configuration_file:
			
 
				+        configuration = json.load(configuration_file)
			
 
				+    return configuration
			
 
				+
			
 
				+class VNNLIBargs():
			
 
				+    def __init__(self, input_file, model_path, config_file, feature_type, free, total_features, list_ff_indices, epsilon=1, output_file='out.vnnlib', seed=None):
			
 
				+        self.input_file = input_file
			
 
				+        self.model_path = model_path
			
 
				+        self.config_file = config_file
			
 
				+        self.feature_type = feature_type
			
 
				+        self.free = free
			
 
				+        self.total_features = total_features
			
 
				+        self.list_ff_indices = list_ff_indices
			
 
				+        self.epsilon = epsilon
			
 
				+        self.output_file = output_file
			
 
				+        self.seed = seed
			
 
				+
			
 
				+def create_vnnlib(args, features, predicted_label):
			
 
				+    input_name, output_name = "X", "Y"
			
 
				+    np.random.seed(args.seed)
			
 
				+    free_features, free_features_indices = select_free_features(args, features)
			
 
				+    with open(args.output_file, 'w') as output_file:
			
 
				+        output_file.write(get_header(args, input_name, output_name, free_features_indices))
			
 
				+        output_file.write(get_input_vars(args, input_name))
			
 
				+        output_file.write(get_output_vars(output_name))
			
 
				+        output_file.write(get_input_constraints(args, input_name, features, free_features))
			
 
				+        output_file.write(get_output_constraints(output_name, predicted_label))            
			
 
				+
			
 
				+
			
 
				+# Main
			
 
				+
			
 
				+if __name__ == '__main__' :
			
 
				+    # Parse arguments
			
 
				+    parser = argparse.ArgumentParser(description = 'Generates data.')
			
 
				+    # Optional arguments
			
 
				+    parser.add_argument('input_file', type = str, help = 'Input binary file name')
			
 
				+    parser.add_argument('model_path', type = str, help = 'Path to the model .pth file')
			
 
				+    parser.add_argument('config_file', type = str, help = 'Configuration file containing the hyperparameters of the model')
			
 
				+    parser.add_argument('feature_type', type = str, help = 'Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}')
			
 
				+    parser.add_argument('free', type = int, help = 'Number of free features')
			
 
				+    parser.add_argument('total_features', type = int, help = 'Total number of features')
			
 
				+    parser.add_argument('-l', '--list_ff_indices', nargs = '+', default = None, type = int, help = 'List of free feature indices (default: None)', dest = 'list_ff_indices')
			
 
				+    parser.add_argument('-e', '--epsilon', default = 1, type = int, help = 'Input epsilon variation (default: 1)', dest = 'epsilon')
			
 
				+    parser.add_argument('-o', '--output_file', default = 'out.vnnlib', type = str, help = 'output file name (default: out.vnnlib)', dest = 'output_file')
			
 
				+    parser.add_argument('-s', '--seed', default = None, type = int, help = 'Random seed', dest = 'seed')
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    # Set device
			
 
				+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
			
 
				+    print("Device: ", device)
			
 
				+    
			
 
				+    configuration = load_configuration(args.config_file)
			
 
				+
			
 
				+    # Load feature extractor
			
 
				+    if  "feature_selector" in configuration:
			
 
				+        config = '../boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl'
			
 
				+        feature_selector = joblib.load(config)
			
 
				+    else:
			
 
				+        feature_selector = None
			
 
				+
			
 
				+
			
 
				+    # Load model
			
 
				+    model = FFNN(configuration)
			
 
				+    model = model.to(device)
			
 
				+    model.load_state_dict(torch.load(args.model_path, weights_only=True))
			
 
				+    model.eval()
			
 
				+
			
 
				+    with open(args.input_file, "rb") as f:
			
 
				+        bytez = f.read()
			
 
				+
			
 
				+    if args.feature_type == "BooleanBigrams":
			
 
				+        feature_extractor = BooleanNGramFeatureExtractor(N=2)
			
 
				+        sparse_features = feature_extractor.feature_vector(bytez)
			
 
				+        features = sparse_features.todense()
			
 
				+    elif args.feature_type == "Bigrams":
			
 
				+        feature_extractor = NGramFeatureExtractor(N=2)
			
 
				+        features = feature_extractor.feature_vector(bytez)
			
 
				+    else:
			
 
				+        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
			
 
				+
			
 
				+    
			
 
				+    if feature_selector is not None:
			
 
				+        features = feature_selector.transform(torch.Tensor(features))
			
 
				+    x = torch.tensor(features, dtype=torch.float).to(device)
			
 
				+    probs = model.predict(x)
			
 
				+    y_pred = probs.argmax(dim=1)
			
 
				+    print("Predicted label: ", y_pred, probs)
			
 
				+
			
 
				+    create_vnnlib(args, features, y_pred[0].item())
			
 
				+