Josep Argelich пре 2 дана
родитељ
комит
9db244e6da
24 измењених фајлова са 78561 додато и 0 уклоњено
  1. 9
    0
      data/2026-IJCI/README.md
  2. 0
    0
      data/2026-IJCI/boolean_classifier/__init__.py
  3. 23
    0
      data/2026-IJCI/boolean_classifier/architectures/ffnn.py
  4. 80
    0
      data/2026-IJCI/boolean_classifier/classify_file.py
  5. 7690
    0
      data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_test_set.csv
  6. 61695
    0
      data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_training_set.csv
  7. 7711
    0
      data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_validation_set.csv
  8. 24
    0
      data/2026-IJCI/boolean_classifier/data/BODMAS/extract_EMBER_features.py
  9. 50
    0
      data/2026-IJCI/boolean_classifier/data/BODMAS/extract_boolean_2Gram_features.py
  10. BIN
      data/2026-IJCI/boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl
  11. BIN
      data/2026-IJCI/boolean_classifier/data/examples/0a01c936a018e3fdbc804816128bb73d0c0196c0bf6f931d195529f8ed3f6d87
  12. 32
    0
      data/2026-IJCI/boolean_classifier/datasets/boolean_ngram_dataset.py
  13. 30
    0
      data/2026-IJCI/boolean_classifier/datasets/ngram_dataset.py
  14. 110
    0
      data/2026-IJCI/boolean_classifier/evaluate_malware_detector.py
  15. 28
    0
      data/2026-IJCI/boolean_classifier/feature_extractors/boolean_ngram_feature_extractor.py
  16. 567
    0
      data/2026-IJCI/boolean_classifier/feature_extractors/ember_feature_extractor.py
  17. 29
    0
      data/2026-IJCI/boolean_classifier/feature_extractors/ngram_feature_extractor.py
  18. 6
    0
      data/2026-IJCI/boolean_classifier/ffnn_configurations/ffnn_2gram_k=1000_config.json
  19. BIN
      data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/model.pth
  20. 1
    0
      data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/results.json
  21. 6
    0
      data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/test.out
  22. 6
    0
      data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/validation.out
  23. 271
    0
      data/2026-IJCI/boolean_classifier/train_malware_detector.py
  24. 193
    0
      data/2026-IJCI/verifier/create_vnnlib.py

+ 9
- 0
data/2026-IJCI/README.md Прегледај датотеку

1
+
2
+# A Malware Detection Model as a Benchmark for Neural Network Verification
3
+
4
+## Overview
5
+This directory contains data and resources for the 2026-IJCI paper titled "A Malware Detection Model as a Benchmark for Neural Network Verification".
6
+
7
+## Contents
8
+- `boolean_classifier/` - Neural Network model
9
+- `verifier/` - Scripts for tool verification

+ 0
- 0
data/2026-IJCI/boolean_classifier/__init__.py Прегледај датотеку


+ 23
- 0
data/2026-IJCI/boolean_classifier/architectures/ffnn.py Прегледај датотеку

1
+import torch
2
+
3
+
4
+class FFNN(torch.nn.Module):
5
+    def __init__(self, configuration: dict):
6
+        super().__init__()
7
+        self.hidden_size = configuration["hidden_size"]
8
+        self.input_size = configuration["input_size"]
9
+        self.output_size = 2
10
+        self.hidden_1 = torch.nn.Linear(self.input_size, self.hidden_size)
11
+        self.output_layer = torch.nn.Linear(self.hidden_size, 2)
12
+
13
+
14
+    def forward(self, x: torch.Tensor)-> torch.Tensor:
15
+        x = self.hidden_1(x)
16
+        x = torch.relu(x)
17
+        y = self.output_layer(x)
18
+        return y
19
+
20
+    def predict(self, x: torch.Tensor) -> torch.Tensor:
21
+        outputs =  self.forward(x)
22
+        outputs = outputs.softmax(dim=1)
23
+        return outputs

+ 80
- 0
data/2026-IJCI/boolean_classifier/classify_file.py Прегледај датотеку

1
+import argparse
2
+import torch
3
+import os
4
+import sys
5
+sys.path.append("../")
6
+from boolean_classifier.architectures.ffnn import FFNN
7
+import json
8
+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
9
+from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
10
+import joblib
11
+import numpy as np
12
+
13
+
14
+def load_configuration(configuration_filepath: str) -> dict:
15
+    with open(configuration_filepath, "r") as configuration_file:
16
+        configuration = json.load(configuration_file)
17
+    return configuration
18
+
19
+if __name__ == "__main__":
20
+    parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector')
21
+    parser.add_argument("exe_filepath",
22
+                        type=str,
23
+                        help="Filepath of the executable"
24
+                        )
25
+    parser.add_argument("feature_type",
26
+                        type=str,
27
+                        help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}")
28
+    parser.add_argument("configuration_file",
29
+                        type=str,
30
+                        help="Configuration file containing the hyperparameters of the model"
31
+                        )
32
+    args = parser.parse_args()
33
+
34
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+    print("Device: ", device)
36
+
37
+    configuration = load_configuration(args.configuration_file)
38
+    if "feature_selector" in configuration:
39
+        feature_selector = joblib.load(configuration["feature_selector"])
40
+    else:
41
+        feature_selector = None
42
+
43
+    # Load model
44
+    model = FFNN(configuration)
45
+    model = model.to(device)
46
+    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
47
+    model.eval()
48
+
49
+    with open(args.exe_filepath, "rb") as f:
50
+        bytez = f.read()
51
+
52
+    if args.feature_type == "BooleanBigrams":
53
+        feature_extractor = BooleanNGramFeatureExtractor(N=2)
54
+        sparse_features = feature_extractor.feature_vector(bytez)
55
+        features = sparse_features.todense()
56
+        # print("feature vector: ", features)
57
+        # zero_indices = np.where(features[0] == 0)[1]
58
+        # print("Number of zero features: ", len(zero_indices))
59
+        # print("Zero indices: ", zero_indices)
60
+        # # Remove some items from zero_indices
61
+        # if len(zero_indices) > 4000:
62
+        #     zero_indices = np.random.choice(zero_indices, size=4000, replace=False)
63
+        # print("Zero indices after sampling: ", zero_indices)
64
+        # for i in zero_indices:
65
+        #     features[0, i] = 1
66
+        # zero_indices = np.where(features[0] == 0)[1]
67
+        # print("Number of zero features: ", len(zero_indices))
68
+    elif args.feature_type == "Bigrams":
69
+        feature_extractor = NGramFeatureExtractor(N=2)
70
+        features = feature_extractor.feature_vector(bytez)
71
+    else:
72
+        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
73
+    if feature_selector is not None:
74
+        features = feature_selector.transform(torch.Tensor(features))
75
+    x = torch.tensor(features, dtype=torch.float).to(device)
76
+    probs = model.predict(x)
77
+    y_pred = probs.argmax(dim=1)
78
+    print("Predicted label: ", y_pred, probs)
79
+
80
+

+ 7690
- 0
data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_test_set.csv
Разлика између датотеке није приказан због своје велике величине
Прегледај датотеку


+ 61695
- 0
data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_training_set.csv
Разлика између датотеке није приказан због своје велике величине
Прегледај датотеку


+ 7711
- 0
data/2026-IJCI/boolean_classifier/data/BODMAS/bodmas_boolean_2Gram_validation_set.csv
Разлика између датотеке није приказан због своје велике величине
Прегледај датотеку


+ 24
- 0
data/2026-IJCI/boolean_classifier/data/BODMAS/extract_EMBER_features.py Прегледај датотеку

1
+import sys
2
+sys.path.append("../../../")
3
+from boolean_classifier.feature_extractors.ember_feature_extractor import EMBERFeatureExtractor
4
+
5
+training_filepaths = ["bodmas_training_set.csv", "bodmas_validation_set.csv", "bodmas_test_set.csv"]
6
+output_filepaths = ["bodmas_ember_training_set.csv", "bodmas_ember_validation_set.csv", "bodmas_ember_test_set.csv"]
7
+for i, training_filepath in enumerate(training_filepaths):
8
+    for output_filepath in output_filepaths:
9
+        with open(output_filepath, "w") as output_file:
10
+            with open(training_filepath, "r") as f:
11
+                lines = f.readlines()
12
+                for j, line in enumerate(lines):
13
+                    exe_filepath, label = line.strip().split(",")
14
+                    print(j, exe_filepath, label)
15
+                    with open(exe_filepath, "rb") as exe_file:
16
+                        bytez = exe_file.read()
17
+                    try:
18
+                        features = EMBERFeatureExtractor().feature_vector(bytez)
19
+                        for feature in features:
20
+                            output_file.write("{},".format(feature))
21
+                        output_file.write(str(label)+"\n")
22
+                    except ValueError as e:
23
+                        print(e)
24
+

+ 50
- 0
data/2026-IJCI/boolean_classifier/data/BODMAS/extract_boolean_2Gram_features.py Прегледај датотеку

1
+import os
2
+import sys
3
+sys.path.append("../../../")
4
+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
5
+import numpy as np
6
+from scipy.sparse import csr_matrix
7
+import scipy.sparse
8
+
9
+
10
+training_filepaths = [
11
+    "bodmas_training_set.csv",
12
+    "bodmas_validation_set.csv",
13
+    "bodmas_test_set.csv"
14
+]
15
+features_training_filepaths = [
16
+    "bodmas_boolean_2Gram_features_training_set.csv",
17
+    "bodmas_boolean_2Gram_features_validation_set.csv",
18
+    "bodmas_boolean_2Gram_features_test_set.csv"
19
+]
20
+
21
+features_directories = [
22
+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/training/",
23
+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/validation/",
24
+    "/home/daniel/Datasets/BODMAS/Boolean_2Gram_features/test/"
25
+]
26
+
27
+training_lists = [[], [], []]
28
+
29
+for i, training_filepath in enumerate(training_filepaths):
30
+    with open(training_filepath, "r") as f:
31
+        lines = f.readlines()
32
+        for j, line in enumerate(lines):
33
+            exe_filepath, label = line.strip().split(",")
34
+            sha = exe_filepath.split("/")[-1]
35
+            print(j, exe_filepath, label)
36
+            with open(exe_filepath, "rb") as exe_file:
37
+                bytez = exe_file.read()
38
+            try:
39
+                sparse_features = BooleanNGramFeatureExtractor(N=2).feature_vector(bytez)
40
+            except ValueError as e:
41
+                print(e)
42
+            scipy.sparse.save_npz(os.path.join(features_directories[i], sha+".npz"), sparse_features)
43
+
44
+            training_lists[i].append((os.path.join(features_directories[i], sha+".npz"), label))
45
+
46
+for i, features_filepath in enumerate(features_training_filepaths):
47
+    with open(features_filepath, "w") as f:
48
+        for filepath, label in training_lists[i]:
49
+            f.write("{},{}\n".format(filepath, label))
50
+

BIN
data/2026-IJCI/boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl Прегледај датотеку


BIN
data/2026-IJCI/boolean_classifier/data/examples/0a01c936a018e3fdbc804816128bb73d0c0196c0bf6f931d195529f8ed3f6d87 Прегледај датотеку


+ 32
- 0
data/2026-IJCI/boolean_classifier/datasets/boolean_ngram_dataset.py Прегледај датотеку

1
+from torch.utils.data import Dataset
2
+import os
3
+from random import shuffle
4
+import numpy as np
5
+import torch
6
+import scipy.sparse
7
+
8
+
9
+class BooleanNGramDataset(Dataset):
10
+    def __init__(self, csv_filepath: str):
11
+        self.all_files = []
12
+        with open(csv_filepath, "r") as f:
13
+            lines = f.readlines()
14
+            for line in lines:
15
+                filepath, label = line.strip().split(",")
16
+                self.all_files.append((filepath, int(label)))
17
+        shuffle(self.all_files)
18
+
19
+    def __len__(self):
20
+        return len(self.all_files)
21
+
22
+    def __getitem__(self, index):
23
+        to_load, y = self.all_files[index]
24
+        # Step 1: Load the .npz file into a sparse matrix
25
+        sparse_matrix = scipy.sparse.load_npz(to_load)
26
+        # Step 2: Convert the sparse matrix to a dense matrix (e.g., using toarray())
27
+        dense_matrix = sparse_matrix.toarray()  # You can also use .todense() if needed
28
+
29
+        # Step 3: Convert the dense matrix to a PyTorch tensor
30
+        x = torch.tensor(dense_matrix, dtype=torch.float)
31
+        x = x.squeeze()
32
+        return x, torch.tensor(y)

+ 30
- 0
data/2026-IJCI/boolean_classifier/datasets/ngram_dataset.py Прегледај датотеку

1
+from torch.utils.data import Dataset
2
+import os
3
+from random import shuffle
4
+import numpy as np
5
+import torch
6
+import scipy.sparse
7
+
8
+
9
+class NGramDataset(Dataset):
10
+    def __init__(self, csv_filepath: str):
11
+        self.all_files = []
12
+        with open(csv_filepath, "r") as f:
13
+            lines = f.readlines()
14
+            for line in lines:
15
+                filepath, label = line.strip().split(",")
16
+                self.all_files.append((filepath, int(label)))
17
+        shuffle(self.all_files)
18
+
19
+
20
+    def __len__(self):
21
+        return len(self.all_files)
22
+
23
+    def __getitem__(self, index):
24
+        to_load, y = self.all_files[index]
25
+        # Step 1: Load the .npz file
26
+        matrix = np.load(to_load)["arr_0"]
27
+        # Step 2: Convert the dense matrix to a PyTorch tensor
28
+        x = torch.tensor(matrix, dtype=torch.float)
29
+        x = x.squeeze()
30
+        return x, torch.tensor(y)

+ 110
- 0
data/2026-IJCI/boolean_classifier/evaluate_malware_detector.py Прегледај датотеку

1
+import argparse
2
+import torch
3
+import sys
4
+sys.path.append("../")
5
+from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
6
+from boolean_classifier.datasets.ngram_dataset import NGramDataset
7
+from boolean_classifier.architectures.ffnn import FFNN
8
+from torch.utils.data import DataLoader
9
+import multiprocessing
10
+import json
11
+import os
12
+import torch.nn
13
+from torch.optim.lr_scheduler import _LRScheduler
14
+from torch.utils.data import DataLoader
15
+from tqdm import tqdm
16
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
17
+import joblib
18
+
19
+
20
+def load_configuration(configuration_filepath: str) -> dict:
21
+    with open(configuration_filepath, "r") as configuration_file:
22
+        configuration = json.load(configuration_file)
23
+    return configuration
24
+
25
+def evaluate(model: torch.nn.Module, dataloader: DataLoader) -> tuple[list, list]:
26
+    y_trues = []
27
+    y_preds = []
28
+    device = next(model.parameters()).device
29
+    model = model.eval()
30
+    with torch.no_grad():
31
+        for x, y in tqdm(dataloader):
32
+            if feature_selector is not None:
33
+                x = torch.Tensor(feature_selector.transform(x))
34
+            x, y = x.to(device), y.to(device)
35
+            outputs = model.predict(x)
36
+            y_pred =  outputs.argmax(dim=1)
37
+            y_trues.extend(y.cpu())
38
+            y_preds.extend(y_pred.cpu())
39
+    return y_trues, y_preds
40
+
41
+def save_results(y_trues: list, y_preds: list, output_filepath: str):
42
+    acc = accuracy_score(y_trues, y_preds)
43
+    precision = precision_score(y_trues, y_preds)
44
+    recall = recall_score(y_trues, y_preds)
45
+    f1 = f1_score(y_trues, y_preds)
46
+    cm = confusion_matrix(y_trues, y_preds)
47
+
48
+    with open(output_filepath, "w") as output_file:
49
+        output_file.write("Accuracy: {}\n".format(acc))
50
+        output_file.write("Precision: {}\n".format(precision))
51
+        output_file.write("Recall: {}\n".format(recall))
52
+        output_file.write("F1: {}\n".format(f1))
53
+        output_file.write("Confusion Matrix: {}\n".format(cm))
54
+
55
+
56
+
57
+
58
+if __name__ == "__main__":
59
+    parser = argparse.ArgumentParser(description='Evaluate malware detector')
60
+    parser.add_argument("evaluation_file",
61
+                        type=str,
62
+                        help="Evaluation file containing the hashes and labels of the benign and malicious samples"
63
+                        )
64
+    parser.add_argument("dataset_type",
65
+                        type=str,
66
+                        help="Type of dataset: {Boolean, EMBER}"
67
+                        )
68
+    parser.add_argument("configuration_file",
69
+                        type=str,
70
+                        help="Configuration file containing the hyperparameters of the model"
71
+                        )
72
+    parser.add_argument("output_file",
73
+                        type=str,
74
+                        help="File to where to store the results",
75
+                        )
76
+    parser.add_argument("--batch_size",
77
+                        type=int,
78
+                        help="Batch size for training",
79
+                        default=32
80
+                        )
81
+    args = parser.parse_args()
82
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
83
+    print("Device: ", device)
84
+    num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
85
+
86
+    configuration = load_configuration(args.configuration_file)
87
+    if args.dataset_type == "BooleanBigrams":
88
+        dataset = BooleanNGramDataset(args.evaluation_file)
89
+    elif args.dataset_type == "Bigrams":
90
+        dataset = NGramDataset(args.evaluation_file)
91
+    else:
92
+        raise NotImplementedError("Only Boolean dataset is currently supported")
93
+    dataloader = DataLoader(
94
+        dataset,
95
+        batch_size=args.batch_size,
96
+        num_workers=num_workers,
97
+    )
98
+    model = FFNN(configuration)
99
+    model = model.to(device)
100
+    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
101
+    model.eval()
102
+    if configuration["feature_selector"] is not None:
103
+        feature_selector = joblib.load(configuration["feature_selector"])
104
+    else:
105
+        feature_selector = None
106
+    y_trues, y_preds = evaluate(model, dataloader)
107
+    save_results(y_trues, y_preds, args.output_file)
108
+
109
+
110
+

+ 28
- 0
data/2026-IJCI/boolean_classifier/feature_extractors/boolean_ngram_feature_extractor.py Прегледај датотеку

1
+from collections import OrderedDict
2
+from scipy.sparse import csr_matrix
3
+
4
+
5
+class BooleanNGramFeatureExtractor(object):
6
+    def __init__(self, N: int = 2):
7
+        self.N = N
8
+        self.dim = 256 ** N
9
+        self.ngram_features = OrderedDict({"{},{}".format(i,j): 0 for i in range(256) for j in range(256)})
10
+
11
+    def feature_vector(self, bytez):
12
+        raw_features = self.extract_ngram_features(bytez)
13
+        return self.reduce(raw_features)
14
+
15
+    def extract_ngram_features(self, bytez)-> dict:
16
+        words = list(bytez)
17
+        bigrams = zip(words, words[1:])  # Create bi-grams
18
+        bigrams = set(bigrams)
19
+        for bigram in bigrams:
20
+            self.ngram_features["{},{}".format(bigram[0], bigram[1])] += 1
21
+        return self.ngram_features
22
+
23
+    def reduce(self, raw_features: dict, technique: str = None):
24
+        if technique is None:
25
+            return csr_matrix(list(raw_features.values()))
26
+        else:
27
+            raise NotImplementedError("Feature selection and dimensionality reduction technique not implemented")
28
+

+ 567
- 0
data/2026-IJCI/boolean_classifier/feature_extractors/ember_feature_extractor.py Прегледај датотеку

1
+#!/usr/bin/python
2
+''' Extracts some basic features from PE files. Many of the features
3
+implemented have been used in previously published works. For more information,
4
+check out the following resources:
5
+* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
6
+* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
7
+* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
8
+* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
9
+* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
10
+
11
+It may be useful to do feature selection to reduce this set of features to a meaningful set
12
+for your modeling problem.
13
+'''
14
+
15
+import hashlib
16
+import json
17
+import os
18
+import re
19
+
20
+import lief
21
+import numpy as np
22
+from sklearn.feature_extraction import FeatureHasher
23
+
24
+LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
25
+LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10)
26
+LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)
27
+
28
+
29
+class FeatureType(object):
30
+    ''' Base class from which each feature type may inherit '''
31
+
32
+    name = ''
33
+    dim = 0
34
+
35
+    def __repr__(self):
36
+        return '{}({})'.format(self.name, self.dim)
37
+
38
+    def raw_features(self, bytez, lief_binary):
39
+        ''' Generate a JSON-able representation of the file '''
40
+        raise (NotImplementedError)
41
+
42
+    def process_raw_features(self, raw_obj):
43
+        ''' Generate a feature vector from the raw features '''
44
+        raise (NotImplementedError)
45
+
46
+    def feature_vector(self, bytez, lief_binary):
47
+        ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
48
+        if there are significant speedups to be gained from combining the two functions. '''
49
+        return self.process_raw_features(self.raw_features(bytez, lief_binary))
50
+
51
+
52
+class ByteHistogram(FeatureType):
53
+    ''' Byte histogram (count + non-normalized) over the entire binary file '''
54
+
55
+    name = 'histogram'
56
+    dim = 256
57
+
58
+    def __init__(self):
59
+        super(FeatureType, self).__init__()
60
+
61
+    def raw_features(self, bytez, lief_binary):
62
+        counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
63
+        return counts.tolist()
64
+
65
+    def process_raw_features(self, raw_obj):
66
+        counts = np.array(raw_obj, dtype=np.float32)
67
+        sum = counts.sum()
68
+        normalized = counts / sum
69
+        return normalized
70
+
71
+
72
+class ByteEntropyHistogram(FeatureType):
73
+    ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
74
+    This roughly approximates the joint probability of byte value and local entropy.
75
+    See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
76
+    '''
77
+
78
+    name = 'byteentropy'
79
+    dim = 256
80
+
81
+    def __init__(self, step=1024, window=2048):
82
+        super(FeatureType, self).__init__()
83
+        self.window = window
84
+        self.step = step
85
+
86
+    def _entropy_bin_counts(self, block):
87
+        # coarse histogram, 16 bytes per bin
88
+        c = np.bincount(block >> 4, minlength=16)  # 16-bin histogram
89
+        p = c.astype(np.float32) / self.window
90
+        wh = np.where(c)[0]
91
+        H = np.sum(-p[wh] * np.log2(
92
+            p[wh])) * 2  # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
93
+
94
+        Hbin = int(H * 2)  # up to 16 bins (max entropy is 8 bits)
95
+        if Hbin == 16:  # handle entropy = 8.0 bits
96
+            Hbin = 15
97
+
98
+        return Hbin, c
99
+
100
+    def raw_features(self, bytez, lief_binary):
101
+        output = np.zeros((16, 16), dtype=int)
102
+        a = np.frombuffer(bytez, dtype=np.uint8)
103
+        if a.shape[0] < self.window:
104
+            Hbin, c = self._entropy_bin_counts(a)
105
+            output[Hbin, :] += c
106
+        else:
107
+            # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
108
+            shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
109
+            strides = a.strides + (a.strides[-1],)
110
+            blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
111
+
112
+            # from the blocks, compute histogram
113
+            for block in blocks:
114
+                Hbin, c = self._entropy_bin_counts(block)
115
+                output[Hbin, :] += c
116
+
117
+        return output.flatten().tolist()
118
+
119
+    def process_raw_features(self, raw_obj):
120
+        counts = np.array(raw_obj, dtype=np.float32)
121
+        sum = counts.sum()
122
+        normalized = counts / sum
123
+        return normalized
124
+
125
+
126
+class SectionInfo(FeatureType):
127
+    ''' Information about section names, sizes and entropy.  Uses hashing trick
128
+    to summarize all this section info into a feature vector.
129
+    '''
130
+
131
+    name = 'section'
132
+    dim = 5 + 50 + 50 + 50 + 50 + 50
133
+
134
+    def __init__(self):
135
+        super(FeatureType, self).__init__()
136
+
137
+    @staticmethod
138
+    def _properties(s):
139
+        return [str(c).split('.')[-1] for c in s.characteristics_lists]
140
+
141
+    def raw_features(self, bytez, lief_binary):
142
+        if lief_binary is None:
143
+            return {"entry": "", "sections": []}
144
+
145
+        # properties of entry point, or if invalid, the first executable section
146
+        not_found_error_class = RuntimeError if not lief.__version__.startswith("0.9.0") else lief.not_found
147
+        try:
148
+            if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12):
149
+                section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase)
150
+
151
+                if section is None:
152
+                    raise not_found_error_class
153
+                entry_section = section.name
154
+            else:  # lief < 0.12
155
+                entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
156
+        except not_found_error_class:
157
+            # bad entry point, let's find the first executable section
158
+            entry_section = ""
159
+            mem_execute_characteristics = lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE if lief.__version__.startswith("0.9.0") else lief.PE.Section.CHARACTERISTICS.MEM_EXECUTE
160
+            for s in lief_binary.sections:
161
+                if mem_execute_characteristics in s.characteristics_lists:
162
+                    entry_section = s.name
163
+                    break
164
+
165
+        raw_obj = {"entry": entry_section}
166
+        raw_obj["sections"] = [{
167
+            'name': s.name,
168
+            'size': s.size,
169
+            'entropy': s.entropy,
170
+            'vsize': s.virtual_size,
171
+            'props': self._properties(s)
172
+        } for s in lief_binary.sections]
173
+        return raw_obj
174
+
175
+    def process_raw_features(self, raw_obj):
176
+        sections = raw_obj['sections']
177
+        general = [
178
+            len(sections),  # total number of sections
179
+            # number of sections with zero size
180
+            sum(1 for s in sections if s['size'] == 0),
181
+            # number of sections with an empty name
182
+            sum(1 for s in sections if s['name'] == ""),
183
+            # number of RX
184
+            sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
185
+            # number of W
186
+            sum(1 for s in sections if 'MEM_WRITE' in s['props'])
187
+        ]
188
+        # gross characteristics of each section
189
+        section_sizes = [(s['name'], s['size']) for s in sections]
190
+        section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
191
+        section_entropy = [(s['name'], s['entropy']) for s in sections]
192
+        section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
193
+        section_vsize = [(s['name'], s['vsize']) for s in sections]
194
+        section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
195
+        entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
196
+        characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
197
+        characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
198
+
199
+        return np.hstack([
200
+            general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
201
+            characteristics_hashed
202
+        ]).astype(np.float32)
203
+
204
+
205
+class ImportsInfo(FeatureType):
206
+    ''' Information about imported libraries and functions from the
207
+    import address table.  Note that the total number of imported
208
+    functions is contained in GeneralFileInfo.
209
+    '''
210
+
211
+    name = 'imports'
212
+    dim = 1280
213
+
214
+    def __init__(self):
215
+        super(FeatureType, self).__init__()
216
+
217
+    def raw_features(self, bytez, lief_binary):
218
+        imports = {}
219
+        if lief_binary is None:
220
+            return imports
221
+
222
+        for lib in lief_binary.imports:
223
+            if lib.name not in imports:
224
+                imports[lib.name] = []  # libraries can be duplicated in listing, extend instead of overwrite
225
+
226
+            # Clipping assumes there are diminishing returns on the discriminatory power of imported functions
227
+            #  beyond the first 10000 characters, and this will help limit the dataset size
228
+            for entry in lib.entries:
229
+                if entry.is_ordinal:
230
+                    imports[lib.name].append("ordinal" + str(entry.ordinal))
231
+                else:
232
+                    imports[lib.name].append(entry.name[:10000])
233
+
234
+        return imports
235
+
236
+    def process_raw_features(self, raw_obj):
237
+        # unique libraries
238
+        libraries = list(set([l.lower() for l in raw_obj.keys()]))
239
+        libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
240
+
241
+        # A string like "kernel32.dll:CreateFileMappingA" for each imported function
242
+        imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
243
+        imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
244
+
245
+        # Two separate elements: libraries (alone) and fully-qualified names of imported functions
246
+        return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
247
+
248
+
249
+class ExportsInfo(FeatureType):
250
+    ''' Information about exported functions. Note that the total number of exported
251
+    functions is contained in GeneralFileInfo.
252
+    '''
253
+
254
+    name = 'exports'
255
+    dim = 128
256
+
257
+    def __init__(self):
258
+        super(FeatureType, self).__init__()
259
+
260
+    def raw_features(self, bytez, lief_binary):
261
+        if lief_binary is None:
262
+            return []
263
+
264
+        # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
265
+        #  the first 10000 characters, and this will help limit the dataset size
266
+        if LIEF_EXPORT_OBJECT:
267
+            # export is an object with .name attribute (0.10.0 and later)
268
+            clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
269
+        else:
270
+            # export is a string (LIEF 0.9.0 and earlier)
271
+            clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
272
+
273
+        return clipped_exports
274
+
275
+    def process_raw_features(self, raw_obj):
276
+        exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
277
+        return exports_hashed.astype(np.float32)
278
+
279
+
280
+class GeneralFileInfo(FeatureType):
281
+    ''' General information about the file '''
282
+
283
+    name = 'general'
284
+    dim = 10
285
+
286
+    def __init__(self):
287
+        super(FeatureType, self).__init__()
288
+
289
+    def raw_features(self, bytez, lief_binary):
290
+        if lief_binary is None:
291
+            return {
292
+                'size': len(bytez),
293
+                'vsize': 0,
294
+                'has_debug': 0,
295
+                'exports': 0,
296
+                'imports': 0,
297
+                'has_relocations': 0,
298
+                'has_resources': 0,
299
+                'has_signature': 0,
300
+                'has_tls': 0,
301
+                'symbols': 0
302
+            }
303
+
304
+        return {
305
+            'size': len(bytez),
306
+            'vsize': lief_binary.virtual_size,
307
+            'has_debug': int(lief_binary.has_debug),
308
+            'exports': len(lief_binary.exported_functions),
309
+            'imports': len(lief_binary.imported_functions),
310
+            'has_relocations': int(lief_binary.has_relocations),
311
+            'has_resources': int(lief_binary.has_resources),
312
+            'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature),
313
+            'has_tls': int(lief_binary.has_tls),
314
+            'symbols': len(lief_binary.symbols),
315
+        }
316
+
317
+    def process_raw_features(self, raw_obj):
318
+        return np.asarray([
319
+            raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
320
+            raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
321
+            raw_obj['symbols']
322
+        ],
323
+            dtype=np.float32)
324
+
325
+
326
+class HeaderFileInfo(FeatureType):
327
+    ''' Machine, architecure, OS, linker and other information extracted from header '''
328
+
329
+    name = 'header'
330
+    dim = 62
331
+
332
+    def __init__(self):
333
+        super(FeatureType, self).__init__()
334
+
335
+    def raw_features(self, bytez, lief_binary):
336
+        raw_obj = {}
337
+        raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
338
+        raw_obj['optional'] = {
339
+            'subsystem': "",
340
+            'dll_characteristics': [],
341
+            'magic': "",
342
+            'major_image_version': 0,
343
+            'minor_image_version': 0,
344
+            'major_linker_version': 0,
345
+            'minor_linker_version': 0,
346
+            'major_operating_system_version': 0,
347
+            'minor_operating_system_version': 0,
348
+            'major_subsystem_version': 0,
349
+            'minor_subsystem_version': 0,
350
+            'sizeof_code': 0,
351
+            'sizeof_headers': 0,
352
+            'sizeof_heap_commit': 0
353
+        }
354
+        if lief_binary is None:
355
+            return raw_obj
356
+
357
+        raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
358
+        raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
359
+        raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
360
+        raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
361
+        raw_obj['optional']['dll_characteristics'] = [
362
+            str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
363
+        ]
364
+        raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
365
+        raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
366
+        raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
367
+        raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
368
+        raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
369
+        raw_obj['optional'][
370
+            'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
371
+        raw_obj['optional'][
372
+            'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
373
+        raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
374
+        raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
375
+        raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
376
+        raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
377
+        raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
378
+        return raw_obj
379
+
380
+    def process_raw_features(self, raw_obj):
381
+        return np.hstack([
382
+            raw_obj['coff']['timestamp'],
383
+            FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
384
+            FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
385
+            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
386
+            FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
387
+            FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
388
+            raw_obj['optional']['major_image_version'],
389
+            raw_obj['optional']['minor_image_version'],
390
+            raw_obj['optional']['major_linker_version'],
391
+            raw_obj['optional']['minor_linker_version'],
392
+            raw_obj['optional']['major_operating_system_version'],
393
+            raw_obj['optional']['minor_operating_system_version'],
394
+            raw_obj['optional']['major_subsystem_version'],
395
+            raw_obj['optional']['minor_subsystem_version'],
396
+            raw_obj['optional']['sizeof_code'],
397
+            raw_obj['optional']['sizeof_headers'],
398
+            raw_obj['optional']['sizeof_heap_commit'],
399
+        ]).astype(np.float32)
400
+
401
+
402
+class StringExtractor(FeatureType):
403
+    ''' Extracts strings from raw byte stream '''
404
+
405
+    name = 'strings'
406
+    dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
407
+
408
+    def __init__(self):
409
+        super(FeatureType, self).__init__()
410
+        # all consecutive runs of 0x20 - 0x7f that are 5+ characters
411
+        self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
412
+        # occurances of the string 'C:\'.  Not actually extracting the path
413
+        self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
414
+        # occurances of http:// or https://.  Not actually extracting the URLs
415
+        self._urls = re.compile(b'https?://', re.IGNORECASE)
416
+        # occurances of the string prefix HKEY_.  No actually extracting registry names
417
+        self._registry = re.compile(b'HKEY_')
418
+        # crude evidence of an MZ header (dropper?) somewhere in the byte stream
419
+        self._mz = re.compile(b'MZ')
420
+
421
+    def raw_features(self, bytez, lief_binary):
422
+        allstrings = self._allstrings.findall(bytez)
423
+        if allstrings:
424
+            # statistics about strings:
425
+            string_lengths = [len(s) for s in allstrings]
426
+            avlength = sum(string_lengths) / len(string_lengths)
427
+            # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
428
+            as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
429
+            c = np.bincount(as_shifted_string, minlength=96)  # histogram count
430
+            # distribution of characters in printable strings
431
+            csum = c.sum()
432
+            p = c.astype(np.float32) / csum
433
+            wh = np.where(c)[0]
434
+            H = np.sum(-p[wh] * np.log2(p[wh]))  # entropy
435
+        else:
436
+            avlength = 0
437
+            c = np.zeros((96,), dtype=np.float32)
438
+            H = 0
439
+            csum = 0
440
+
441
+        return {
442
+            'numstrings': len(allstrings),
443
+            'avlength': avlength,
444
+            'printabledist': c.tolist(),  # store non-normalized histogram
445
+            'printables': int(csum),
446
+            'entropy': float(H),
447
+            'paths': len(self._paths.findall(bytez)),
448
+            'urls': len(self._urls.findall(bytez)),
449
+            'registry': len(self._registry.findall(bytez)),
450
+            'MZ': len(self._mz.findall(bytez))
451
+        }
452
+
453
+    def process_raw_features(self, raw_obj):
454
+        hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
455
+        return np.hstack([
456
+            raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
457
+            np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
458
+            raw_obj['registry'], raw_obj['MZ']
459
+        ]).astype(np.float32)
460
+
461
+
462
+class DataDirectories(FeatureType):
463
+    ''' Extracts size and virtual address of the first 15 data directories '''
464
+
465
+    name = 'datadirectories'
466
+    dim = 15 * 2
467
+
468
+    def __init__(self):
469
+        super(FeatureType, self).__init__()
470
+        self._name_order = [
471
+            "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
472
+            "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
473
+            "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
474
+        ]
475
+
476
+    def raw_features(self, bytez, lief_binary):
477
+        output = []
478
+        if lief_binary is None:
479
+            return output
480
+
481
+        for data_directory in lief_binary.data_directories:
482
+            output.append({
483
+                "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
484
+                "size": data_directory.size,
485
+                "virtual_address": data_directory.rva
486
+            })
487
+        return output
488
+
489
+    def process_raw_features(self, raw_obj):
490
+        features = np.zeros(2 * len(self._name_order), dtype=np.float32)
491
+        for i in range(len(self._name_order)):
492
+            if i < len(raw_obj):
493
+                features[2 * i] = raw_obj[i]["size"]
494
+                features[2 * i + 1] = raw_obj[i]["virtual_address"]
495
+        return features
496
+
497
+
498
+class EMBERFeatureExtractor(object):
499
+    ''' Extract useful features from a PE file, and return as a vector of fixed size. '''
500
+
501
+    def __init__(self, feature_version=2, print_feature_warning=True, features_file=''):
502
+        self.features = []
503
+        features = {
504
+            'ByteHistogram': ByteHistogram(),
505
+            'ByteEntropyHistogram': ByteEntropyHistogram(),
506
+            'StringExtractor': StringExtractor(),
507
+            'GeneralFileInfo': GeneralFileInfo(),
508
+            'HeaderFileInfo': HeaderFileInfo(),
509
+            'SectionInfo': SectionInfo(),
510
+            'ImportsInfo': ImportsInfo(),
511
+            'ExportsInfo': ExportsInfo()
512
+        }
513
+
514
+        if os.path.exists(features_file):
515
+            with open(features_file, encoding='utf8') as f:
516
+                x = json.load(f)
517
+                self.features = [features[feature] for feature in x['features'] if feature in features]
518
+        else:
519
+            self.features = list(features.values())
520
+
521
+        if feature_version == 1:
522
+            if not lief.__version__.startswith("0.8.3"):
523
+                if print_feature_warning:
524
+                    print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
525
+                    print(
526
+                        f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
527
+                    print(f"WARNING:   in the feature calculations.")
528
+        elif feature_version == 2:
529
+            self.features.append(DataDirectories())
530
+            if not lief.__version__.startswith("0.9.0"):
531
+                if print_feature_warning:
532
+                    print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
533
+                    print(
534
+                        f"WARNING:   lief version {lief.__version__} found instead. There may be slight inconsistencies")
535
+                    print(f"WARNING:   in the feature calculations.")
536
+        else:
537
+            raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
538
+        self.dim = sum([fe.dim for fe in self.features])
539
+
540
+    def raw_features(self, bytez):
541
+        if lief.__version__.startswith("0.9.0"):
542
+            lief_errors = (
543
+                lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError)
544
+        else:
545
+            lief_errors = (
546
+                lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error,
547
+                lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound,
548
+                RuntimeError)
549
+
550
+        try:
551
+            lief_binary = lief.PE.parse(list(bytez))
552
+        except lief_errors as e:
553
+            print("lief error: ", str(e))
554
+            lief_binary = None
555
+        except Exception:  # everything else (KeyboardInterrupt, SystemExit, ValueError):
556
+            raise
557
+
558
+        features = {"sha256": hashlib.sha256(bytez).hexdigest()}
559
+        features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
560
+        return features
561
+
562
+    def process_raw_features(self, raw_obj):
563
+        feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
564
+        return np.hstack(feature_vectors).astype(np.float32)
565
+
566
+    def feature_vector(self, bytez):
567
+        return self.process_raw_features(self.raw_features(bytez))

+ 29
- 0
data/2026-IJCI/boolean_classifier/feature_extractors/ngram_feature_extractor.py Прегледај датотеку

1
+from collections import OrderedDict
2
+import numpy as np
3
+
4
+class NGramFeatureExtractor(object):
5
+    def __init__(self, N: int = 2):
6
+        self.N = N
7
+        self.dim = 256 ** N
8
+        self.ngram_features = OrderedDict({"{},{}".format(i,j): 0.0 for i in range(256) for j in range(256)})
9
+
10
+    def feature_vector(self, bytez):
11
+        raw_features = self.extract_ngram_features(bytez)
12
+        return self.reduce(raw_features)
13
+
14
+    def extract_ngram_features(self, bytez)-> dict:
15
+        words = list(bytez)
16
+        num_ngrams = len(words) - self.N
17
+        bigrams = zip(words, words[1:])  # Create bi-grams
18
+        for bigram in bigrams:
19
+            self.ngram_features["{},{}".format(bigram[0], bigram[1])] += 1
20
+        for key in self.ngram_features:
21
+            self.ngram_features[key] = self.ngram_features[key] / num_ngrams
22
+        return self.ngram_features
23
+
24
+    def reduce(self, raw_features: dict, technique: str = None):
25
+        if technique is None:
26
+            return np.expand_dims(np.array(list(raw_features.values())), axis=0)
27
+        else:
28
+            raise NotImplementedError("Feature selection and dimensionality reduction technique not implemented")
29
+

+ 6
- 0
data/2026-IJCI/boolean_classifier/ffnn_configurations/ffnn_2gram_k=1000_config.json Прегледај датотеку

1
+{
2
+  "feature_selector": "data/BODMAS/feature_selectors/bigrams/bigrams_feature_selector_k=1000.pkl",
3
+  "hidden_size": 512,
4
+  "input_size": 1000,
5
+  "model_path": "models/ffnn_2gram_k=1000_512_1"
6
+}

BIN
data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/model.pth Прегледај датотеку


+ 1
- 0
data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/results.json Прегледај датотеку

1
+{"training_losses": [0.007372988766147287, 0.005052168182717729, 0.004575773141697014, 0.004136406766564416, 0.003666894217103657, 0.0035721441213530944, 0.003411646314705646, 0.003301303897834049, 0.003245272185255189, 0.0031502678544421403, 0.003098506758948322], "training_accuracies": [0.9023583758813518, 0.9278223518923737, 0.9339492665532053, 0.9399303022935408, 0.9442418348326445, 0.9457006240376044, 0.9474511710835563, 0.9474835886214442, 0.949201718129508, 0.9492179268984521, 0.9511629791717319], "validation_losses": [0.009812075672269006, 0.010353462121517892, 0.005870703318895587, 0.005944547294264034, 0.005886296627045667, 0.005803646742737165, 0.00720510685860129, 0.005735838049743723, 0.005888903167399741, 0.005404814309630779, 0.006096020837560452], "validation_accuracies": [0.838542342108676, 0.7581377253274543, 0.9389184282194268, 0.9361950460381273, 0.9361950460381273, 0.9319154454675139, 0.9067565815069382, 0.9329529243937232, 0.9302295422124238, 0.9365841006354558, 0.9289326935546621]}

+ 6
- 0
data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/test.out Прегледај датотеку

1
+Accuracy: 0.9607282184655397
2
+Precision: 0.9691975141853553
3
+Recall: 0.9501986754966888
4
+F1: 0.9596040663456393
5
+Confusion Matrix: [[3801  114]
6
+ [ 188 3587]]

+ 6
- 0
data/2026-IJCI/boolean_classifier/models/ffnn_boolean_2gram_k=1000_512_1/validation.out Прегледај датотеку

1
+Accuracy: 0.9319154454675139
2
+Precision: 0.9192049561177078
3
+Recall: 0.9438112907500663
4
+F1: 0.9313456257355825
5
+Confusion Matrix: [[3625  313]
6
+ [ 212 3561]]

+ 271
- 0
data/2026-IJCI/boolean_classifier/train_malware_detector.py Прегледај датотеку

1
+import argparse
2
+import copy
3
+
4
+import torch
5
+import sys
6
+sys.path.append("../")
7
+from boolean_classifier.datasets.boolean_ngram_dataset import BooleanNGramDataset
8
+from boolean_classifier.datasets.ngram_dataset import NGramDataset
9
+from boolean_classifier.architectures.ffnn import FFNN
10
+from torch.utils.data import DataLoader
11
+import multiprocessing
12
+import json
13
+import os
14
+import torch.nn
15
+from torch.optim.lr_scheduler import _LRScheduler
16
+from torch.utils.data import DataLoader
17
+from tqdm import tqdm
18
+import joblib
19
+
20
+
21
+class EarlyStoppingPyTorchTrainer:
22
+    """Trainer for PyTorch models with early stopping."""
23
+
24
+    def __init__(self, optimizer: torch.optim.Optimizer, epochs: int = 5,
25
+                 loss: torch.nn.Module = None, scheduler: _LRScheduler = None, feature_selector = None) -> None:
26
+        """
27
+        Create PyTorch trainer.
28
+        Parameters
29
+        ----------
30
+        optimizer : torch.optim.Optimizer
31
+            Optimizer to use for training the model.
32
+        epochs : int, optional
33
+            Number of epochs, by default 5.
34
+        loss : torch.nn.Module, optional
35
+            Loss to minimize, by default None.
36
+        scheduler : _LRScheduler, optional
37
+            Scheduler for the optimizer, by default None.
38
+        """
39
+        self._epochs = epochs
40
+        self._optimizer = optimizer
41
+        self._loss = loss if loss is not None else torch.nn.CrossEntropyLoss()
42
+        self._scheduler = scheduler
43
+        self.feature_selector = feature_selector
44
+
45
+        self.training_losses = []
46
+        self.training_accuracies = []
47
+        self.validation_losses = []
48
+        self.validation_accuracies = []
49
+
50
+    def train(self, model: torch.nn.Module,
51
+            train_loader: DataLoader,
52
+            val_loader: DataLoader,
53
+            patience: int) -> torch.nn.Module:
54
+        """
55
+        Train model with given loaders and early stopping.
56
+        Parameters
57
+        ----------
58
+        model : torch.nn.Module
59
+            Pytorch model to be trained.
60
+        train_loader : DataLoader
61
+            Train data loader.
62
+        val_loader : DataLoader
63
+            Validation data loader.
64
+        patience : int
65
+            Number of epochs to wait before early stopping.
66
+        Returns
67
+        -------
68
+        torch.nn.Module
69
+            Trained model.
70
+        """
71
+        best_loss = float("inf")
72
+        best_model = None
73
+        patience_counter = 0
74
+        for _ in range(self._epochs):
75
+            model = self.fit(model, train_loader)
76
+            val_loss = self.validate(model, val_loader)
77
+            if val_loss <= best_loss:
78
+                best_loss = val_loss
79
+                best_model = copy.deepcopy(model)
80
+                patience_counter = 0
81
+            else:
82
+                patience_counter += 1
83
+            if patience_counter >= patience:
84
+                break
85
+        return best_model
86
+
87
+    def fit(self,
88
+              model: torch.nn.Module,
89
+              dataloader: DataLoader) -> torch.nn.Module:
90
+        """
91
+        Train model for one epoch with given loader.
92
+        Parameters
93
+        ----------
94
+        model : torch.nn.Module
95
+            Pytorch model to be trained.
96
+        dataloader : DataLoader
97
+            Train data loader.
98
+        Returns
99
+        -------
100
+        torch.nn.Module
101
+            Trained model.
102
+        """
103
+        device = next(model.parameters()).device
104
+        model = model.train()
105
+        model = model.to(device)
106
+        running_loss = 0.0
107
+        train_total = 0
108
+        train_correct = 0
109
+        for x, y in tqdm(dataloader):
110
+            if self.feature_selector is not None:
111
+                x = torch.Tensor(self.feature_selector.transform(x))
112
+            x, y = x.to(device), y.to(device)
113
+            self._optimizer.zero_grad()
114
+            outputs = model(x)
115
+            loss = self._loss(outputs, y)
116
+            loss.backward()
117
+            self._optimizer.step()
118
+            running_loss += loss.item()
119
+            y_preds = outputs.softmax(dim=1).argmax(dim=1)
120
+            train_total += y.size(0)
121
+            train_correct += (y_preds == y).sum().item()
122
+
123
+        self.training_losses.append(running_loss / train_total)
124
+        self.training_accuracies.append(train_correct / train_total)
125
+
126
+        if self._scheduler is not None:
127
+            self._scheduler.step()
128
+        return model
129
+
130
+    def validate(self,
131
+                 model: torch.nn.Module,
132
+                 dataloader: DataLoader) -> float:
133
+        """
134
+        Validate model with given loader.
135
+        Parameters
136
+        ----------
137
+        model : torch.nn.Module
138
+            Pytorch model to be balidated.
139
+        dataloader : DataLoader
140
+            Validation data loader.
141
+        Returns
142
+        -------
143
+        float
144
+            Validation loss of the model.
145
+        """
146
+        running_loss = 0
147
+        val_total = 0
148
+        val_correct = 0
149
+        device = next(model.parameters()).device
150
+        model = model.eval()
151
+        model = model.to(device)
152
+        with torch.no_grad():
153
+            for x, y in tqdm(dataloader):
154
+                if self.feature_selector is not None:
155
+                    x = torch.Tensor(self.feature_selector.transform(x))
156
+                x, y = x.to(device), y.to(device)
157
+                outputs = model(x)
158
+                loss = self._loss(outputs, y)
159
+                running_loss += loss.item()
160
+                y_preds = outputs.softmax(dim=1).argmax(dim=1)
161
+
162
+                val_total += y.size(0)
163
+                val_correct += (y_preds == y).sum().item()
164
+
165
+            self.validation_losses.append(running_loss / val_total)
166
+            self.validation_accuracies.append(val_correct / val_total)
167
+        return loss
168
+
169
+def save_results(trainer: EarlyStoppingPyTorchTrainer, configuration: dict):
170
+    results = {
171
+        "training_losses": trainer.training_losses,
172
+        "training_accuracies": trainer.training_accuracies,
173
+        "validation_losses": trainer.validation_losses,
174
+        "validation_accuracies": trainer.validation_accuracies
175
+    }
176
+    with open(os.path.join(configuration["model_path"], "results.json"), "w") as output_file:
177
+        json.dump(results, output_file)
178
+
179
+def load_configuration(configuration_filepath: str) -> dict:
180
+    with open(configuration_filepath, "r") as configuration_file:
181
+        configuration = json.load(configuration_file)
182
+    return configuration
183
+
184
+if __name__ == "__main__":
185
+    parser = argparse.ArgumentParser(description='Train malware detector')
186
+    parser.add_argument("training_file",
187
+                        type=str,
188
+                        help="Training file containing the hashes and labels of the benign and malicious samples"
189
+                        )
190
+    parser.add_argument("validation_file",
191
+                        type=str,
192
+                        help="Validation file containing the hashes and labels of the benign and malicious samples"
193
+                        )
194
+    parser.add_argument("dataset_type",
195
+                        type=str,
196
+                        help="Type of dataset: {BooleanBigrams, Bigrams, EMBER}"
197
+                        )
198
+    parser.add_argument("configuration_file",
199
+                        type=str,
200
+                        help="Configuration file containing the hyperparameters of the model"
201
+                        )
202
+    parser.add_argument("--batch_size",
203
+                        type=int,
204
+                        help="Batch size for training",
205
+                        default=32
206
+                        )
207
+    parser.add_argument("--num_epochs",
208
+                        type=int,
209
+                        help="Max epochs",
210
+                        default=50
211
+                        )
212
+    parser.add_argument("--patience",
213
+                        type=int,
214
+                        help="Patience for early stopping",
215
+                        default=5
216
+                        )
217
+    args = parser.parse_args()
218
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
219
+    print("Device: ", device)
220
+    num_workers = max(multiprocessing.cpu_count() - 4, multiprocessing.cpu_count() // 2 + 1)
221
+
222
+    if args.dataset_type == "BooleanBigrams":
223
+        training_dataset = BooleanNGramDataset(args.training_file)
224
+        validation_dataset = BooleanNGramDataset(args.validation_file)
225
+    elif args.dataset_type == "Bigrams":
226
+        training_dataset = NGramDataset(args.training_file)
227
+        validation_dataset = NGramDataset(args.validation_file)
228
+    else:
229
+        raise NotImplementedError("Only Boolean dataset is currently supported")
230
+    training_dataloader = DataLoader(
231
+        training_dataset,
232
+        batch_size=args.batch_size,
233
+        num_workers=num_workers,
234
+    )
235
+    validation_dataloader = DataLoader(
236
+        validation_dataset,
237
+        batch_size=args.batch_size,
238
+        num_workers=num_workers,
239
+    )
240
+
241
+    configuration = load_configuration(args.configuration_file)
242
+    model = FFNN(configuration)
243
+    model = model.to(device)
244
+
245
+    if configuration["feature_selector"] is not None:
246
+        feature_selector = joblib.load(configuration["feature_selector"])
247
+    else:
248
+        feature_selector = None
249
+
250
+    criterion = torch.nn.CrossEntropyLoss()
251
+    optimizer = torch.optim.Adam(model.parameters())
252
+
253
+    trainer = EarlyStoppingPyTorchTrainer(
254
+        optimizer,
255
+        epochs=args.num_epochs,
256
+        loss=criterion,
257
+        feature_selector=feature_selector
258
+    )
259
+    model = trainer.train(
260
+        model,
261
+        training_dataloader,
262
+        validation_dataloader,
263
+        args.patience
264
+    )
265
+    if not os.path.exists(configuration["model_path"]):
266
+        os.makedirs(configuration["model_path"])
267
+    torch.save(model.state_dict(), os.path.join(configuration["model_path"],"model.pth"))
268
+    save_results(trainer, configuration)
269
+    
270
+
271
+    

+ 193
- 0
data/2026-IJCI/verifier/create_vnnlib.py Прегледај датотеку

1
+#!/usr/bin/python3
2
+
3
+# Libraries
4
+
5
+import argparse
6
+import torch
7
+import os
8
+import sys
9
+import json
10
+import joblib
11
+import numpy as np
12
+
13
+current = os.path.dirname(os.path.realpath(__file__))
14
+parent = os.path.dirname(current)
15
+sys.path.append(parent)
16
+
17
+from boolean_classifier.architectures.ffnn import FFNN
18
+
19
+from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
20
+from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
21
+
22
+# Functions
23
+
24
+def get_header(args, input_name, output_name, free_features_indices):
25
+    '''Get the header for the VNN file'''
26
+    str = f'; Input file: {args.input_file}\n'
27
+    str += f'; Free features: {args.free}\n'
28
+    str += f'; Free features indices:'
29
+    for i in range(len(free_features_indices)):
30
+        str += f' {free_features_indices[i]}'
31
+    str += f'\n'
32
+    str += f'; Total features: {args.total_features}\n'
33
+    str += f'; Feature type: {args.feature_type}\n'
34
+    str += f'; Input name: {input_name}\n'
35
+    str += f'; Output name: {output_name}\n'
36
+    str += f'; Epsilon: {args.epsilon}\n'
37
+    str += f'; Random seed: {args.seed}\n'
38
+    return str
39
+
40
+def get_input_vars(args, input_name):
41
+    '''Get the input variables for the VNN file'''
42
+    str = f'\n; Input variables:\n\n'
43
+    for i in range(args.total_features):
44
+        str += f'(declare-const {input_name}_{i} Real)\n'
45
+    return str
46
+
47
+def get_output_vars(output_name):
48
+    '''Get the output variables for the VNN file'''
49
+    str = f'\n; Output variables:\n\n'
50
+    str += f'(declare-const {output_name}_0 Real)\n'
51
+    str += f'(declare-const {output_name}_1 Real)\n'
52
+    return str
53
+
54
+def select_free_features(args, features):
55
+    '''Select features to be free but only from features that are zero'''
56
+    if args.list_ff_indices is not None: # If list of free feature indices is provided, use it. Do not check if they are zero or if it matches the number of arts.free features.
57
+        indices = args.list_ff_indices
58
+        # assert len(indices) == args.free, "Number of free features does not match the length of the provided indices."
59
+        assert all(i >= 0 and i < args.total_features for i in indices), "Some indices are out of bounds."
60
+    else:
61
+        zero_indices = np.where(features == 0)[1] # For numpy arrays
62
+        # print(f'Selecting {args.free} out of {len(zero_indices)} features with zero value')
63
+        assert len(zero_indices) >= args.free, "Not enough zero features to select from."
64
+        indices = np.random.choice(zero_indices, size=args.free, replace=False)
65
+        # print('Free features indices:', random_indices)
66
+    free_features = [False] * args.total_features
67
+    for i in indices:
68
+        free_features[i] = True
69
+    return free_features, indices
70
+
71
+def get_input_constraints(args, input_name, features, free_features):
72
+    '''Get the input constraints for the VNN file'''
73
+    str = f'\n; Input constraints:\n\n'
74
+    # Set ranges for the free features
75
+    for i, free in enumerate(free_features):
76
+        if free:
77
+            # Standard constraint X >= 0 and <= 1
78
+            str += f'(assert (>= {input_name}_{i} {max(0, features[0, i] - args.epsilon)}))\n'
79
+            str += f'(assert (<= {input_name}_{i} {min(1, features[0, i] + args.epsilon)}))\n' 
80
+            # Additional constraint to standard to ensure 0 or 1
81
+            #str += f'(assert (or (<= {input_name}_{i} {max(0, dense_features[0, i] - args.epsilon)})'
82
+            #str += f' (>= {input_name}_{i} {min(1, dense_features[0, i] + args.epsilon)})))\n'
83
+        else:
84
+            str += f'(assert (>= {input_name}_{i} {features[0, i]}))\n'
85
+            str += f'(assert (<= {input_name}_{i} {features[0, i]}))\n'
86
+    return str
87
+
88
+def get_output_constraints(output_name, predicted_label):
89
+    '''Get the output constraints for the VNN file'''
90
+    str = f'\n; Output constraints:\n\n'
91
+    if predicted_label == 1:
92
+        str += f'(assert (>= {output_name}_0 0.55))\n'
93
+        str += f'(assert (<= {output_name}_0 1.0))\n'
94
+        str += f'(assert (>= {output_name}_1 0.0))\n'
95
+        str += f'(assert (<= {output_name}_1 0.45))\n'
96
+    else:
97
+        str += f'(assert (>= {output_name}_0 0.0))\n'
98
+        str += f'(assert (<= {output_name}_0 0.45))\n'
99
+        str += f'(assert (>= {output_name}_1 0.55))\n'
100
+        str += f'(assert (<= {output_name}_1 1.0))\n'   
101
+    return str
102
+
103
+def load_configuration(configuration_filepath: str) -> dict:
104
+    with open(configuration_filepath, "r") as configuration_file:
105
+        configuration = json.load(configuration_file)
106
+    return configuration
107
+
108
+class VNNLIBargs():
109
+    def __init__(self, input_file, model_path, config_file, feature_type, free, total_features, list_ff_indices, epsilon=1, output_file='out.vnnlib', seed=None):
110
+        self.input_file = input_file
111
+        self.model_path = model_path
112
+        self.config_file = config_file
113
+        self.feature_type = feature_type
114
+        self.free = free
115
+        self.total_features = total_features
116
+        self.list_ff_indices = list_ff_indices
117
+        self.epsilon = epsilon
118
+        self.output_file = output_file
119
+        self.seed = seed
120
+
121
+def create_vnnlib(args, features, predicted_label):
122
+    input_name, output_name = "X", "Y"
123
+    np.random.seed(args.seed)
124
+    free_features, free_features_indices = select_free_features(args, features)
125
+    with open(args.output_file, 'w') as output_file:
126
+        output_file.write(get_header(args, input_name, output_name, free_features_indices))
127
+        output_file.write(get_input_vars(args, input_name))
128
+        output_file.write(get_output_vars(output_name))
129
+        output_file.write(get_input_constraints(args, input_name, features, free_features))
130
+        output_file.write(get_output_constraints(output_name, predicted_label))            
131
+
132
+
133
+# Main
134
+
135
+if __name__ == '__main__' :
136
+    # Parse arguments
137
+    parser = argparse.ArgumentParser(description = 'Generates data.')
138
+    # Optional arguments
139
+    parser.add_argument('input_file', type = str, help = 'Input binary file name')
140
+    parser.add_argument('model_path', type = str, help = 'Path to the model .pth file')
141
+    parser.add_argument('config_file', type = str, help = 'Configuration file containing the hyperparameters of the model')
142
+    parser.add_argument('feature_type', type = str, help = 'Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}')
143
+    parser.add_argument('free', type = int, help = 'Number of free features')
144
+    parser.add_argument('total_features', type = int, help = 'Total number of features')
145
+    parser.add_argument('-l', '--list_ff_indices', nargs = '+', default = None, type = int, help = 'List of free feature indices (default: None)', dest = 'list_ff_indices')
146
+    parser.add_argument('-e', '--epsilon', default = 1, type = int, help = 'Input epsilon variation (default: 1)', dest = 'epsilon')
147
+    parser.add_argument('-o', '--output_file', default = 'out.vnnlib', type = str, help = 'output file name (default: out.vnnlib)', dest = 'output_file')
148
+    parser.add_argument('-s', '--seed', default = None, type = int, help = 'Random seed', dest = 'seed')
149
+    args = parser.parse_args()
150
+
151
+    # Set device
152
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
153
+    print("Device: ", device)
154
+    
155
+    configuration = load_configuration(args.config_file)
156
+
157
+    # Load feature extractor
158
+    if  "feature_selector" in configuration:
159
+        config = '../boolean_classifier/data/BODMAS/feature_selectors/boolean_bigrams/boolean_bigrams_feature_selector_k=1000.pkl'
160
+        feature_selector = joblib.load(config)
161
+    else:
162
+        feature_selector = None
163
+
164
+
165
+    # Load model
166
+    model = FFNN(configuration)
167
+    model = model.to(device)
168
+    model.load_state_dict(torch.load(args.model_path, weights_only=True))
169
+    model.eval()
170
+
171
+    with open(args.input_file, "rb") as f:
172
+        bytez = f.read()
173
+
174
+    if args.feature_type == "BooleanBigrams":
175
+        feature_extractor = BooleanNGramFeatureExtractor(N=2)
176
+        sparse_features = feature_extractor.feature_vector(bytez)
177
+        features = sparse_features.todense()
178
+    elif args.feature_type == "Bigrams":
179
+        feature_extractor = NGramFeatureExtractor(N=2)
180
+        features = feature_extractor.feature_vector(bytez)
181
+    else:
182
+        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
183
+
184
+    
185
+    if feature_selector is not None:
186
+        features = feature_selector.transform(torch.Tensor(features))
187
+    x = torch.tensor(features, dtype=torch.float).to(device)
188
+    probs = model.predict(x)
189
+    y_pred = probs.argmax(dim=1)
190
+    print("Predicted label: ", y_pred, probs)
191
+
192
+    create_vnnlib(args, features, y_pred[0].item())
193
+

Powered by TurnKey Linux.