| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- import argparse
- import torch
- import os
- import sys
- sys.path.append("../")
- from boolean_classifier.architectures.ffnn import FFNN
- import json
- from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
- from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
- import joblib
- import numpy as np
-
-
- def load_configuration(configuration_filepath: str) -> dict:
- with open(configuration_filepath, "r") as configuration_file:
- configuration = json.load(configuration_file)
- return configuration
-
- if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector')
- parser.add_argument("exe_filepath",
- type=str,
- help="Filepath of the executable"
- )
- parser.add_argument("feature_type",
- type=str,
- help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}")
- parser.add_argument("configuration_file",
- type=str,
- help="Configuration file containing the hyperparameters of the model"
- )
- args = parser.parse_args()
-
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- print("Device: ", device)
-
- configuration = load_configuration(args.configuration_file)
- if "feature_selector" in configuration:
- feature_selector = joblib.load(configuration["feature_selector"])
- else:
- feature_selector = None
-
- # Load model
- model = FFNN(configuration)
- model = model.to(device)
- model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
- model.eval()
-
- with open(args.exe_filepath, "rb") as f:
- bytez = f.read()
-
- if args.feature_type == "BooleanBigrams":
- feature_extractor = BooleanNGramFeatureExtractor(N=2)
- sparse_features = feature_extractor.feature_vector(bytez)
- features = sparse_features.todense()
- # print("feature vector: ", features)
- # zero_indices = np.where(features[0] == 0)[1]
- # print("Number of zero features: ", len(zero_indices))
- # print("Zero indices: ", zero_indices)
- # # Remove some items from zero_indices
- # if len(zero_indices) > 4000:
- # zero_indices = np.random.choice(zero_indices, size=4000, replace=False)
- # print("Zero indices after sampling: ", zero_indices)
- # for i in zero_indices:
- # features[0, i] = 1
- # zero_indices = np.where(features[0] == 0)[1]
- # print("Number of zero features: ", len(zero_indices))
- elif args.feature_type == "Bigrams":
- feature_extractor = NGramFeatureExtractor(N=2)
- features = feature_extractor.feature_vector(bytez)
- else:
- raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
- if feature_selector is not None:
- features = feature_selector.transform(torch.Tensor(features))
- x = torch.tensor(features, dtype=torch.float).to(device)
- probs = model.predict(x)
- y_pred = probs.argmax(dim=1)
- print("Predicted label: ", y_pred, probs)
-
|