import argparse import torch import os import sys sys.path.append("../") from boolean_classifier.architectures.ffnn import FFNN import json from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor import joblib import numpy as np def load_configuration(configuration_filepath: str) -> dict: with open(configuration_filepath, "r") as configuration_file: configuration = json.load(configuration_file) return configuration if __name__ == "__main__": parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector') parser.add_argument("exe_filepath", type=str, help="Filepath of the executable" ) parser.add_argument("feature_type", type=str, help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}") parser.add_argument("configuration_file", type=str, help="Configuration file containing the hyperparameters of the model" ) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device: ", device) configuration = load_configuration(args.configuration_file) if "feature_selector" in configuration: feature_selector = joblib.load(configuration["feature_selector"]) else: feature_selector = None # Load model model = FFNN(configuration) model = model.to(device) model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True)) model.eval() with open(args.exe_filepath, "rb") as f: bytez = f.read() if args.feature_type == "BooleanBigrams": feature_extractor = BooleanNGramFeatureExtractor(N=2) sparse_features = feature_extractor.feature_vector(bytez) features = sparse_features.todense() # print("feature vector: ", features) # zero_indices = np.where(features[0] == 0)[1] # print("Number of zero features: ", len(zero_indices)) # print("Zero indices: ", zero_indices) # # Remove some items from zero_indices # if len(zero_indices) > 4000: # zero_indices = np.random.choice(zero_indices, size=4000, replace=False) # print("Zero indices after sampling: ", zero_indices) # for i in zero_indices: # features[0, i] = 1 # zero_indices = np.where(features[0] == 0)[1] # print("Number of zero features: ", len(zero_indices)) elif args.feature_type == "Bigrams": feature_extractor = NGramFeatureExtractor(N=2) features = feature_extractor.feature_vector(bytez) else: raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}") if feature_selector is not None: features = feature_selector.transform(torch.Tensor(features)) x = torch.tensor(features, dtype=torch.float).to(device) probs = model.predict(x) y_pred = probs.argmax(dim=1) print("Predicted label: ", y_pred, probs)