josep
/
OpenData


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
							import argparse
import torch
import os
import sys
sys.path.append("../")
from boolean_classifier.architectures.ffnn import FFNN
import json
from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
import joblib
import numpy as np


def load_configuration(configuration_filepath: str) -> dict:
    with open(configuration_filepath, "r") as configuration_file:
        configuration = json.load(configuration_file)
    return configuration

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector')
    parser.add_argument("exe_filepath",
                        type=str,
                        help="Filepath of the executable"
                        )
    parser.add_argument("feature_type",
                        type=str,
                        help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}")
    parser.add_argument("configuration_file",
                        type=str,
                        help="Configuration file containing the hyperparameters of the model"
                        )
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device: ", device)

    configuration = load_configuration(args.configuration_file)
    if "feature_selector" in configuration:
        feature_selector = joblib.load(configuration["feature_selector"])
    else:
        feature_selector = None

    # Load model
    model = FFNN(configuration)
    model = model.to(device)
    model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
    model.eval()

    with open(args.exe_filepath, "rb") as f:
        bytez = f.read()

    if args.feature_type == "BooleanBigrams":
        feature_extractor = BooleanNGramFeatureExtractor(N=2)
        sparse_features = feature_extractor.feature_vector(bytez)
        features = sparse_features.todense()
        # print("feature vector: ", features)
        # zero_indices = np.where(features[0] == 0)[1]
        # print("Number of zero features: ", len(zero_indices))
        # print("Zero indices: ", zero_indices)
        # # Remove some items from zero_indices
        # if len(zero_indices) > 4000:
        #     zero_indices = np.random.choice(zero_indices, size=4000, replace=False)
        # print("Zero indices after sampling: ", zero_indices)
        # for i in zero_indices:
        #     features[0, i] = 1
        # zero_indices = np.where(features[0] == 0)[1]
        # print("Number of zero features: ", len(zero_indices))
    elif args.feature_type == "Bigrams":
        feature_extractor = NGramFeatureExtractor(N=2)
        features = feature_extractor.feature_vector(bytez)
    else:
        raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
    if feature_selector is not None:
        features = feature_selector.transform(torch.Tensor(features))
    x = torch.tensor(features, dtype=torch.float).to(device)
    probs = model.predict(x)
    y_pred = probs.argmax(dim=1)
    print("Predicted label: ", y_pred, probs)