Research data available for everyone.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import argparse
  2. import torch
  3. import os
  4. import sys
  5. sys.path.append("../")
  6. from boolean_classifier.architectures.ffnn import FFNN
  7. import json
  8. from boolean_classifier.feature_extractors.boolean_ngram_feature_extractor import BooleanNGramFeatureExtractor
  9. from boolean_classifier.feature_extractors.ngram_feature_extractor import NGramFeatureExtractor
  10. import joblib
  11. import numpy as np
  12. def load_configuration(configuration_filepath: str) -> dict:
  13. with open(configuration_filepath, "r") as configuration_file:
  14. configuration = json.load(configuration_file)
  15. return configuration
  16. if __name__ == "__main__":
  17. parser = argparse.ArgumentParser(description='Classify a single file with boolean malware detector')
  18. parser.add_argument("exe_filepath",
  19. type=str,
  20. help="Filepath of the executable"
  21. )
  22. parser.add_argument("feature_type",
  23. type=str,
  24. help="Type of features to extract. Select one of the following: {BooleanBigrams, Bigrams}")
  25. parser.add_argument("configuration_file",
  26. type=str,
  27. help="Configuration file containing the hyperparameters of the model"
  28. )
  29. args = parser.parse_args()
  30. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  31. print("Device: ", device)
  32. configuration = load_configuration(args.configuration_file)
  33. if "feature_selector" in configuration:
  34. feature_selector = joblib.load(configuration["feature_selector"])
  35. else:
  36. feature_selector = None
  37. # Load model
  38. model = FFNN(configuration)
  39. model = model.to(device)
  40. model.load_state_dict(torch.load(os.path.join(configuration["model_path"], "model.pth"), weights_only=True))
  41. model.eval()
  42. with open(args.exe_filepath, "rb") as f:
  43. bytez = f.read()
  44. if args.feature_type == "BooleanBigrams":
  45. feature_extractor = BooleanNGramFeatureExtractor(N=2)
  46. sparse_features = feature_extractor.feature_vector(bytez)
  47. features = sparse_features.todense()
  48. # print("feature vector: ", features)
  49. # zero_indices = np.where(features[0] == 0)[1]
  50. # print("Number of zero features: ", len(zero_indices))
  51. # print("Zero indices: ", zero_indices)
  52. # # Remove some items from zero_indices
  53. # if len(zero_indices) > 4000:
  54. # zero_indices = np.random.choice(zero_indices, size=4000, replace=False)
  55. # print("Zero indices after sampling: ", zero_indices)
  56. # for i in zero_indices:
  57. # features[0, i] = 1
  58. # zero_indices = np.where(features[0] == 0)[1]
  59. # print("Number of zero features: ", len(zero_indices))
  60. elif args.feature_type == "Bigrams":
  61. feature_extractor = NGramFeatureExtractor(N=2)
  62. features = feature_extractor.feature_vector(bytez)
  63. else:
  64. raise NotImplementedError("Select one of the following: {BooleanBigrams, Bigrams}")
  65. if feature_selector is not None:
  66. features = feature_selector.transform(torch.Tensor(features))
  67. x = torch.tensor(features, dtype=torch.float).to(device)
  68. probs = model.predict(x)
  69. y_pred = probs.argmax(dim=1)
  70. print("Predicted label: ", y_pred, probs)

Powered by TurnKey Linux.