|
|
@@ -0,0 +1,567 @@
|
|
|
1
|
+#!/usr/bin/python
|
|
|
2
|
+''' Extracts some basic features from PE files. Many of the features
|
|
|
3
|
+implemented have been used in previously published works. For more information,
|
|
|
4
|
+check out the following resources:
|
|
|
5
|
+* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
|
|
|
6
|
+* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
|
|
|
7
|
+* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
|
|
|
8
|
+* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
|
|
|
9
|
+* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
|
|
|
10
|
+
|
|
|
11
|
+It may be useful to do feature selection to reduce this set of features to a meaningful set
|
|
|
12
|
+for your modeling problem.
|
|
|
13
|
+'''
|
|
|
14
|
+
|
|
|
15
|
+import hashlib
|
|
|
16
|
+import json
|
|
|
17
|
+import os
|
|
|
18
|
+import re
|
|
|
19
|
+
|
|
|
20
|
+import lief
|
|
|
21
|
+import numpy as np
|
|
|
22
|
+from sklearn.feature_extraction import FeatureHasher
|
|
|
23
|
+
|
|
|
24
|
+LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
|
|
|
25
|
+LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10)
|
|
|
26
|
+LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)
|
|
|
27
|
+
|
|
|
28
|
+
|
|
|
29
|
+class FeatureType(object):
|
|
|
30
|
+ ''' Base class from which each feature type may inherit '''
|
|
|
31
|
+
|
|
|
32
|
+ name = ''
|
|
|
33
|
+ dim = 0
|
|
|
34
|
+
|
|
|
35
|
+ def __repr__(self):
|
|
|
36
|
+ return '{}({})'.format(self.name, self.dim)
|
|
|
37
|
+
|
|
|
38
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
39
|
+ ''' Generate a JSON-able representation of the file '''
|
|
|
40
|
+ raise (NotImplementedError)
|
|
|
41
|
+
|
|
|
42
|
+ def process_raw_features(self, raw_obj):
|
|
|
43
|
+ ''' Generate a feature vector from the raw features '''
|
|
|
44
|
+ raise (NotImplementedError)
|
|
|
45
|
+
|
|
|
46
|
+ def feature_vector(self, bytez, lief_binary):
|
|
|
47
|
+ ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
|
|
|
48
|
+ if there are significant speedups to be gained from combining the two functions. '''
|
|
|
49
|
+ return self.process_raw_features(self.raw_features(bytez, lief_binary))
|
|
|
50
|
+
|
|
|
51
|
+
|
|
|
52
|
+class ByteHistogram(FeatureType):
|
|
|
53
|
+ ''' Byte histogram (count + non-normalized) over the entire binary file '''
|
|
|
54
|
+
|
|
|
55
|
+ name = 'histogram'
|
|
|
56
|
+ dim = 256
|
|
|
57
|
+
|
|
|
58
|
+ def __init__(self):
|
|
|
59
|
+ super(FeatureType, self).__init__()
|
|
|
60
|
+
|
|
|
61
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
62
|
+ counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
|
|
|
63
|
+ return counts.tolist()
|
|
|
64
|
+
|
|
|
65
|
+ def process_raw_features(self, raw_obj):
|
|
|
66
|
+ counts = np.array(raw_obj, dtype=np.float32)
|
|
|
67
|
+ sum = counts.sum()
|
|
|
68
|
+ normalized = counts / sum
|
|
|
69
|
+ return normalized
|
|
|
70
|
+
|
|
|
71
|
+
|
|
|
72
|
+class ByteEntropyHistogram(FeatureType):
|
|
|
73
|
+ ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
|
|
|
74
|
+ This roughly approximates the joint probability of byte value and local entropy.
|
|
|
75
|
+ See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
|
|
|
76
|
+ '''
|
|
|
77
|
+
|
|
|
78
|
+ name = 'byteentropy'
|
|
|
79
|
+ dim = 256
|
|
|
80
|
+
|
|
|
81
|
+ def __init__(self, step=1024, window=2048):
|
|
|
82
|
+ super(FeatureType, self).__init__()
|
|
|
83
|
+ self.window = window
|
|
|
84
|
+ self.step = step
|
|
|
85
|
+
|
|
|
86
|
+ def _entropy_bin_counts(self, block):
|
|
|
87
|
+ # coarse histogram, 16 bytes per bin
|
|
|
88
|
+ c = np.bincount(block >> 4, minlength=16) # 16-bin histogram
|
|
|
89
|
+ p = c.astype(np.float32) / self.window
|
|
|
90
|
+ wh = np.where(c)[0]
|
|
|
91
|
+ H = np.sum(-p[wh] * np.log2(
|
|
|
92
|
+ p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
|
|
|
93
|
+
|
|
|
94
|
+ Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits)
|
|
|
95
|
+ if Hbin == 16: # handle entropy = 8.0 bits
|
|
|
96
|
+ Hbin = 15
|
|
|
97
|
+
|
|
|
98
|
+ return Hbin, c
|
|
|
99
|
+
|
|
|
100
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
101
|
+ output = np.zeros((16, 16), dtype=int)
|
|
|
102
|
+ a = np.frombuffer(bytez, dtype=np.uint8)
|
|
|
103
|
+ if a.shape[0] < self.window:
|
|
|
104
|
+ Hbin, c = self._entropy_bin_counts(a)
|
|
|
105
|
+ output[Hbin, :] += c
|
|
|
106
|
+ else:
|
|
|
107
|
+ # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
|
|
|
108
|
+ shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
|
|
|
109
|
+ strides = a.strides + (a.strides[-1],)
|
|
|
110
|
+ blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
|
|
|
111
|
+
|
|
|
112
|
+ # from the blocks, compute histogram
|
|
|
113
|
+ for block in blocks:
|
|
|
114
|
+ Hbin, c = self._entropy_bin_counts(block)
|
|
|
115
|
+ output[Hbin, :] += c
|
|
|
116
|
+
|
|
|
117
|
+ return output.flatten().tolist()
|
|
|
118
|
+
|
|
|
119
|
+ def process_raw_features(self, raw_obj):
|
|
|
120
|
+ counts = np.array(raw_obj, dtype=np.float32)
|
|
|
121
|
+ sum = counts.sum()
|
|
|
122
|
+ normalized = counts / sum
|
|
|
123
|
+ return normalized
|
|
|
124
|
+
|
|
|
125
|
+
|
|
|
126
|
+class SectionInfo(FeatureType):
|
|
|
127
|
+ ''' Information about section names, sizes and entropy. Uses hashing trick
|
|
|
128
|
+ to summarize all this section info into a feature vector.
|
|
|
129
|
+ '''
|
|
|
130
|
+
|
|
|
131
|
+ name = 'section'
|
|
|
132
|
+ dim = 5 + 50 + 50 + 50 + 50 + 50
|
|
|
133
|
+
|
|
|
134
|
+ def __init__(self):
|
|
|
135
|
+ super(FeatureType, self).__init__()
|
|
|
136
|
+
|
|
|
137
|
+ @staticmethod
|
|
|
138
|
+ def _properties(s):
|
|
|
139
|
+ return [str(c).split('.')[-1] for c in s.characteristics_lists]
|
|
|
140
|
+
|
|
|
141
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
142
|
+ if lief_binary is None:
|
|
|
143
|
+ return {"entry": "", "sections": []}
|
|
|
144
|
+
|
|
|
145
|
+ # properties of entry point, or if invalid, the first executable section
|
|
|
146
|
+ not_found_error_class = RuntimeError if not lief.__version__.startswith("0.9.0") else lief.not_found
|
|
|
147
|
+ try:
|
|
|
148
|
+ if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12):
|
|
|
149
|
+ section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase)
|
|
|
150
|
+
|
|
|
151
|
+ if section is None:
|
|
|
152
|
+ raise not_found_error_class
|
|
|
153
|
+ entry_section = section.name
|
|
|
154
|
+ else: # lief < 0.12
|
|
|
155
|
+ entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
|
|
|
156
|
+ except not_found_error_class:
|
|
|
157
|
+ # bad entry point, let's find the first executable section
|
|
|
158
|
+ entry_section = ""
|
|
|
159
|
+ mem_execute_characteristics = lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE if lief.__version__.startswith("0.9.0") else lief.PE.Section.CHARACTERISTICS.MEM_EXECUTE
|
|
|
160
|
+ for s in lief_binary.sections:
|
|
|
161
|
+ if mem_execute_characteristics in s.characteristics_lists:
|
|
|
162
|
+ entry_section = s.name
|
|
|
163
|
+ break
|
|
|
164
|
+
|
|
|
165
|
+ raw_obj = {"entry": entry_section}
|
|
|
166
|
+ raw_obj["sections"] = [{
|
|
|
167
|
+ 'name': s.name,
|
|
|
168
|
+ 'size': s.size,
|
|
|
169
|
+ 'entropy': s.entropy,
|
|
|
170
|
+ 'vsize': s.virtual_size,
|
|
|
171
|
+ 'props': self._properties(s)
|
|
|
172
|
+ } for s in lief_binary.sections]
|
|
|
173
|
+ return raw_obj
|
|
|
174
|
+
|
|
|
175
|
+ def process_raw_features(self, raw_obj):
|
|
|
176
|
+ sections = raw_obj['sections']
|
|
|
177
|
+ general = [
|
|
|
178
|
+ len(sections), # total number of sections
|
|
|
179
|
+ # number of sections with zero size
|
|
|
180
|
+ sum(1 for s in sections if s['size'] == 0),
|
|
|
181
|
+ # number of sections with an empty name
|
|
|
182
|
+ sum(1 for s in sections if s['name'] == ""),
|
|
|
183
|
+ # number of RX
|
|
|
184
|
+ sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
|
|
|
185
|
+ # number of W
|
|
|
186
|
+ sum(1 for s in sections if 'MEM_WRITE' in s['props'])
|
|
|
187
|
+ ]
|
|
|
188
|
+ # gross characteristics of each section
|
|
|
189
|
+ section_sizes = [(s['name'], s['size']) for s in sections]
|
|
|
190
|
+ section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
|
|
|
191
|
+ section_entropy = [(s['name'], s['entropy']) for s in sections]
|
|
|
192
|
+ section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
|
|
|
193
|
+ section_vsize = [(s['name'], s['vsize']) for s in sections]
|
|
|
194
|
+ section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
|
|
|
195
|
+ entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
|
|
|
196
|
+ characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
|
|
|
197
|
+ characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
|
|
|
198
|
+
|
|
|
199
|
+ return np.hstack([
|
|
|
200
|
+ general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
|
|
|
201
|
+ characteristics_hashed
|
|
|
202
|
+ ]).astype(np.float32)
|
|
|
203
|
+
|
|
|
204
|
+
|
|
|
205
|
+class ImportsInfo(FeatureType):
|
|
|
206
|
+ ''' Information about imported libraries and functions from the
|
|
|
207
|
+ import address table. Note that the total number of imported
|
|
|
208
|
+ functions is contained in GeneralFileInfo.
|
|
|
209
|
+ '''
|
|
|
210
|
+
|
|
|
211
|
+ name = 'imports'
|
|
|
212
|
+ dim = 1280
|
|
|
213
|
+
|
|
|
214
|
+ def __init__(self):
|
|
|
215
|
+ super(FeatureType, self).__init__()
|
|
|
216
|
+
|
|
|
217
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
218
|
+ imports = {}
|
|
|
219
|
+ if lief_binary is None:
|
|
|
220
|
+ return imports
|
|
|
221
|
+
|
|
|
222
|
+ for lib in lief_binary.imports:
|
|
|
223
|
+ if lib.name not in imports:
|
|
|
224
|
+ imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite
|
|
|
225
|
+
|
|
|
226
|
+ # Clipping assumes there are diminishing returns on the discriminatory power of imported functions
|
|
|
227
|
+ # beyond the first 10000 characters, and this will help limit the dataset size
|
|
|
228
|
+ for entry in lib.entries:
|
|
|
229
|
+ if entry.is_ordinal:
|
|
|
230
|
+ imports[lib.name].append("ordinal" + str(entry.ordinal))
|
|
|
231
|
+ else:
|
|
|
232
|
+ imports[lib.name].append(entry.name[:10000])
|
|
|
233
|
+
|
|
|
234
|
+ return imports
|
|
|
235
|
+
|
|
|
236
|
+ def process_raw_features(self, raw_obj):
|
|
|
237
|
+ # unique libraries
|
|
|
238
|
+ libraries = list(set([l.lower() for l in raw_obj.keys()]))
|
|
|
239
|
+ libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
|
|
|
240
|
+
|
|
|
241
|
+ # A string like "kernel32.dll:CreateFileMappingA" for each imported function
|
|
|
242
|
+ imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
|
|
|
243
|
+ imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
|
|
|
244
|
+
|
|
|
245
|
+ # Two separate elements: libraries (alone) and fully-qualified names of imported functions
|
|
|
246
|
+ return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
|
|
|
247
|
+
|
|
|
248
|
+
|
|
|
249
|
+class ExportsInfo(FeatureType):
|
|
|
250
|
+ ''' Information about exported functions. Note that the total number of exported
|
|
|
251
|
+ functions is contained in GeneralFileInfo.
|
|
|
252
|
+ '''
|
|
|
253
|
+
|
|
|
254
|
+ name = 'exports'
|
|
|
255
|
+ dim = 128
|
|
|
256
|
+
|
|
|
257
|
+ def __init__(self):
|
|
|
258
|
+ super(FeatureType, self).__init__()
|
|
|
259
|
+
|
|
|
260
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
261
|
+ if lief_binary is None:
|
|
|
262
|
+ return []
|
|
|
263
|
+
|
|
|
264
|
+ # Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
|
|
|
265
|
+ # the first 10000 characters, and this will help limit the dataset size
|
|
|
266
|
+ if LIEF_EXPORT_OBJECT:
|
|
|
267
|
+ # export is an object with .name attribute (0.10.0 and later)
|
|
|
268
|
+ clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
|
|
|
269
|
+ else:
|
|
|
270
|
+ # export is a string (LIEF 0.9.0 and earlier)
|
|
|
271
|
+ clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
|
|
|
272
|
+
|
|
|
273
|
+ return clipped_exports
|
|
|
274
|
+
|
|
|
275
|
+ def process_raw_features(self, raw_obj):
|
|
|
276
|
+ exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
|
|
|
277
|
+ return exports_hashed.astype(np.float32)
|
|
|
278
|
+
|
|
|
279
|
+
|
|
|
280
|
+class GeneralFileInfo(FeatureType):
|
|
|
281
|
+ ''' General information about the file '''
|
|
|
282
|
+
|
|
|
283
|
+ name = 'general'
|
|
|
284
|
+ dim = 10
|
|
|
285
|
+
|
|
|
286
|
+ def __init__(self):
|
|
|
287
|
+ super(FeatureType, self).__init__()
|
|
|
288
|
+
|
|
|
289
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
290
|
+ if lief_binary is None:
|
|
|
291
|
+ return {
|
|
|
292
|
+ 'size': len(bytez),
|
|
|
293
|
+ 'vsize': 0,
|
|
|
294
|
+ 'has_debug': 0,
|
|
|
295
|
+ 'exports': 0,
|
|
|
296
|
+ 'imports': 0,
|
|
|
297
|
+ 'has_relocations': 0,
|
|
|
298
|
+ 'has_resources': 0,
|
|
|
299
|
+ 'has_signature': 0,
|
|
|
300
|
+ 'has_tls': 0,
|
|
|
301
|
+ 'symbols': 0
|
|
|
302
|
+ }
|
|
|
303
|
+
|
|
|
304
|
+ return {
|
|
|
305
|
+ 'size': len(bytez),
|
|
|
306
|
+ 'vsize': lief_binary.virtual_size,
|
|
|
307
|
+ 'has_debug': int(lief_binary.has_debug),
|
|
|
308
|
+ 'exports': len(lief_binary.exported_functions),
|
|
|
309
|
+ 'imports': len(lief_binary.imported_functions),
|
|
|
310
|
+ 'has_relocations': int(lief_binary.has_relocations),
|
|
|
311
|
+ 'has_resources': int(lief_binary.has_resources),
|
|
|
312
|
+ 'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature),
|
|
|
313
|
+ 'has_tls': int(lief_binary.has_tls),
|
|
|
314
|
+ 'symbols': len(lief_binary.symbols),
|
|
|
315
|
+ }
|
|
|
316
|
+
|
|
|
317
|
+ def process_raw_features(self, raw_obj):
|
|
|
318
|
+ return np.asarray([
|
|
|
319
|
+ raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
|
|
|
320
|
+ raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
|
|
|
321
|
+ raw_obj['symbols']
|
|
|
322
|
+ ],
|
|
|
323
|
+ dtype=np.float32)
|
|
|
324
|
+
|
|
|
325
|
+
|
|
|
326
|
+class HeaderFileInfo(FeatureType):
|
|
|
327
|
+ ''' Machine, architecure, OS, linker and other information extracted from header '''
|
|
|
328
|
+
|
|
|
329
|
+ name = 'header'
|
|
|
330
|
+ dim = 62
|
|
|
331
|
+
|
|
|
332
|
+ def __init__(self):
|
|
|
333
|
+ super(FeatureType, self).__init__()
|
|
|
334
|
+
|
|
|
335
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
336
|
+ raw_obj = {}
|
|
|
337
|
+ raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
|
|
|
338
|
+ raw_obj['optional'] = {
|
|
|
339
|
+ 'subsystem': "",
|
|
|
340
|
+ 'dll_characteristics': [],
|
|
|
341
|
+ 'magic': "",
|
|
|
342
|
+ 'major_image_version': 0,
|
|
|
343
|
+ 'minor_image_version': 0,
|
|
|
344
|
+ 'major_linker_version': 0,
|
|
|
345
|
+ 'minor_linker_version': 0,
|
|
|
346
|
+ 'major_operating_system_version': 0,
|
|
|
347
|
+ 'minor_operating_system_version': 0,
|
|
|
348
|
+ 'major_subsystem_version': 0,
|
|
|
349
|
+ 'minor_subsystem_version': 0,
|
|
|
350
|
+ 'sizeof_code': 0,
|
|
|
351
|
+ 'sizeof_headers': 0,
|
|
|
352
|
+ 'sizeof_heap_commit': 0
|
|
|
353
|
+ }
|
|
|
354
|
+ if lief_binary is None:
|
|
|
355
|
+ return raw_obj
|
|
|
356
|
+
|
|
|
357
|
+ raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
|
|
|
358
|
+ raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
|
|
|
359
|
+ raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
|
|
|
360
|
+ raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
|
|
|
361
|
+ raw_obj['optional']['dll_characteristics'] = [
|
|
|
362
|
+ str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
|
|
|
363
|
+ ]
|
|
|
364
|
+ raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
|
|
|
365
|
+ raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
|
|
|
366
|
+ raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
|
|
|
367
|
+ raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
|
|
|
368
|
+ raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
|
|
|
369
|
+ raw_obj['optional'][
|
|
|
370
|
+ 'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
|
|
|
371
|
+ raw_obj['optional'][
|
|
|
372
|
+ 'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
|
|
|
373
|
+ raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
|
|
|
374
|
+ raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
|
|
|
375
|
+ raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
|
|
|
376
|
+ raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
|
|
|
377
|
+ raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
|
|
|
378
|
+ return raw_obj
|
|
|
379
|
+
|
|
|
380
|
+ def process_raw_features(self, raw_obj):
|
|
|
381
|
+ return np.hstack([
|
|
|
382
|
+ raw_obj['coff']['timestamp'],
|
|
|
383
|
+ FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
|
|
|
384
|
+ FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
|
|
|
385
|
+ FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
|
|
|
386
|
+ FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
|
|
|
387
|
+ FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
|
|
|
388
|
+ raw_obj['optional']['major_image_version'],
|
|
|
389
|
+ raw_obj['optional']['minor_image_version'],
|
|
|
390
|
+ raw_obj['optional']['major_linker_version'],
|
|
|
391
|
+ raw_obj['optional']['minor_linker_version'],
|
|
|
392
|
+ raw_obj['optional']['major_operating_system_version'],
|
|
|
393
|
+ raw_obj['optional']['minor_operating_system_version'],
|
|
|
394
|
+ raw_obj['optional']['major_subsystem_version'],
|
|
|
395
|
+ raw_obj['optional']['minor_subsystem_version'],
|
|
|
396
|
+ raw_obj['optional']['sizeof_code'],
|
|
|
397
|
+ raw_obj['optional']['sizeof_headers'],
|
|
|
398
|
+ raw_obj['optional']['sizeof_heap_commit'],
|
|
|
399
|
+ ]).astype(np.float32)
|
|
|
400
|
+
|
|
|
401
|
+
|
|
|
402
|
+class StringExtractor(FeatureType):
|
|
|
403
|
+ ''' Extracts strings from raw byte stream '''
|
|
|
404
|
+
|
|
|
405
|
+ name = 'strings'
|
|
|
406
|
+ dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
|
|
|
407
|
+
|
|
|
408
|
+ def __init__(self):
|
|
|
409
|
+ super(FeatureType, self).__init__()
|
|
|
410
|
+ # all consecutive runs of 0x20 - 0x7f that are 5+ characters
|
|
|
411
|
+ self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
|
|
|
412
|
+ # occurances of the string 'C:\'. Not actually extracting the path
|
|
|
413
|
+ self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
|
|
|
414
|
+ # occurances of http:// or https://. Not actually extracting the URLs
|
|
|
415
|
+ self._urls = re.compile(b'https?://', re.IGNORECASE)
|
|
|
416
|
+ # occurances of the string prefix HKEY_. No actually extracting registry names
|
|
|
417
|
+ self._registry = re.compile(b'HKEY_')
|
|
|
418
|
+ # crude evidence of an MZ header (dropper?) somewhere in the byte stream
|
|
|
419
|
+ self._mz = re.compile(b'MZ')
|
|
|
420
|
+
|
|
|
421
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
422
|
+ allstrings = self._allstrings.findall(bytez)
|
|
|
423
|
+ if allstrings:
|
|
|
424
|
+ # statistics about strings:
|
|
|
425
|
+ string_lengths = [len(s) for s in allstrings]
|
|
|
426
|
+ avlength = sum(string_lengths) / len(string_lengths)
|
|
|
427
|
+ # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
|
|
|
428
|
+ as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
|
|
|
429
|
+ c = np.bincount(as_shifted_string, minlength=96) # histogram count
|
|
|
430
|
+ # distribution of characters in printable strings
|
|
|
431
|
+ csum = c.sum()
|
|
|
432
|
+ p = c.astype(np.float32) / csum
|
|
|
433
|
+ wh = np.where(c)[0]
|
|
|
434
|
+ H = np.sum(-p[wh] * np.log2(p[wh])) # entropy
|
|
|
435
|
+ else:
|
|
|
436
|
+ avlength = 0
|
|
|
437
|
+ c = np.zeros((96,), dtype=np.float32)
|
|
|
438
|
+ H = 0
|
|
|
439
|
+ csum = 0
|
|
|
440
|
+
|
|
|
441
|
+ return {
|
|
|
442
|
+ 'numstrings': len(allstrings),
|
|
|
443
|
+ 'avlength': avlength,
|
|
|
444
|
+ 'printabledist': c.tolist(), # store non-normalized histogram
|
|
|
445
|
+ 'printables': int(csum),
|
|
|
446
|
+ 'entropy': float(H),
|
|
|
447
|
+ 'paths': len(self._paths.findall(bytez)),
|
|
|
448
|
+ 'urls': len(self._urls.findall(bytez)),
|
|
|
449
|
+ 'registry': len(self._registry.findall(bytez)),
|
|
|
450
|
+ 'MZ': len(self._mz.findall(bytez))
|
|
|
451
|
+ }
|
|
|
452
|
+
|
|
|
453
|
+ def process_raw_features(self, raw_obj):
|
|
|
454
|
+ hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
|
|
|
455
|
+ return np.hstack([
|
|
|
456
|
+ raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
|
|
|
457
|
+ np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
|
|
|
458
|
+ raw_obj['registry'], raw_obj['MZ']
|
|
|
459
|
+ ]).astype(np.float32)
|
|
|
460
|
+
|
|
|
461
|
+
|
|
|
462
|
+class DataDirectories(FeatureType):
|
|
|
463
|
+ ''' Extracts size and virtual address of the first 15 data directories '''
|
|
|
464
|
+
|
|
|
465
|
+ name = 'datadirectories'
|
|
|
466
|
+ dim = 15 * 2
|
|
|
467
|
+
|
|
|
468
|
+ def __init__(self):
|
|
|
469
|
+ super(FeatureType, self).__init__()
|
|
|
470
|
+ self._name_order = [
|
|
|
471
|
+ "EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
|
|
|
472
|
+ "BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
|
|
|
473
|
+ "BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
|
|
|
474
|
+ ]
|
|
|
475
|
+
|
|
|
476
|
+ def raw_features(self, bytez, lief_binary):
|
|
|
477
|
+ output = []
|
|
|
478
|
+ if lief_binary is None:
|
|
|
479
|
+ return output
|
|
|
480
|
+
|
|
|
481
|
+ for data_directory in lief_binary.data_directories:
|
|
|
482
|
+ output.append({
|
|
|
483
|
+ "name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
|
|
|
484
|
+ "size": data_directory.size,
|
|
|
485
|
+ "virtual_address": data_directory.rva
|
|
|
486
|
+ })
|
|
|
487
|
+ return output
|
|
|
488
|
+
|
|
|
489
|
+ def process_raw_features(self, raw_obj):
|
|
|
490
|
+ features = np.zeros(2 * len(self._name_order), dtype=np.float32)
|
|
|
491
|
+ for i in range(len(self._name_order)):
|
|
|
492
|
+ if i < len(raw_obj):
|
|
|
493
|
+ features[2 * i] = raw_obj[i]["size"]
|
|
|
494
|
+ features[2 * i + 1] = raw_obj[i]["virtual_address"]
|
|
|
495
|
+ return features
|
|
|
496
|
+
|
|
|
497
|
+
|
|
|
498
|
+class EMBERFeatureExtractor(object):
|
|
|
499
|
+ ''' Extract useful features from a PE file, and return as a vector of fixed size. '''
|
|
|
500
|
+
|
|
|
501
|
+ def __init__(self, feature_version=2, print_feature_warning=True, features_file=''):
|
|
|
502
|
+ self.features = []
|
|
|
503
|
+ features = {
|
|
|
504
|
+ 'ByteHistogram': ByteHistogram(),
|
|
|
505
|
+ 'ByteEntropyHistogram': ByteEntropyHistogram(),
|
|
|
506
|
+ 'StringExtractor': StringExtractor(),
|
|
|
507
|
+ 'GeneralFileInfo': GeneralFileInfo(),
|
|
|
508
|
+ 'HeaderFileInfo': HeaderFileInfo(),
|
|
|
509
|
+ 'SectionInfo': SectionInfo(),
|
|
|
510
|
+ 'ImportsInfo': ImportsInfo(),
|
|
|
511
|
+ 'ExportsInfo': ExportsInfo()
|
|
|
512
|
+ }
|
|
|
513
|
+
|
|
|
514
|
+ if os.path.exists(features_file):
|
|
|
515
|
+ with open(features_file, encoding='utf8') as f:
|
|
|
516
|
+ x = json.load(f)
|
|
|
517
|
+ self.features = [features[feature] for feature in x['features'] if feature in features]
|
|
|
518
|
+ else:
|
|
|
519
|
+ self.features = list(features.values())
|
|
|
520
|
+
|
|
|
521
|
+ if feature_version == 1:
|
|
|
522
|
+ if not lief.__version__.startswith("0.8.3"):
|
|
|
523
|
+ if print_feature_warning:
|
|
|
524
|
+ print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
|
|
|
525
|
+ print(
|
|
|
526
|
+ f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
|
|
|
527
|
+ print(f"WARNING: in the feature calculations.")
|
|
|
528
|
+ elif feature_version == 2:
|
|
|
529
|
+ self.features.append(DataDirectories())
|
|
|
530
|
+ if not lief.__version__.startswith("0.9.0"):
|
|
|
531
|
+ if print_feature_warning:
|
|
|
532
|
+ print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
|
|
|
533
|
+ print(
|
|
|
534
|
+ f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
|
|
|
535
|
+ print(f"WARNING: in the feature calculations.")
|
|
|
536
|
+ else:
|
|
|
537
|
+ raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
|
|
|
538
|
+ self.dim = sum([fe.dim for fe in self.features])
|
|
|
539
|
+
|
|
|
540
|
+ def raw_features(self, bytez):
|
|
|
541
|
+ if lief.__version__.startswith("0.9.0"):
|
|
|
542
|
+ lief_errors = (
|
|
|
543
|
+ lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError)
|
|
|
544
|
+ else:
|
|
|
545
|
+ lief_errors = (
|
|
|
546
|
+ lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error,
|
|
|
547
|
+ lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound,
|
|
|
548
|
+ RuntimeError)
|
|
|
549
|
+
|
|
|
550
|
+ try:
|
|
|
551
|
+ lief_binary = lief.PE.parse(list(bytez))
|
|
|
552
|
+ except lief_errors as e:
|
|
|
553
|
+ print("lief error: ", str(e))
|
|
|
554
|
+ lief_binary = None
|
|
|
555
|
+ except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError):
|
|
|
556
|
+ raise
|
|
|
557
|
+
|
|
|
558
|
+ features = {"sha256": hashlib.sha256(bytez).hexdigest()}
|
|
|
559
|
+ features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
|
|
|
560
|
+ return features
|
|
|
561
|
+
|
|
|
562
|
+ def process_raw_features(self, raw_obj):
|
|
|
563
|
+ feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
|
|
|
564
|
+ return np.hstack(feature_vectors).astype(np.float32)
|
|
|
565
|
+
|
|
|
566
|
+ def feature_vector(self, bytez):
|
|
|
567
|
+ return self.process_raw_features(self.raw_features(bytez))
|