big_data_projet/06_extract_features_and_predict.py

221 lines
8.4 KiB
Python

import pefile
import os
import array
import math
import pickle
import joblib
import sys
import argparse
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
def get_entropy(data):
if len(data) == 0:
return 0.0
occurences = array.array('L', [0]*256)
for x in data:
occurences[x if isinstance(x, int) else ord(x)] += 1
entropy = 0
for x in occurences:
if x:
p_x = float(x) / len(data)
entropy -= p_x*math.log(p_x, 2)
return entropy
def get_resources(pe):
"""Extract resources :
[entropy, size]"""
resources = []
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
try:
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(resource_type, 'directory'):
for resource_id in resource_type.directory.entries:
if hasattr(resource_id, 'directory'):
for resource_lang in resource_id.directory.entries:
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
size = resource_lang.data.struct.Size
entropy = get_entropy(data)
resources.append([entropy, size])
except Exception as e:
return resources
return resources
def get_version_info(pe):
"""Return version infos"""
res = {}
for fileinfo in pe.FileInfo:
if fileinfo.Key == 'StringFileInfo':
for st in fileinfo.StringTable:
for entry in st.entries.items():
res[entry[0]] = entry[1]
if fileinfo.Key == 'VarFileInfo':
for var in fileinfo.Var:
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
if hasattr(pe, 'VS_FIXEDFILEINFO'):
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
res['type'] = pe.VS_FIXEDFILEINFO.FileType
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
return res
def extract_infos(fpath):
res = {}
pe = pefile.PE(fpath)
res['Machine'] = pe.FILE_HEADER.Machine
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
res['Characteristics'] = pe.FILE_HEADER.Characteristics
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
try:
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
except AttributeError:
res['BaseOfData'] = 0
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
# Sections
res['SectionsNb'] = len(pe.sections)
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
res['SectionsMinEntropy'] = min(entropy)
res['SectionsMaxEntropy'] = max(entropy)
raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
res['SectionsMinRawsize'] = min(raw_sizes)
res['SectionsMaxRawsize'] = max(raw_sizes)
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
res['SectionsMinVirtualsize'] = min(virtual_sizes)
res['SectionMaxVirtualsize'] = max(virtual_sizes)
#Imports
try:
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
res['ImportsNb'] = len(imports)
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
except AttributeError:
res['ImportsNbDLL'] = 0
res['ImportsNb'] = 0
res['ImportsNbOrdinal'] = 0
#Exports
try:
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
except AttributeError:
# No export
res['ExportNb'] = 0
#Resources
resources= get_resources(pe)
res['ResourcesNb'] = len(resources)
if len(resources)> 0:
entropy = list(map(lambda x:x[0], resources))
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
res['ResourcesMinEntropy'] = min(entropy)
res['ResourcesMaxEntropy'] = max(entropy)
sizes = list(map(lambda x:x[1], resources))
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
res['ResourcesMinSize'] = min(sizes)
res['ResourcesMaxSize'] = max(sizes)
else:
res['ResourcesNb'] = 0
res['ResourcesMeanEntropy'] = 0
res['ResourcesMinEntropy'] = 0
res['ResourcesMaxEntropy'] = 0
res['ResourcesMeanSize'] = 0
res['ResourcesMinSize'] = 0
res['ResourcesMaxSize'] = 0
# Load configuration size
try:
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
except AttributeError:
res['LoadConfigurationSize'] = 0
# Version configuration size
try:
version_infos = get_version_info(pe)
res['VersionInformationSize'] = len(version_infos.keys())
except AttributeError:
res['VersionInformationSize'] = 0
return res
selected_features = ['Characteristics',
'DllCharacteristics',
'SectionsMaxEntropy',
'MajorSubsystemVersion',
'Machine',
'Subsystem',
'ResourcesMaxEntropy',
'ResourcesMinEntropy',
'VersionInformationSize',
'MajorOperatingSystemVersion',
'ResourcesMeanEntropy',
'SectionsMeanEntropy']
pipeline = Pipeline([
('std_scaler', StandardScaler()),
])
def predict_from_features(features, model):
features_as_nested_lists = [features]
features_numpy = np.array(features_as_nested_lists)
#X_unknown = pipeline.fit_transform(features_numpy)
X_unknown = features_numpy
X_unknown_columns = selected_features
X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns)
#ans = model.predict(X_unknown)
ans = model.predict([features])
return ans[0]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("filepath", help="Filepath of the file to test")
args = parser.parse_args()
#print(args.filepath)
features = selected_features
data = extract_infos(args.filepath)
pe_features = list(map(lambda x:data[x], features))
#print("===========================================")
#print("Features extracted from the file {}".format(args.filepath))
#print(pe_features)
saved_model = joblib.load("models/malware_classifier_4.pkl")
prediction = predict_from_features(pe_features, saved_model)
print(prediction)