221 lines
8.5 KiB
Python
221 lines
8.5 KiB
Python
import pefile
|
|
import os
|
|
import array
|
|
import math
|
|
import pickle
|
|
import joblib
|
|
import sys
|
|
import argparse
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.preprocessing import StandardScaler
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
def get_entropy(data):
|
|
if len(data) == 0:
|
|
return 0.0
|
|
occurences = array.array('L', [0]*256)
|
|
for x in data:
|
|
occurences[x if isinstance(x, int) else ord(x)] += 1
|
|
|
|
entropy = 0
|
|
for x in occurences:
|
|
if x:
|
|
p_x = float(x) / len(data)
|
|
entropy -= p_x*math.log(p_x, 2)
|
|
|
|
return entropy
|
|
|
|
def get_resources(pe):
|
|
"""Extract resources :
|
|
[entropy, size]"""
|
|
resources = []
|
|
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
|
|
try:
|
|
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
|
|
if hasattr(resource_type, 'directory'):
|
|
for resource_id in resource_type.directory.entries:
|
|
if hasattr(resource_id, 'directory'):
|
|
for resource_lang in resource_id.directory.entries:
|
|
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
|
|
size = resource_lang.data.struct.Size
|
|
entropy = get_entropy(data)
|
|
|
|
resources.append([entropy, size])
|
|
except Exception as e:
|
|
return resources
|
|
return resources
|
|
|
|
def get_version_info(pe):
|
|
"""Return version infos"""
|
|
res = {}
|
|
for fileinfo in pe.FileInfo:
|
|
if fileinfo.Key == 'StringFileInfo':
|
|
for st in fileinfo.StringTable:
|
|
for entry in st.entries.items():
|
|
res[entry[0]] = entry[1]
|
|
if fileinfo.Key == 'VarFileInfo':
|
|
for var in fileinfo.Var:
|
|
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
|
|
if hasattr(pe, 'VS_FIXEDFILEINFO'):
|
|
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
|
|
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
|
|
res['type'] = pe.VS_FIXEDFILEINFO.FileType
|
|
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
|
|
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
|
|
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
|
|
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
|
|
return res
|
|
|
|
def extract_infos(fpath):
|
|
res = {}
|
|
pe = pefile.PE(fpath)
|
|
res['Machine'] = pe.FILE_HEADER.Machine
|
|
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
|
|
res['Characteristics'] = pe.FILE_HEADER.Characteristics
|
|
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
|
|
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
|
|
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
|
|
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
|
|
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
|
|
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
|
|
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
|
|
try:
|
|
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
|
|
except AttributeError:
|
|
res['BaseOfData'] = 0
|
|
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
|
|
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
|
|
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
|
|
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
|
|
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
|
|
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
|
|
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
|
|
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
|
|
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
|
|
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
|
|
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
|
|
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
|
|
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
|
|
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
|
|
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
|
|
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
|
|
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
|
|
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
|
|
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
|
|
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
|
|
|
|
# Sections
|
|
res['SectionsNb'] = len(pe.sections)
|
|
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
|
|
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
|
|
res['SectionsMinEntropy'] = min(entropy)
|
|
res['SectionsMaxEntropy'] = max(entropy)
|
|
|
|
|
|
raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
|
|
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
|
|
res['SectionsMinRawsize'] = min(raw_sizes)
|
|
res['SectionsMaxRawsize'] = max(raw_sizes)
|
|
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
|
|
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
|
|
res['SectionsMinVirtualsize'] = min(virtual_sizes)
|
|
res['SectionMaxVirtualsize'] = max(virtual_sizes)
|
|
|
|
#Imports
|
|
try:
|
|
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
|
|
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
|
|
res['ImportsNb'] = len(imports)
|
|
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
|
|
except AttributeError:
|
|
res['ImportsNbDLL'] = 0
|
|
res['ImportsNb'] = 0
|
|
res['ImportsNbOrdinal'] = 0
|
|
|
|
#Exports
|
|
try:
|
|
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
|
|
except AttributeError:
|
|
# No export
|
|
res['ExportNb'] = 0
|
|
#Resources
|
|
resources= get_resources(pe)
|
|
res['ResourcesNb'] = len(resources)
|
|
if len(resources)> 0:
|
|
entropy = list(map(lambda x:x[0], resources))
|
|
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
|
|
res['ResourcesMinEntropy'] = min(entropy)
|
|
res['ResourcesMaxEntropy'] = max(entropy)
|
|
sizes = list(map(lambda x:x[1], resources))
|
|
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
|
|
res['ResourcesMinSize'] = min(sizes)
|
|
res['ResourcesMaxSize'] = max(sizes)
|
|
else:
|
|
res['ResourcesNb'] = 0
|
|
res['ResourcesMeanEntropy'] = 0
|
|
res['ResourcesMinEntropy'] = 0
|
|
res['ResourcesMaxEntropy'] = 0
|
|
res['ResourcesMeanSize'] = 0
|
|
res['ResourcesMinSize'] = 0
|
|
res['ResourcesMaxSize'] = 0
|
|
|
|
# Load configuration size
|
|
try:
|
|
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
|
|
except AttributeError:
|
|
res['LoadConfigurationSize'] = 0
|
|
|
|
|
|
# Version configuration size
|
|
try:
|
|
version_infos = get_version_info(pe)
|
|
res['VersionInformationSize'] = len(version_infos.keys())
|
|
except AttributeError:
|
|
res['VersionInformationSize'] = 0
|
|
return res
|
|
|
|
selected_features = ['Characteristics',
|
|
'DllCharacteristics',
|
|
'SectionsMaxEntropy',
|
|
'MajorSubsystemVersion',
|
|
'Machine',
|
|
'Subsystem',
|
|
'ResourcesMaxEntropy',
|
|
'ResourcesMinEntropy',
|
|
'VersionInformationSize',
|
|
'MajorOperatingSystemVersion',
|
|
'ResourcesMeanEntropy',
|
|
'SectionsMeanEntropy']
|
|
|
|
pipeline = Pipeline([
|
|
('std_scaler', StandardScaler()),
|
|
])
|
|
|
|
def predict_from_features(features, model):
|
|
features_as_nested_lists = [features]
|
|
features_numpy = np.array(features_as_nested_lists)
|
|
#X_unknown = pipeline.fit_transform(features_numpy)
|
|
X_unknown = features_numpy
|
|
X_unknown_columns = selected_features
|
|
X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns)
|
|
ans = model.predict(X_unknown)
|
|
#ans = model.predict([features])
|
|
return ans[0]
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("filepath", help="Filepath of the file to test")
|
|
args = parser.parse_args()
|
|
#print(args.filepath)
|
|
features = selected_features
|
|
data = extract_infos(args.filepath)
|
|
pe_features = list(map(lambda x:data[x], features))
|
|
#print("===========================================")
|
|
#print("Features extracted from the file {}".format(args.filepath))
|
|
#print(pe_features)
|
|
|
|
saved_model = joblib.load("/home/ubuntu/bigData/projet_big_data/models/malware_classifier_4.pkl")
|
|
prediction = predict_from_features(pe_features, saved_model)
|
|
print(prediction)
|