import pefile import os import array import math import pickle import joblib import sys import argparse from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler import numpy as np import pandas as pd def get_entropy(data): if len(data) == 0: return 0.0 occurences = array.array('L', [0]*256) for x in data: occurences[x if isinstance(x, int) else ord(x)] += 1 entropy = 0 for x in occurences: if x: p_x = float(x) / len(data) entropy -= p_x*math.log(p_x, 2) return entropy def get_resources(pe): """Extract resources : [entropy, size]""" resources = [] if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): try: for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: if hasattr(resource_type, 'directory'): for resource_id in resource_type.directory.entries: if hasattr(resource_id, 'directory'): for resource_lang in resource_id.directory.entries: data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) size = resource_lang.data.struct.Size entropy = get_entropy(data) resources.append([entropy, size]) except Exception as e: return resources return resources def get_version_info(pe): """Return version infos""" res = {} for fileinfo in pe.FileInfo: if fileinfo.Key == 'StringFileInfo': for st in fileinfo.StringTable: for entry in st.entries.items(): res[entry[0]] = entry[1] if fileinfo.Key == 'VarFileInfo': for var in fileinfo.Var: res[var.entry.items()[0][0]] = var.entry.items()[0][1] if hasattr(pe, 'VS_FIXEDFILEINFO'): res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags res['os'] = pe.VS_FIXEDFILEINFO.FileOS res['type'] = pe.VS_FIXEDFILEINFO.FileType res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS res['signature'] = pe.VS_FIXEDFILEINFO.Signature res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion return res def extract_infos(fpath): res = {} pe = pefile.PE(fpath) res['Machine'] = pe.FILE_HEADER.Machine res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader res['Characteristics'] = pe.FILE_HEADER.Characteristics res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode try: res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData except AttributeError: res['BaseOfData'] = 0 res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes # Sections res['SectionsNb'] = len(pe.sections) entropy = list(map(lambda x:x.get_entropy(), pe.sections)) res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) res['SectionsMinEntropy'] = min(entropy) res['SectionsMaxEntropy'] = max(entropy) raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections)) res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) res['SectionsMinRawsize'] = min(raw_sizes) res['SectionsMaxRawsize'] = max(raw_sizes) virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections)) res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) res['SectionsMinVirtualsize'] = min(virtual_sizes) res['SectionMaxVirtualsize'] = max(virtual_sizes) #Imports try: res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])) res['ImportsNb'] = len(imports) res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports))) except AttributeError: res['ImportsNbDLL'] = 0 res['ImportsNb'] = 0 res['ImportsNbOrdinal'] = 0 #Exports try: res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) except AttributeError: # No export res['ExportNb'] = 0 #Resources resources= get_resources(pe) res['ResourcesNb'] = len(resources) if len(resources)> 0: entropy = list(map(lambda x:x[0], resources)) res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) res['ResourcesMinEntropy'] = min(entropy) res['ResourcesMaxEntropy'] = max(entropy) sizes = list(map(lambda x:x[1], resources)) res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) res['ResourcesMinSize'] = min(sizes) res['ResourcesMaxSize'] = max(sizes) else: res['ResourcesNb'] = 0 res['ResourcesMeanEntropy'] = 0 res['ResourcesMinEntropy'] = 0 res['ResourcesMaxEntropy'] = 0 res['ResourcesMeanSize'] = 0 res['ResourcesMinSize'] = 0 res['ResourcesMaxSize'] = 0 # Load configuration size try: res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size except AttributeError: res['LoadConfigurationSize'] = 0 # Version configuration size try: version_infos = get_version_info(pe) res['VersionInformationSize'] = len(version_infos.keys()) except AttributeError: res['VersionInformationSize'] = 0 return res selected_features = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] pipeline = Pipeline([ ('std_scaler', StandardScaler()), ]) def predict_from_features(features, model): features_as_nested_lists = [features] features_numpy = np.array(features_as_nested_lists) #X_unknown = pipeline.fit_transform(features_numpy) X_unknown = features_numpy X_unknown_columns = selected_features X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) ans = model.predict(X_unknown) #ans = model.predict([features]) return ans[0] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("filepath", help="Filepath of the file to test") args = parser.parse_args() #print(args.filepath) features = selected_features data = extract_infos(args.filepath) pe_features = list(map(lambda x:data[x], features)) #print("===========================================") #print("Features extracted from the file {}".format(args.filepath)) #print(pe_features) saved_model = joblib.load("/home/ubuntu/bigData/projet_big_data/models/malware_classifier_4.pkl") prediction = predict_from_features(pe_features, saved_model) print(prediction)