Re-adapted the data analysis and added test script over real malware

This commit is contained in:
valentin 2020-04-05 22:44:26 +02:00
parent 36f9d7e098
commit e126fb2600
11 changed files with 590 additions and 13 deletions

126
03_2_the_ml.py Normal file
View File

@ -0,0 +1,126 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import joblib
# - For features selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
print("Loading dataset in memory [ ... ]")
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
print("Loading dataset in memory [ DONE ]")
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
files = files.drop("ID", axis=1)
files = files.drop("md5", axis=1)
# ==== Split DataSet again but lets stratify with Machine feature ====
# - Add tmp cat in order to be able to stratify data while splitting it
#files["legitimate_cat"] = pd.cut(files["legitimate"], bins=[0, 1], labels=[1,2])
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(files, files["legitimate"]):
strat_train_set = files.loc[train_index]
strat_test_set = files.loc[test_index]
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
#for set_ in (strat_train_set, strat_test_set):
# set_.drop("legitimate_cat", axis=1, inplace=True)
files = strat_train_set.copy()
# ==== Split Features and Labels ====
print("Splitting dataset Features and Labels [ ... ]")
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
files_without_labels = files.drop("legitimate", axis=1)
files_labels = files["legitimate"].copy()
print("Splitting dataset Features and Labels [ DONE ]")
imputer = SimpleImputer(strategy="median")
# - Features manual transformer -
class ManualFeatureSelector(TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
# -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] --
return X.values[:,[1,24,34,18,0,23,48,47,53,14,46,32]]
# - If ID and md5 not dropped juste after dataset import use the other return -
#return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
# - Create the pipeline -
pipeline = Pipeline([
('features_remap', ManualFeatureSelector()),
('imputer', SimpleImputer(strategy="median")),
])
# - Prepare dataset, pass it through the pipeline -
print("Dataset passing through the pipeline [ ... ]")
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
files_prepared = pipeline.fit_transform(files_without_labels)
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name)
print("Dataset passing through the pipeline [ DONE ]")
print("Describe of 'files_prepared': ")
print(files_prepared.describe())
# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-=
algos = {
"DecisionTree": DecisionTreeClassifier(max_depth=10),
"RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
"GradientBoosting": GradientBoostingClassifier(n_estimators=50),
"AdaBoost": AdaBoostClassifier(n_estimators=100),
"GNB": GaussianNB()
}
X_train = files_prepared
y_train = files_labels
X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1))
y_test = strat_test_set['legitimate'].copy()
results = {}
print("Testing 5 algo [ ... ]")
for algo in algos:
cur_algo = algos[algo]
cur_algo.fit(X_train, y_train)
#score = cur_algo.score(X_test, y_test)
score = cur_algo.score(X_test, y_test)
print("%s : %f %%" % (algo, score*100))
results[algo] = score
winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
print("Saving the model [ ... ]")
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
print("Saving the model [ DONE ]")

View File

@ -72,7 +72,6 @@ class ManualFeatureSelector(TransformerMixin):
pipeline = Pipeline([ pipeline = Pipeline([
('features_remap', ManualFeatureSelector()), ('features_remap', ManualFeatureSelector()),
('imputer', SimpleImputer(strategy="median")), ('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
]) ])
# - Prepare dataset, pass it through the pipeline - # - Prepare dataset, pass it through the pipeline -
@ -115,6 +114,6 @@ print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
print("Saving the model [ ... ]") print("Saving the model [ ... ]")
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") #joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_2.pkl")
print("Saving the model [ DONE ]") print("Saving the model [ DONE ]")

View File

@ -30,6 +30,9 @@ print("Loading dataset in memory [ ... ]")
file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
print("Loading dataset in memory [ DONE ]") print("Loading dataset in memory [ DONE ]")
file_to_test = file_to_test.drop("ID", axis=1)
file_to_test = file_to_test.drop("md5", axis=1)
# --- remove labels --- # --- remove labels ---
#file_to_test = file_to_test.drop("legitimate", axis=1) #file_to_test = file_to_test.drop("legitimate", axis=1)
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
@ -59,7 +62,10 @@ class ManualFeatureSelector(TransformerMixin):
#'SectionsMeanEntropy'] #'SectionsMeanEntropy']
# ? X.transpose() # ? X.transpose()
Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] #Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
# - If ID and md5 are not droped after dataset import, use the other return -
Y = X[:,[1,24,34,18,0,23,48,47,53,14,46,32]]
#Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
return Y return Y
@ -77,7 +83,6 @@ class ManualFeatureSelector(TransformerMixin):
pipeline = Pipeline([ pipeline = Pipeline([
('features_remap', ManualFeatureSelector()), ('features_remap', ManualFeatureSelector()),
('imputer', SimpleImputer(strategy="median")), ('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()),
]) ])
@ -90,7 +95,7 @@ def full_pipeline(data):
return prepared return prepared
# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-= # =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-=
saved_model = joblib.load("models/malware_classifier_1.pkl") saved_model = joblib.load("models/malware_classifier_5.pkl")
# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= # =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-=
# - This function use the model and predict if it's a malware or not - # - This function use the model and predict if it's a malware or not -
@ -116,18 +121,21 @@ def predict_one_line(model,line):
# - At the end, print the prediction accuracy result # - At the end, print the prediction accuracy result
res = [] res = []
nb_malware_to_test = 10 #nb_malware_to_test = 50
nb_malware_to_test = 34199
good_ans = 0 good_ans = 0
for i in range(nb_malware_to_test): for i in range(34179,nb_malware_to_test):
#for i in range(nb_malware_to_test):
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
features = file_to_test.values[i,] features = file_to_test.values[i,]
features_list = features.tolist() features_list = features.tolist()
features_array = [features_list] features_array = [features_list]
features = np.array(features_array) features = np.array(features_array)
res.append(predict_one_line(saved_model, features)) res.append(predict_one_line(saved_model, features))
if res[i] == file_to_test.values[i,][56]: if res[i-34179] == file_to_test.values[i,][54]:
good_ans +=1 good_ans +=1
print(features)
print(res)
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))

View File

@ -0,0 +1,195 @@
import pefile
import os
import array
import math
import pickle
import joblib
import sys
import argparse
def get_entropy(data):
if len(data) == 0:
return 0.0
occurences = array.array('L', [0]*256)
for x in data:
occurences[x if isinstance(x, int) else ord(x)] += 1
entropy = 0
for x in occurences:
if x:
p_x = float(x) / len(data)
entropy -= p_x*math.log(p_x, 2)
return entropy
def get_resources(pe):
"""Extract resources :
[entropy, size]"""
resources = []
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
try:
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(resource_type, 'directory'):
for resource_id in resource_type.directory.entries:
if hasattr(resource_id, 'directory'):
for resource_lang in resource_id.directory.entries:
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
size = resource_lang.data.struct.Size
entropy = get_entropy(data)
resources.append([entropy, size])
except Exception as e:
return resources
return resources
def get_version_info(pe):
"""Return version infos"""
res = {}
for fileinfo in pe.FileInfo:
if fileinfo.Key == 'StringFileInfo':
for st in fileinfo.StringTable:
for entry in st.entries.items():
res[entry[0]] = entry[1]
if fileinfo.Key == 'VarFileInfo':
for var in fileinfo.Var:
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
if hasattr(pe, 'VS_FIXEDFILEINFO'):
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
res['type'] = pe.VS_FIXEDFILEINFO.FileType
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
return res
def extract_infos(fpath):
res = {}
pe = pefile.PE(fpath)
res['Machine'] = pe.FILE_HEADER.Machine
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
res['Characteristics'] = pe.FILE_HEADER.Characteristics
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
try:
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
except AttributeError:
res['BaseOfData'] = 0
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
# Sections
res['SectionsNb'] = len(pe.sections)
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
res['SectionsMinEntropy'] = min(entropy)
res['SectionsMaxEntropy'] = max(entropy)
raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
res['SectionsMinRawsize'] = min(raw_sizes)
res['SectionsMaxRawsize'] = max(raw_sizes)
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
res['SectionsMinVirtualsize'] = min(virtual_sizes)
res['SectionMaxVirtualsize'] = max(virtual_sizes)
#Imports
try:
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
res['ImportsNb'] = len(imports)
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
except AttributeError:
res['ImportsNbDLL'] = 0
res['ImportsNb'] = 0
res['ImportsNbOrdinal'] = 0
#Exports
try:
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
except AttributeError:
# No export
res['ExportNb'] = 0
#Resources
resources= get_resources(pe)
res['ResourcesNb'] = len(resources)
if len(resources)> 0:
entropy = list(map(lambda x:x[0], resources))
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
res['ResourcesMinEntropy'] = min(entropy)
res['ResourcesMaxEntropy'] = max(entropy)
sizes = list(map(lambda x:x[1], resources))
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
res['ResourcesMinSize'] = min(sizes)
res['ResourcesMaxSize'] = max(sizes)
else:
res['ResourcesNb'] = 0
res['ResourcesMeanEntropy'] = 0
res['ResourcesMinEntropy'] = 0
res['ResourcesMaxEntropy'] = 0
res['ResourcesMeanSize'] = 0
res['ResourcesMinSize'] = 0
res['ResourcesMaxSize'] = 0
# Load configuration size
try:
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
except AttributeError:
res['LoadConfigurationSize'] = 0
# Version configuration size
try:
version_infos = get_version_info(pe)
res['VersionInformationSize'] = len(version_infos.keys())
except AttributeError:
res['VersionInformationSize'] = 0
return res
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("filepath", help="Filepath of the file to test")
args = parser.parse_args()
print(args.filepath)
features = ['Characteristics',
'DllCharacteristics',
'SectionsMaxEntropy',
'MajorSubsystemVersion',
'Machine',
'Subsystem',
'ResourcesMaxEntropy',
'ResourcesMinEntropy',
'VersionInformationSize',
'MajorOperatingSystemVersion',
'ResourcesMeanEntropy',
'SectionsMeanEntropy']
data = extract_infos(args.filepath)
pe_features = list(map(lambda x:data[x], features))
print("===========================================")
print("Features extracted from the file {}".format(args.filepath))
print(pe_features)

View File

@ -0,0 +1,220 @@
import pefile
import os
import array
import math
import pickle
import joblib
import sys
import argparse
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
def get_entropy(data):
if len(data) == 0:
return 0.0
occurences = array.array('L', [0]*256)
for x in data:
occurences[x if isinstance(x, int) else ord(x)] += 1
entropy = 0
for x in occurences:
if x:
p_x = float(x) / len(data)
entropy -= p_x*math.log(p_x, 2)
return entropy
def get_resources(pe):
"""Extract resources :
[entropy, size]"""
resources = []
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
try:
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(resource_type, 'directory'):
for resource_id in resource_type.directory.entries:
if hasattr(resource_id, 'directory'):
for resource_lang in resource_id.directory.entries:
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
size = resource_lang.data.struct.Size
entropy = get_entropy(data)
resources.append([entropy, size])
except Exception as e:
return resources
return resources
def get_version_info(pe):
"""Return version infos"""
res = {}
for fileinfo in pe.FileInfo:
if fileinfo.Key == 'StringFileInfo':
for st in fileinfo.StringTable:
for entry in st.entries.items():
res[entry[0]] = entry[1]
if fileinfo.Key == 'VarFileInfo':
for var in fileinfo.Var:
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
if hasattr(pe, 'VS_FIXEDFILEINFO'):
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
res['type'] = pe.VS_FIXEDFILEINFO.FileType
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
return res
def extract_infos(fpath):
res = {}
pe = pefile.PE(fpath)
res['Machine'] = pe.FILE_HEADER.Machine
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
res['Characteristics'] = pe.FILE_HEADER.Characteristics
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
try:
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
except AttributeError:
res['BaseOfData'] = 0
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
# Sections
res['SectionsNb'] = len(pe.sections)
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
res['SectionsMinEntropy'] = min(entropy)
res['SectionsMaxEntropy'] = max(entropy)
raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
res['SectionsMinRawsize'] = min(raw_sizes)
res['SectionsMaxRawsize'] = max(raw_sizes)
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
res['SectionsMinVirtualsize'] = min(virtual_sizes)
res['SectionMaxVirtualsize'] = max(virtual_sizes)
#Imports
try:
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
res['ImportsNb'] = len(imports)
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
except AttributeError:
res['ImportsNbDLL'] = 0
res['ImportsNb'] = 0
res['ImportsNbOrdinal'] = 0
#Exports
try:
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
except AttributeError:
# No export
res['ExportNb'] = 0
#Resources
resources= get_resources(pe)
res['ResourcesNb'] = len(resources)
if len(resources)> 0:
entropy = list(map(lambda x:x[0], resources))
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
res['ResourcesMinEntropy'] = min(entropy)
res['ResourcesMaxEntropy'] = max(entropy)
sizes = list(map(lambda x:x[1], resources))
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
res['ResourcesMinSize'] = min(sizes)
res['ResourcesMaxSize'] = max(sizes)
else:
res['ResourcesNb'] = 0
res['ResourcesMeanEntropy'] = 0
res['ResourcesMinEntropy'] = 0
res['ResourcesMaxEntropy'] = 0
res['ResourcesMeanSize'] = 0
res['ResourcesMinSize'] = 0
res['ResourcesMaxSize'] = 0
# Load configuration size
try:
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
except AttributeError:
res['LoadConfigurationSize'] = 0
# Version configuration size
try:
version_infos = get_version_info(pe)
res['VersionInformationSize'] = len(version_infos.keys())
except AttributeError:
res['VersionInformationSize'] = 0
return res
selected_features = ['Characteristics',
'DllCharacteristics',
'SectionsMaxEntropy',
'MajorSubsystemVersion',
'Machine',
'Subsystem',
'ResourcesMaxEntropy',
'ResourcesMinEntropy',
'VersionInformationSize',
'MajorOperatingSystemVersion',
'ResourcesMeanEntropy',
'SectionsMeanEntropy']
pipeline = Pipeline([
('std_scaler', StandardScaler()),
])
def predict_from_features(features, model):
features_as_nested_lists = [features]
features_numpy = np.array(features_as_nested_lists)
#X_unknown = pipeline.fit_transform(features_numpy)
X_unknown = features_numpy
X_unknown_columns = selected_features
X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns)
#ans = model.predict(X_unknown)
ans = model.predict([features])
return ans[0]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("filepath", help="Filepath of the file to test")
args = parser.parse_args()
#print(args.filepath)
features = selected_features
data = extract_infos(args.filepath)
pe_features = list(map(lambda x:data[x], features))
#print("===========================================")
#print("Features extracted from the file {}".format(args.filepath))
#print(pe_features)
saved_model = joblib.load("models/malware_classifier_4.pkl")
prediction = predict_from_features(pe_features, saved_model)
print(prediction)

14
check_files.sh Executable file
View File

@ -0,0 +1,14 @@
#!/bin/bash
result=[]
i=0
j=0
for filename in /dev/shm/VirusShare_*; do
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
if [ $result == "1" ]
then
((j++))
fi
((i++))
echo "${j}/${i} bad answers"
done

15
check_files2.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/bash
result=[]
i=0
j=0
for filename in /home/ubuntu/removeme_exefiles/*.exe; do
echo "$filename"
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
if [ $result == "0" ]
then
((j++))
fi
((i++))
echo "${j}/${i} bad answers"
done

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.