Re-adapted the data analysis and added test script over real malware
This commit is contained in:
parent
36f9d7e098
commit
e126fb2600
126
03_2_the_ml.py
Normal file
126
03_2_the_ml.py
Normal file
@ -0,0 +1,126 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import StratifiedShuffleSplit
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.ensemble import AdaBoostClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
import joblib
|
||||
|
||||
|
||||
# - For features selection
|
||||
from sklearn.ensemble import ExtraTreesClassifier
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
|
||||
print("Loading dataset in memory [ ... ]")
|
||||
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
||||
print("Loading dataset in memory [ DONE ]")
|
||||
|
||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||
|
||||
files = files.drop("ID", axis=1)
|
||||
files = files.drop("md5", axis=1)
|
||||
|
||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||
#files["legitimate_cat"] = pd.cut(files["legitimate"], bins=[0, 1], labels=[1,2])
|
||||
|
||||
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||
for train_index, test_index in split.split(files, files["legitimate"]):
|
||||
strat_train_set = files.loc[train_index]
|
||||
strat_test_set = files.loc[test_index]
|
||||
|
||||
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
||||
#for set_ in (strat_train_set, strat_test_set):
|
||||
# set_.drop("legitimate_cat", axis=1, inplace=True)
|
||||
|
||||
files = strat_train_set.copy()
|
||||
|
||||
# ==== Split Features and Labels ====
|
||||
|
||||
print("Splitting dataset Features and Labels [ ... ]")
|
||||
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
||||
files_without_labels = files.drop("legitimate", axis=1)
|
||||
files_labels = files["legitimate"].copy()
|
||||
print("Splitting dataset Features and Labels [ DONE ]")
|
||||
|
||||
imputer = SimpleImputer(strategy="median")
|
||||
|
||||
# - Features manual transformer -
|
||||
class ManualFeatureSelector(TransformerMixin):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
# -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] --
|
||||
return X.values[:,[1,24,34,18,0,23,48,47,53,14,46,32]]
|
||||
# - If ID and md5 not dropped juste after dataset import use the other return -
|
||||
#return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
|
||||
# - Create the pipeline -
|
||||
|
||||
pipeline = Pipeline([
|
||||
('features_remap', ManualFeatureSelector()),
|
||||
('imputer', SimpleImputer(strategy="median")),
|
||||
])
|
||||
|
||||
# - Prepare dataset, pass it through the pipeline -
|
||||
print("Dataset passing through the pipeline [ ... ]")
|
||||
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||
files_prepared = pipeline.fit_transform(files_without_labels)
|
||||
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name)
|
||||
print("Dataset passing through the pipeline [ DONE ]")
|
||||
print("Describe of 'files_prepared': ")
|
||||
print(files_prepared.describe())
|
||||
|
||||
# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-=
|
||||
|
||||
algos = {
|
||||
"DecisionTree": DecisionTreeClassifier(max_depth=10),
|
||||
"RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
|
||||
"GradientBoosting": GradientBoostingClassifier(n_estimators=50),
|
||||
"AdaBoost": AdaBoostClassifier(n_estimators=100),
|
||||
"GNB": GaussianNB()
|
||||
}
|
||||
|
||||
X_train = files_prepared
|
||||
y_train = files_labels
|
||||
|
||||
X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1))
|
||||
y_test = strat_test_set['legitimate'].copy()
|
||||
|
||||
results = {}
|
||||
print("Testing 5 algo [ ... ]")
|
||||
for algo in algos:
|
||||
cur_algo = algos[algo]
|
||||
cur_algo.fit(X_train, y_train)
|
||||
#score = cur_algo.score(X_test, y_test)
|
||||
score = cur_algo.score(X_test, y_test)
|
||||
print("%s : %f %%" % (algo, score*100))
|
||||
results[algo] = score
|
||||
|
||||
winner = max(results, key=results.get)
|
||||
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||
|
||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||
print("Saving the model [ ... ]")
|
||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
|
||||
print("Saving the model [ DONE ]")
|
||||
|
||||
@ -72,7 +72,6 @@ class ManualFeatureSelector(TransformerMixin):
|
||||
pipeline = Pipeline([
|
||||
('features_remap', ManualFeatureSelector()),
|
||||
('imputer', SimpleImputer(strategy="median")),
|
||||
('std_scaler', StandardScaler()),
|
||||
])
|
||||
|
||||
# - Prepare dataset, pass it through the pipeline -
|
||||
@ -115,6 +114,6 @@ print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]
|
||||
|
||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||
print("Saving the model [ ... ]")
|
||||
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_2.pkl")
|
||||
print("Saving the model [ DONE ]")
|
||||
|
||||
|
||||
@ -30,6 +30,9 @@ print("Loading dataset in memory [ ... ]")
|
||||
file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
||||
print("Loading dataset in memory [ DONE ]")
|
||||
|
||||
file_to_test = file_to_test.drop("ID", axis=1)
|
||||
file_to_test = file_to_test.drop("md5", axis=1)
|
||||
|
||||
# --- remove labels ---
|
||||
#file_to_test = file_to_test.drop("legitimate", axis=1)
|
||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||
@ -59,7 +62,10 @@ class ManualFeatureSelector(TransformerMixin):
|
||||
#'SectionsMeanEntropy']
|
||||
|
||||
# ? X.transpose()
|
||||
Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
|
||||
#Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
|
||||
# - If ID and md5 are not droped after dataset import, use the other return -
|
||||
Y = X[:,[1,24,34,18,0,23,48,47,53,14,46,32]]
|
||||
#Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
|
||||
return Y
|
||||
|
||||
|
||||
@ -74,10 +80,9 @@ class ManualFeatureSelector(TransformerMixin):
|
||||
# - This pipeline use the imputer and scales the values -
|
||||
# -- Tried to regroup both pipelines, seems to work for now ... --
|
||||
# -- TODO If no issues, don't forget to remove the upper commented pipeline
|
||||
pipeline = Pipeline([
|
||||
('features_remap', ManualFeatureSelector()),
|
||||
('imputer', SimpleImputer(strategy="median")),
|
||||
('std_scaler', StandardScaler()),
|
||||
pipeline = Pipeline([
|
||||
('features_remap', ManualFeatureSelector()),
|
||||
('imputer', SimpleImputer(strategy="median")),
|
||||
])
|
||||
|
||||
|
||||
@ -90,7 +95,7 @@ def full_pipeline(data):
|
||||
return prepared
|
||||
|
||||
# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-=
|
||||
saved_model = joblib.load("models/malware_classifier_1.pkl")
|
||||
saved_model = joblib.load("models/malware_classifier_5.pkl")
|
||||
|
||||
# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-=
|
||||
# - This function use the model and predict if it's a malware or not -
|
||||
@ -116,18 +121,21 @@ def predict_one_line(model,line):
|
||||
# - At the end, print the prediction accuracy result
|
||||
|
||||
res = []
|
||||
nb_malware_to_test = 10
|
||||
#nb_malware_to_test = 50
|
||||
nb_malware_to_test = 34199
|
||||
good_ans = 0
|
||||
for i in range(nb_malware_to_test):
|
||||
for i in range(34179,nb_malware_to_test):
|
||||
#for i in range(nb_malware_to_test):
|
||||
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
|
||||
features = file_to_test.values[i,]
|
||||
features_list = features.tolist()
|
||||
features_array = [features_list]
|
||||
features = np.array(features_array)
|
||||
|
||||
features = np.array(features_array)
|
||||
res.append(predict_one_line(saved_model, features))
|
||||
if res[i] == file_to_test.values[i,][56]:
|
||||
if res[i-34179] == file_to_test.values[i,][54]:
|
||||
good_ans +=1
|
||||
print(features)
|
||||
print(res)
|
||||
|
||||
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))
|
||||
|
||||
|
||||
195
05_generate_dataset_from_file.py
Normal file
195
05_generate_dataset_from_file.py
Normal file
@ -0,0 +1,195 @@
|
||||
import pefile
|
||||
import os
|
||||
import array
|
||||
import math
|
||||
import pickle
|
||||
import joblib
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
def get_entropy(data):
|
||||
if len(data) == 0:
|
||||
return 0.0
|
||||
occurences = array.array('L', [0]*256)
|
||||
for x in data:
|
||||
occurences[x if isinstance(x, int) else ord(x)] += 1
|
||||
|
||||
entropy = 0
|
||||
for x in occurences:
|
||||
if x:
|
||||
p_x = float(x) / len(data)
|
||||
entropy -= p_x*math.log(p_x, 2)
|
||||
|
||||
return entropy
|
||||
|
||||
def get_resources(pe):
|
||||
"""Extract resources :
|
||||
[entropy, size]"""
|
||||
resources = []
|
||||
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
|
||||
try:
|
||||
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
|
||||
if hasattr(resource_type, 'directory'):
|
||||
for resource_id in resource_type.directory.entries:
|
||||
if hasattr(resource_id, 'directory'):
|
||||
for resource_lang in resource_id.directory.entries:
|
||||
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
|
||||
size = resource_lang.data.struct.Size
|
||||
entropy = get_entropy(data)
|
||||
|
||||
resources.append([entropy, size])
|
||||
except Exception as e:
|
||||
return resources
|
||||
return resources
|
||||
|
||||
def get_version_info(pe):
|
||||
"""Return version infos"""
|
||||
res = {}
|
||||
for fileinfo in pe.FileInfo:
|
||||
if fileinfo.Key == 'StringFileInfo':
|
||||
for st in fileinfo.StringTable:
|
||||
for entry in st.entries.items():
|
||||
res[entry[0]] = entry[1]
|
||||
if fileinfo.Key == 'VarFileInfo':
|
||||
for var in fileinfo.Var:
|
||||
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
|
||||
if hasattr(pe, 'VS_FIXEDFILEINFO'):
|
||||
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
|
||||
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
|
||||
res['type'] = pe.VS_FIXEDFILEINFO.FileType
|
||||
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
|
||||
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
|
||||
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
|
||||
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
|
||||
return res
|
||||
|
||||
def extract_infos(fpath):
|
||||
res = {}
|
||||
pe = pefile.PE(fpath)
|
||||
res['Machine'] = pe.FILE_HEADER.Machine
|
||||
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
|
||||
res['Characteristics'] = pe.FILE_HEADER.Characteristics
|
||||
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
|
||||
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
|
||||
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
|
||||
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
|
||||
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
|
||||
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
|
||||
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
|
||||
try:
|
||||
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
|
||||
except AttributeError:
|
||||
res['BaseOfData'] = 0
|
||||
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
|
||||
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
|
||||
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
|
||||
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
|
||||
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
|
||||
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
|
||||
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
|
||||
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
|
||||
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
|
||||
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
|
||||
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
|
||||
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
|
||||
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
|
||||
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
|
||||
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
|
||||
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
|
||||
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
|
||||
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
|
||||
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
|
||||
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
|
||||
|
||||
# Sections
|
||||
res['SectionsNb'] = len(pe.sections)
|
||||
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
|
||||
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
|
||||
res['SectionsMinEntropy'] = min(entropy)
|
||||
res['SectionsMaxEntropy'] = max(entropy)
|
||||
|
||||
|
||||
raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
|
||||
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
|
||||
res['SectionsMinRawsize'] = min(raw_sizes)
|
||||
res['SectionsMaxRawsize'] = max(raw_sizes)
|
||||
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
|
||||
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
|
||||
res['SectionsMinVirtualsize'] = min(virtual_sizes)
|
||||
res['SectionMaxVirtualsize'] = max(virtual_sizes)
|
||||
|
||||
#Imports
|
||||
try:
|
||||
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
|
||||
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
|
||||
res['ImportsNb'] = len(imports)
|
||||
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
|
||||
except AttributeError:
|
||||
res['ImportsNbDLL'] = 0
|
||||
res['ImportsNb'] = 0
|
||||
res['ImportsNbOrdinal'] = 0
|
||||
|
||||
#Exports
|
||||
try:
|
||||
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
|
||||
except AttributeError:
|
||||
# No export
|
||||
res['ExportNb'] = 0
|
||||
#Resources
|
||||
resources= get_resources(pe)
|
||||
res['ResourcesNb'] = len(resources)
|
||||
if len(resources)> 0:
|
||||
entropy = list(map(lambda x:x[0], resources))
|
||||
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
|
||||
res['ResourcesMinEntropy'] = min(entropy)
|
||||
res['ResourcesMaxEntropy'] = max(entropy)
|
||||
sizes = list(map(lambda x:x[1], resources))
|
||||
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
|
||||
res['ResourcesMinSize'] = min(sizes)
|
||||
res['ResourcesMaxSize'] = max(sizes)
|
||||
else:
|
||||
res['ResourcesNb'] = 0
|
||||
res['ResourcesMeanEntropy'] = 0
|
||||
res['ResourcesMinEntropy'] = 0
|
||||
res['ResourcesMaxEntropy'] = 0
|
||||
res['ResourcesMeanSize'] = 0
|
||||
res['ResourcesMinSize'] = 0
|
||||
res['ResourcesMaxSize'] = 0
|
||||
|
||||
# Load configuration size
|
||||
try:
|
||||
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
|
||||
except AttributeError:
|
||||
res['LoadConfigurationSize'] = 0
|
||||
|
||||
|
||||
# Version configuration size
|
||||
try:
|
||||
version_infos = get_version_info(pe)
|
||||
res['VersionInformationSize'] = len(version_infos.keys())
|
||||
except AttributeError:
|
||||
res['VersionInformationSize'] = 0
|
||||
return res
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("filepath", help="Filepath of the file to test")
|
||||
args = parser.parse_args()
|
||||
print(args.filepath)
|
||||
features = ['Characteristics',
|
||||
'DllCharacteristics',
|
||||
'SectionsMaxEntropy',
|
||||
'MajorSubsystemVersion',
|
||||
'Machine',
|
||||
'Subsystem',
|
||||
'ResourcesMaxEntropy',
|
||||
'ResourcesMinEntropy',
|
||||
'VersionInformationSize',
|
||||
'MajorOperatingSystemVersion',
|
||||
'ResourcesMeanEntropy',
|
||||
'SectionsMeanEntropy']
|
||||
data = extract_infos(args.filepath)
|
||||
pe_features = list(map(lambda x:data[x], features))
|
||||
print("===========================================")
|
||||
print("Features extracted from the file {}".format(args.filepath))
|
||||
print(pe_features)
|
||||
220
06_extract_features_and_predict.py
Normal file
220
06_extract_features_and_predict.py
Normal file
@ -0,0 +1,220 @@
|
||||
import pefile
|
||||
import os
|
||||
import array
|
||||
import math
|
||||
import pickle
|
||||
import joblib
|
||||
import sys
|
||||
import argparse
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
def get_entropy(data):
|
||||
if len(data) == 0:
|
||||
return 0.0
|
||||
occurences = array.array('L', [0]*256)
|
||||
for x in data:
|
||||
occurences[x if isinstance(x, int) else ord(x)] += 1
|
||||
|
||||
entropy = 0
|
||||
for x in occurences:
|
||||
if x:
|
||||
p_x = float(x) / len(data)
|
||||
entropy -= p_x*math.log(p_x, 2)
|
||||
|
||||
return entropy
|
||||
|
||||
def get_resources(pe):
|
||||
"""Extract resources :
|
||||
[entropy, size]"""
|
||||
resources = []
|
||||
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
|
||||
try:
|
||||
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
|
||||
if hasattr(resource_type, 'directory'):
|
||||
for resource_id in resource_type.directory.entries:
|
||||
if hasattr(resource_id, 'directory'):
|
||||
for resource_lang in resource_id.directory.entries:
|
||||
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
|
||||
size = resource_lang.data.struct.Size
|
||||
entropy = get_entropy(data)
|
||||
|
||||
resources.append([entropy, size])
|
||||
except Exception as e:
|
||||
return resources
|
||||
return resources
|
||||
|
||||
def get_version_info(pe):
|
||||
"""Return version infos"""
|
||||
res = {}
|
||||
for fileinfo in pe.FileInfo:
|
||||
if fileinfo.Key == 'StringFileInfo':
|
||||
for st in fileinfo.StringTable:
|
||||
for entry in st.entries.items():
|
||||
res[entry[0]] = entry[1]
|
||||
if fileinfo.Key == 'VarFileInfo':
|
||||
for var in fileinfo.Var:
|
||||
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
|
||||
if hasattr(pe, 'VS_FIXEDFILEINFO'):
|
||||
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
|
||||
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
|
||||
res['type'] = pe.VS_FIXEDFILEINFO.FileType
|
||||
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
|
||||
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
|
||||
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
|
||||
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
|
||||
return res
|
||||
|
||||
def extract_infos(fpath):
|
||||
res = {}
|
||||
pe = pefile.PE(fpath)
|
||||
res['Machine'] = pe.FILE_HEADER.Machine
|
||||
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
|
||||
res['Characteristics'] = pe.FILE_HEADER.Characteristics
|
||||
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
|
||||
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
|
||||
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
|
||||
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
|
||||
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
|
||||
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
|
||||
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
|
||||
try:
|
||||
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
|
||||
except AttributeError:
|
||||
res['BaseOfData'] = 0
|
||||
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
|
||||
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
|
||||
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
|
||||
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
|
||||
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
|
||||
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
|
||||
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
|
||||
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
|
||||
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
|
||||
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
|
||||
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
|
||||
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
|
||||
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
|
||||
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
|
||||
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
|
||||
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
|
||||
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
|
||||
res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
|
||||
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
|
||||
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
|
||||
|
||||
# Sections
|
||||
res['SectionsNb'] = len(pe.sections)
|
||||
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
|
||||
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
|
||||
res['SectionsMinEntropy'] = min(entropy)
|
||||
res['SectionsMaxEntropy'] = max(entropy)
|
||||
|
||||
|
||||
raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
|
||||
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
|
||||
res['SectionsMinRawsize'] = min(raw_sizes)
|
||||
res['SectionsMaxRawsize'] = max(raw_sizes)
|
||||
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
|
||||
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
|
||||
res['SectionsMinVirtualsize'] = min(virtual_sizes)
|
||||
res['SectionMaxVirtualsize'] = max(virtual_sizes)
|
||||
|
||||
#Imports
|
||||
try:
|
||||
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
|
||||
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
|
||||
res['ImportsNb'] = len(imports)
|
||||
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
|
||||
except AttributeError:
|
||||
res['ImportsNbDLL'] = 0
|
||||
res['ImportsNb'] = 0
|
||||
res['ImportsNbOrdinal'] = 0
|
||||
|
||||
#Exports
|
||||
try:
|
||||
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
|
||||
except AttributeError:
|
||||
# No export
|
||||
res['ExportNb'] = 0
|
||||
#Resources
|
||||
resources= get_resources(pe)
|
||||
res['ResourcesNb'] = len(resources)
|
||||
if len(resources)> 0:
|
||||
entropy = list(map(lambda x:x[0], resources))
|
||||
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
|
||||
res['ResourcesMinEntropy'] = min(entropy)
|
||||
res['ResourcesMaxEntropy'] = max(entropy)
|
||||
sizes = list(map(lambda x:x[1], resources))
|
||||
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
|
||||
res['ResourcesMinSize'] = min(sizes)
|
||||
res['ResourcesMaxSize'] = max(sizes)
|
||||
else:
|
||||
res['ResourcesNb'] = 0
|
||||
res['ResourcesMeanEntropy'] = 0
|
||||
res['ResourcesMinEntropy'] = 0
|
||||
res['ResourcesMaxEntropy'] = 0
|
||||
res['ResourcesMeanSize'] = 0
|
||||
res['ResourcesMinSize'] = 0
|
||||
res['ResourcesMaxSize'] = 0
|
||||
|
||||
# Load configuration size
|
||||
try:
|
||||
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
|
||||
except AttributeError:
|
||||
res['LoadConfigurationSize'] = 0
|
||||
|
||||
|
||||
# Version configuration size
|
||||
try:
|
||||
version_infos = get_version_info(pe)
|
||||
res['VersionInformationSize'] = len(version_infos.keys())
|
||||
except AttributeError:
|
||||
res['VersionInformationSize'] = 0
|
||||
return res
|
||||
|
||||
selected_features = ['Characteristics',
|
||||
'DllCharacteristics',
|
||||
'SectionsMaxEntropy',
|
||||
'MajorSubsystemVersion',
|
||||
'Machine',
|
||||
'Subsystem',
|
||||
'ResourcesMaxEntropy',
|
||||
'ResourcesMinEntropy',
|
||||
'VersionInformationSize',
|
||||
'MajorOperatingSystemVersion',
|
||||
'ResourcesMeanEntropy',
|
||||
'SectionsMeanEntropy']
|
||||
|
||||
pipeline = Pipeline([
|
||||
('std_scaler', StandardScaler()),
|
||||
])
|
||||
|
||||
def predict_from_features(features, model):
|
||||
features_as_nested_lists = [features]
|
||||
features_numpy = np.array(features_as_nested_lists)
|
||||
#X_unknown = pipeline.fit_transform(features_numpy)
|
||||
X_unknown = features_numpy
|
||||
X_unknown_columns = selected_features
|
||||
X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns)
|
||||
#ans = model.predict(X_unknown)
|
||||
ans = model.predict([features])
|
||||
return ans[0]
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("filepath", help="Filepath of the file to test")
|
||||
args = parser.parse_args()
|
||||
#print(args.filepath)
|
||||
features = selected_features
|
||||
data = extract_infos(args.filepath)
|
||||
pe_features = list(map(lambda x:data[x], features))
|
||||
#print("===========================================")
|
||||
#print("Features extracted from the file {}".format(args.filepath))
|
||||
#print(pe_features)
|
||||
|
||||
saved_model = joblib.load("models/malware_classifier_4.pkl")
|
||||
prediction = predict_from_features(pe_features, saved_model)
|
||||
print(prediction)
|
||||
14
check_files.sh
Executable file
14
check_files.sh
Executable file
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
result=[]
|
||||
i=0
|
||||
j=0
|
||||
for filename in /dev/shm/VirusShare_*; do
|
||||
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
||||
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
||||
if [ $result == "1" ]
|
||||
then
|
||||
((j++))
|
||||
fi
|
||||
((i++))
|
||||
echo "${j}/${i} bad answers"
|
||||
done
|
||||
15
check_files2.sh
Executable file
15
check_files2.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
result=[]
|
||||
i=0
|
||||
j=0
|
||||
for filename in /home/ubuntu/removeme_exefiles/*.exe; do
|
||||
echo "$filename"
|
||||
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
||||
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
||||
if [ $result == "0" ]
|
||||
then
|
||||
((j++))
|
||||
fi
|
||||
((i++))
|
||||
echo "${j}/${i} bad answers"
|
||||
done
|
||||
BIN
models/malware_classifier_2.pkl
Normal file
BIN
models/malware_classifier_2.pkl
Normal file
Binary file not shown.
BIN
models/malware_classifier_3.pkl
Normal file
BIN
models/malware_classifier_3.pkl
Normal file
Binary file not shown.
BIN
models/malware_classifier_4.pkl
Normal file
BIN
models/malware_classifier_4.pkl
Normal file
Binary file not shown.
BIN
models/malware_classifier_5.pkl
Normal file
BIN
models/malware_classifier_5.pkl
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user