diff --git a/03_2_the_ml.py b/03_2_the_ml.py new file mode 100644 index 0000000..4754967 --- /dev/null +++ b/03_2_the_ml.py @@ -0,0 +1,126 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import OneHotEncoder +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error +from sklearn.tree import DecisionTreeRegressor +from sklearn.model_selection import cross_val_score +from sklearn.ensemble import RandomForestRegressor +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.naive_bayes import GaussianNB +import joblib + + +# - For features selection +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.feature_selection import SelectFromModel + +print("Loading dataset in memory [ ... ]") +files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) +print("Loading dataset in memory [ DONE ]") + +# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= + +files = files.drop("ID", axis=1) +files = files.drop("md5", axis=1) + +# ==== Split DataSet again but lets stratify with Machine feature ==== +# - Add tmp cat in order to be able to stratify data while splitting it +#files["legitimate_cat"] = pd.cut(files["legitimate"], bins=[0, 1], labels=[1,2]) + +split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) +for train_index, test_index in split.split(files, files["legitimate"]): + strat_train_set = files.loc[train_index] + strat_test_set = files.loc[test_index] + +# - Remove tmp created cat, now the data is splitted, we dont need it anymore +#for set_ in (strat_train_set, strat_test_set): +# set_.drop("legitimate_cat", axis=1, inplace=True) + +files = strat_train_set.copy() + +# ==== Split Features and Labels ==== + +print("Splitting dataset Features and Labels [ ... ]") +#files_without_labels = files_without_useless_features.drop("legitimate", axis=1) +files_without_labels = files.drop("legitimate", axis=1) +files_labels = files["legitimate"].copy() +print("Splitting dataset Features and Labels [ DONE ]") + +imputer = SimpleImputer(strategy="median") + +# - Features manual transformer - +class ManualFeatureSelector(TransformerMixin): + + def __init__(self): + pass + + def fit(self, X, y=None): + return self + + def transform(self, X): + # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- + return X.values[:,[1,24,34,18,0,23,48,47,53,14,46,32]] + # - If ID and md5 not dropped juste after dataset import use the other return - + #return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]] +# - Create the pipeline - + +pipeline = Pipeline([ + ('features_remap', ManualFeatureSelector()), + ('imputer', SimpleImputer(strategy="median")), +]) + +# - Prepare dataset, pass it through the pipeline - +print("Dataset passing through the pipeline [ ... ]") +features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] +files_prepared = pipeline.fit_transform(files_without_labels) +files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name) +print("Dataset passing through the pipeline [ DONE ]") +print("Describe of 'files_prepared': ") +print(files_prepared.describe()) + +# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-= + +algos = { + "DecisionTree": DecisionTreeClassifier(max_depth=10), + "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1), + "GradientBoosting": GradientBoostingClassifier(n_estimators=50), + "AdaBoost": AdaBoostClassifier(n_estimators=100), + "GNB": GaussianNB() + } + +X_train = files_prepared +y_train = files_labels + +X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1)) +y_test = strat_test_set['legitimate'].copy() + +results = {} +print("Testing 5 algo [ ... ]") +for algo in algos: + cur_algo = algos[algo] + cur_algo.fit(X_train, y_train) + #score = cur_algo.score(X_test, y_test) + score = cur_algo.score(X_test, y_test) + print("%s : %f %%" % (algo, score*100)) + results[algo] = score + +winner = max(results, key=results.get) +print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) + +# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= +print("Saving the model [ ... ]") +joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") +print("Saving the model [ DONE ]") + diff --git a/03_the_ml.py b/03_the_ml.py index 85e4734..7ccf11c 100644 --- a/03_the_ml.py +++ b/03_the_ml.py @@ -72,7 +72,6 @@ class ManualFeatureSelector(TransformerMixin): pipeline = Pipeline([ ('features_remap', ManualFeatureSelector()), ('imputer', SimpleImputer(strategy="median")), - ('std_scaler', StandardScaler()), ]) # - Prepare dataset, pass it through the pipeline - @@ -115,6 +114,6 @@ print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner] # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= print("Saving the model [ ... ]") -#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") +#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_2.pkl") print("Saving the model [ DONE ]") diff --git a/04_detect_from_oneline_csv.py b/04_detect_from_oneline_csv.py index e05af31..d9ea5d1 100644 --- a/04_detect_from_oneline_csv.py +++ b/04_detect_from_oneline_csv.py @@ -30,6 +30,9 @@ print("Loading dataset in memory [ ... ]") file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") +file_to_test = file_to_test.drop("ID", axis=1) +file_to_test = file_to_test.drop("md5", axis=1) + # --- remove labels --- #file_to_test = file_to_test.drop("legitimate", axis=1) # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= @@ -59,7 +62,10 @@ class ManualFeatureSelector(TransformerMixin): #'SectionsMeanEntropy'] # ? X.transpose() - Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] + #Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] + # - If ID and md5 are not droped after dataset import, use the other return - + Y = X[:,[1,24,34,18,0,23,48,47,53,14,46,32]] + #Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] return Y @@ -74,10 +80,9 @@ class ManualFeatureSelector(TransformerMixin): # - This pipeline use the imputer and scales the values - # -- Tried to regroup both pipelines, seems to work for now ... -- # -- TODO If no issues, don't forget to remove the upper commented pipeline -pipeline = Pipeline([ - ('features_remap', ManualFeatureSelector()), - ('imputer', SimpleImputer(strategy="median")), - ('std_scaler', StandardScaler()), +pipeline = Pipeline([ + ('features_remap', ManualFeatureSelector()), + ('imputer', SimpleImputer(strategy="median")), ]) @@ -90,7 +95,7 @@ def full_pipeline(data): return prepared # =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-= -saved_model = joblib.load("models/malware_classifier_1.pkl") +saved_model = joblib.load("models/malware_classifier_5.pkl") # =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= # - This function use the model and predict if it's a malware or not - @@ -116,18 +121,21 @@ def predict_one_line(model,line): # - At the end, print the prediction accuracy result res = [] -nb_malware_to_test = 10 +#nb_malware_to_test = 50 +nb_malware_to_test = 34199 good_ans = 0 -for i in range(nb_malware_to_test): +for i in range(34179,nb_malware_to_test): +#for i in range(nb_malware_to_test): print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) features = file_to_test.values[i,] features_list = features.tolist() features_array = [features_list] - features = np.array(features_array) - + features = np.array(features_array) res.append(predict_one_line(saved_model, features)) - if res[i] == file_to_test.values[i,][56]: + if res[i-34179] == file_to_test.values[i,][54]: good_ans +=1 + print(features) + print(res) print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) diff --git a/05_generate_dataset_from_file.py b/05_generate_dataset_from_file.py new file mode 100644 index 0000000..275956a --- /dev/null +++ b/05_generate_dataset_from_file.py @@ -0,0 +1,195 @@ +import pefile +import os +import array +import math +import pickle +import joblib +import sys +import argparse + +def get_entropy(data): + if len(data) == 0: + return 0.0 + occurences = array.array('L', [0]*256) + for x in data: + occurences[x if isinstance(x, int) else ord(x)] += 1 + + entropy = 0 + for x in occurences: + if x: + p_x = float(x) / len(data) + entropy -= p_x*math.log(p_x, 2) + + return entropy + +def get_resources(pe): + """Extract resources : + [entropy, size]""" + resources = [] + if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): + try: + for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: + if hasattr(resource_type, 'directory'): + for resource_id in resource_type.directory.entries: + if hasattr(resource_id, 'directory'): + for resource_lang in resource_id.directory.entries: + data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) + size = resource_lang.data.struct.Size + entropy = get_entropy(data) + + resources.append([entropy, size]) + except Exception as e: + return resources + return resources + +def get_version_info(pe): + """Return version infos""" + res = {} + for fileinfo in pe.FileInfo: + if fileinfo.Key == 'StringFileInfo': + for st in fileinfo.StringTable: + for entry in st.entries.items(): + res[entry[0]] = entry[1] + if fileinfo.Key == 'VarFileInfo': + for var in fileinfo.Var: + res[var.entry.items()[0][0]] = var.entry.items()[0][1] + if hasattr(pe, 'VS_FIXEDFILEINFO'): + res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags + res['os'] = pe.VS_FIXEDFILEINFO.FileOS + res['type'] = pe.VS_FIXEDFILEINFO.FileType + res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS + res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS + res['signature'] = pe.VS_FIXEDFILEINFO.Signature + res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion + return res + +def extract_infos(fpath): + res = {} + pe = pefile.PE(fpath) + res['Machine'] = pe.FILE_HEADER.Machine + res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader + res['Characteristics'] = pe.FILE_HEADER.Characteristics + res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion + res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion + res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode + res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData + res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData + res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint + res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode + try: + res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData + except AttributeError: + res['BaseOfData'] = 0 + res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase + res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment + res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment + res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion + res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion + res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion + res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion + res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion + res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion + res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage + res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders + res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum + res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem + res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics + res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve + res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit + res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve + res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit + res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags + res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes + + # Sections + res['SectionsNb'] = len(pe.sections) + entropy = list(map(lambda x:x.get_entropy(), pe.sections)) + res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) + res['SectionsMinEntropy'] = min(entropy) + res['SectionsMaxEntropy'] = max(entropy) + + + raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections)) + res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) + res['SectionsMinRawsize'] = min(raw_sizes) + res['SectionsMaxRawsize'] = max(raw_sizes) + virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections)) + res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) + res['SectionsMinVirtualsize'] = min(virtual_sizes) + res['SectionMaxVirtualsize'] = max(virtual_sizes) + + #Imports + try: + res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) + imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])) + res['ImportsNb'] = len(imports) + res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports))) + except AttributeError: + res['ImportsNbDLL'] = 0 + res['ImportsNb'] = 0 + res['ImportsNbOrdinal'] = 0 + + #Exports + try: + res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) + except AttributeError: + # No export + res['ExportNb'] = 0 + #Resources + resources= get_resources(pe) + res['ResourcesNb'] = len(resources) + if len(resources)> 0: + entropy = list(map(lambda x:x[0], resources)) + res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) + res['ResourcesMinEntropy'] = min(entropy) + res['ResourcesMaxEntropy'] = max(entropy) + sizes = list(map(lambda x:x[1], resources)) + res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) + res['ResourcesMinSize'] = min(sizes) + res['ResourcesMaxSize'] = max(sizes) + else: + res['ResourcesNb'] = 0 + res['ResourcesMeanEntropy'] = 0 + res['ResourcesMinEntropy'] = 0 + res['ResourcesMaxEntropy'] = 0 + res['ResourcesMeanSize'] = 0 + res['ResourcesMinSize'] = 0 + res['ResourcesMaxSize'] = 0 + + # Load configuration size + try: + res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size + except AttributeError: + res['LoadConfigurationSize'] = 0 + + + # Version configuration size + try: + version_infos = get_version_info(pe) + res['VersionInformationSize'] = len(version_infos.keys()) + except AttributeError: + res['VersionInformationSize'] = 0 + return res + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("filepath", help="Filepath of the file to test") + args = parser.parse_args() + print(args.filepath) + features = ['Characteristics', + 'DllCharacteristics', + 'SectionsMaxEntropy', + 'MajorSubsystemVersion', + 'Machine', + 'Subsystem', + 'ResourcesMaxEntropy', + 'ResourcesMinEntropy', + 'VersionInformationSize', + 'MajorOperatingSystemVersion', + 'ResourcesMeanEntropy', + 'SectionsMeanEntropy'] + data = extract_infos(args.filepath) + pe_features = list(map(lambda x:data[x], features)) + print("===========================================") + print("Features extracted from the file {}".format(args.filepath)) + print(pe_features) diff --git a/06_extract_features_and_predict.py b/06_extract_features_and_predict.py new file mode 100644 index 0000000..421048a --- /dev/null +++ b/06_extract_features_and_predict.py @@ -0,0 +1,220 @@ +import pefile +import os +import array +import math +import pickle +import joblib +import sys +import argparse +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +import numpy as np +import pandas as pd + +def get_entropy(data): + if len(data) == 0: + return 0.0 + occurences = array.array('L', [0]*256) + for x in data: + occurences[x if isinstance(x, int) else ord(x)] += 1 + + entropy = 0 + for x in occurences: + if x: + p_x = float(x) / len(data) + entropy -= p_x*math.log(p_x, 2) + + return entropy + +def get_resources(pe): + """Extract resources : + [entropy, size]""" + resources = [] + if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): + try: + for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: + if hasattr(resource_type, 'directory'): + for resource_id in resource_type.directory.entries: + if hasattr(resource_id, 'directory'): + for resource_lang in resource_id.directory.entries: + data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) + size = resource_lang.data.struct.Size + entropy = get_entropy(data) + + resources.append([entropy, size]) + except Exception as e: + return resources + return resources + +def get_version_info(pe): + """Return version infos""" + res = {} + for fileinfo in pe.FileInfo: + if fileinfo.Key == 'StringFileInfo': + for st in fileinfo.StringTable: + for entry in st.entries.items(): + res[entry[0]] = entry[1] + if fileinfo.Key == 'VarFileInfo': + for var in fileinfo.Var: + res[var.entry.items()[0][0]] = var.entry.items()[0][1] + if hasattr(pe, 'VS_FIXEDFILEINFO'): + res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags + res['os'] = pe.VS_FIXEDFILEINFO.FileOS + res['type'] = pe.VS_FIXEDFILEINFO.FileType + res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS + res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS + res['signature'] = pe.VS_FIXEDFILEINFO.Signature + res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion + return res + +def extract_infos(fpath): + res = {} + pe = pefile.PE(fpath) + res['Machine'] = pe.FILE_HEADER.Machine + res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader + res['Characteristics'] = pe.FILE_HEADER.Characteristics + res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion + res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion + res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode + res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData + res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData + res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint + res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode + try: + res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData + except AttributeError: + res['BaseOfData'] = 0 + res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase + res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment + res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment + res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion + res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion + res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion + res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion + res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion + res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion + res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage + res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders + res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum + res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem + res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics + res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve + res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit + res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve + res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit + res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags + res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes + + # Sections + res['SectionsNb'] = len(pe.sections) + entropy = list(map(lambda x:x.get_entropy(), pe.sections)) + res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) + res['SectionsMinEntropy'] = min(entropy) + res['SectionsMaxEntropy'] = max(entropy) + + + raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections)) + res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) + res['SectionsMinRawsize'] = min(raw_sizes) + res['SectionsMaxRawsize'] = max(raw_sizes) + virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections)) + res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) + res['SectionsMinVirtualsize'] = min(virtual_sizes) + res['SectionMaxVirtualsize'] = max(virtual_sizes) + + #Imports + try: + res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) + imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])) + res['ImportsNb'] = len(imports) + res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports))) + except AttributeError: + res['ImportsNbDLL'] = 0 + res['ImportsNb'] = 0 + res['ImportsNbOrdinal'] = 0 + + #Exports + try: + res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) + except AttributeError: + # No export + res['ExportNb'] = 0 + #Resources + resources= get_resources(pe) + res['ResourcesNb'] = len(resources) + if len(resources)> 0: + entropy = list(map(lambda x:x[0], resources)) + res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) + res['ResourcesMinEntropy'] = min(entropy) + res['ResourcesMaxEntropy'] = max(entropy) + sizes = list(map(lambda x:x[1], resources)) + res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) + res['ResourcesMinSize'] = min(sizes) + res['ResourcesMaxSize'] = max(sizes) + else: + res['ResourcesNb'] = 0 + res['ResourcesMeanEntropy'] = 0 + res['ResourcesMinEntropy'] = 0 + res['ResourcesMaxEntropy'] = 0 + res['ResourcesMeanSize'] = 0 + res['ResourcesMinSize'] = 0 + res['ResourcesMaxSize'] = 0 + + # Load configuration size + try: + res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size + except AttributeError: + res['LoadConfigurationSize'] = 0 + + + # Version configuration size + try: + version_infos = get_version_info(pe) + res['VersionInformationSize'] = len(version_infos.keys()) + except AttributeError: + res['VersionInformationSize'] = 0 + return res + +selected_features = ['Characteristics', + 'DllCharacteristics', + 'SectionsMaxEntropy', + 'MajorSubsystemVersion', + 'Machine', + 'Subsystem', + 'ResourcesMaxEntropy', + 'ResourcesMinEntropy', + 'VersionInformationSize', + 'MajorOperatingSystemVersion', + 'ResourcesMeanEntropy', + 'SectionsMeanEntropy'] + +pipeline = Pipeline([ + ('std_scaler', StandardScaler()), +]) + +def predict_from_features(features, model): + features_as_nested_lists = [features] + features_numpy = np.array(features_as_nested_lists) + #X_unknown = pipeline.fit_transform(features_numpy) + X_unknown = features_numpy + X_unknown_columns = selected_features + X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) + #ans = model.predict(X_unknown) + ans = model.predict([features]) + return ans[0] + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("filepath", help="Filepath of the file to test") + args = parser.parse_args() + #print(args.filepath) + features = selected_features + data = extract_infos(args.filepath) + pe_features = list(map(lambda x:data[x], features)) + #print("===========================================") + #print("Features extracted from the file {}".format(args.filepath)) + #print(pe_features) + + saved_model = joblib.load("models/malware_classifier_4.pkl") + prediction = predict_from_features(pe_features, saved_model) + print(prediction) diff --git a/check_files.sh b/check_files.sh new file mode 100755 index 0000000..b0de082 --- /dev/null +++ b/check_files.sh @@ -0,0 +1,14 @@ +#!/bin/bash +result=[] +i=0 +j=0 +for filename in /dev/shm/VirusShare_*; do + result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" + python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename" + if [ $result == "1" ] + then + ((j++)) + fi + ((i++)) + echo "${j}/${i} bad answers" +done diff --git a/check_files2.sh b/check_files2.sh new file mode 100755 index 0000000..85b2129 --- /dev/null +++ b/check_files2.sh @@ -0,0 +1,15 @@ +#!/bin/bash +result=[] +i=0 +j=0 +for filename in /home/ubuntu/removeme_exefiles/*.exe; do + echo "$filename" + result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" + python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename" + if [ $result == "0" ] + then + ((j++)) + fi + ((i++)) + echo "${j}/${i} bad answers" +done diff --git a/models/malware_classifier_2.pkl b/models/malware_classifier_2.pkl new file mode 100644 index 0000000..7961bb1 Binary files /dev/null and b/models/malware_classifier_2.pkl differ diff --git a/models/malware_classifier_3.pkl b/models/malware_classifier_3.pkl new file mode 100644 index 0000000..8bf9aea Binary files /dev/null and b/models/malware_classifier_3.pkl differ diff --git a/models/malware_classifier_4.pkl b/models/malware_classifier_4.pkl new file mode 100644 index 0000000..e260689 Binary files /dev/null and b/models/malware_classifier_4.pkl differ diff --git a/models/malware_classifier_5.pkl b/models/malware_classifier_5.pkl new file mode 100644 index 0000000..2806e40 Binary files /dev/null and b/models/malware_classifier_5.pkl differ