From a4236e68d048e3bc767cdeeffe48b252b866cfad Mon Sep 17 00:00:00 2001 From: valentin Date: Sat, 4 Apr 2020 18:22:01 +0200 Subject: [PATCH] canap --- 03_the_ml.py | 4 +- 04_detect_from_oneline_csv.py | 116 +++++++++++++++------------------- dataset/malware_header.csv | 1 + dataset/one_malware.csv | 1 - dataset/one_malware_2.csv | 2 + dataset/one_malware_3.csv | 2 + dataset/one_malware_4.csv | 2 + dataset/one_malware_5.csv | 2 + dataset/one_malware_6.csv | 2 + dataset/one_malware_7.csv | 2 + 10 files changed, 66 insertions(+), 68 deletions(-) create mode 100644 dataset/malware_header.csv delete mode 100644 dataset/one_malware.csv create mode 100644 dataset/one_malware_2.csv create mode 100644 dataset/one_malware_3.csv create mode 100644 dataset/one_malware_4.csv create mode 100644 dataset/one_malware_5.csv create mode 100644 dataset/one_malware_6.csv create mode 100644 dataset/one_malware_7.csv diff --git a/03_the_ml.py b/03_the_ml.py index 658cee8..85e4734 100644 --- a/03_the_ml.py +++ b/03_the_ml.py @@ -27,7 +27,7 @@ from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") -files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False) +files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # ==== Split DataSet again but lets stratify with Machine feature ==== @@ -115,6 +115,6 @@ print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner] # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= print("Saving the model [ ... ]") -joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") +#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") print("Saving the model [ DONE ]") diff --git a/04_detect_from_oneline_csv.py b/04_detect_from_oneline_csv.py index 658cee8..9a3eeaf 100644 --- a/04_detect_from_oneline_csv.py +++ b/04_detect_from_oneline_csv.py @@ -27,34 +27,12 @@ from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") -files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False) +file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") + +# --- remove labels --- +#file_to_test = file_to_test.drop("legitimate", axis=1) # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= -# ==== Split DataSet again but lets stratify with Machine feature ==== -# - Add tmp cat in order to be able to stratify data while splitting it -files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2]) - -split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) -for train_index, test_index in split.split(files, files["Machine_cat"]): - strat_train_set = files.loc[train_index] - strat_test_set = files.loc[test_index] - -# - Remove tmp created cat, now the data is splitted, we dont need it anymore -for set_ in (strat_train_set, strat_test_set): - set_.drop("Machine_cat", axis=1, inplace=True) - -files = strat_train_set.copy() - -# ==== Split Features and Labels ==== - -print("Splitting dataset Features and Labels [ ... ]") -#files_without_labels = files_without_useless_features.drop("legitimate", axis=1) -files_without_labels = files.drop("legitimate", axis=1) -files_labels = files["legitimate"].copy() -print("Splitting dataset Features and Labels [ DONE ]") - -imputer = SimpleImputer(strategy="median") - # - Features manual transformer - class ManualFeatureSelector(TransformerMixin): @@ -75,46 +53,54 @@ pipeline = Pipeline([ ('std_scaler', StandardScaler()), ]) -# - Prepare dataset, pass it through the pipeline - -print("Dataset passing through the pipeline [ ... ]") -features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -files_prepared = pipeline.fit_transform(files_without_labels) -files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name) -print("Dataset passing through the pipeline [ DONE ]") -print("Describe of 'files_prepared': ") -print(files_prepared.describe()) +saved_model = joblib.load("models/malware_classifier_1.pkl") + +def list_transformer(l): + l_t = [] + + l_t.append(l[3]) + l_t.append(l[26]) + l_t.append(l[36]) + l_t.append(l[20]) + l_t.append(l[2]) + l_t.append(l[25]) + l_t.append(l[50]) + l_t.append(l[49]) + l_t.append(l[55]) + l_t.append(l[16]) + l_t.append(l[48]) + l_t.append(l[34]) + + return l_t -# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-= +def predict_one_line(model,line): -algos = { - "DecisionTree": DecisionTreeClassifier(max_depth=10), - "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1), - "GradientBoosting": GradientBoostingClassifier(n_estimators=50), - "AdaBoost": AdaBoostClassifier(n_estimators=100), - "GNB": GaussianNB() - } + #X_unknown = pipeline.fit_transform(line) + X_unknown = list_transformer(line) + X_unknown = pd.DataFrame([X_unknown]) + X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] + ans = model.predict(X_unknown) + ans_type = ['malicious', 'legitimate'] + #print(ans[0]) + #print("This file is: ", ans_type[ans[0]]) + return ans[0] -X_train = files_prepared -y_train = files_labels +res = [] +labels = [] +nb_malware_to_test = 2000 +good_ans = 0 +for i in range(nb_malware_to_test): + print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) + features = file_to_test.values[i,] + res.append(predict_one_line(saved_model, features)) + labels.append(file_to_test.values[i,][56]) + if res[i] == file_to_test.values[i,][56]: + good_ans +=1 + +print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) -X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1)) -y_test = strat_test_set['legitimate'].copy() - -results = {} -print("Testing 5 algo [ ... ]") -for algo in algos: - cur_algo = algos[algo] - cur_algo.fit(X_train, y_train) - #score = cur_algo.score(X_test, y_test) - score = cur_algo.score(X_test, y_test) - print("%s : %f %%" % (algo, score*100)) - results[algo] = score - -winner = max(results, key=results.get) -print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) - -# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= -print("Saving the model [ ... ]") -joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") -print("Saving the model [ DONE ]") - +l1 = file_to_test.values[2,] +#print(file_to_test.values[1,]) +print(l1) +#predict_one_line(saved_model, l1) +#predict_one_line(saved_model, file_to_test) diff --git a/dataset/malware_header.csv b/dataset/malware_header.csv new file mode 100644 index 0000000..844bc91 --- /dev/null +++ b/dataset/malware_header.csv @@ -0,0 +1 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate diff --git a/dataset/one_malware.csv b/dataset/one_malware.csv deleted file mode 100644 index f8a3aef..0000000 --- a/dataset/one_malware.csv +++ /dev/null @@ -1 +0,0 @@ -4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0 diff --git a/dataset/one_malware_2.csv b/dataset/one_malware_2.csv new file mode 100644 index 0000000..81cad4d --- /dev/null +++ b/dataset/one_malware_2.csv @@ -0,0 +1,2 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize +4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0 diff --git a/dataset/one_malware_3.csv b/dataset/one_malware_3.csv new file mode 100644 index 0000000..3b6238c --- /dev/null +++ b/dataset/one_malware_3.csv @@ -0,0 +1,2 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize +55599,26e634af4a03b524e293fec7f70ef740,332,224,33166,2,25,29696,10752,0,32996,4096,36864,4194304,4096,512,4,0,0,0,4,0,110592,1024,0,2,0,1048576,16384,1048576,4096,0,16,8,3.31141157945,0,6.51167217489,5056,0,29696,10213.625,8,43161,10,85,0,0,4,4.63428968486,1.78149632953,6.90278161507,1118,16,4264,0,0 diff --git a/dataset/one_malware_4.csv b/dataset/one_malware_4.csv new file mode 100644 index 0000000..6a53dcc --- /dev/null +++ b/dataset/one_malware_4.csv @@ -0,0 +1,2 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize +55571,15aacc9fb50d342921fc27828d818dc0,332,224,271,6,0,98304,131072,0,98902,4096,102400,4194304,4096,4096,4,0,0,0,4,0,233472,4096,0,2,0,1048576,4096,1048576,4096,0,16,4,3.54693957609,1.04392074096,5.65629913697,32768,4096,98304,55010.5,968,115944,18,233,1,0,1,3.5634550578,3.5634550578,3.5634550578,868,868,868,0,0 diff --git a/dataset/one_malware_5.csv b/dataset/one_malware_5.csv new file mode 100644 index 0000000..00ca925 --- /dev/null +++ b/dataset/one_malware_5.csv @@ -0,0 +1,2 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize +55699,57f64bb19554df5905e284e1c6d36e90,332,224,271,6,0,23040,119808,1024,12491,4096,28672,4194304,4096,512,4,0,6,0,4,0,225280,1024,0,2,32768,1048576,4096,1048576,4096,0,16,5,4.47217657988,0,6.43310034828,11878.4,0,30720,42597.2,4496,110456,8,155,1,0,18,4.28749013773,2.47653802279,6.71583396325,1622.72222222,132,9640,0,0 diff --git a/dataset/one_malware_6.csv b/dataset/one_malware_6.csv new file mode 100644 index 0000000..88f0739 --- /dev/null +++ b/dataset/one_malware_6.csv @@ -0,0 +1,2 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize +199999,c2780cf8eea998e4f3672159926b8757,332,224,783,2,56,29184,14848,110592,14764,4096,36864,4194304,4096,512,4,0,6,0,4,0,221184,1024,0,2,32768,2097152,4096,1048576,4096,0,16,7,3.34528001443,0,6.30323831549,8118.85714286,0,29184,28755.4285714,140,110088,8,155,0,0,8,4.0912689525,2.45849222582,5.86996923095,2088.75,48,9640,0,0 diff --git a/dataset/one_malware_7.csv b/dataset/one_malware_7.csv new file mode 100644 index 0000000..1d5e515 --- /dev/null +++ b/dataset/one_malware_7.csv @@ -0,0 +1,2 @@ +ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize +34181,e4647c31a5fcfde4a964e820d4ecae30,332,224,271,6,0,25600,98816,0,14588,4096,32768,4194304,4096,512,4,0,0,0,4,0,135168,1024,0,2,0,1048576,4096,1048576,4096,0,16,3,6.07048588959,5.03156401487,6.77656630478,19285.3333333,2560,29696,41197.3333333,2284,96056,1,49,0,0,0,0,0,0,0,0,0,0,0