From 433b2a0957d266538a0e39cb2bf1e696f3a7b303 Mon Sep 17 00:00:00 2001 From: valentin Date: Sat, 4 Apr 2020 22:20:02 +0200 Subject: [PATCH] Corrected semi-auto accuracy tester --- .04_detect_from_oneline_csv.py.swp | Bin 0 -> 20480 bytes 04_detect_from_oneline_csv.py | 98 ++++++++++++++++++----------- 2 files changed, 61 insertions(+), 37 deletions(-) create mode 100644 .04_detect_from_oneline_csv.py.swp diff --git a/.04_detect_from_oneline_csv.py.swp b/.04_detect_from_oneline_csv.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..a22b046f15306196f2cc3821f3087c9a66ba3881 GIT binary patch literal 20480 zcmeHOTZ|k>6)o(5**H#65Qq;Hid~zTfu7l&^^=8h6tCBgrL1jvY{%Ak($vh>6HbOY~yrqwAZ7t~W)`cSP5NL-|{y=k3w8Gn5}a=9ip-oPnHyoPnHy zoPnHyoPnHyoPnHyoPnHyoPqxV0|E>6&tma@>zmOy&i~`}{~vE->_y;vz{i1O00;hj zD`QUpj{+Y7W&s8~zk{(a0~YWh;AY?#w=niaUgZEp2?Z6h`uh@in33v{;4m=F}92+s$@xyy5DzX zBNT0x2Et)Vi6bSJqJ|2f&uANV;1x7pFy)fr8{k$J&|Q#&NYCy_v*LtZRXnE$){13c zvZ(6hp)z>=Mqz$gxT5V#*YJF!t?E5&nj=!{2&qBedy8Ui!SY?(=|czl6&A%s+pPyJ zukZP?cUHO{p(Clv-Df zpKzore8XC*_Y?lXfdfnH;LW2RNm7W`&89htXmaDYsoSU#RGB1B#kL;}oO^81;b?r7 zI?n8KPpm>M)l*r?K@}NQlZP623`d1CRl0_+`!I2jNGZk=DGQp6hHa?B=)bT*6Wu;2 z4ub3rG9O)2M;H#Ns^DX!I(N3kzQ;lmDD&+vI7sg zLBFVS4C)S7dUz^1bf~eWq(wrDD|^C$GK<0rgn1Z)mkV!KdV)ToK%XvP*nM%N)^U&d3V@_-A(PsB+yZ39eRc*I6 zyHDF&)%MP4d-rM8{o0-nYP1Q0BlDrF~8|C++YkC+*AVdEn;4r_tU}l^o?mDPyF|MaC%q%qEf^ zzz8Yxw(a8U7rA3aFBeawmupE$n2o0FKBq~=*@-RV?ZDLNFF zS}a5br=zRhHeu%#f6Xsw_?#O(18%EU(BLdg`;s2kT{|L7PnI^LL#0wtDlm$O<=zTv zcdbg~5(e&>e3}*NiVFymQK4;dh^uFM%O?OgQ$sH8|Di+r9LqoE|y4=+fhiu#P)0$-7 zdAPt)7do#ys22c88iq ztnb6NYkQs!XMm3)l+2bHr`Nb9r0-&C7{5smwyAsf2Q<0TrRxrBMB&!L)x-Hs)Pwh^ zw5WctkkzMc!|=#gh4weEX6Grtkd>ynhHJy8c{JvtGTATwP=VFIa7iprIav#+Zi0O=YGc*^0+>Z5`vyr(xEq zF9+SODeI1D_?s$o`a$Qj5P$Qj5P$Qj5P$Qj5P_^&ZAHHD2#(dNY@3mZ&MjddW_5T=FFMPd z)v$*g?~?Iat#*(f{kMd%8pfgKH2tB2K{~d(gfp(>U^6}~9Cf}~Qb%|=KaPd?2WnO9 zTN_>MC*qtH$D~6Wk&)Iofo<&F*Tjx3PO7|$z{v-WNjLNt$zB^8#U63`jMs81!gWQz z7;nev>aoOAVqp?UY$QT-IE0hM%rmhZ9UfPFTOA*j)cXG^oROc!xjL=?|L*{v!Mgrw;7hvH28ORyP8ORyP8ORxU z-7qkbOwPv4eqXPFmQC;3Wiqp(E6ls>GV~dI|?2hH`0nF%swqv@RAbYga}x( zBn6(5Qc<`BYRCjGAsD1Cdq!CV&?Kb=MYtk1AtTE%)ZV2I&7QXfqqM zl?uwrgnJF%CY9Wzh{c2@u#=4?By$0e6!Nrg@vP3*H6=ljQOXd{zzmbsa-&f&5OrEb z+>N@Kjc?WpO@tl7x>|u@c9bm^sGb6bgKG$ijw!3^PQywm@CPA5T{{fIT0(e>MIl0I zE2l8vFs!M}!#Lr1#KiH1VK@av7e?>Fv4>#z1yqU^lHUfF(*OqzW^hqPuc(g literal 0 HcmV?d00001 diff --git a/04_detect_from_oneline_csv.py b/04_detect_from_oneline_csv.py index 9a3eeaf..1a0c448 100644 --- a/04_detect_from_oneline_csv.py +++ b/04_detect_from_oneline_csv.py @@ -43,64 +43,88 @@ class ManualFeatureSelector(TransformerMixin): return self def transform(self, X): - # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- - return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]] -# - Create the pipeline - + # -- corresponding values to these indices are : + + #['Characteristics', + #'DllCharacteristics', + #'SectionsMaxEntropy', + #'MajorSubsystemVersion', + #'Machine', + #'Subsystem', + #'ResourcesMaxEntropy', + #'ResourcesMinEntropy', + #'VersionInformationSize', + #'MajorOperatingSystemVersion', + #'ResourcesMeanEntropy', + #'SectionsMeanEntropy'] + + # ? X.transpose() + Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] + return Y -pipeline = Pipeline([ + +# =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-= + +# - This pipeline select usefull features - +pipeline_features_selection = Pipeline([ ('features_remap', ManualFeatureSelector()), +]) + + +# - This pipeline use the imputer and scales the values - +pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) + +# - Call the two upper pipelines - +def full_pipeline(data): + prepared = pipeline_features_selection.fit_transform(data) + #print("Full pipeline -> shape:", prepared.shape) + #print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared)) + prepared = pipeline.fit_transform(prepared) + return prepared + +# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-= saved_model = joblib.load("models/malware_classifier_1.pkl") -def list_transformer(l): - l_t = [] - - l_t.append(l[3]) - l_t.append(l[26]) - l_t.append(l[36]) - l_t.append(l[20]) - l_t.append(l[2]) - l_t.append(l[25]) - l_t.append(l[50]) - l_t.append(l[49]) - l_t.append(l[55]) - l_t.append(l[16]) - l_t.append(l[48]) - l_t.append(l[34]) - - return l_t +# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= +# - This function use the model and predict if it's a malware or not - +# - The file infos are given in numpy array type - +# - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable - def predict_one_line(model,line): - - #X_unknown = pipeline.fit_transform(line) - X_unknown = list_transformer(line) - X_unknown = pd.DataFrame([X_unknown]) - X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] + X_unknown = full_pipeline(line) + X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] + X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) ans = model.predict(X_unknown) - ans_type = ['malicious', 'legitimate'] - #print(ans[0]) - #print("This file is: ", ans_type[ans[0]]) + #ans_type = ['malicious', 'legitimate'] return ans[0] + +# =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-= +# - Takes each line of the dataset +# - Parse it as a numpy array +# - Send it to the prediction function (predict_one_line) +# - Compare the result with the expected value +# - Save the result +# - At the end, print the prediction accuracy result + res = [] -labels = [] -nb_malware_to_test = 2000 +nb_malware_to_test = 10 good_ans = 0 for i in range(nb_malware_to_test): - print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) + print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) features = file_to_test.values[i,] + features_list = features.tolist() + features_array = [features_list] + features = np.array(features_array) + res.append(predict_one_line(saved_model, features)) - labels.append(file_to_test.values[i,][56]) if res[i] == file_to_test.values[i,][56]: good_ans +=1 print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) l1 = file_to_test.values[2,] -#print(file_to_test.values[1,]) -print(l1) -#predict_one_line(saved_model, l1) -#predict_one_line(saved_model, file_to_test)