diff --git a/.04_detect_from_oneline_csv.py.swp b/.04_detect_from_oneline_csv.py.swp new file mode 100644 index 0000000..a22b046 Binary files /dev/null and b/.04_detect_from_oneline_csv.py.swp differ diff --git a/04_detect_from_oneline_csv.py b/04_detect_from_oneline_csv.py index 9a3eeaf..1a0c448 100644 --- a/04_detect_from_oneline_csv.py +++ b/04_detect_from_oneline_csv.py @@ -43,64 +43,88 @@ class ManualFeatureSelector(TransformerMixin): return self def transform(self, X): - # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- - return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]] -# - Create the pipeline - + # -- corresponding values to these indices are : + + #['Characteristics', + #'DllCharacteristics', + #'SectionsMaxEntropy', + #'MajorSubsystemVersion', + #'Machine', + #'Subsystem', + #'ResourcesMaxEntropy', + #'ResourcesMinEntropy', + #'VersionInformationSize', + #'MajorOperatingSystemVersion', + #'ResourcesMeanEntropy', + #'SectionsMeanEntropy'] + + # ? X.transpose() + Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] + return Y -pipeline = Pipeline([ + +# =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-= + +# - This pipeline select usefull features - +pipeline_features_selection = Pipeline([ ('features_remap', ManualFeatureSelector()), +]) + + +# - This pipeline use the imputer and scales the values - +pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) + +# - Call the two upper pipelines - +def full_pipeline(data): + prepared = pipeline_features_selection.fit_transform(data) + #print("Full pipeline -> shape:", prepared.shape) + #print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared)) + prepared = pipeline.fit_transform(prepared) + return prepared + +# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-= saved_model = joblib.load("models/malware_classifier_1.pkl") -def list_transformer(l): - l_t = [] - - l_t.append(l[3]) - l_t.append(l[26]) - l_t.append(l[36]) - l_t.append(l[20]) - l_t.append(l[2]) - l_t.append(l[25]) - l_t.append(l[50]) - l_t.append(l[49]) - l_t.append(l[55]) - l_t.append(l[16]) - l_t.append(l[48]) - l_t.append(l[34]) - - return l_t +# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= +# - This function use the model and predict if it's a malware or not - +# - The file infos are given in numpy array type - +# - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable - def predict_one_line(model,line): - - #X_unknown = pipeline.fit_transform(line) - X_unknown = list_transformer(line) - X_unknown = pd.DataFrame([X_unknown]) - X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] + X_unknown = full_pipeline(line) + X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] + X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) ans = model.predict(X_unknown) - ans_type = ['malicious', 'legitimate'] - #print(ans[0]) - #print("This file is: ", ans_type[ans[0]]) + #ans_type = ['malicious', 'legitimate'] return ans[0] + +# =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-= +# - Takes each line of the dataset +# - Parse it as a numpy array +# - Send it to the prediction function (predict_one_line) +# - Compare the result with the expected value +# - Save the result +# - At the end, print the prediction accuracy result + res = [] -labels = [] -nb_malware_to_test = 2000 +nb_malware_to_test = 10 good_ans = 0 for i in range(nb_malware_to_test): - print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) + print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) features = file_to_test.values[i,] + features_list = features.tolist() + features_array = [features_list] + features = np.array(features_array) + res.append(predict_one_line(saved_model, features)) - labels.append(file_to_test.values[i,][56]) if res[i] == file_to_test.values[i,][56]: good_ans +=1 print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) l1 = file_to_test.values[2,] -#print(file_to_test.values[1,]) -print(l1) -#predict_one_line(saved_model, l1) -#predict_one_line(saved_model, file_to_test)