Corrected semi-auto accuracy tester

2020-04-04 22:20:02 +02:00 · 2020-04-04 22:20:02 +02:00 · 433b2a0957
commit 433b2a0957
parent a4236e68d0
2 changed files with 61 additions and 37 deletions
--- a/.04_detect_from_oneline_csv.py.swp
+++ b/.04_detect_from_oneline_csv.py.swp
--- a/04_detect_from_oneline_csv.py
+++ b/04_detect_from_oneline_csv.py
@ -43,64 +43,88 @@ class ManualFeatureSelector(TransformerMixin):
        return self

    def transform(self, X):
-        # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- 
-        return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
-# - Create the pipeline - 
+        # -- corresponding values to these indices are : 
 	
-pipeline = Pipeline([
+	#['Characteristics', 
+	#'DllCharacteristics', 
+	#'SectionsMaxEntropy', 
+	#'MajorSubsystemVersion', 
+	#'Machine', 
+	#'Subsystem', 
+	#'ResourcesMaxEntropy', 
+	#'ResourcesMinEntropy', 
+	#'VersionInformationSize', 
+	#'MajorOperatingSystemVersion', 
+	#'ResourcesMeanEntropy', 
+	#'SectionsMeanEntropy'] 
+        
+	# ? X.transpose()   
+        Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] 
+        return Y 	 
+
+
+# =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-=  
+
+# - This pipeline select usefull features -
+pipeline_features_selection = Pipeline([
 	('features_remap', ManualFeatureSelector()), 
+])
+
+
+# - This pipeline use the imputer and scales the values - 
+pipeline = Pipeline([
 	('imputer', SimpleImputer(strategy="median")),
 	('std_scaler', StandardScaler()),
 ]) 

+
+# - Call the two upper pipelines - 
+def full_pipeline(data): 
+	prepared = pipeline_features_selection.fit_transform(data)
+	#print("Full pipeline -> shape:", prepared.shape)
+	#print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared))   
+	prepared = pipeline.fit_transform(prepared) 
+	return prepared   
+
+# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-=
 saved_model = joblib.load("models/malware_classifier_1.pkl") 

-def list_transformer(l): 
-	l_t = [] 
-
-	l_t.append(l[3])	
-	l_t.append(l[26])	
-	l_t.append(l[36])	
-	l_t.append(l[20])	
-	l_t.append(l[2])	
-	l_t.append(l[25])	
-	l_t.append(l[50])	
-	l_t.append(l[49])	
-	l_t.append(l[55])	
-	l_t.append(l[16])	
-	l_t.append(l[48])	
-	l_t.append(l[34])
-	
-	return l_t 	
+# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= 
+# - This function use the model and predict if it's a malware or not -
+# - The file infos are given in numpy array type - 
+# - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable -
 
 def predict_one_line(model,line): 
-
-	#X_unknown = pipeline.fit_transform(line) 
-	X_unknown = list_transformer(line) 
-	X_unknown = pd.DataFrame([X_unknown]) 
-	X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy',     'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', '    MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']  
+	X_unknown = full_pipeline(line) 
+	X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy',     'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', '    MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']  
+	X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) 
 	ans = model.predict(X_unknown) 
-	ans_type = ['malicious', 'legitimate']
-	#print(ans[0])  
-	#print("This file is: ", ans_type[ans[0]]) 
+	#ans_type = ['malicious', 'legitimate']
 	return ans[0] 

+
+# =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-= 
+# - Takes each line of the dataset
+# - Parse it as a numpy array 
+# - Send it to the prediction function (predict_one_line) 
+# - Compare the result with the expected value 
+# - Save the result 
+# - At the end, print the prediction accuracy result 
+
 res = []
-labels = [] 
-nb_malware_to_test = 2000 
+nb_malware_to_test = 10 
 good_ans = 0 
 for i in range(nb_malware_to_test): 
 	print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
 	features = file_to_test.values[i,]
+	features_list = features.tolist()
+	features_array = [features_list]
+	features = np.array(features_array) 	
+	
 	res.append(predict_one_line(saved_model, features)) 
-	labels.append(file_to_test.values[i,][56]) 
 	if res[i] == file_to_test.values[i,][56]: 
 		good_ans +=1 
 	
 print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))  

 l1 = file_to_test.values[2,] 
-#print(file_to_test.values[1,]) 
-print(l1)  
-#predict_one_line(saved_model, l1) 
-#predict_one_line(saved_model, file_to_test)