Corrected semi-auto accuracy tester

2020-04-04 22:20:02 +02:00 · 2020-04-04 22:20:02 +02:00 · 433b2a0957
commit 433b2a0957
parent a4236e68d0
2 changed files with 61 additions and 37 deletions
--- a/.04_detect_from_oneline_csv.py.swp
+++ b/.04_detect_from_oneline_csv.py.swp
--- a/04_detect_from_oneline_csv.py
+++ b/04_detect_from_oneline_csv.py
@ -43,64 +43,88 @@ class ManualFeatureSelector(TransformerMixin):
        return self
    def transform(self, X):
-        # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- 
+        # -- corresponding values to these indices are : 
        return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
 # - Create the pipeline - 
-pipeline = Pipeline([
+	#['Characteristics', 
 	#'DllCharacteristics', 
 	#'SectionsMaxEntropy', 
 	#'MajorSubsystemVersion', 
 	#'Machine', 
 	#'Subsystem', 
 	#'ResourcesMaxEntropy', 
 	#'ResourcesMinEntropy', 
 	#'VersionInformationSize', 
 	#'MajorOperatingSystemVersion', 
 	#'ResourcesMeanEntropy', 
 	#'SectionsMeanEntropy'] 
 	# ? X.transpose()   
        Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] 
        return Y 	 
 # =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-=  
 # - This pipeline select usefull features -
 pipeline_features_selection = Pipeline([
 	('features_remap', ManualFeatureSelector()), 
 ])
 # - This pipeline use the imputer and scales the values - 
 pipeline = Pipeline([
 	('imputer', SimpleImputer(strategy="median")),
 	('std_scaler', StandardScaler()),
 ]) 
 # - Call the two upper pipelines - 
 def full_pipeline(data): 
 	prepared = pipeline_features_selection.fit_transform(data)
 	#print("Full pipeline -> shape:", prepared.shape)
 	#print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared))   
 	prepared = pipeline.fit_transform(prepared) 
 	return prepared   
 # =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-=
 saved_model = joblib.load("models/malware_classifier_1.pkl") 
-def list_transformer(l): 
+# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= 
-	l_t = [] 
+# - This function use the model and predict if it's a malware or not -
-
+# - The file infos are given in numpy array type - 
-	l_t.append(l[3])	
+# - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable -
 	l_t.append(l[26])	
 	l_t.append(l[36])	
 	l_t.append(l[20])	
 	l_t.append(l[2])	
 	l_t.append(l[25])	
 	l_t.append(l[50])	
 	l_t.append(l[49])	
 	l_t.append(l[55])	
 	l_t.append(l[16])	
 	l_t.append(l[48])	
 	l_t.append(l[34])
 	return l_t 	
 def predict_one_line(model,line): 
-
+	X_unknown = full_pipeline(line) 
-	#X_unknown = pipeline.fit_transform(line) 
+	X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy',     'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', '    MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']  
-	X_unknown = list_transformer(line) 
+	X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) 
 	X_unknown = pd.DataFrame([X_unknown]) 
 	X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy',     'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', '    MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']  
 	ans = model.predict(X_unknown) 
-	ans_type = ['malicious', 'legitimate']
+	#ans_type = ['malicious', 'legitimate']
 	#print(ans[0])  
 	#print("This file is: ", ans_type[ans[0]]) 
 	return ans[0] 
 # =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-= 
 # - Takes each line of the dataset
 # - Parse it as a numpy array 
 # - Send it to the prediction function (predict_one_line) 
 # - Compare the result with the expected value 
 # - Save the result 
 # - At the end, print the prediction accuracy result 
 res = []
-labels = [] 
+nb_malware_to_test = 10 
 nb_malware_to_test = 2000 
 good_ans = 0 
 for i in range(nb_malware_to_test): 
 	print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
 	features = file_to_test.values[i,]
 	features_list = features.tolist()
 	features_array = [features_list]
 	features = np.array(features_array) 	
 	res.append(predict_one_line(saved_model, features)) 
 	labels.append(file_to_test.values[i,][56]) 
 	if res[i] == file_to_test.values[i,][56]: 
 		good_ans +=1 
 print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))  
 l1 = file_to_test.values[2,] 
 #print(file_to_test.values[1,]) 
 print(l1)  
 #predict_one_line(saved_model, l1) 
 #predict_one_line(saved_model, file_to_test)