Corrected semi-auto accuracy tester

This commit is contained in:
valentin 2020-04-04 22:20:02 +02:00
parent a4236e68d0
commit 433b2a0957
2 changed files with 61 additions and 37 deletions

Binary file not shown.

View File

@ -43,64 +43,88 @@ class ManualFeatureSelector(TransformerMixin):
return self return self
def transform(self, X): def transform(self, X):
# -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- # -- corresponding values to these indices are :
return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
# - Create the pipeline - #['Characteristics',
#'DllCharacteristics',
#'SectionsMaxEntropy',
#'MajorSubsystemVersion',
#'Machine',
#'Subsystem',
#'ResourcesMaxEntropy',
#'ResourcesMinEntropy',
#'VersionInformationSize',
#'MajorOperatingSystemVersion',
#'ResourcesMeanEntropy',
#'SectionsMeanEntropy']
# ? X.transpose()
Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
return Y
pipeline = Pipeline([
# =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-=
# - This pipeline select usefull features -
pipeline_features_selection = Pipeline([
('features_remap', ManualFeatureSelector()), ('features_remap', ManualFeatureSelector()),
])
# - This pipeline use the imputer and scales the values -
pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")), ('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler()), ('std_scaler', StandardScaler()),
]) ])
# - Call the two upper pipelines -
def full_pipeline(data):
prepared = pipeline_features_selection.fit_transform(data)
#print("Full pipeline -> shape:", prepared.shape)
#print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared))
prepared = pipeline.fit_transform(prepared)
return prepared
# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-=
saved_model = joblib.load("models/malware_classifier_1.pkl") saved_model = joblib.load("models/malware_classifier_1.pkl")
def list_transformer(l): # =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-=
l_t = [] # - This function use the model and predict if it's a malware or not -
# - The file infos are given in numpy array type -
l_t.append(l[3]) # - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable -
l_t.append(l[26])
l_t.append(l[36])
l_t.append(l[20])
l_t.append(l[2])
l_t.append(l[25])
l_t.append(l[50])
l_t.append(l[49])
l_t.append(l[55])
l_t.append(l[16])
l_t.append(l[48])
l_t.append(l[34])
return l_t
def predict_one_line(model,line): def predict_one_line(model,line):
X_unknown = full_pipeline(line)
#X_unknown = pipeline.fit_transform(line) X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
X_unknown = list_transformer(line) X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns)
X_unknown = pd.DataFrame([X_unknown])
X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
ans = model.predict(X_unknown) ans = model.predict(X_unknown)
ans_type = ['malicious', 'legitimate'] #ans_type = ['malicious', 'legitimate']
#print(ans[0])
#print("This file is: ", ans_type[ans[0]])
return ans[0] return ans[0]
# =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-=
# - Takes each line of the dataset
# - Parse it as a numpy array
# - Send it to the prediction function (predict_one_line)
# - Compare the result with the expected value
# - Save the result
# - At the end, print the prediction accuracy result
res = [] res = []
labels = [] nb_malware_to_test = 10
nb_malware_to_test = 2000
good_ans = 0 good_ans = 0
for i in range(nb_malware_to_test): for i in range(nb_malware_to_test):
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
features = file_to_test.values[i,] features = file_to_test.values[i,]
features_list = features.tolist()
features_array = [features_list]
features = np.array(features_array)
res.append(predict_one_line(saved_model, features)) res.append(predict_one_line(saved_model, features))
labels.append(file_to_test.values[i,][56])
if res[i] == file_to_test.values[i,][56]: if res[i] == file_to_test.values[i,][56]:
good_ans +=1 good_ans +=1
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))
l1 = file_to_test.values[2,] l1 = file_to_test.values[2,]
#print(file_to_test.values[1,])
print(l1)
#predict_one_line(saved_model, l1)
#predict_one_line(saved_model, file_to_test)