Corrected semi-auto accuracy tester
This commit is contained in:
parent
a4236e68d0
commit
433b2a0957
BIN
.04_detect_from_oneline_csv.py.swp
Normal file
BIN
.04_detect_from_oneline_csv.py.swp
Normal file
Binary file not shown.
@ -43,64 +43,88 @@ class ManualFeatureSelector(TransformerMixin):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
# -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] --
|
# -- corresponding values to these indices are :
|
||||||
return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
|
|
||||||
# - Create the pipeline -
|
|
||||||
|
|
||||||
pipeline = Pipeline([
|
#['Characteristics',
|
||||||
|
#'DllCharacteristics',
|
||||||
|
#'SectionsMaxEntropy',
|
||||||
|
#'MajorSubsystemVersion',
|
||||||
|
#'Machine',
|
||||||
|
#'Subsystem',
|
||||||
|
#'ResourcesMaxEntropy',
|
||||||
|
#'ResourcesMinEntropy',
|
||||||
|
#'VersionInformationSize',
|
||||||
|
#'MajorOperatingSystemVersion',
|
||||||
|
#'ResourcesMeanEntropy',
|
||||||
|
#'SectionsMeanEntropy']
|
||||||
|
|
||||||
|
# ? X.transpose()
|
||||||
|
Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]]
|
||||||
|
return Y
|
||||||
|
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-=
|
||||||
|
|
||||||
|
# - This pipeline select usefull features -
|
||||||
|
pipeline_features_selection = Pipeline([
|
||||||
('features_remap', ManualFeatureSelector()),
|
('features_remap', ManualFeatureSelector()),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
# - This pipeline use the imputer and scales the values -
|
||||||
|
pipeline = Pipeline([
|
||||||
('imputer', SimpleImputer(strategy="median")),
|
('imputer', SimpleImputer(strategy="median")),
|
||||||
('std_scaler', StandardScaler()),
|
('std_scaler', StandardScaler()),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
|
# - Call the two upper pipelines -
|
||||||
|
def full_pipeline(data):
|
||||||
|
prepared = pipeline_features_selection.fit_transform(data)
|
||||||
|
#print("Full pipeline -> shape:", prepared.shape)
|
||||||
|
#print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared))
|
||||||
|
prepared = pipeline.fit_transform(prepared)
|
||||||
|
return prepared
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-=
|
||||||
saved_model = joblib.load("models/malware_classifier_1.pkl")
|
saved_model = joblib.load("models/malware_classifier_1.pkl")
|
||||||
|
|
||||||
def list_transformer(l):
|
# =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-=
|
||||||
l_t = []
|
# - This function use the model and predict if it's a malware or not -
|
||||||
|
# - The file infos are given in numpy array type -
|
||||||
l_t.append(l[3])
|
# - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable -
|
||||||
l_t.append(l[26])
|
|
||||||
l_t.append(l[36])
|
|
||||||
l_t.append(l[20])
|
|
||||||
l_t.append(l[2])
|
|
||||||
l_t.append(l[25])
|
|
||||||
l_t.append(l[50])
|
|
||||||
l_t.append(l[49])
|
|
||||||
l_t.append(l[55])
|
|
||||||
l_t.append(l[16])
|
|
||||||
l_t.append(l[48])
|
|
||||||
l_t.append(l[34])
|
|
||||||
|
|
||||||
return l_t
|
|
||||||
|
|
||||||
def predict_one_line(model,line):
|
def predict_one_line(model,line):
|
||||||
|
X_unknown = full_pipeline(line)
|
||||||
#X_unknown = pipeline.fit_transform(line)
|
X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||||
X_unknown = list_transformer(line)
|
X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns)
|
||||||
X_unknown = pd.DataFrame([X_unknown])
|
|
||||||
X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
|
||||||
ans = model.predict(X_unknown)
|
ans = model.predict(X_unknown)
|
||||||
ans_type = ['malicious', 'legitimate']
|
#ans_type = ['malicious', 'legitimate']
|
||||||
#print(ans[0])
|
|
||||||
#print("This file is: ", ans_type[ans[0]])
|
|
||||||
return ans[0]
|
return ans[0]
|
||||||
|
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-=
|
||||||
|
# - Takes each line of the dataset
|
||||||
|
# - Parse it as a numpy array
|
||||||
|
# - Send it to the prediction function (predict_one_line)
|
||||||
|
# - Compare the result with the expected value
|
||||||
|
# - Save the result
|
||||||
|
# - At the end, print the prediction accuracy result
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
labels = []
|
nb_malware_to_test = 10
|
||||||
nb_malware_to_test = 2000
|
|
||||||
good_ans = 0
|
good_ans = 0
|
||||||
for i in range(nb_malware_to_test):
|
for i in range(nb_malware_to_test):
|
||||||
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
|
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
|
||||||
features = file_to_test.values[i,]
|
features = file_to_test.values[i,]
|
||||||
|
features_list = features.tolist()
|
||||||
|
features_array = [features_list]
|
||||||
|
features = np.array(features_array)
|
||||||
|
|
||||||
res.append(predict_one_line(saved_model, features))
|
res.append(predict_one_line(saved_model, features))
|
||||||
labels.append(file_to_test.values[i,][56])
|
|
||||||
if res[i] == file_to_test.values[i,][56]:
|
if res[i] == file_to_test.values[i,][56]:
|
||||||
good_ans +=1
|
good_ans +=1
|
||||||
|
|
||||||
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))
|
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))
|
||||||
|
|
||||||
l1 = file_to_test.values[2,]
|
l1 = file_to_test.values[2,]
|
||||||
#print(file_to_test.values[1,])
|
|
||||||
print(l1)
|
|
||||||
#predict_one_line(saved_model, l1)
|
|
||||||
#predict_one_line(saved_model, file_to_test)
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user