diff --git a/01_data_analysis.py b/01_data_analysis.py index 9008b8c..5be4daf 100644 --- a/01_data_analysis.py +++ b/01_data_analysis.py @@ -20,7 +20,7 @@ from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") -files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False) +files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") print("Dataset basic infos:") diff --git a/02_draft_preparation_work.py b/02_draft_preparation_work.py index 9dce853..3782d80 100644 --- a/02_draft_preparation_work.py +++ b/02_draft_preparation_work.py @@ -20,7 +20,7 @@ from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") -files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False) +files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= diff --git a/04_detect_from_oneline_csv.py b/04_detect_from_oneline_csv.py index d9ea5d1..7443f5c 100644 --- a/04_detect_from_oneline_csv.py +++ b/04_detect_from_oneline_csv.py @@ -121,18 +121,18 @@ def predict_one_line(model,line): # - At the end, print the prediction accuracy result res = [] -#nb_malware_to_test = 50 -nb_malware_to_test = 34199 +nb_malware_to_test = 50 +#nb_malware_to_test = 34199 good_ans = 0 -for i in range(34179,nb_malware_to_test): -#for i in range(nb_malware_to_test): +#for i in range(34179,nb_malware_to_test): +for i in range(nb_malware_to_test): print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) features = file_to_test.values[i,] features_list = features.tolist() features_array = [features_list] features = np.array(features_array) res.append(predict_one_line(saved_model, features)) - if res[i-34179] == file_to_test.values[i,][54]: + if res[i] == file_to_test.values[i,][54]: good_ans +=1 print(features) print(res) diff --git a/06_extract_features_and_predict.py b/06_extract_features_and_predict.py index 4a5689e..51d1fbd 100644 --- a/06_extract_features_and_predict.py +++ b/06_extract_features_and_predict.py @@ -199,8 +199,8 @@ def predict_from_features(features, model): X_unknown = features_numpy X_unknown_columns = selected_features X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) - #ans = model.predict(X_unknown) - ans = model.predict([features]) + ans = model.predict(X_unknown) + #ans = model.predict([features]) return ans[0] if __name__ == "__main__": diff --git a/tenamortech_malware_scanner_website_ihm.mp4 b/tenamortech_malware_scanner_website_ihm.mp4 new file mode 100644 index 0000000..cc62282 Binary files /dev/null and b/tenamortech_malware_scanner_website_ihm.mp4 differ