From d1f3068c0443f2be209b3ff8a07bfc230d4ffdfa Mon Sep 17 00:00:00 2001
From: valentin <poubelle@romanet.fr>
Date: Mon, 6 Apr 2020 18:57:10 +0200
Subject: [PATCH] After Cleaning

---
 01_data_analysis.py          | 10 ----------
 02_draft_preparation_work.py |  9 +--------
 03_2_the_ml.py               |  8 ++++----
 07_check_files.sh            |  2 +-
 08_check_files2.sh           |  2 +-
 5 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/01_data_analysis.py b/01_data_analysis.py
index fdc83fd..9008b8c 100644
--- a/01_data_analysis.py
+++ b/01_data_analysis.py
@@ -33,13 +33,3 @@ print("Generating the correlation matrix [ DONE ]")
 print("Correlation matrix:") 
 print(corr_matrix['legitimate'].sort_values(ascending=False))
 
-# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-= 
-# ==== Split dataset (train/test) ==== 
-
-print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
-train_set, test_set = train_test_split(files, test_size=0.2, random_state=42) 
-print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")
-
-
-
- 
diff --git a/02_draft_preparation_work.py b/02_draft_preparation_work.py
index 3601ad6..9dce853 100644
--- a/02_draft_preparation_work.py
+++ b/02_draft_preparation_work.py
@@ -26,17 +26,12 @@ print("Loading dataset in memory [ DONE ]")
 # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= 
 # ==== Split DataSet again but lets stratify with Machine feature ==== 
 # - Add tmp cat in order to be able to stratify data while splitting it 
-files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2]) 
 
 split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
-for train_index, test_index in split.split(files, files["Machine_cat"]): 
+for train_index, test_index in split.split(files, files["legitimate"]): 
 	strat_train_set = files.loc[train_index]  
 	strat_test_set = files.loc[test_index] 
 
-# - Remove tmp created cat, now the data is splitted, we dont need it anymore 
-for set_ in (strat_train_set, strat_test_set): 
-	set_.drop("Machine_cat", axis=1, inplace=True)
- 
 files = strat_train_set.copy() 
 # ==== Drop useless features ====
 
@@ -112,5 +107,3 @@ for f in range(nb_features):
 files_shorted_features = files_prepared[features_to_keep]
 #print(files_shorted_features.describe()) 
 
-
- 
diff --git a/03_2_the_ml.py b/03_2_the_ml.py
index 4754967..d863be0 100644
--- a/03_2_the_ml.py
+++ b/03_2_the_ml.py
@@ -117,10 +117,10 @@ for algo in algos:
     results[algo] = score
 
 winner = max(results, key=results.get)
-print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 
+print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 
 
 # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= 
-print("Saving the model [ ... ]")
-joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") 
-print("Saving the model [ DONE ]")
+#print("Saving the model [ ... ]")
+#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") 
+#print("Saving the model [ DONE ]")
  
diff --git a/07_check_files.sh b/07_check_files.sh
index b0de082..a277b6a 100755
--- a/07_check_files.sh
+++ b/07_check_files.sh
@@ -2,7 +2,7 @@
 result=[] 
 i=0
 j=0
-for filename in /dev/shm/VirusShare_*; do
+for filename in /home/ubuntu/storage/malware/VirusShare_*; do
 	result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" 
 	python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename" 
 	if [ $result == "1" ]
diff --git a/08_check_files2.sh b/08_check_files2.sh
index 85b2129..46a9474 100755
--- a/08_check_files2.sh
+++ b/08_check_files2.sh
@@ -2,7 +2,7 @@
 result=[] 
 i=0
 j=0
-for filename in /home/ubuntu/removeme_exefiles/*.exe; do 
+for filename in /home/ubuntu/storage/malware/*.exe; do 
 	echo "$filename" 
 	result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" 
 	python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"