From d1f3068c0443f2be209b3ff8a07bfc230d4ffdfa Mon Sep 17 00:00:00 2001 From: valentin Date: Mon, 6 Apr 2020 18:57:10 +0200 Subject: [PATCH] After Cleaning --- 01_data_analysis.py | 10 ---------- 02_draft_preparation_work.py | 9 +-------- 03_2_the_ml.py | 8 ++++---- 07_check_files.sh | 2 +- 08_check_files2.sh | 2 +- 5 files changed, 7 insertions(+), 24 deletions(-) diff --git a/01_data_analysis.py b/01_data_analysis.py index fdc83fd..9008b8c 100644 --- a/01_data_analysis.py +++ b/01_data_analysis.py @@ -33,13 +33,3 @@ print("Generating the correlation matrix [ DONE ]") print("Correlation matrix:") print(corr_matrix['legitimate'].sort_values(ascending=False)) -# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-= -# ==== Split dataset (train/test) ==== - -print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]") -train_set, test_set = train_test_split(files, test_size=0.2, random_state=42) -print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]") - - - - diff --git a/02_draft_preparation_work.py b/02_draft_preparation_work.py index 3601ad6..9dce853 100644 --- a/02_draft_preparation_work.py +++ b/02_draft_preparation_work.py @@ -26,17 +26,12 @@ print("Loading dataset in memory [ DONE ]") # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # ==== Split DataSet again but lets stratify with Machine feature ==== # - Add tmp cat in order to be able to stratify data while splitting it -files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2]) split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) -for train_index, test_index in split.split(files, files["Machine_cat"]): +for train_index, test_index in split.split(files, files["legitimate"]): strat_train_set = files.loc[train_index] strat_test_set = files.loc[test_index] -# - Remove tmp created cat, now the data is splitted, we dont need it anymore -for set_ in (strat_train_set, strat_test_set): - set_.drop("Machine_cat", axis=1, inplace=True) - files = strat_train_set.copy() # ==== Drop useless features ==== @@ -112,5 +107,3 @@ for f in range(nb_features): files_shorted_features = files_prepared[features_to_keep] #print(files_shorted_features.describe()) - - diff --git a/03_2_the_ml.py b/03_2_the_ml.py index 4754967..d863be0 100644 --- a/03_2_the_ml.py +++ b/03_2_the_ml.py @@ -117,10 +117,10 @@ for algo in algos: results[algo] = score winner = max(results, key=results.get) -print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) +print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= -print("Saving the model [ ... ]") -joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") -print("Saving the model [ DONE ]") +#print("Saving the model [ ... ]") +#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") +#print("Saving the model [ DONE ]") diff --git a/07_check_files.sh b/07_check_files.sh index b0de082..a277b6a 100755 --- a/07_check_files.sh +++ b/07_check_files.sh @@ -2,7 +2,7 @@ result=[] i=0 j=0 -for filename in /dev/shm/VirusShare_*; do +for filename in /home/ubuntu/storage/malware/VirusShare_*; do result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename" if [ $result == "1" ] diff --git a/08_check_files2.sh b/08_check_files2.sh index 85b2129..46a9474 100755 --- a/08_check_files2.sh +++ b/08_check_files2.sh @@ -2,7 +2,7 @@ result=[] i=0 j=0 -for filename in /home/ubuntu/removeme_exefiles/*.exe; do +for filename in /home/ubuntu/storage/malware/*.exe; do echo "$filename" result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"