After Cleaning

This commit is contained in:
valentin 2020-04-06 18:57:10 +02:00
parent ae3eaab3cd
commit d1f3068c04
5 changed files with 7 additions and 24 deletions

View File

@ -33,13 +33,3 @@ print("Generating the correlation matrix [ DONE ]")
print("Correlation matrix:") print("Correlation matrix:")
print(corr_matrix['legitimate'].sort_values(ascending=False)) print(corr_matrix['legitimate'].sort_values(ascending=False))
# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-=
# ==== Split dataset (train/test) ====
print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
train_set, test_set = train_test_split(files, test_size=0.2, random_state=42)
print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")

View File

@ -26,17 +26,12 @@ print("Loading dataset in memory [ DONE ]")
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
# ==== Split DataSet again but lets stratify with Machine feature ==== # ==== Split DataSet again but lets stratify with Machine feature ====
# - Add tmp cat in order to be able to stratify data while splitting it # - Add tmp cat in order to be able to stratify data while splitting it
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(files, files["Machine_cat"]): for train_index, test_index in split.split(files, files["legitimate"]):
strat_train_set = files.loc[train_index] strat_train_set = files.loc[train_index]
strat_test_set = files.loc[test_index] strat_test_set = files.loc[test_index]
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
for set_ in (strat_train_set, strat_test_set):
set_.drop("Machine_cat", axis=1, inplace=True)
files = strat_train_set.copy() files = strat_train_set.copy()
# ==== Drop useless features ==== # ==== Drop useless features ====
@ -112,5 +107,3 @@ for f in range(nb_features):
files_shorted_features = files_prepared[features_to_keep] files_shorted_features = files_prepared[features_to_keep]
#print(files_shorted_features.describe()) #print(files_shorted_features.describe())

View File

@ -117,10 +117,10 @@ for algo in algos:
results[algo] = score results[algo] = score
winner = max(results, key=results.get) winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
print("Saving the model [ ... ]") #print("Saving the model [ ... ]")
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") #joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
print("Saving the model [ DONE ]") #print("Saving the model [ DONE ]")

View File

@ -2,7 +2,7 @@
result=[] result=[]
i=0 i=0
j=0 j=0
for filename in /dev/shm/VirusShare_*; do for filename in /home/ubuntu/storage/malware/VirusShare_*; do
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename" python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
if [ $result == "1" ] if [ $result == "1" ]

View File

@ -2,7 +2,7 @@
result=[] result=[]
i=0 i=0
j=0 j=0
for filename in /home/ubuntu/removeme_exefiles/*.exe; do for filename in /home/ubuntu/storage/malware/*.exe; do
echo "$filename" echo "$filename"
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)" result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename" python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"