After Cleaning

This commit is contained in:
valentin 2020-04-06 18:57:10 +02:00
parent ae3eaab3cd
commit d1f3068c04
5 changed files with 7 additions and 24 deletions

View File

@ -33,13 +33,3 @@ print("Generating the correlation matrix [ DONE ]")
print("Correlation matrix:")
print(corr_matrix['legitimate'].sort_values(ascending=False))
# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-=
# ==== Split dataset (train/test) ====
print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
train_set, test_set = train_test_split(files, test_size=0.2, random_state=42)
print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")

View File

@ -26,17 +26,12 @@ print("Loading dataset in memory [ DONE ]")
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
# ==== Split DataSet again but lets stratify with Machine feature ====
# - Add tmp cat in order to be able to stratify data while splitting it
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(files, files["Machine_cat"]):
for train_index, test_index in split.split(files, files["legitimate"]):
strat_train_set = files.loc[train_index]
strat_test_set = files.loc[test_index]
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
for set_ in (strat_train_set, strat_test_set):
set_.drop("Machine_cat", axis=1, inplace=True)
files = strat_train_set.copy()
# ==== Drop useless features ====
@ -112,5 +107,3 @@ for f in range(nb_features):
files_shorted_features = files_prepared[features_to_keep]
#print(files_shorted_features.describe())

View File

@ -117,10 +117,10 @@ for algo in algos:
results[algo] = score
winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
print("Saving the model [ ... ]")
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
print("Saving the model [ DONE ]")
#print("Saving the model [ ... ]")
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
#print("Saving the model [ DONE ]")

View File

@ -2,7 +2,7 @@
result=[]
i=0
j=0
for filename in /dev/shm/VirusShare_*; do
for filename in /home/ubuntu/storage/malware/VirusShare_*; do
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
if [ $result == "1" ]

View File

@ -2,7 +2,7 @@
result=[]
i=0
j=0
for filename in /home/ubuntu/removeme_exefiles/*.exe; do
for filename in /home/ubuntu/storage/malware/*.exe; do
echo "$filename"
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"