After Cleaning
This commit is contained in:
parent
ae3eaab3cd
commit
d1f3068c04
@ -33,13 +33,3 @@ print("Generating the correlation matrix [ DONE ]")
|
||||
print("Correlation matrix:")
|
||||
print(corr_matrix['legitimate'].sort_values(ascending=False))
|
||||
|
||||
# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-=
|
||||
# ==== Split dataset (train/test) ====
|
||||
|
||||
print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
|
||||
train_set, test_set = train_test_split(files, test_size=0.2, random_state=42)
|
||||
print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -26,17 +26,12 @@ print("Loading dataset in memory [ DONE ]")
|
||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
||||
|
||||
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
||||
for train_index, test_index in split.split(files, files["legitimate"]):
|
||||
strat_train_set = files.loc[train_index]
|
||||
strat_test_set = files.loc[test_index]
|
||||
|
||||
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
||||
for set_ in (strat_train_set, strat_test_set):
|
||||
set_.drop("Machine_cat", axis=1, inplace=True)
|
||||
|
||||
files = strat_train_set.copy()
|
||||
# ==== Drop useless features ====
|
||||
|
||||
@ -112,5 +107,3 @@ for f in range(nb_features):
|
||||
files_shorted_features = files_prepared[features_to_keep]
|
||||
#print(files_shorted_features.describe())
|
||||
|
||||
|
||||
|
||||
|
||||
@ -117,10 +117,10 @@ for algo in algos:
|
||||
results[algo] = score
|
||||
|
||||
winner = max(results, key=results.get)
|
||||
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||
print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||
|
||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||
print("Saving the model [ ... ]")
|
||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
|
||||
print("Saving the model [ DONE ]")
|
||||
#print("Saving the model [ ... ]")
|
||||
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
|
||||
#print("Saving the model [ DONE ]")
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
result=[]
|
||||
i=0
|
||||
j=0
|
||||
for filename in /dev/shm/VirusShare_*; do
|
||||
for filename in /home/ubuntu/storage/malware/VirusShare_*; do
|
||||
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
||||
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
||||
if [ $result == "1" ]
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
result=[]
|
||||
i=0
|
||||
j=0
|
||||
for filename in /home/ubuntu/removeme_exefiles/*.exe; do
|
||||
for filename in /home/ubuntu/storage/malware/*.exe; do
|
||||
echo "$filename"
|
||||
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
||||
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user