After Cleaning
This commit is contained in:
parent
ae3eaab3cd
commit
d1f3068c04
@ -33,13 +33,3 @@ print("Generating the correlation matrix [ DONE ]")
|
|||||||
print("Correlation matrix:")
|
print("Correlation matrix:")
|
||||||
print(corr_matrix['legitimate'].sort_values(ascending=False))
|
print(corr_matrix['legitimate'].sort_values(ascending=False))
|
||||||
|
|
||||||
# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-=
|
|
||||||
# ==== Split dataset (train/test) ====
|
|
||||||
|
|
||||||
print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
|
|
||||||
train_set, test_set = train_test_split(files, test_size=0.2, random_state=42)
|
|
||||||
print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -26,17 +26,12 @@ print("Loading dataset in memory [ DONE ]")
|
|||||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||||
# - Add tmp cat in order to be able to stratify data while splitting it
|
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||||
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
|
||||||
|
|
||||||
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||||
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
for train_index, test_index in split.split(files, files["legitimate"]):
|
||||||
strat_train_set = files.loc[train_index]
|
strat_train_set = files.loc[train_index]
|
||||||
strat_test_set = files.loc[test_index]
|
strat_test_set = files.loc[test_index]
|
||||||
|
|
||||||
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
|
||||||
for set_ in (strat_train_set, strat_test_set):
|
|
||||||
set_.drop("Machine_cat", axis=1, inplace=True)
|
|
||||||
|
|
||||||
files = strat_train_set.copy()
|
files = strat_train_set.copy()
|
||||||
# ==== Drop useless features ====
|
# ==== Drop useless features ====
|
||||||
|
|
||||||
@ -112,5 +107,3 @@ for f in range(nb_features):
|
|||||||
files_shorted_features = files_prepared[features_to_keep]
|
files_shorted_features = files_prepared[features_to_keep]
|
||||||
#print(files_shorted_features.describe())
|
#print(files_shorted_features.describe())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -117,10 +117,10 @@ for algo in algos:
|
|||||||
results[algo] = score
|
results[algo] = score
|
||||||
|
|
||||||
winner = max(results, key=results.get)
|
winner = max(results, key=results.get)
|
||||||
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||||
|
|
||||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||||
print("Saving the model [ ... ]")
|
#print("Saving the model [ ... ]")
|
||||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
|
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl")
|
||||||
print("Saving the model [ DONE ]")
|
#print("Saving the model [ DONE ]")
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
result=[]
|
result=[]
|
||||||
i=0
|
i=0
|
||||||
j=0
|
j=0
|
||||||
for filename in /dev/shm/VirusShare_*; do
|
for filename in /home/ubuntu/storage/malware/VirusShare_*; do
|
||||||
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
||||||
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
||||||
if [ $result == "1" ]
|
if [ $result == "1" ]
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
result=[]
|
result=[]
|
||||||
i=0
|
i=0
|
||||||
j=0
|
j=0
|
||||||
for filename in /home/ubuntu/removeme_exefiles/*.exe; do
|
for filename in /home/ubuntu/storage/malware/*.exe; do
|
||||||
echo "$filename"
|
echo "$filename"
|
||||||
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
result="$(python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py $filename)"
|
||||||
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
python3.6 /home/ubuntu/bigData/projet_big_data/06_extract_features_and_predict.py "$filename"
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user