Initial commmit Bro ;)
This commit is contained in:
commit
72478eccb9
45
01_data_analysis.py
Normal file
45
01_data_analysis.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import StratifiedShuffleSplit
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.tree import DecisionTreeRegressor
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
|
||||||
|
# - For features selection
|
||||||
|
from sklearn.ensemble import ExtraTreesClassifier
|
||||||
|
from sklearn.feature_selection import SelectFromModel
|
||||||
|
|
||||||
|
print("Loading dataset in memory [ ... ]")
|
||||||
|
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
||||||
|
print("Loading dataset in memory [ DONE ]")
|
||||||
|
|
||||||
|
print("Dataset basic infos:")
|
||||||
|
#print(files.describe())
|
||||||
|
#print(files.info())
|
||||||
|
|
||||||
|
print("Generating the correlation matrix [ ... ]")
|
||||||
|
corr_matrix = files.corr()
|
||||||
|
print("Generating the correlation matrix [ DONE ]")
|
||||||
|
print("Correlation matrix:")
|
||||||
|
print(corr_matrix['legitimate'].sort_values(ascending=False))
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-=
|
||||||
|
# ==== Split dataset (train/test) ====
|
||||||
|
|
||||||
|
print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
|
||||||
|
train_set, test_set = train_test_split(files, test_size=0.2, random_state=42)
|
||||||
|
print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
116
02_draft_preparation_work.py
Normal file
116
02_draft_preparation_work.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import StratifiedShuffleSplit
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.tree import DecisionTreeRegressor
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
|
||||||
|
# - For features selection
|
||||||
|
from sklearn.ensemble import ExtraTreesClassifier
|
||||||
|
from sklearn.feature_selection import SelectFromModel
|
||||||
|
|
||||||
|
print("Loading dataset in memory [ ... ]")
|
||||||
|
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
||||||
|
print("Loading dataset in memory [ DONE ]")
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||||
|
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||||
|
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||||
|
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
||||||
|
|
||||||
|
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||||
|
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
||||||
|
strat_train_set = files.loc[train_index]
|
||||||
|
strat_test_set = files.loc[test_index]
|
||||||
|
|
||||||
|
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
||||||
|
for set_ in (strat_train_set, strat_test_set):
|
||||||
|
set_.drop("Machine_cat", axis=1, inplace=True)
|
||||||
|
|
||||||
|
files = strat_train_set.copy()
|
||||||
|
# ==== Drop useless features ====
|
||||||
|
|
||||||
|
files_without_useless_features = files.drop(['ID', 'md5'], axis=1)
|
||||||
|
|
||||||
|
# ==== Split Features and Labels ====
|
||||||
|
|
||||||
|
print("Splitting dataset Features and Labels [ ... ]")
|
||||||
|
files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
||||||
|
files_labels = files["legitimate"].copy()
|
||||||
|
print("Splitting dataset Features and Labels [ DONE ]")
|
||||||
|
|
||||||
|
# ==== Replace Null values with median ====
|
||||||
|
|
||||||
|
print("Replacing null values with Median [ ... ]")
|
||||||
|
imputer = SimpleImputer(strategy="median")
|
||||||
|
# - No need to drop categorical features, we dont have those in this dataset
|
||||||
|
# - Fit the imputer with the data
|
||||||
|
print("Imputer is learning medians [ ... ]")
|
||||||
|
imputer.fit(files_without_labels)
|
||||||
|
print("Imputer is learning medians [ DONE ]")
|
||||||
|
# - Apply median on null values
|
||||||
|
print("Imputer is transforming DataSet [ ... ]")
|
||||||
|
files_without_labels_X_tr = imputer.transform(files_without_labels)
|
||||||
|
print("Imputer is transforming DataSet [ DONE ]")
|
||||||
|
# - Transform NumPy array into PD DataFrame
|
||||||
|
files_without_labels_tr = pd.DataFrame(files_without_labels_X_tr, columns=files_without_labels.columns)
|
||||||
|
print("Replacing null values with Median [ DONE ]")
|
||||||
|
|
||||||
|
# - Create the pipeline -
|
||||||
|
|
||||||
|
pipeline = Pipeline([
|
||||||
|
('imputer', SimpleImputer(strategy="median")),
|
||||||
|
('std_scaler', StandardScaler()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# - Prepare dataset, pass it through the pipeline - i
|
||||||
|
print("Dataset passing through the pipeline [ ... ]")
|
||||||
|
files_prepared = pipeline.fit_transform(files_without_labels)
|
||||||
|
files_prepared = pd.DataFrame(files_prepared, columns=files_without_labels.columns)
|
||||||
|
print("Dataset passing through the pipeline [ DONE ]")
|
||||||
|
print("Describe of 'files_prepared': ")
|
||||||
|
print(files_prepared.describe())
|
||||||
|
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Features selection =-=-=-=-=-=-=
|
||||||
|
|
||||||
|
print("Extracting most correlated features [ ... ]")
|
||||||
|
print(" -> Line 1 [ ... ]")
|
||||||
|
# - Here we use n_jobs with "-1" in order to use all threads of the CPU -
|
||||||
|
f_select = ExtraTreesClassifier(n_jobs = -1).fit(files_prepared, files_labels)
|
||||||
|
|
||||||
|
print(" -> Line 2 [ ... ]")
|
||||||
|
model = SelectFromModel(f_select, prefit=True)
|
||||||
|
|
||||||
|
print(" -> Line 3 [ ... ]")
|
||||||
|
files_features_short = model.transform(files_prepared)
|
||||||
|
print("Extracting most correlated features [ DONE ]")
|
||||||
|
|
||||||
|
print("Features nb before filter: ", files_prepared.shape[1])
|
||||||
|
print("Features nb after filter: ", files_features_short.shape[1])
|
||||||
|
|
||||||
|
print(" --- Features list after ExtraTreesClassifier job --- ")
|
||||||
|
nb_features = files_features_short.shape[1]
|
||||||
|
indices = np.argsort(f_select.feature_importances_)[::-1][:nb_features]
|
||||||
|
|
||||||
|
features_to_keep = []
|
||||||
|
for f in range(nb_features):
|
||||||
|
print("%d. feature %s (%f)" % (f + 1, files_prepared.columns[indices[f]], f_select.feature_importances_[indices[f]]))
|
||||||
|
# - keep a list of the features to keep
|
||||||
|
features_to_keep.append(files_prepared.columns[indices[f]])
|
||||||
|
# - build DataFrame based on original dataset with selected features only -
|
||||||
|
files_shorted_features = files_prepared[features_to_keep]
|
||||||
|
#print(files_shorted_features.describe())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
120
03_the_ml.py
Normal file
120
03_the_ml.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import StratifiedShuffleSplit
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.tree import DecisionTreeRegressor
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.ensemble import GradientBoostingClassifier
|
||||||
|
from sklearn.ensemble import AdaBoostClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
|
||||||
|
# - For features selection
|
||||||
|
from sklearn.ensemble import ExtraTreesClassifier
|
||||||
|
from sklearn.feature_selection import SelectFromModel
|
||||||
|
|
||||||
|
print("Loading dataset in memory [ ... ]")
|
||||||
|
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
||||||
|
print("Loading dataset in memory [ DONE ]")
|
||||||
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||||
|
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||||
|
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||||
|
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
||||||
|
|
||||||
|
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||||
|
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
||||||
|
strat_train_set = files.loc[train_index]
|
||||||
|
strat_test_set = files.loc[test_index]
|
||||||
|
|
||||||
|
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
||||||
|
for set_ in (strat_train_set, strat_test_set):
|
||||||
|
set_.drop("Machine_cat", axis=1, inplace=True)
|
||||||
|
|
||||||
|
files = strat_train_set.copy()
|
||||||
|
|
||||||
|
# ==== Split Features and Labels ====
|
||||||
|
|
||||||
|
print("Splitting dataset Features and Labels [ ... ]")
|
||||||
|
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
||||||
|
files_without_labels = files.drop("legitimate", axis=1)
|
||||||
|
files_labels = files["legitimate"].copy()
|
||||||
|
print("Splitting dataset Features and Labels [ DONE ]")
|
||||||
|
|
||||||
|
imputer = SimpleImputer(strategy="median")
|
||||||
|
|
||||||
|
# - Features manual transformer -
|
||||||
|
class ManualFeatureSelector(TransformerMixin):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def fit(self, X, y=None):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, X):
|
||||||
|
# -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] --
|
||||||
|
return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
|
||||||
|
# - Create the pipeline -
|
||||||
|
|
||||||
|
pipeline = Pipeline([
|
||||||
|
('features_remap', ManualFeatureSelector()),
|
||||||
|
('imputer', SimpleImputer(strategy="median")),
|
||||||
|
('std_scaler', StandardScaler()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# - Prepare dataset, pass it through the pipeline -
|
||||||
|
print("Dataset passing through the pipeline [ ... ]")
|
||||||
|
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||||
|
files_prepared = pipeline.fit_transform(files_without_labels)
|
||||||
|
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name)
|
||||||
|
print("Dataset passing through the pipeline [ DONE ]")
|
||||||
|
print("Describe of 'files_prepared': ")
|
||||||
|
print(files_prepared.describe())
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-=
|
||||||
|
|
||||||
|
algos = {
|
||||||
|
"DecisionTree": DecisionTreeClassifier(max_depth=10),
|
||||||
|
"RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
|
||||||
|
"GradientBoosting": GradientBoostingClassifier(n_estimators=50),
|
||||||
|
"AdaBoost": AdaBoostClassifier(n_estimators=100),
|
||||||
|
"GNB": GaussianNB()
|
||||||
|
}
|
||||||
|
|
||||||
|
X_train = files_prepared
|
||||||
|
y_train = files_labels
|
||||||
|
|
||||||
|
X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1))
|
||||||
|
y_test = strat_test_set['legitimate'].copy()
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
print("Testing 5 algo [ ... ]")
|
||||||
|
for algo in algos:
|
||||||
|
cur_algo = algos[algo]
|
||||||
|
cur_algo.fit(X_train, y_train)
|
||||||
|
#score = cur_algo.score(X_test, y_test)
|
||||||
|
score = cur_algo.score(X_test, y_test)
|
||||||
|
print("%s : %f %%" % (algo, score*100))
|
||||||
|
results[algo] = score
|
||||||
|
|
||||||
|
winner = max(results, key=results.get)
|
||||||
|
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||||
|
print("Saving the model [ ... ]")
|
||||||
|
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||||
|
print("Saving the model [ DONE ]")
|
||||||
|
|
||||||
120
04_detect_from_oneline_csv.py
Normal file
120
04_detect_from_oneline_csv.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.model_selection import StratifiedShuffleSplit
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.tree import DecisionTreeRegressor
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.ensemble import GradientBoostingClassifier
|
||||||
|
from sklearn.ensemble import AdaBoostClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
|
||||||
|
# - For features selection
|
||||||
|
from sklearn.ensemble import ExtraTreesClassifier
|
||||||
|
from sklearn.feature_selection import SelectFromModel
|
||||||
|
|
||||||
|
print("Loading dataset in memory [ ... ]")
|
||||||
|
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
||||||
|
print("Loading dataset in memory [ DONE ]")
|
||||||
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||||
|
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||||
|
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||||
|
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
||||||
|
|
||||||
|
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||||
|
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
||||||
|
strat_train_set = files.loc[train_index]
|
||||||
|
strat_test_set = files.loc[test_index]
|
||||||
|
|
||||||
|
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
||||||
|
for set_ in (strat_train_set, strat_test_set):
|
||||||
|
set_.drop("Machine_cat", axis=1, inplace=True)
|
||||||
|
|
||||||
|
files = strat_train_set.copy()
|
||||||
|
|
||||||
|
# ==== Split Features and Labels ====
|
||||||
|
|
||||||
|
print("Splitting dataset Features and Labels [ ... ]")
|
||||||
|
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
||||||
|
files_without_labels = files.drop("legitimate", axis=1)
|
||||||
|
files_labels = files["legitimate"].copy()
|
||||||
|
print("Splitting dataset Features and Labels [ DONE ]")
|
||||||
|
|
||||||
|
imputer = SimpleImputer(strategy="median")
|
||||||
|
|
||||||
|
# - Features manual transformer -
|
||||||
|
class ManualFeatureSelector(TransformerMixin):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def fit(self, X, y=None):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, X):
|
||||||
|
# -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] --
|
||||||
|
return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
|
||||||
|
# - Create the pipeline -
|
||||||
|
|
||||||
|
pipeline = Pipeline([
|
||||||
|
('features_remap', ManualFeatureSelector()),
|
||||||
|
('imputer', SimpleImputer(strategy="median")),
|
||||||
|
('std_scaler', StandardScaler()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# - Prepare dataset, pass it through the pipeline -
|
||||||
|
print("Dataset passing through the pipeline [ ... ]")
|
||||||
|
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||||
|
files_prepared = pipeline.fit_transform(files_without_labels)
|
||||||
|
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name)
|
||||||
|
print("Dataset passing through the pipeline [ DONE ]")
|
||||||
|
print("Describe of 'files_prepared': ")
|
||||||
|
print(files_prepared.describe())
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-=
|
||||||
|
|
||||||
|
algos = {
|
||||||
|
"DecisionTree": DecisionTreeClassifier(max_depth=10),
|
||||||
|
"RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
|
||||||
|
"GradientBoosting": GradientBoostingClassifier(n_estimators=50),
|
||||||
|
"AdaBoost": AdaBoostClassifier(n_estimators=100),
|
||||||
|
"GNB": GaussianNB()
|
||||||
|
}
|
||||||
|
|
||||||
|
X_train = files_prepared
|
||||||
|
y_train = files_labels
|
||||||
|
|
||||||
|
X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1))
|
||||||
|
y_test = strat_test_set['legitimate'].copy()
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
print("Testing 5 algo [ ... ]")
|
||||||
|
for algo in algos:
|
||||||
|
cur_algo = algos[algo]
|
||||||
|
cur_algo.fit(X_train, y_train)
|
||||||
|
#score = cur_algo.score(X_test, y_test)
|
||||||
|
score = cur_algo.score(X_test, y_test)
|
||||||
|
print("%s : %f %%" % (algo, score*100))
|
||||||
|
results[algo] = score
|
||||||
|
|
||||||
|
winner = max(results, key=results.get)
|
||||||
|
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||||
|
|
||||||
|
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||||
|
print("Saving the model [ ... ]")
|
||||||
|
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||||
|
print("Saving the model [ DONE ]")
|
||||||
|
|
||||||
216353
dataset/bkp_dataset.txt
Normal file
216353
dataset/bkp_dataset.txt
Normal file
File diff suppressed because it is too large
Load Diff
216353
dataset/dataset.txt
Normal file
216353
dataset/dataset.txt
Normal file
File diff suppressed because it is too large
Load Diff
216353
dataset/dataset_clean.txt
Normal file
216353
dataset/dataset_clean.txt
Normal file
File diff suppressed because it is too large
Load Diff
1
dataset/one_malware.csv
Normal file
1
dataset/one_malware.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0
|
||||||
|
BIN
models/malware_classifier_1.pkl
Normal file
BIN
models/malware_classifier_1.pkl
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user