Initial commmit Bro ;)

2020-04-04 12:57:05 +02:00 · 2020-04-04 12:57:05 +02:00 · 72478eccb9
commit 72478eccb9
9 changed files with 649461 additions and 0 deletions
--- a/01_data_analysis.py
+++ b/01_data_analysis.py
@ -0,0 +1,45 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.model_selection import cross_val_score
+from sklearn.ensemble import RandomForestRegressor
+
+# - For features selection 
+from sklearn.ensemble import ExtraTreesClassifier 
+from sklearn.feature_selection import SelectFromModel 
+
+print("Loading dataset in memory [ ... ]")
+files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
+print("Loading dataset in memory [ DONE ]")
+
+print("Dataset basic infos:")
+#print(files.describe()) 
+#print(files.info())
+
+print("Generating the correlation matrix [ ... ]") 
+corr_matrix = files.corr() 
+print("Generating the correlation matrix [ DONE ]")
+print("Correlation matrix:") 
+print(corr_matrix['legitimate'].sort_values(ascending=False))
+
+# =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-= 
+# ==== Split dataset (train/test) ==== 
+
+print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]")
+train_set, test_set = train_test_split(files, test_size=0.2, random_state=42) 
+print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")
+
+
+
+ 
--- a/02_draft_preparation_work.py
+++ b/02_draft_preparation_work.py
@ -0,0 +1,116 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.model_selection import cross_val_score
+from sklearn.ensemble import RandomForestRegressor
+
+# - For features selection 
+from sklearn.ensemble import ExtraTreesClassifier 
+from sklearn.feature_selection import SelectFromModel 
+
+print("Loading dataset in memory [ ... ]")
+files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
+print("Loading dataset in memory [ DONE ]")
+
+# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= 
+# ==== Split DataSet again but lets stratify with Machine feature ==== 
+# - Add tmp cat in order to be able to stratify data while splitting it 
+files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2]) 
+
+split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
+for train_index, test_index in split.split(files, files["Machine_cat"]): 
+	strat_train_set = files.loc[train_index]  
+	strat_test_set = files.loc[test_index] 
+
+# - Remove tmp created cat, now the data is splitted, we dont need it anymore 
+for set_ in (strat_train_set, strat_test_set): 
+	set_.drop("Machine_cat", axis=1, inplace=True)
+ 
+files = strat_train_set.copy() 
+# ==== Drop useless features ====
+
+files_without_useless_features = files.drop(['ID', 'md5'], axis=1) 
+
+# ==== Split Features and Labels ==== 
+
+print("Splitting dataset Features and Labels [ ... ]")
+files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
+files_labels = files["legitimate"].copy() 
+print("Splitting dataset Features and Labels [ DONE ]")
+
+# ==== Replace Null values with median ====
+
+print("Replacing null values with Median [ ... ]")
+imputer = SimpleImputer(strategy="median")
+# - No need to drop categorical features, we dont have those in this dataset
+# - Fit the imputer with the data 
+print("Imputer is learning medians [ ... ]") 
+imputer.fit(files_without_labels) 
+print("Imputer is learning medians [ DONE ]") 
+# - Apply median on null values 
+print("Imputer is transforming DataSet [ ... ]") 
+files_without_labels_X_tr = imputer.transform(files_without_labels) 
+print("Imputer is transforming DataSet [ DONE ]") 
+# - Transform NumPy array into PD DataFrame
+files_without_labels_tr = pd.DataFrame(files_without_labels_X_tr, columns=files_without_labels.columns) 
+print("Replacing null values with Median [ DONE ]")
+
+# - Create the pipeline - 
+
+pipeline = Pipeline([
+	('imputer', SimpleImputer(strategy="median")), 
+	('std_scaler', StandardScaler()),
+]) 
+
+# - Prepare dataset, pass it through the pipeline - i
+print("Dataset passing through the pipeline [ ... ]")
+files_prepared = pipeline.fit_transform(files_without_labels) 
+files_prepared = pd.DataFrame(files_prepared, columns=files_without_labels.columns)
+print("Dataset passing through the pipeline [ DONE ]") 
+print("Describe of 'files_prepared': ")
+print(files_prepared.describe())
+ 
+
+# =-=-=-=-=-=-= Features selection =-=-=-=-=-=-= 
+
+print("Extracting most correlated features [ ... ]") 
+print(" -> Line 1 [ ... ]") 
+# - Here we use n_jobs with "-1" in order to use all threads of the CPU -   
+f_select = ExtraTreesClassifier(n_jobs = -1).fit(files_prepared, files_labels) 
+
+print(" -> Line 2 [ ... ]")
+model = SelectFromModel(f_select, prefit=True) 
+
+print(" -> Line 3 [ ... ]")
+files_features_short = model.transform(files_prepared) 
+print("Extracting most correlated features [ DONE ]")
+
+print("Features nb before filter: ", files_prepared.shape[1])
+print("Features nb after filter:  ", files_features_short.shape[1]) 
+
+print(" --- Features list after ExtraTreesClassifier job --- ") 
+nb_features = files_features_short.shape[1] 
+indices = np.argsort(f_select.feature_importances_)[::-1][:nb_features]
+
+features_to_keep = []  
+for f in range(nb_features):
+	print("%d. feature %s (%f)" % (f + 1, files_prepared.columns[indices[f]], f_select.feature_importances_[indices[f]])) 
+	# - keep a list of the features to keep  
+	features_to_keep.append(files_prepared.columns[indices[f]]) 
+# - build DataFrame based on original dataset with selected features only - 
+files_shorted_features = files_prepared[features_to_keep]
+#print(files_shorted_features.describe()) 
+
+
+ 
--- a/03_the_ml.py
+++ b/03_the_ml.py
@ -0,0 +1,120 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.model_selection import cross_val_score
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.tree import DecisionTreeClassifier 
+from sklearn.ensemble import RandomForestClassifier 
+from sklearn.ensemble import GradientBoostingClassifier 
+from sklearn.ensemble import AdaBoostClassifier 
+from sklearn.naive_bayes import GaussianNB 
+import joblib 
+
+
+# - For features selection 
+from sklearn.ensemble import ExtraTreesClassifier 
+from sklearn.feature_selection import SelectFromModel 
+
+print("Loading dataset in memory [ ... ]")
+files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
+print("Loading dataset in memory [ DONE ]")
+# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= 
+# ==== Split DataSet again but lets stratify with Machine feature ==== 
+# - Add tmp cat in order to be able to stratify data while splitting it 
+files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2]) 
+
+split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
+for train_index, test_index in split.split(files, files["Machine_cat"]): 
+	strat_train_set = files.loc[train_index]  
+	strat_test_set = files.loc[test_index] 
+
+# - Remove tmp created cat, now the data is splitted, we dont need it anymore 
+for set_ in (strat_train_set, strat_test_set): 
+	set_.drop("Machine_cat", axis=1, inplace=True)
+ 
+files = strat_train_set.copy() 
+
+# ==== Split Features and Labels ==== 
+
+print("Splitting dataset Features and Labels [ ... ]")
+#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
+files_without_labels = files.drop("legitimate", axis=1)
+files_labels = files["legitimate"].copy() 
+print("Splitting dataset Features and Labels [ DONE ]")
+
+imputer = SimpleImputer(strategy="median")
+
+# - Features manual transformer - 
+class ManualFeatureSelector(TransformerMixin):
+
+    def __init__(self):
+        pass
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- 
+        return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
+# - Create the pipeline - 
+
+pipeline = Pipeline([
+	('features_remap', ManualFeatureSelector()), 
+	('imputer', SimpleImputer(strategy="median")),
+	('std_scaler', StandardScaler()),
+]) 
+
+# - Prepare dataset, pass it through the pipeline - 
+print("Dataset passing through the pipeline [ ... ]")
+features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax    Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] 
+files_prepared = pipeline.fit_transform(files_without_labels) 
+files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name) 
+print("Dataset passing through the pipeline [ DONE ]") 
+print("Describe of 'files_prepared': ")
+print(files_prepared.describe())
+ 
+# =-=-=-=-=-=-= Select the algo Model  =-=-=-=-=-=-= 
+
+algos = {
+        "DecisionTree": DecisionTreeClassifier(max_depth=10),
+        "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
+        "GradientBoosting": GradientBoostingClassifier(n_estimators=50),
+        "AdaBoost": AdaBoostClassifier(n_estimators=100),
+        "GNB": GaussianNB()
+    }
+
+X_train = files_prepared 
+y_train = files_labels  
+
+X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1)) 
+y_test = strat_test_set['legitimate'].copy() 
+
+results = {}
+print("Testing 5 algo [ ... ]")
+for algo in algos:
+    cur_algo = algos[algo]
+    cur_algo.fit(X_train, y_train)
+    #score = cur_algo.score(X_test, y_test)
+    score = cur_algo.score(X_test, y_test)
+    print("%s : %f %%" % (algo, score*100))
+    results[algo] = score
+
+winner = max(results, key=results.get)
+print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 
+
+# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= 
+print("Saving the model [ ... ]")
+joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") 
+print("Saving the model [ DONE ]")
+ 
--- a/04_detect_from_oneline_csv.py
+++ b/04_detect_from_oneline_csv.py
@ -0,0 +1,120 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.model_selection import cross_val_score
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.tree import DecisionTreeClassifier 
+from sklearn.ensemble import RandomForestClassifier 
+from sklearn.ensemble import GradientBoostingClassifier 
+from sklearn.ensemble import AdaBoostClassifier 
+from sklearn.naive_bayes import GaussianNB 
+import joblib 
+
+
+# - For features selection 
+from sklearn.ensemble import ExtraTreesClassifier 
+from sklearn.feature_selection import SelectFromModel 
+
+print("Loading dataset in memory [ ... ]")
+files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
+print("Loading dataset in memory [ DONE ]")
+# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= 
+# ==== Split DataSet again but lets stratify with Machine feature ==== 
+# - Add tmp cat in order to be able to stratify data while splitting it 
+files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2]) 
+
+split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
+for train_index, test_index in split.split(files, files["Machine_cat"]): 
+	strat_train_set = files.loc[train_index]  
+	strat_test_set = files.loc[test_index] 
+
+# - Remove tmp created cat, now the data is splitted, we dont need it anymore 
+for set_ in (strat_train_set, strat_test_set): 
+	set_.drop("Machine_cat", axis=1, inplace=True)
+ 
+files = strat_train_set.copy() 
+
+# ==== Split Features and Labels ==== 
+
+print("Splitting dataset Features and Labels [ ... ]")
+#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
+files_without_labels = files.drop("legitimate", axis=1)
+files_labels = files["legitimate"].copy() 
+print("Splitting dataset Features and Labels [ DONE ]")
+
+imputer = SimpleImputer(strategy="median")
+
+# - Features manual transformer - 
+class ManualFeatureSelector(TransformerMixin):
+
+    def __init__(self):
+        pass
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- 
+        return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
+# - Create the pipeline - 
+
+pipeline = Pipeline([
+	('features_remap', ManualFeatureSelector()), 
+	('imputer', SimpleImputer(strategy="median")),
+	('std_scaler', StandardScaler()),
+]) 
+
+# - Prepare dataset, pass it through the pipeline - 
+print("Dataset passing through the pipeline [ ... ]")
+features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax    Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] 
+files_prepared = pipeline.fit_transform(files_without_labels) 
+files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name) 
+print("Dataset passing through the pipeline [ DONE ]") 
+print("Describe of 'files_prepared': ")
+print(files_prepared.describe())
+ 
+# =-=-=-=-=-=-= Select the algo Model  =-=-=-=-=-=-= 
+
+algos = {
+        "DecisionTree": DecisionTreeClassifier(max_depth=10),
+        "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
+        "GradientBoosting": GradientBoostingClassifier(n_estimators=50),
+        "AdaBoost": AdaBoostClassifier(n_estimators=100),
+        "GNB": GaussianNB()
+    }
+
+X_train = files_prepared 
+y_train = files_labels  
+
+X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1)) 
+y_test = strat_test_set['legitimate'].copy() 
+
+results = {}
+print("Testing 5 algo [ ... ]")
+for algo in algos:
+    cur_algo = algos[algo]
+    cur_algo.fit(X_train, y_train)
+    #score = cur_algo.score(X_test, y_test)
+    score = cur_algo.score(X_test, y_test)
+    print("%s : %f %%" % (algo, score*100))
+    results[algo] = score
+
+winner = max(results, key=results.get)
+print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 
+
+# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= 
+print("Saving the model [ ... ]")
+joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl") 
+print("Saving the model [ DONE ]")
+ 
--- a/dataset/bkp_dataset.txt
+++ b/dataset/bkp_dataset.txt
--- a/dataset/dataset.txt
+++ b/dataset/dataset.txt
--- a/dataset/dataset_clean.txt
+++ b/dataset/dataset_clean.txt
--- a/dataset/one_malware.csv
+++ b/dataset/one_malware.csv
@ -0,0 +1 @@
+4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0
--- a/models/malware_classifier_1.pkl
+++ b/models/malware_classifier_1.pkl
				`@ -0,0 +1 @@`
				`4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0`