import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.naive_bayes import GaussianNB import joblib # - For features selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= files = files.drop("ID", axis=1) files = files.drop("md5", axis=1) # ==== Split DataSet again but lets stratify with Machine feature ==== # - Add tmp cat in order to be able to stratify data while splitting it #files["legitimate_cat"] = pd.cut(files["legitimate"], bins=[0, 1], labels=[1,2]) split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) for train_index, test_index in split.split(files, files["legitimate"]): strat_train_set = files.loc[train_index] strat_test_set = files.loc[test_index] # - Remove tmp created cat, now the data is splitted, we dont need it anymore #for set_ in (strat_train_set, strat_test_set): # set_.drop("legitimate_cat", axis=1, inplace=True) files = strat_train_set.copy() # ==== Split Features and Labels ==== print("Splitting dataset Features and Labels [ ... ]") #files_without_labels = files_without_useless_features.drop("legitimate", axis=1) files_without_labels = files.drop("legitimate", axis=1) files_labels = files["legitimate"].copy() print("Splitting dataset Features and Labels [ DONE ]") imputer = SimpleImputer(strategy="median") # - Features manual transformer - class ManualFeatureSelector(TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- return X.values[:,[1,24,34,18,0,23,48,47,53,14,46,32]] # - If ID and md5 not dropped juste after dataset import use the other return - #return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]] # - Create the pipeline - pipeline = Pipeline([ ('features_remap', ManualFeatureSelector()), ('imputer', SimpleImputer(strategy="median")), ]) # - Prepare dataset, pass it through the pipeline - print("Dataset passing through the pipeline [ ... ]") features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] files_prepared = pipeline.fit_transform(files_without_labels) files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name) print("Dataset passing through the pipeline [ DONE ]") print("Describe of 'files_prepared': ") print(files_prepared.describe()) # =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-= algos = { "DecisionTree": DecisionTreeClassifier(max_depth=10), "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1), "GradientBoosting": GradientBoostingClassifier(n_estimators=50), "AdaBoost": AdaBoostClassifier(n_estimators=100), "GNB": GaussianNB() } X_train = files_prepared y_train = files_labels X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1)) y_test = strat_test_set['legitimate'].copy() results = {} print("Testing 5 algo [ ... ]") for algo in algos: cur_algo = algos[algo] cur_algo.fit(X_train, y_train) #score = cur_algo.score(X_test, y_test) score = cur_algo.score(X_test, y_test) print("%s : %f %%" % (algo, score*100)) results[algo] = score winner = max(results, key=results.get) print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) # =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= #print("Saving the model [ ... ]") #joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") #print("Saving the model [ DONE ]")