import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor # - For features selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # ==== Split DataSet again but lets stratify with Machine feature ==== # - Add tmp cat in order to be able to stratify data while splitting it split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) for train_index, test_index in split.split(files, files["legitimate"]): strat_train_set = files.loc[train_index] strat_test_set = files.loc[test_index] files = strat_train_set.copy() # ==== Drop useless features ==== files_without_useless_features = files.drop(['ID', 'md5'], axis=1) # ==== Split Features and Labels ==== print("Splitting dataset Features and Labels [ ... ]") files_without_labels = files_without_useless_features.drop("legitimate", axis=1) files_labels = files["legitimate"].copy() print("Splitting dataset Features and Labels [ DONE ]") # ==== Replace Null values with median ==== print("Replacing null values with Median [ ... ]") imputer = SimpleImputer(strategy="median") # - No need to drop categorical features, we dont have those in this dataset # - Fit the imputer with the data print("Imputer is learning medians [ ... ]") imputer.fit(files_without_labels) print("Imputer is learning medians [ DONE ]") # - Apply median on null values print("Imputer is transforming DataSet [ ... ]") files_without_labels_X_tr = imputer.transform(files_without_labels) print("Imputer is transforming DataSet [ DONE ]") # - Transform NumPy array into PD DataFrame files_without_labels_tr = pd.DataFrame(files_without_labels_X_tr, columns=files_without_labels.columns) print("Replacing null values with Median [ DONE ]") # - Create the pipeline - pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) # - Prepare dataset, pass it through the pipeline - i print("Dataset passing through the pipeline [ ... ]") files_prepared = pipeline.fit_transform(files_without_labels) files_prepared = pd.DataFrame(files_prepared, columns=files_without_labels.columns) print("Dataset passing through the pipeline [ DONE ]") print("Describe of 'files_prepared': ") print(files_prepared.describe()) # =-=-=-=-=-=-= Features selection =-=-=-=-=-=-= print("Extracting most correlated features [ ... ]") print(" -> Line 1 [ ... ]") # - Here we use n_jobs with "-1" in order to use all threads of the CPU - f_select = ExtraTreesClassifier(n_jobs = -1).fit(files_prepared, files_labels) print(" -> Line 2 [ ... ]") model = SelectFromModel(f_select, prefit=True) print(" -> Line 3 [ ... ]") files_features_short = model.transform(files_prepared) print("Extracting most correlated features [ DONE ]") print("Features nb before filter: ", files_prepared.shape[1]) print("Features nb after filter: ", files_features_short.shape[1]) print(" --- Features list after ExtraTreesClassifier job --- ") nb_features = files_features_short.shape[1] indices = np.argsort(f_select.feature_importances_)[::-1][:nb_features] features_to_keep = [] for f in range(nb_features): print("%d. feature %s (%f)" % (f + 1, files_prepared.columns[indices[f]], f_select.feature_importances_[indices[f]])) # - keep a list of the features to keep features_to_keep.append(files_prepared.columns[indices[f]]) # - build DataFrame based on original dataset with selected features only - files_shorted_features = files_prepared[features_to_keep] #print(files_shorted_features.describe())