big_data_projet/02_draft_preparation_work.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# - For features selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

print("Loading dataset in memory [ ... ]")
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
print("Loading dataset in memory [ DONE ]")

# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
# ==== Split DataSet again but lets stratify with Machine feature ====
# - Add tmp cat in order to be able to stratify data while splitting it

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(files, files["legitimate"]):
	strat_train_set = files.loc[train_index]
	strat_test_set = files.loc[test_index]

files = strat_train_set.copy()
# ==== Drop useless features ====

files_without_useless_features = files.drop(['ID', 'md5'], axis=1)

# ==== Split Features and Labels ====

print("Splitting dataset Features and Labels [ ... ]")
files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
files_labels = files["legitimate"].copy()
print("Splitting dataset Features and Labels [ DONE ]")

# ==== Replace Null values with median ====

print("Replacing null values with Median [ ... ]")
imputer = SimpleImputer(strategy="median")
# - No need to drop categorical features, we dont have those in this dataset
# - Fit the imputer with the data
print("Imputer is learning medians [ ... ]")
imputer.fit(files_without_labels)
print("Imputer is learning medians [ DONE ]")
# - Apply median on null values
print("Imputer is transforming DataSet [ ... ]")
files_without_labels_X_tr = imputer.transform(files_without_labels)
print("Imputer is transforming DataSet [ DONE ]")
# - Transform NumPy array into PD DataFrame
files_without_labels_tr = pd.DataFrame(files_without_labels_X_tr, columns=files_without_labels.columns)
print("Replacing null values with Median [ DONE ]")

# - Create the pipeline -

pipeline = Pipeline([
	('imputer', SimpleImputer(strategy="median")),
	('std_scaler', StandardScaler()),
])

# - Prepare dataset, pass it through the pipeline - i
print("Dataset passing through the pipeline [ ... ]")
files_prepared = pipeline.fit_transform(files_without_labels)
files_prepared = pd.DataFrame(files_prepared, columns=files_without_labels.columns)
print("Dataset passing through the pipeline [ DONE ]")
print("Describe of 'files_prepared': ")
print(files_prepared.describe())


# =-=-=-=-=-=-= Features selection =-=-=-=-=-=-=

print("Extracting most correlated features [ ... ]")
print(" -> Line 1 [ ... ]")
# - Here we use n_jobs with "-1" in order to use all threads of the CPU -
f_select = ExtraTreesClassifier(n_jobs = -1).fit(files_prepared, files_labels)

print(" -> Line 2 [ ... ]")
model = SelectFromModel(f_select, prefit=True)

print(" -> Line 3 [ ... ]")
files_features_short = model.transform(files_prepared)
print("Extracting most correlated features [ DONE ]")

print("Features nb before filter: ", files_prepared.shape[1])
print("Features nb after filter:  ", files_features_short.shape[1])

print(" --- Features list after ExtraTreesClassifier job --- ")
nb_features = files_features_short.shape[1]
indices = np.argsort(f_select.feature_importances_)[::-1][:nb_features]

features_to_keep = []
for f in range(nb_features):
	print("%d. feature %s (%f)" % (f + 1, files_prepared.columns[indices[f]], f_select.feature_importances_[indices[f]]))
	# - keep a list of the features to keep
	features_to_keep.append(files_prepared.columns[indices[f]])
# - build DataFrame based on original dataset with selected features only -
files_shorted_features = files_prepared[features_to_keep]
#print(files_shorted_features.describe())