110 lines
4.3 KiB
Python
110 lines
4.3 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import StratifiedShuffleSplit
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.metrics import mean_squared_error
|
|
from sklearn.tree import DecisionTreeRegressor
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
|
# - For features selection
|
|
from sklearn.ensemble import ExtraTreesClassifier
|
|
from sklearn.feature_selection import SelectFromModel
|
|
|
|
print("Loading dataset in memory [ ... ]")
|
|
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
|
print("Loading dataset in memory [ DONE ]")
|
|
|
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
|
# ==== Split DataSet again but lets stratify with Machine feature ====
|
|
# - Add tmp cat in order to be able to stratify data while splitting it
|
|
|
|
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
|
for train_index, test_index in split.split(files, files["legitimate"]):
|
|
strat_train_set = files.loc[train_index]
|
|
strat_test_set = files.loc[test_index]
|
|
|
|
files = strat_train_set.copy()
|
|
# ==== Drop useless features ====
|
|
|
|
files_without_useless_features = files.drop(['ID', 'md5'], axis=1)
|
|
|
|
# ==== Split Features and Labels ====
|
|
|
|
print("Splitting dataset Features and Labels [ ... ]")
|
|
files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
|
files_labels = files["legitimate"].copy()
|
|
print("Splitting dataset Features and Labels [ DONE ]")
|
|
|
|
# ==== Replace Null values with median ====
|
|
|
|
print("Replacing null values with Median [ ... ]")
|
|
imputer = SimpleImputer(strategy="median")
|
|
# - No need to drop categorical features, we dont have those in this dataset
|
|
# - Fit the imputer with the data
|
|
print("Imputer is learning medians [ ... ]")
|
|
imputer.fit(files_without_labels)
|
|
print("Imputer is learning medians [ DONE ]")
|
|
# - Apply median on null values
|
|
print("Imputer is transforming DataSet [ ... ]")
|
|
files_without_labels_X_tr = imputer.transform(files_without_labels)
|
|
print("Imputer is transforming DataSet [ DONE ]")
|
|
# - Transform NumPy array into PD DataFrame
|
|
files_without_labels_tr = pd.DataFrame(files_without_labels_X_tr, columns=files_without_labels.columns)
|
|
print("Replacing null values with Median [ DONE ]")
|
|
|
|
# - Create the pipeline -
|
|
|
|
pipeline = Pipeline([
|
|
('imputer', SimpleImputer(strategy="median")),
|
|
('std_scaler', StandardScaler()),
|
|
])
|
|
|
|
# - Prepare dataset, pass it through the pipeline - i
|
|
print("Dataset passing through the pipeline [ ... ]")
|
|
files_prepared = pipeline.fit_transform(files_without_labels)
|
|
files_prepared = pd.DataFrame(files_prepared, columns=files_without_labels.columns)
|
|
print("Dataset passing through the pipeline [ DONE ]")
|
|
print("Describe of 'files_prepared': ")
|
|
print(files_prepared.describe())
|
|
|
|
|
|
# =-=-=-=-=-=-= Features selection =-=-=-=-=-=-=
|
|
|
|
print("Extracting most correlated features [ ... ]")
|
|
print(" -> Line 1 [ ... ]")
|
|
# - Here we use n_jobs with "-1" in order to use all threads of the CPU -
|
|
f_select = ExtraTreesClassifier(n_jobs = -1).fit(files_prepared, files_labels)
|
|
|
|
print(" -> Line 2 [ ... ]")
|
|
model = SelectFromModel(f_select, prefit=True)
|
|
|
|
print(" -> Line 3 [ ... ]")
|
|
files_features_short = model.transform(files_prepared)
|
|
print("Extracting most correlated features [ DONE ]")
|
|
|
|
print("Features nb before filter: ", files_prepared.shape[1])
|
|
print("Features nb after filter: ", files_features_short.shape[1])
|
|
|
|
print(" --- Features list after ExtraTreesClassifier job --- ")
|
|
nb_features = files_features_short.shape[1]
|
|
indices = np.argsort(f_select.feature_importances_)[::-1][:nb_features]
|
|
|
|
features_to_keep = []
|
|
for f in range(nb_features):
|
|
print("%d. feature %s (%f)" % (f + 1, files_prepared.columns[indices[f]], f_select.feature_importances_[indices[f]]))
|
|
# - keep a list of the features to keep
|
|
features_to_keep.append(files_prepared.columns[indices[f]])
|
|
# - build DataFrame based on original dataset with selected features only -
|
|
files_shorted_features = files_prepared[features_to_keep]
|
|
#print(files_shorted_features.describe())
|
|
|