import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor # - For features selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") print("Dataset basic infos:") #print(files.describe()) #print(files.info()) print("Generating the correlation matrix [ ... ]") corr_matrix = files.corr() print("Generating the correlation matrix [ DONE ]") print("Correlation matrix:") print(corr_matrix['legitimate'].sort_values(ascending=False)) # =-=-=-=-=-=-=-= Data Analysis =-=-=-=-=-=-= # ==== Split dataset (train/test) ==== print("Splitting dataset (train/test) [idistribution -> 20%] [ ... ]") train_set, test_set = train_test_split(files, test_size=0.2, random_state=42) print("Splitting dataset (train/test) [idistribution -> 20%] [ DONE ]")