import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor # - For features selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") print("Dataset basic infos:") #print(files.describe()) #print(files.info()) print("Generating the correlation matrix [ ... ]") corr_matrix = files.corr() print("Generating the correlation matrix [ DONE ]") print("Correlation matrix:") print(corr_matrix['legitimate'].sort_values(ascending=False))