214 lines
7.8 KiB
Python
214 lines
7.8 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import StratifiedShuffleSplit
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.compose import ColumnTransformer
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.metrics import mean_squared_error
|
|
from sklearn.tree import DecisionTreeRegressor
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
|
#housing = pd.read_csv('https://raw.githubusercontent.com/securitylab-repository/TPS-IA/master/datasets/housing.csv',delimiter=',')
|
|
housing = pd.read_csv('housing.csv',delimiter=',')
|
|
|
|
#housing.head()
|
|
|
|
#housing.info()
|
|
|
|
#housing['ocean_proximity'].sample(100)
|
|
|
|
#housing["ocean_proximity"].value_counts()
|
|
|
|
#housing.describe()
|
|
|
|
#housing.hist(bins=50, figsize=(20,15))
|
|
|
|
#plt.show()
|
|
print("======================== Training [...]")
|
|
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
|
|
print("======================== Training [OK]")
|
|
|
|
print("======================== Type -> train_set")
|
|
#type(train_set)
|
|
print("======================== Type -> test_set")
|
|
#type(test_set)
|
|
print("======================== Type -> train_set.info()")
|
|
#type(train_set.info())
|
|
print("======================== Type -> test_set.info()")
|
|
#type(test_set.info())
|
|
|
|
#print(train_set)
|
|
|
|
### === Data analysis === ###
|
|
|
|
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
|
|
#housing["income_cat"].hist()
|
|
#plt.show()
|
|
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
|
|
for train_index, test_index in split.split(housing, housing["income_cat"]):
|
|
strat_train_set = housing.loc[train_index]
|
|
strat_test_set = housing.loc[test_index]
|
|
|
|
print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))
|
|
|
|
|
|
for set_ in (strat_train_set, strat_test_set):
|
|
set_.drop("income_cat", axis=1, inplace=True)
|
|
|
|
housing=strat_train_set.copy()
|
|
# --- Visualisation des donnees ---
|
|
#housing.plot(kind="scatter", x="longitude", y="latitude")
|
|
#housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
|
|
#plt.show()
|
|
|
|
# --- Matrice de correlation ---
|
|
corr_matrix = housing.corr()
|
|
print(corr_matrix['median_house_value'].sort_values(ascending=False))
|
|
|
|
#housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
|
|
#plt.show()
|
|
|
|
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
|
|
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
|
|
housing["population_per_household"]=housing["population"]/housing["households"]
|
|
|
|
corr_matrix = housing.corr()
|
|
print(corr_matrix["median_house_value"].sort_values(ascending=False))
|
|
|
|
# --- End of data analysis ---
|
|
|
|
|
|
|
|
|
|
### === Prepare Data for ML === ###
|
|
# --- Split Features and Labels ---
|
|
housing = strat_train_set.drop("median_house_value", axis=1)
|
|
housing_labels = strat_train_set["median_house_value"].copy()
|
|
|
|
# --- Replace null-values with median value ---
|
|
|
|
# - tell the imputer that we want to use median to replace values -
|
|
imputer = SimpleImputer(strategy="median")
|
|
|
|
# - remove features that dont have numeric values -
|
|
# - cant calculate median of a word -
|
|
housing_num = housing.drop("ocean_proximity", axis=1)
|
|
imputer.fit(housing_num)
|
|
print(imputer.statistics_)
|
|
# - transform housing_num with median values calculated by the imputer -
|
|
X = imputer.transform(housing_num)
|
|
# - convert X to Pandas DataFrame because now X is a NumPy array -
|
|
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
|
|
|
|
# --- Handling text and categorical attributes ---
|
|
housing_cat = housing[["ocean_proximity"]]
|
|
cat_encoder = OneHotEncoder()
|
|
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
|
|
|
|
# - just to check categories encoding -
|
|
print(housing_cat_1hot.toarray())
|
|
print(cat_encoder.categories_)
|
|
print(housing.head(3))
|
|
|
|
# --- Custom Transformers ---
|
|
|
|
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
|
|
|
|
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
|
|
def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
|
|
self.add_bedrooms_per_room = add_bedrooms_per_room
|
|
def fit(self, X, y=None):
|
|
return self # nothing else to do
|
|
def transform(self, X, y=None):
|
|
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
|
|
population_per_household = X[:, population_ix] / X[:, households_ix]
|
|
if self.add_bedrooms_per_room:
|
|
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
|
|
return np.c_[X, rooms_per_household, population_per_household,
|
|
bedrooms_per_room]
|
|
else:
|
|
return np.c_[X, rooms_per_household, population_per_household]
|
|
|
|
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
|
|
housing_extra_attribs = attr_adder.transform(housing.values)
|
|
|
|
# --- Transformation pipelines ---
|
|
|
|
# -- Numeric pipeline --
|
|
|
|
num_pipeline = Pipeline([
|
|
('imputer', SimpleImputer(strategy="median")),
|
|
('attribs_adder', CombinedAttributesAdder()),
|
|
('std_scaler', StandardScaler()),
|
|
])
|
|
|
|
housing_num_tr = num_pipeline.fit_transform(housing_num)
|
|
|
|
# -- Full pipeline, including categorical attributes with custom transformer
|
|
|
|
num_attribs = list(housing_num)
|
|
cat_attribs = ["ocean_proximity"]
|
|
|
|
full_pipeline = ColumnTransformer([
|
|
("num", num_pipeline, num_attribs),
|
|
("cat", OneHotEncoder(), cat_attribs),
|
|
])
|
|
|
|
housing_prepared = full_pipeline.fit_transform(housing)
|
|
|
|
### === Select and Train Model === ###
|
|
# --- Train and eval on training set ---
|
|
|
|
lin_reg = LinearRegression()
|
|
lin_reg.fit(housing_prepared, housing_labels)
|
|
|
|
# - just a simple test with trained data -
|
|
|
|
some_data = housing.iloc[:5]
|
|
some_labels = housing_labels.iloc[:5]
|
|
some_data_prepared = full_pipeline.transform(some_data)
|
|
print("Predictions: ", lin_reg.predict(some_data_prepared))
|
|
print("Labels: ", list(some_labels))
|
|
# - not very good, lets check error rate with RMSE method -
|
|
|
|
housing_predictions = lin_reg.predict(housing_prepared)
|
|
lin_mse = mean_squared_error(housing_labels, housing_predictions)
|
|
lin_rmse = np.sqrt(lin_mse)
|
|
print("Error rate [lin_reg] (less is best): ", lin_rmse)
|
|
# - Linear regression is not adapted for this dataset -
|
|
# - Lets try another one (Decision tree regressor)
|
|
# - Train the model ...
|
|
tree_reg = DecisionTreeRegressor()
|
|
tree_reg.fit(housing_prepared, housing_labels)
|
|
# - Calculate error rate on predictions...
|
|
housing_predictions = tree_reg.predict(housing_prepared)
|
|
tree_mse = mean_squared_error(housing_labels, housing_predictions)
|
|
tree_rmse = np.sqrt(tree_mse)
|
|
print("Error rate [tree_reg] (less is best): ", tree_rmse)
|
|
# - We got 0 as error rate. Strange.. its performing too much. Its overfitting.
|
|
# --- Cross validation to split training_Set in multiple parts and evaluate on each validation_sets ---
|
|
# ---- CV Score over tree_reg ----
|
|
score = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
|
|
tree_rmse_scores = np.sqrt(-score)
|
|
print("CV Scores Mean: [tree_reg]", tree_rmse_scores.mean())
|
|
|
|
# ---- CV Score over lin_reg ----
|
|
score = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
|
|
lin_rmse_scores = np.sqrt(-score)
|
|
print("CV Scores Mean [lin_reg]: ", lin_rmse_scores.mean())
|
|
|
|
# ---- CV Score over forest_reg ----
|
|
forest_reg = RandomForestRegressor()
|
|
forest_reg.fit(housing_prepared, housing_labels)
|
|
|
|
score = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
|
|
forest_rmse_scores = np.sqrt(-score)
|
|
print("CV Scores Mean [forest_reg]: ", forest_rmse_scores.mean())
|