import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor #housing = pd.read_csv('https://raw.githubusercontent.com/securitylab-repository/TPS-IA/master/datasets/housing.csv',delimiter=',') housing = pd.read_csv('housing.csv',delimiter=',') #housing.head() #housing.info() #housing['ocean_proximity'].sample(100) #housing["ocean_proximity"].value_counts() #housing.describe() #housing.hist(bins=50, figsize=(20,15)) #plt.show() print("======================== Training [...]") train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42) print("======================== Training [OK]") print("======================== Type -> train_set") #type(train_set) print("======================== Type -> test_set") #type(test_set) print("======================== Type -> train_set.info()") #type(train_set.info()) print("======================== Type -> test_set.info()") #type(test_set.info()) #print(train_set) ### === Data analysis === ### housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5]) #housing["income_cat"].hist() #plt.show() split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(housing, housing["income_cat"]): strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] print(strat_test_set["income_cat"].value_counts() / len(strat_test_set)) for set_ in (strat_train_set, strat_test_set): set_.drop("income_cat", axis=1, inplace=True) housing=strat_train_set.copy() # --- Visualisation des donnees --- #housing.plot(kind="scatter", x="longitude", y="latitude") #housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) #plt.show() # --- Matrice de correlation --- corr_matrix = housing.corr() print(corr_matrix['median_house_value'].sort_values(ascending=False)) #housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1) #plt.show() housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] housing["population_per_household"]=housing["population"]/housing["households"] corr_matrix = housing.corr() print(corr_matrix["median_house_value"].sort_values(ascending=False)) # --- End of data analysis --- ### === Prepare Data for ML === ### # --- Split Features and Labels --- housing = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # --- Replace null-values with median value --- # - tell the imputer that we want to use median to replace values - imputer = SimpleImputer(strategy="median") # - remove features that dont have numeric values - # - cant calculate median of a word - housing_num = housing.drop("ocean_proximity", axis=1) imputer.fit(housing_num) print(imputer.statistics_) # - transform housing_num with median values calculated by the imputer - X = imputer.transform(housing_num) # - convert X to Pandas DataFrame because now X is a NumPy array - housing_tr = pd.DataFrame(X, columns=housing_num.columns) # --- Handling text and categorical attributes --- housing_cat = housing[["ocean_proximity"]] cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) # - just to check categories encoding - print(housing_cat_1hot.toarray()) print(cat_encoder.categories_) print(housing.head(3)) # --- Custom Transformers --- rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # nothing else to do def transform(self, X, y=None): rooms_per_household = X[:, rooms_ix] / X[:, households_ix] population_per_household = X[:, population_ix] / X[:, households_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # --- Transformation pipelines --- # -- Numeric pipeline -- num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # -- Full pipeline, including categorical attributes with custom transformer num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing) ### === Select and Train Model === ### # --- Train and eval on training set --- lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) # - just a simple test with trained data - some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = full_pipeline.transform(some_data) print("Predictions: ", lin_reg.predict(some_data_prepared)) print("Labels: ", list(some_labels)) # - not very good, lets check error rate with RMSE method - housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print("Error rate [lin_reg] (less is best): ", lin_rmse) # - Linear regression is not adapted for this dataset - # - Lets try another one (Decision tree regressor) # - Train the model ... tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) # - Calculate error rate on predictions... housing_predictions = tree_reg.predict(housing_prepared) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print("Error rate [tree_reg] (less is best): ", tree_rmse) # - We got 0 as error rate. Strange.. its performing too much. Its overfitting. # --- Cross validation to split training_Set in multiple parts and evaluate on each validation_sets --- # ---- CV Score over tree_reg ---- score = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-score) print("CV Scores Mean: [tree_reg]", tree_rmse_scores.mean()) # ---- CV Score over lin_reg ---- score = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-score) print("CV Scores Mean [lin_reg]: ", lin_rmse_scores.mean()) # ---- CV Score over forest_reg ---- forest_reg = RandomForestRegressor(n_jobs=-1) #forest_reg = RandomForestRegressor() print("Training model [...]") forest_reg.fit(housing_prepared, housing_labels) print("Training model [DONE]") print("Testing model [...]") score = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10, n_jobs=-1) #score = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) print("Testing model [DONE]") print("--- Scores on Validation set --- ") forest_rmse_scores = np.sqrt(-score) print("CV Scores Mean [forest_reg]: ", forest_rmse_scores.mean()) print("CV Scores Standard deviation [forest_reg]: ", forest_rmse_scores.std()) housing_predictions = forest_reg.predict(housing_prepared) forest_mse = mean_squared_error(housing_labels, housing_predictions) forest_rmse = np.sqrt(forest_mse) print("Error rate [forest_reg] (less is best): ", forest_rmse) print("Predictions: ", forest_reg.predict(some_data_prepared)) print("Labels: ", list(some_labels)) # ============== TestSet ============ print("=========== TestSet ===========") X_test = strat_test_set.drop("median_house_value", axis=1) y_test = strat_test_set["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) X_test_prepared_5 = full_pipeline.transform(X_test.iloc[:5]) y_test_5 = y_test.iloc[:5] final_prediction = forest_reg.predict(X_test_prepared) final_mse = mean_squared_error(y_test, final_prediction) final_rmse = np.sqrt(final_mse) print("Final Error rate (less is best): ", final_rmse) print("Predictions: ", forest_reg.predict(X_test_prepared_5)) print("Labels: ", list(y_test_5))