import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.naive_bayes import GaussianNB 
import joblib 


# - For features selection 
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.feature_selection import SelectFromModel 

print("Loading dataset in memory [ ... ]")
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) 
print("Loading dataset in memory [ DONE ]")

# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= 

files = files.drop("ID", axis=1) 
files = files.drop("md5", axis=1) 

# ==== Split DataSet again but lets stratify with Machine feature ==== 
# - Add tmp cat in order to be able to stratify data while splitting it 
#files["legitimate_cat"] = pd.cut(files["legitimate"], bins=[0, 1], labels=[1,2]) 

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42) 
for train_index, test_index in split.split(files, files["legitimate"]): 
	strat_train_set = files.loc[train_index]  
	strat_test_set = files.loc[test_index] 

# - Remove tmp created cat, now the data is splitted, we dont need it anymore 
#for set_ in (strat_train_set, strat_test_set): 
#	set_.drop("legitimate_cat", axis=1, inplace=True)
 
files = strat_train_set.copy() 

# ==== Split Features and Labels ==== 

print("Splitting dataset Features and Labels [ ... ]")
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
files_without_labels = files.drop("legitimate", axis=1)
files_labels = files["legitimate"].copy() 
print("Splitting dataset Features and Labels [ DONE ]")

imputer = SimpleImputer(strategy="median")

# - Features manual transformer - 
class ManualFeatureSelector(TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- 
        return X.values[:,[1,24,34,18,0,23,48,47,53,14,46,32]] 
	# - If ID and md5 not dropped juste after dataset import use the other return - 
        #return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]]
# - Create the pipeline - 

pipeline = Pipeline([
	('features_remap', ManualFeatureSelector()), 
	('imputer', SimpleImputer(strategy="median")), 
]) 

# - Prepare dataset, pass it through the pipeline - 
print("Dataset passing through the pipeline [ ... ]")
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax    Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] 
files_prepared = pipeline.fit_transform(files_without_labels) 
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name) 
print("Dataset passing through the pipeline [ DONE ]") 
print("Describe of 'files_prepared': ")
print(files_prepared.describe())
 
# =-=-=-=-=-=-= Select the algo Model  =-=-=-=-=-=-= 

algos = {
        "DecisionTree": DecisionTreeClassifier(max_depth=10),
        "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=50),
        "AdaBoost": AdaBoostClassifier(n_estimators=100),
        "GNB": GaussianNB()
    }

X_train = files_prepared 
y_train = files_labels  

X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1)) 
y_test = strat_test_set['legitimate'].copy() 

results = {}
print("Testing 5 algo [ ... ]")
for algo in algos:
    cur_algo = algos[algo]
    cur_algo.fit(X_train, y_train)
    #score = cur_algo.score(X_test, y_test)
    score = cur_algo.score(X_test, y_test)
    print("%s : %f %%" % (algo, score*100))
    results[algo] = score

winner = max(results, key=results.get)
print('Winner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) 

# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-= 
#print("Saving the model [ ... ]")
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_5.pkl") 
#print("Saving the model [ DONE ]")