import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.naive_bayes import GaussianNB import joblib # - For features selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") file_to_test = file_to_test.drop("ID", axis=1) file_to_test = file_to_test.drop("md5", axis=1) # --- remove labels --- #file_to_test = file_to_test.drop("legitimate", axis=1) # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # - Features manual transformer - class ManualFeatureSelector(TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): # -- corresponding values to these indices are : #['Characteristics', #'DllCharacteristics', #'SectionsMaxEntropy', #'MajorSubsystemVersion', #'Machine', #'Subsystem', #'ResourcesMaxEntropy', #'ResourcesMinEntropy', #'VersionInformationSize', #'MajorOperatingSystemVersion', #'ResourcesMeanEntropy', #'SectionsMeanEntropy'] # ? X.transpose() #Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] # - If ID and md5 are not droped after dataset import, use the other return - Y = X[:,[1,24,34,18,0,23,48,47,53,14,46,32]] #Y = X[:,[4,27,37,21,3,26,51,50,56,17,49,35]] return Y # =-=-=-=-=-=-= Pipelines =-=-=-=-=-=-= # - This pipeline select usefull features - #pipeline_features_selection = Pipeline([ # ('features_remap', ManualFeatureSelector()), #]) # - This pipeline use the imputer and scales the values - # -- Tried to regroup both pipelines, seems to work for now ... -- # -- TODO If no issues, don't forget to remove the upper commented pipeline pipeline = Pipeline([ ('features_remap', ManualFeatureSelector()), ('imputer', SimpleImputer(strategy="median")), ]) # - Call the two upper pipelines - def full_pipeline(data): prepared = pipeline_features_selection.fit_transform(data) #print("Full pipeline -> shape:", prepared.shape) #print("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", list(prepared)) prepared = pipeline.fit_transform(prepared) return prepared # =-=-=-=-=-=-= Load previously saved model =-=-=-=-=-=-= saved_model = joblib.load("models/malware_classifier_5.pkl") # =-=-=-=-=-=-= Prediction core =-=-=-=-=-=-=-= # - This function use the model and predict if it's a malware or not - # - The file infos are given in numpy array type - # - Use np.array([['info1', 'info2', 'infox']]) to build the 'line' variable - def predict_one_line(model,line): #X_unknown = full_pipeline(line) X_unknown = pipeline.fit_transform(line) X_unknown_columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] X_unknown = pd.DataFrame(X_unknown, columns=X_unknown_columns) ans = model.predict(X_unknown) #ans_type = ['malicious', 'legitimate'] return ans[0] # =-=-=-=-=-=-=-= Semi-auto prediction tester =-=-=-=-=-=-= # - Takes each line of the dataset # - Parse it as a numpy array # - Send it to the prediction function (predict_one_line) # - Compare the result with the expected value # - Save the result # - At the end, print the prediction accuracy result res = [] #nb_malware_to_test = 50 nb_malware_to_test = 34199 good_ans = 0 for i in range(34179,nb_malware_to_test): #for i in range(nb_malware_to_test): print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) features = file_to_test.values[i,] features_list = features.tolist() features_array = [features_list] features = np.array(features_array) res.append(predict_one_line(saved_model, features)) if res[i-34179] == file_to_test.values[i,][54]: good_ans +=1 print(features) print(res) print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) l1 = file_to_test.values[2,]