import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.naive_bayes import GaussianNB import joblib # - For features selection from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel print("Loading dataset in memory [ ... ]") file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False) print("Loading dataset in memory [ DONE ]") # --- remove labels --- #file_to_test = file_to_test.drop("legitimate", axis=1) # =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-= # - Features manual transformer - class ManualFeatureSelector(TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): # -- corresponding values to these indices are -> ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] -- return X.values[:,[3,26,36,20,2,25,50,49,55,16,48,34]] # - Create the pipeline - pipeline = Pipeline([ ('features_remap', ManualFeatureSelector()), ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) saved_model = joblib.load("models/malware_classifier_1.pkl") def list_transformer(l): l_t = [] l_t.append(l[3]) l_t.append(l[26]) l_t.append(l[36]) l_t.append(l[20]) l_t.append(l[2]) l_t.append(l[25]) l_t.append(l[50]) l_t.append(l[49]) l_t.append(l[55]) l_t.append(l[16]) l_t.append(l[48]) l_t.append(l[34]) return l_t def predict_one_line(model,line): #X_unknown = pipeline.fit_transform(line) X_unknown = list_transformer(line) X_unknown = pd.DataFrame([X_unknown]) X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy'] ans = model.predict(X_unknown) ans_type = ['malicious', 'legitimate'] #print(ans[0]) #print("This file is: ", ans_type[ans[0]]) return ans[0] res = [] labels = [] nb_malware_to_test = 2000 good_ans = 0 for i in range(nb_malware_to_test): print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1))) features = file_to_test.values[i,] res.append(predict_one_line(saved_model, features)) labels.append(file_to_test.values[i,][56]) if res[i] == file_to_test.values[i,][56]: good_ans +=1 print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test)) l1 = file_to_test.values[2,] #print(file_to_test.values[1,]) print(l1) #predict_one_line(saved_model, l1) #predict_one_line(saved_model, file_to_test)