canap
This commit is contained in:
parent
72478eccb9
commit
a4236e68d0
@ -27,7 +27,7 @@ from sklearn.ensemble import ExtraTreesClassifier
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
|
||||
print("Loading dataset in memory [ ... ]")
|
||||
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
||||
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
||||
print("Loading dataset in memory [ DONE ]")
|
||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||
@ -115,6 +115,6 @@ print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]
|
||||
|
||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||
print("Saving the model [ ... ]")
|
||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||
print("Saving the model [ DONE ]")
|
||||
|
||||
|
||||
@ -27,34 +27,12 @@ from sklearn.ensemble import ExtraTreesClassifier
|
||||
from sklearn.feature_selection import SelectFromModel
|
||||
|
||||
print("Loading dataset in memory [ ... ]")
|
||||
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
||||
file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
||||
print("Loading dataset in memory [ DONE ]")
|
||||
|
||||
# --- remove labels ---
|
||||
#file_to_test = file_to_test.drop("legitimate", axis=1)
|
||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||
# - Add tmp cat in order to be able to stratify data while splitting it
|
||||
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
||||
|
||||
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
||||
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
||||
strat_train_set = files.loc[train_index]
|
||||
strat_test_set = files.loc[test_index]
|
||||
|
||||
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
||||
for set_ in (strat_train_set, strat_test_set):
|
||||
set_.drop("Machine_cat", axis=1, inplace=True)
|
||||
|
||||
files = strat_train_set.copy()
|
||||
|
||||
# ==== Split Features and Labels ====
|
||||
|
||||
print("Splitting dataset Features and Labels [ ... ]")
|
||||
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
||||
files_without_labels = files.drop("legitimate", axis=1)
|
||||
files_labels = files["legitimate"].copy()
|
||||
print("Splitting dataset Features and Labels [ DONE ]")
|
||||
|
||||
imputer = SimpleImputer(strategy="median")
|
||||
|
||||
# - Features manual transformer -
|
||||
class ManualFeatureSelector(TransformerMixin):
|
||||
|
||||
@ -75,46 +53,54 @@ pipeline = Pipeline([
|
||||
('std_scaler', StandardScaler()),
|
||||
])
|
||||
|
||||
# - Prepare dataset, pass it through the pipeline -
|
||||
print("Dataset passing through the pipeline [ ... ]")
|
||||
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||
files_prepared = pipeline.fit_transform(files_without_labels)
|
||||
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name)
|
||||
print("Dataset passing through the pipeline [ DONE ]")
|
||||
print("Describe of 'files_prepared': ")
|
||||
print(files_prepared.describe())
|
||||
saved_model = joblib.load("models/malware_classifier_1.pkl")
|
||||
|
||||
# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-=
|
||||
def list_transformer(l):
|
||||
l_t = []
|
||||
|
||||
algos = {
|
||||
"DecisionTree": DecisionTreeClassifier(max_depth=10),
|
||||
"RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
|
||||
"GradientBoosting": GradientBoostingClassifier(n_estimators=50),
|
||||
"AdaBoost": AdaBoostClassifier(n_estimators=100),
|
||||
"GNB": GaussianNB()
|
||||
}
|
||||
l_t.append(l[3])
|
||||
l_t.append(l[26])
|
||||
l_t.append(l[36])
|
||||
l_t.append(l[20])
|
||||
l_t.append(l[2])
|
||||
l_t.append(l[25])
|
||||
l_t.append(l[50])
|
||||
l_t.append(l[49])
|
||||
l_t.append(l[55])
|
||||
l_t.append(l[16])
|
||||
l_t.append(l[48])
|
||||
l_t.append(l[34])
|
||||
|
||||
X_train = files_prepared
|
||||
y_train = files_labels
|
||||
return l_t
|
||||
|
||||
X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1))
|
||||
y_test = strat_test_set['legitimate'].copy()
|
||||
def predict_one_line(model,line):
|
||||
|
||||
results = {}
|
||||
print("Testing 5 algo [ ... ]")
|
||||
for algo in algos:
|
||||
cur_algo = algos[algo]
|
||||
cur_algo.fit(X_train, y_train)
|
||||
#score = cur_algo.score(X_test, y_test)
|
||||
score = cur_algo.score(X_test, y_test)
|
||||
print("%s : %f %%" % (algo, score*100))
|
||||
results[algo] = score
|
||||
#X_unknown = pipeline.fit_transform(line)
|
||||
X_unknown = list_transformer(line)
|
||||
X_unknown = pd.DataFrame([X_unknown])
|
||||
X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||
ans = model.predict(X_unknown)
|
||||
ans_type = ['malicious', 'legitimate']
|
||||
#print(ans[0])
|
||||
#print("This file is: ", ans_type[ans[0]])
|
||||
return ans[0]
|
||||
|
||||
winner = max(results, key=results.get)
|
||||
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
||||
res = []
|
||||
labels = []
|
||||
nb_malware_to_test = 2000
|
||||
good_ans = 0
|
||||
for i in range(nb_malware_to_test):
|
||||
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
|
||||
features = file_to_test.values[i,]
|
||||
res.append(predict_one_line(saved_model, features))
|
||||
labels.append(file_to_test.values[i,][56])
|
||||
if res[i] == file_to_test.values[i,][56]:
|
||||
good_ans +=1
|
||||
|
||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||
print("Saving the model [ ... ]")
|
||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||
print("Saving the model [ DONE ]")
|
||||
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))
|
||||
|
||||
l1 = file_to_test.values[2,]
|
||||
#print(file_to_test.values[1,])
|
||||
print(l1)
|
||||
#predict_one_line(saved_model, l1)
|
||||
#predict_one_line(saved_model, file_to_test)
|
||||
|
||||
1
dataset/malware_header.csv
Normal file
1
dataset/malware_header.csv
Normal file
@ -0,0 +1 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
|
||||
|
@ -1 +0,0 @@
|
||||
4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0
|
||||
|
2
dataset/one_malware_2.csv
Normal file
2
dataset/one_malware_2.csv
Normal file
@ -0,0 +1,2 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||
4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0
|
||||
|
2
dataset/one_malware_3.csv
Normal file
2
dataset/one_malware_3.csv
Normal file
@ -0,0 +1,2 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||
55599,26e634af4a03b524e293fec7f70ef740,332,224,33166,2,25,29696,10752,0,32996,4096,36864,4194304,4096,512,4,0,0,0,4,0,110592,1024,0,2,0,1048576,16384,1048576,4096,0,16,8,3.31141157945,0,6.51167217489,5056,0,29696,10213.625,8,43161,10,85,0,0,4,4.63428968486,1.78149632953,6.90278161507,1118,16,4264,0,0
|
||||
|
2
dataset/one_malware_4.csv
Normal file
2
dataset/one_malware_4.csv
Normal file
@ -0,0 +1,2 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||
55571,15aacc9fb50d342921fc27828d818dc0,332,224,271,6,0,98304,131072,0,98902,4096,102400,4194304,4096,4096,4,0,0,0,4,0,233472,4096,0,2,0,1048576,4096,1048576,4096,0,16,4,3.54693957609,1.04392074096,5.65629913697,32768,4096,98304,55010.5,968,115944,18,233,1,0,1,3.5634550578,3.5634550578,3.5634550578,868,868,868,0,0
|
||||
|
2
dataset/one_malware_5.csv
Normal file
2
dataset/one_malware_5.csv
Normal file
@ -0,0 +1,2 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||
55699,57f64bb19554df5905e284e1c6d36e90,332,224,271,6,0,23040,119808,1024,12491,4096,28672,4194304,4096,512,4,0,6,0,4,0,225280,1024,0,2,32768,1048576,4096,1048576,4096,0,16,5,4.47217657988,0,6.43310034828,11878.4,0,30720,42597.2,4496,110456,8,155,1,0,18,4.28749013773,2.47653802279,6.71583396325,1622.72222222,132,9640,0,0
|
||||
|
2
dataset/one_malware_6.csv
Normal file
2
dataset/one_malware_6.csv
Normal file
@ -0,0 +1,2 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||
199999,c2780cf8eea998e4f3672159926b8757,332,224,783,2,56,29184,14848,110592,14764,4096,36864,4194304,4096,512,4,0,6,0,4,0,221184,1024,0,2,32768,2097152,4096,1048576,4096,0,16,7,3.34528001443,0,6.30323831549,8118.85714286,0,29184,28755.4285714,140,110088,8,155,0,0,8,4.0912689525,2.45849222582,5.86996923095,2088.75,48,9640,0,0
|
||||
|
2
dataset/one_malware_7.csv
Normal file
2
dataset/one_malware_7.csv
Normal file
@ -0,0 +1,2 @@
|
||||
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||
34181,e4647c31a5fcfde4a964e820d4ecae30,332,224,271,6,0,25600,98816,0,14588,4096,32768,4194304,4096,512,4,0,0,0,4,0,135168,1024,0,2,0,1048576,4096,1048576,4096,0,16,3,6.07048588959,5.03156401487,6.77656630478,19285.3333333,2560,29696,41197.3333333,2284,96056,1,49,0,0,0,0,0,0,0,0,0,0,0
|
||||
|
Loading…
Reference in New Issue
Block a user