canap
This commit is contained in:
parent
72478eccb9
commit
a4236e68d0
@ -27,7 +27,7 @@ from sklearn.ensemble import ExtraTreesClassifier
|
|||||||
from sklearn.feature_selection import SelectFromModel
|
from sklearn.feature_selection import SelectFromModel
|
||||||
|
|
||||||
print("Loading dataset in memory [ ... ]")
|
print("Loading dataset in memory [ ... ]")
|
||||||
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
files = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
||||||
print("Loading dataset in memory [ DONE ]")
|
print("Loading dataset in memory [ DONE ]")
|
||||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
# ==== Split DataSet again but lets stratify with Machine feature ====
|
||||||
@ -115,6 +115,6 @@ print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]
|
|||||||
|
|
||||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
||||||
print("Saving the model [ ... ]")
|
print("Saving the model [ ... ]")
|
||||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
#joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
||||||
print("Saving the model [ DONE ]")
|
print("Saving the model [ DONE ]")
|
||||||
|
|
||||||
|
|||||||
@ -27,34 +27,12 @@ from sklearn.ensemble import ExtraTreesClassifier
|
|||||||
from sklearn.feature_selection import SelectFromModel
|
from sklearn.feature_selection import SelectFromModel
|
||||||
|
|
||||||
print("Loading dataset in memory [ ... ]")
|
print("Loading dataset in memory [ ... ]")
|
||||||
files = pd.read_csv('dataset_clean.txt',delimiter=',', low_memory=False)
|
file_to_test = pd.read_csv('dataset/dataset_clean.txt',delimiter=',', low_memory=False)
|
||||||
print("Loading dataset in memory [ DONE ]")
|
print("Loading dataset in memory [ DONE ]")
|
||||||
|
|
||||||
|
# --- remove labels ---
|
||||||
|
#file_to_test = file_to_test.drop("legitimate", axis=1)
|
||||||
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
# =-=-=-=-=-=-=-= Data Prepare Work =-=-=-=-=-=-=
|
||||||
# ==== Split DataSet again but lets stratify with Machine feature ====
|
|
||||||
# - Add tmp cat in order to be able to stratify data while splitting it
|
|
||||||
files["Machine_cat"] = pd.cut(files["Machine"], bins=[0., 30000., np.inf], labels=[1,2])
|
|
||||||
|
|
||||||
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
|
|
||||||
for train_index, test_index in split.split(files, files["Machine_cat"]):
|
|
||||||
strat_train_set = files.loc[train_index]
|
|
||||||
strat_test_set = files.loc[test_index]
|
|
||||||
|
|
||||||
# - Remove tmp created cat, now the data is splitted, we dont need it anymore
|
|
||||||
for set_ in (strat_train_set, strat_test_set):
|
|
||||||
set_.drop("Machine_cat", axis=1, inplace=True)
|
|
||||||
|
|
||||||
files = strat_train_set.copy()
|
|
||||||
|
|
||||||
# ==== Split Features and Labels ====
|
|
||||||
|
|
||||||
print("Splitting dataset Features and Labels [ ... ]")
|
|
||||||
#files_without_labels = files_without_useless_features.drop("legitimate", axis=1)
|
|
||||||
files_without_labels = files.drop("legitimate", axis=1)
|
|
||||||
files_labels = files["legitimate"].copy()
|
|
||||||
print("Splitting dataset Features and Labels [ DONE ]")
|
|
||||||
|
|
||||||
imputer = SimpleImputer(strategy="median")
|
|
||||||
|
|
||||||
# - Features manual transformer -
|
# - Features manual transformer -
|
||||||
class ManualFeatureSelector(TransformerMixin):
|
class ManualFeatureSelector(TransformerMixin):
|
||||||
|
|
||||||
@ -75,46 +53,54 @@ pipeline = Pipeline([
|
|||||||
('std_scaler', StandardScaler()),
|
('std_scaler', StandardScaler()),
|
||||||
])
|
])
|
||||||
|
|
||||||
# - Prepare dataset, pass it through the pipeline -
|
saved_model = joblib.load("models/malware_classifier_1.pkl")
|
||||||
print("Dataset passing through the pipeline [ ... ]")
|
|
||||||
features_columns_name = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMax Entropy', 'ResourcesMinEntropy', 'VersionInformationSize', 'MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
|
||||||
files_prepared = pipeline.fit_transform(files_without_labels)
|
|
||||||
files_prepared = pd.DataFrame(files_prepared, columns=features_columns_name)
|
|
||||||
print("Dataset passing through the pipeline [ DONE ]")
|
|
||||||
print("Describe of 'files_prepared': ")
|
|
||||||
print(files_prepared.describe())
|
|
||||||
|
|
||||||
# =-=-=-=-=-=-= Select the algo Model =-=-=-=-=-=-=
|
def list_transformer(l):
|
||||||
|
l_t = []
|
||||||
|
|
||||||
algos = {
|
l_t.append(l[3])
|
||||||
"DecisionTree": DecisionTreeClassifier(max_depth=10),
|
l_t.append(l[26])
|
||||||
"RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
|
l_t.append(l[36])
|
||||||
"GradientBoosting": GradientBoostingClassifier(n_estimators=50),
|
l_t.append(l[20])
|
||||||
"AdaBoost": AdaBoostClassifier(n_estimators=100),
|
l_t.append(l[2])
|
||||||
"GNB": GaussianNB()
|
l_t.append(l[25])
|
||||||
}
|
l_t.append(l[50])
|
||||||
|
l_t.append(l[49])
|
||||||
|
l_t.append(l[55])
|
||||||
|
l_t.append(l[16])
|
||||||
|
l_t.append(l[48])
|
||||||
|
l_t.append(l[34])
|
||||||
|
|
||||||
X_train = files_prepared
|
return l_t
|
||||||
y_train = files_labels
|
|
||||||
|
|
||||||
X_test = pipeline.fit_transform(strat_test_set.drop("legitimate", axis=1))
|
def predict_one_line(model,line):
|
||||||
y_test = strat_test_set['legitimate'].copy()
|
|
||||||
|
|
||||||
results = {}
|
#X_unknown = pipeline.fit_transform(line)
|
||||||
print("Testing 5 algo [ ... ]")
|
X_unknown = list_transformer(line)
|
||||||
for algo in algos:
|
X_unknown = pd.DataFrame([X_unknown])
|
||||||
cur_algo = algos[algo]
|
X_unknown.columns = ['Characteristics', 'DllCharacteristics', 'SectionsMaxEntropy', 'MajorSubsystemVersion', 'Machine', 'Subsystem', 'ResourcesMaxEntropy', 'ResourcesMinEntropy', 'VersionInformationSize', ' MajorOperatingSystemVersion', 'ResourcesMeanEntropy', 'SectionsMeanEntropy']
|
||||||
cur_algo.fit(X_train, y_train)
|
ans = model.predict(X_unknown)
|
||||||
#score = cur_algo.score(X_test, y_test)
|
ans_type = ['malicious', 'legitimate']
|
||||||
score = cur_algo.score(X_test, y_test)
|
#print(ans[0])
|
||||||
print("%s : %f %%" % (algo, score*100))
|
#print("This file is: ", ans_type[ans[0]])
|
||||||
results[algo] = score
|
return ans[0]
|
||||||
|
|
||||||
winner = max(results, key=results.get)
|
res = []
|
||||||
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
|
labels = []
|
||||||
|
nb_malware_to_test = 2000
|
||||||
|
good_ans = 0
|
||||||
|
for i in range(nb_malware_to_test):
|
||||||
|
print(" =-=-=-= Prediction {} out of {} ({}%) [ ERT ~ {} min ] =-=-=-=".format(i, nb_malware_to_test, round((i/nb_malware_to_test)*100,1), round(((nb_malware_to_test-i)*1.2)/60,1)))
|
||||||
|
features = file_to_test.values[i,]
|
||||||
|
res.append(predict_one_line(saved_model, features))
|
||||||
|
labels.append(file_to_test.values[i,][56])
|
||||||
|
if res[i] == file_to_test.values[i,][56]:
|
||||||
|
good_ans +=1
|
||||||
|
|
||||||
# =-=-=-=-=-=-= Save the current Model =-=-=-=-=-=-=
|
print(" ===> Got {} / {} good answers".format(good_ans, nb_malware_to_test))
|
||||||
print("Saving the model [ ... ]")
|
|
||||||
joblib.dump(algos[winner], "/home/ubuntu/bigData/projet_big_data/models/malware_classifier_1.pkl")
|
|
||||||
print("Saving the model [ DONE ]")
|
|
||||||
|
|
||||||
|
l1 = file_to_test.values[2,]
|
||||||
|
#print(file_to_test.values[1,])
|
||||||
|
print(l1)
|
||||||
|
#predict_one_line(saved_model, l1)
|
||||||
|
#predict_one_line(saved_model, file_to_test)
|
||||||
|
|||||||
1
dataset/malware_header.csv
Normal file
1
dataset/malware_header.csv
Normal file
@ -0,0 +1 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
|
||||||
|
@ -1 +0,0 @@
|
|||||||
4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0
|
|
||||||
|
2
dataset/one_malware_2.csv
Normal file
2
dataset/one_malware_2.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||||
|
4,156a0bb069f94d1e7c2508318805f2a4,332,224,8450,10,0,108544,15872,0,105021,4096,114688,268435456,4096,512,6,1,8,0,6,1,143360,1024,165754,3,320,1048576,4096,1048576,4096,0,16,5,3.40483134511,0.160328725899,6.66271801901,24883.2,512,108544,25645.4,85,108180,12,66,0,105,2,3.27055919863,3.03418784123,3.50693055603,1032,972,1092,72,0
|
||||||
|
2
dataset/one_malware_3.csv
Normal file
2
dataset/one_malware_3.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||||
|
55599,26e634af4a03b524e293fec7f70ef740,332,224,33166,2,25,29696,10752,0,32996,4096,36864,4194304,4096,512,4,0,0,0,4,0,110592,1024,0,2,0,1048576,16384,1048576,4096,0,16,8,3.31141157945,0,6.51167217489,5056,0,29696,10213.625,8,43161,10,85,0,0,4,4.63428968486,1.78149632953,6.90278161507,1118,16,4264,0,0
|
||||||
|
2
dataset/one_malware_4.csv
Normal file
2
dataset/one_malware_4.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||||
|
55571,15aacc9fb50d342921fc27828d818dc0,332,224,271,6,0,98304,131072,0,98902,4096,102400,4194304,4096,4096,4,0,0,0,4,0,233472,4096,0,2,0,1048576,4096,1048576,4096,0,16,4,3.54693957609,1.04392074096,5.65629913697,32768,4096,98304,55010.5,968,115944,18,233,1,0,1,3.5634550578,3.5634550578,3.5634550578,868,868,868,0,0
|
||||||
|
2
dataset/one_malware_5.csv
Normal file
2
dataset/one_malware_5.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||||
|
55699,57f64bb19554df5905e284e1c6d36e90,332,224,271,6,0,23040,119808,1024,12491,4096,28672,4194304,4096,512,4,0,6,0,4,0,225280,1024,0,2,32768,1048576,4096,1048576,4096,0,16,5,4.47217657988,0,6.43310034828,11878.4,0,30720,42597.2,4496,110456,8,155,1,0,18,4.28749013773,2.47653802279,6.71583396325,1622.72222222,132,9640,0,0
|
||||||
|
2
dataset/one_malware_6.csv
Normal file
2
dataset/one_malware_6.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||||
|
199999,c2780cf8eea998e4f3672159926b8757,332,224,783,2,56,29184,14848,110592,14764,4096,36864,4194304,4096,512,4,0,6,0,4,0,221184,1024,0,2,32768,2097152,4096,1048576,4096,0,16,7,3.34528001443,0,6.30323831549,8118.85714286,0,29184,28755.4285714,140,110088,8,155,0,0,8,4.0912689525,2.45849222582,5.86996923095,2088.75,48,9640,0,0
|
||||||
|
2
dataset/one_malware_7.csv
Normal file
2
dataset/one_malware_7.csv
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize
|
||||||
|
34181,e4647c31a5fcfde4a964e820d4ecae30,332,224,271,6,0,25600,98816,0,14588,4096,32768,4194304,4096,512,4,0,0,0,4,0,135168,1024,0,2,0,1048576,4096,1048576,4096,0,16,3,6.07048588959,5.03156401487,6.77656630478,19285.3333333,2560,29696,41197.3333333,2284,96056,1,49,0,0,0,0,0,0,0,0,0,0,0
|
||||||
|
Loading…
Reference in New Issue
Block a user