首先,从权限文本中提取出特征。
f = glob.iglob(r'C:/project/ML/train_ben/*/AndroidManifest.xml')
i = 0
for xml in f:
tree = ET.parse(xml)
root = tree.getroot()
#子文件依次提取特征
#用t传递子特征
t = []
for d in root.iter('uses-permission'):
pms = d.attrib
for key, value in pms.items():
value = value.split('.')[-1]
all_attr.append(value)
t.append(value)
x1_attr[i] = t
i = i+1
接着,处理特征。
#处理权限特征
lb = preprocessing.LabelBinarizer()
lb.fit(all_attr)
for i in range(1000):
try:
t = lb.transform(x1_attr[i])
x1_attr[i] = np.sum(t,axis=0)
except:
x1_attr[i] = np.zeros(563)
然后,合并数据集。
###增加列
## ben 末尾增加一列为 0
y1 = np.zeros((1000,1))
## mal 末尾增加一列为 1
y2 = np.ones((1000,1))
#合并array, 竖直方向
x = np.vstack((x1_attr,x2_attr))
y = np.vstack((y1,y2))
接着是交叉验证和数据降维。
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
#降低维度
slc = SelectPercentile(chi2,percentile=25)
x = slc.fit_transform(x, y)
# 随机森林建模
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(x_train, y_train)
print('clf.score = ', clf.score(x_test,y_test))
#合并array, 竖直方向
x = np.vstack((x1_attr,x2_attr))
y = np.vstack((y1,y2))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=0)
#贝叶斯选择特征
selected_feat_names = []
tmp = []
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rfc.fit(x_train, y_train)
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
#选择特征的个数
for f in range(x.shape[1]):
if f < 70:
tmp.append(indices[f])
print(len(selected_feat_names), "features are selected")
#调节参数 ——选择特征的个数
plt.title("Feature Importance")
plt.bar(range(x.shape[1]),
importances[indices],
color='lightblue',
align='center')
plt.xticks(range(x.shape[1]),
tmp,
rotation=90)
plt.xlim([-1, 70])
plt.tight_layout()
plt.show()
print(rfc.score(x_test,y_test))
最后提交txt分类结果。
# 预测
flag = clf.predict(x_test_attr)
fp=open('C:/project/result.txt','w+')
for i in range(len(flag)):
fp.write(str(i)+'\t'+str(int(flag[i]))+'\n')
fp.close()