一, 数据清洗
(1),先做数据清洗,清洗过的数据被称之为"干净数据";
具体过程为-》要结合业务场景来判断哪些特征是值得被提取的,
如果自身对业务场景并不熟悉,可以咨询或者请教身边经验丰富的人。
举例:比较两句话的不同:
① 我司/代开/发票································1
② 月底/了/,/请/将/本月/发票/统一/装订/················0
(2),数据清洗过程中,也可以将所有认为可能对结果产生影响的特征全部进行提取,
并且逐一分析每个特征对结果的影响,再删除掉那些与结果无关的特征。
(3),首先,根据label值,制作lable的正向字典
其次,根据之前定义好的特征,分别提取非格式化邮件中的特征内容
最后,讲所有提取到的特征转为行,并将216*300行内容保存至1个大的表格中。
二, 特征工程
(1), 对from & to 中的域名进行匹配,并且寻找它们与label值之间的关系,
如果关系比较紧密,存在明显的关系,则进一步将它们转化为数值;
如果关系不大,则降它们删除
(2), 对date中的值进行正则匹配提取,并对匹配后的格式化数据进行时间段切分:
8~13=0;13~19=1;19~24=2;24~8=3;分别取寻找时间段和label之间的关系,
如果关系明确切明显,则就按照时间段作为特征来数值化特征值,否则删除。
data长度为16的值类型:['2005-9-2 上午11:04', '2005-9-2 上午10:55', '2005-9-2 上午10:55',
'2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55',
'2005-9-2 上午10:55']
data长度为19的值类型:['Sep 23 2005 1:04 AM']
data长度为21的值类型:['August 24 2005 5:00pm', 'August 24 2005 5:00pm', 'August 24 2005 5:00pm']
通过分析,时间段值不能起到区分垃圾邮件的作用,但是如果是正常邮件一定含有日期;如果是垃圾邮件,
则不一定含有,所以可以构造'has date'这一列为新的特征作为最后带入的特征进行训练及预测。
对于邮件长度进行统计,发现特别短和特别长的邮件是垃圾邮件的概率非常大,因此可以拟合用e为底的指数和
对数函数来表征邮件长度的信息量,尽管信息量的值是大于1的,但是其特点和概率类似,即:值越大则是垃圾
邮件的概率就越大,值越小则是垃圾邮件的概率就越小。
最后,要为邮件的内容进行分词。
三、建模
(1)分词后邮件内容的tf-idf转化(也可以用词袋法)。
(2) 对tf-idf转化后的邮件内容做PCA 或者 svd 降维。
(3) 把降维后的数据带入模型进行训练并测试。
数据处理脚本 dataProcess.py
#encoding:utf-8
import os
import sys#调试用包
import time#计时包
print("kaishi")
def 制作标签字典(file_path):
type_dict = {"spam": "1", "ham": "0"}
index_file = open(file_path)#./date/full/index
index_dict = {}#要创建的新字典
try:
for line in index_file:
arr = line.split(" ")
#[spam,../data/000/000]
if len(arr) == 2:
key, value = arr
#(spam),(../data/000/000)
# 添加到字段中
value = value.replace("../data", "").replace("\n", "")
#(spam),(/000/000)
index_dict[value] = type_dict[key.lower()]
#{/000/000: 1}
#{/000/001: 0}
finally:
index_file.close()
return index_dict
# for file_path in path_list:
# 邮件的文件内容数据读取
def 字典化邮件文本内容(file_path):#dll
'./data/data/000/000'
file = open(file_path, "r", encoding="gb2312", errors='ignore')
content_dict = {}
try:
is_content = False#初始化为False后
for line in file:
#切掉两头的空格,逐步逼近格式化数据
line = line.strip()
if line.startswith("From:"):
#From: "yan"<(8月27-28,上海)培训课程>,
content_dict['from'] = line[5:]
# "yan"<(8月27-28,上海)培训课程>#
#{'from':"yan"<(8月27-28,上海)培训课程>}
elif line.startswith("To:"):
content_dict['to'] = line[3:]
#[email protected]
elif line.startswith("Date:"):
content_dict['date'] = line[5:]
# Tue, 30 Aug 2005 10:08:15 +0800
elif not line:
is_content = True
# 处理邮件内容,利用is_content的真假来执行一个小的针对文本内容的for loop
if is_content:
if 'content' in content_dict:
content_dict['content'] += line
#content_dict['content'] = content_dict['content']+line
else:
content_dict['content'] = line
finally:
file.close()
'释放堆中的内存地址'
return content_dict
# 邮件数据处理
def 字典转文本(file_path):
#先把内容读成字典
content_dict = 字典化邮件文本内容(file_path)#先把非格式化邮件转化为字典
# 再把字典转成文本
result_str = content_dict.get('from', 'unkown').replace(',', '').strip() + ","
result_str += content_dict.get('to', 'unknown').replace(',', '').strip() + ","
result_str += content_dict.get('date', 'unknown').replace(',', '').strip() + ","
result_str += content_dict.get('content', 'unknown').replace(',', ' ').strip()
return result_str
#使用函数开始数据处理
start = time.time()#开始时间标记
index_dict = 制作标签字典('./data/full/index')
#{/000/000: 1}
#{/000/001: 0}
# index_dict = 制作标签字典('C:\\Users/Administrator/Desktop/index')#('./data/full/index')
# print(index_dict)
# sys.exit(0)
list0 = os.listdir('./data/data')#文件夹的名称
#{000,001,002,215}
for l1 in list0: #开始把N个文件夹中的file写入N*n个wiriter
'循环文件夹'
l1_path = './data/data/' + l1#000
#l1_path='./data/data/000',是文件夹的名称
print('开始处理文件夹' + l1_path)#开始处理第000文件夹
list1 = os.listdir(l1_path)#获取000文件夹内的所有文件名列表
#[000,001]
#list1文件列表
write_file_path = './data/process01_' + l1
#./data/process01_000
#./data/process01_001
#总共有216个
#保存每个文件夹下面文件的文件 300行
with open(write_file_path, "w", encoding= 'utf-8') as writer:
for l2 in list1:
'循环文件'
l2_path = l1_path + "/" + l2#得到要处理文件的具体路径
index_key = "/" + l1 + "/" + l2
if index_key in index_dict:
#{/000/000: 1}
#{/000/001: 0}
content_str = 字典转文本(l2_path)
content_str += "," + index_dict[index_key] + "\n"
writer.writelines(content_str)
with open('./data/result_process01',"w", encoding ='utf-8') as writer:
for l1 in list0:
file_path= './data/process01_' + l1
print("开始合并文件:" + file_path)
with open(file_path, encoding = 'utf-8') as file:
for line in file:
writer.writelines(line)
#两个for嵌套共执行: 216*300=6W+
end = time.time()
print('数据处理总共耗时%.2f'%(end- start))
特征工程脚本 fetureExtract.py
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# import matplotlib.pyplot as plt
import re
import time
import jieba
import sys
# mpl.rcParams['font.sans-serif'] = [u'simHei']#ָ改为指定字体“黑体”
# mpl.rcParams['axes.unicode_minus'] = False #使得坐标轴保存负号变更为方块,用来正常显示负号
# plt.title(u'我是中文')
# get_ipython().magic(u'matplotlib tk')
'from to date content label'
df = pd.read_csv('./data/result_process01', sep = ',', header = None, names= ['from','to', 'date', 'content','label'])
# print(df.head(10))
# print(df.tail(10))
# print(df.info())
# sys.exit("第20行")
#分析邮件的收发地址对label的影响
def 获取邮件收发地址(strl):#发送接收地址提取
it = re.findall(r"@([A-Za-z0-9]*\.[A-Za-z0-9\.]+)", str(strl))#正则匹配
#[^d]
result = ''
# result =[]
# result = {}
if len(it)>0:
result = it[0]
else:
result = 'unknown'
return result
'''result0 = 获取邮件收发地址(df['from'])
print(df['from'])
print(df['from'].shape)
print(result0)
sys.exit(0)'''
df['from_address'] = pd.Series(map(lambda str : 获取邮件收发地址(str), df['from']))#map映射并添加
'''print(df.from_address.value_counts())
sys.exit(0)'''
df['to_address'] = pd.Series(map(lambda str: 获取邮件收发地址(str), df['to']))
print("="*10 + 'to address' + "="*20)#也可以这样写
print(df.to_address.value_counts().head(5))#
print("总邮件接受服务器类别数量为:" + str(df.to_address.unique().shape))#计算服务器的个数
print("="*10 + 'from address' + "= "*20)
print(df.from_address.value_counts().head(10))
print(df[['from_address', 'label']].groupby(['from_address', 'label'])['label'].count())
print(df[['to_address', 'label']].groupby(['to_address', 'label'])['label'].count())
print("邮件发送服务器类别数量为:" + str(df.from_address.unique().shape))
from_address_df = df.from_address.value_counts().to_frame()#转为结构化的输出,带出列名
len_less_10_from_address_count = from_address_df[from_address_df.from_address<=10].shape
print("发送邮件数量小于10封的服务器数量为:" + str(len_less_10_from_address_count))
# from_address_df[from_address_df.from_address<=10].to_csv('./data/fromToResult.csv')
# df.from_address.value_counts().to_csv('./data/fromToResultNoneFrame.csv')
#结论:from和to这两个特征没有用,最后要删除
#===================================================================================================
# np.unique(list(map(lambda t: len(str(t).strip()), df['date'])))#转换为list再去做
print(np.unique(list(map(lambda t: len(str(t).strip()), df['date']))))
# np.unique(list(filter(lambda t: len(str(t).strip())==30, df['date'])))
print((list(filter(lambda t: len(str(t).strip())==3, df['date']))))
#nan
print((list(filter(lambda t: len(str(t).strip())==7, df['date']))))
#unknown
print((list(filter(lambda t: len(str(t).strip())==16, df['date']))))
#2005-9-2 上午11:04
print((list(filter(lambda t: len(str(t).strip())==19, df['date']))))
#Sep 23 2005 1:04 AM
print((list(filter(lambda t: len(str(t).strip())==21, df['date']))))
#August 24 2005 5:00pm
print((list(filter(lambda t: len(str(t).strip())==23, df['date']))))
#Thu 1 Sep 2005 09:42:01
print((list(filter(lambda t: len(str(t).strip())==24, df['date']))))
#Mon 15 Aug 2005 07:04:08
print((list(filter(lambda t: len(str(t).strip())==26, df['date']))))
#Sat 1 Oct 2005 00:12:07 UT
print((list(filter(lambda t: len(str(t).strip())==27, df['date']))))
#Mon 1 Jan 2001 21:40:47 GMT
print((list(filter(lambda t: len(str(t).strip())==28, df['date']))))
#Sun 14 Aug 2005 11:59:22 GMT
print((list(filter(lambda t: len(str(t).strip())==61, df['date']))))
#[ 3 7 16 19 21 23 24 26 27 28 29 30 31 32 33 34 35 36 45 46 57 58 61 62]
#通过打印,发现3, 7, 16, 19 21不含有星期数,需要观察的格式不全,需要特别处理
def 根据日期长度提取日期特征(str1):#Tue 30 Aug 2005 10:08:15 +0800
'''
8~13=0;13~19=1;19~24=2;24~8=3;
'''
if not isinstance(str1, str):#如果不是字符串
str1 = str(str1)
str_len = len(str1)
week = ""
hour = ""
time_quantum = ""
if str_len < 10:
week = "unknown"
hour = "unknown"
time_quantum = "unknown"
pass
elif str_len == 16:#2005-9-2 上午11:04
rex = r"(\d{2}):\d{2}"#只取冒号前面的
it = re.findall(rex, str1)
if len(it) == 1:
hour = it[0]
else:
hour = "unknown"
week = "Fri"
time_quantum = "0"
pass
#['2005-9-2 上午11:04', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55', '2005-9-2 上午10:55']
elif str_len == 19: #['Sep 23 2005 1:04 AM']
week = "Sep"
hour = "01"
time_quantum = "3"
pass
elif str_len == 21: #['August 24 2005 5:00pm']
week ="Wed"
hour = "17"
time_quantum = "1"
pass
else: #'Fri 2 Sep 2005 08:17:50' Wed 31 Aug 2005 15:06:36
rex = r"([A-Za-z]+\d?[A-Za-z]*) .*?(\d{2}):\d{2}:\d{2}.*"# 加问号保险些# 'Fri 23 Sep 2005 09:39:39 +0800 X-Priority: 3 X-Mailer: FoxMail'
it = re.findall(rex, str1)
# print(it)
# print(len(it))
# print(len(it[0]))
# sys.exit('129')
if len(it) == 1 and len(it[0]) ==2:
week = it[0][0][-3:]#it是list
hour = it[0][1]
int_hour = int(hour)
#24~8=3;8~13=0;13~19=1;19~24=2;
if int_hour <8:
time_quantum = "3"
elif int_hour <13:
time_quantum = "0"
elif int_hour <19:
time_quantum = "1"
else:
time_quantum = "2"
pass
else:
week = "unknown"
hour = "unknown"
time_quantum = 'unknown'
week = week.lower()
hour = hour.lower()
time_quantum = time_quantum.lower()
return(week, hour, time_quantum)
#数据转换
date_time_extract_result = list(map(lambda st: 根据日期长度提取日期特征(st), df['date']))
df['date_week'] = pd.Series(map(lambda t: t[0], date_time_extract_result))#匿名函数传出的是最后结构里面的值,是子集
df['date_hour'] = pd.Series(map(lambda t: t[1], date_time_extract_result))
df['date_time_quantum'] = pd.Series(map(lambda t: t[2], date_time_extract_result))
print("======星期属性字段的描述==========")
print(df.date_week.value_counts().head(3))
print(df[['date_week', 'label']].groupby(['date_week', 'label'])['label'].count())#先取data_week 和 label,然后按照label去排
print("======小时属性字段的描述==========")
print(df.date_hour.value_counts().head(3))
print(df[['date_hour', 'label']].groupby(['date_hour', 'label'])['label'].count())
print("======时间段属性字段的描述==========")
print(df.date_hour.value_counts().head(3))
print(df[['date_time_quantum', 'label']].groupby(['date_time_quantum', 'label'])['label'].count())
df['has_date'] = df.apply(lambda c: 0 if c['date_week'] == 'unknown' else 1, axis=1)#这里的1是按照行
#结论:data数据对标签没有太大指示作用,但是,一般的垃圾邮件都不会含有时间
# 开始分词==============================================
print('='*30 + '现在开始分词 ,请 耐心等待 5分钟 。。。' + '='*20)
df['content'] = df['content'].astype('str')#类型转换
df['jieba_cut_content'] = list(map(lambda st: " ".join(jieba.cut(st)), df['content']))
df.head(4)
#特征工程之四 长度提取
def 邮件长度统计(lg):
#以500为间隔递增
if lg <= 10:
return 0
elif lg <= 100:
return 1
elif lg <= 500:
return 2
elif lg <= 1000:
return 3
elif lg <= 1500:
return 4
elif lg <= 2000:
return 5
elif lg <= 2500:
return 6
elif lg <= 3000:
return 7
elif lg <= 4000:
return 8
elif lg <= 5000:
return 9
elif lg <= 10000:
return 10
elif lg <= 20000:
return 11
elif lg <= 30000:
return 12
elif lg <= 50000:
return 13
else:
return 14
print(df['content'])
df['content_length'] = pd.Series(map(lambda st:len(st),df['jieba_cut_content']))#content为切开之后的词向量 df['jieba_cut_content']
df['content_length_type'] = pd.Series(map(lambda st: 邮件长度统计(st), df['content_length']))
# print(df.head(10)) #如果不count就按照自然顺序排
df2 = df.groupby(['content_length_type', 'label'])['label'].agg(['count']).reset_index()#agg 计算并且添加count,类似于eval
print(df2)
df3 = df2[df2.label == 1][['content_length_type', 'count']].rename(columns = {'count' : 'c1'})
df4 = df2[df2.label == 0][['content_length_type', 'count']].rename(columns = {'count' : 'c2'})
print(df3)
print(df4)
df5 = pd.merge(df3, df4)#注意pandas中merge与concat的区别
df5['c1_rage'] = df5.apply(lambda r: r['c1'] / (r['c1'] + r['c2']), axis = 1)#1所占百分比
df5['c2_rage'] = df5.apply(lambda r: r['c2'] / (r['c1'] + r['c2']), axis = 1)#0所占百分比
print(df5)
#画图出来观测为信号添加做准备
plt.plot(df5['content_length_type'], df5['c1_rage'], label = u'垃圾邮件比例')#长度与概率的图像
plt.plot(df5['content_length_type'], df5['c2_rage'], label = u'正常邮件比例')
plt.grid(True)
plt.legend(loc = 0)#加入图例
plt.show()
#添加信号量,数值分析模拟回归方程
def 长度信息量计算(x):
'''返回值是是否是垃圾邮件的信息量,值越大,则是垃圾邮件的概率越大,反之。。。'''
if x > 10000:
return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1) - np.log(abs(x - 10000)) + 1
else:
return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1)
a = np.arange(1, 20000)
plt.plot(a, list(map(lambda t: 长度信息量计算(t) ,a)), label = u'信息量')
# plt.plot(df['content_length'], list(map(lambda t: 长度信息量计算(t) ,df['content_length'])), label = u'信息量')
plt.grid(True)
plt.legend(loc = 0)
plt.show()
df['content_length_sema'] = list(map(lambda st: 长度信息量计算(st), df['content_length']))
# print(df.dtypes) #可以查看每一列的数据类型,也可以查看每一列的名称
df.drop(['from', 'to', 'date', 'from_address', 'to_address', \
'date_week','date_hour', 'date_time_quantum', 'content', \
'content_length', 'content_length_type'], axis = 1, inplace=True)
# print(df.info())
# print(df.head(10))
df.to_csv('./data/result_process02', encoding='utf-8', index = False)
df.to_csv('./data/result_process02.csv', encoding='utf-8', index = False)
贝叶斯分类脚本bayes.py
#encoding:utf-8
import pandas as pd
import numpy as np
# import matplotlib as mpl
# import matplotlib.pyplot as plt
import sys
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
# mpl.rcParams['font.sans-serif'] = [u'simHei']
# mpl.rcParams['axes.unicode_minus'] = False
df = pd.read_csv('./data/result_process02.csv', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True) #删除数据中有空值的实例
# print(df.head(5))
# print(df.info())
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
'content_length_sema']],df['label'],\
test_size = 0.2, random_state = 0)
# print("训练集实例的个数是%d" % x_train.shape[0])
# print("测试集实例的个数是%d" % x_test.shape[0])
# print(x_train.head(10))
# print(x_test.head(10))
#================================================================================================
print('='*30 + '对分词后的油价内容做tf-idf转化' + '='*30)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)#加载tf-idf模型
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)# fit_transform(jieba_cut_content)
# df1 = transformer.fit_transform(jieba_cut_content)
print('='*30 + '对tf-idf后的数值矩阵进行svd降维' + '='*30)
svd = TruncatedSVD(n_components=20)#降成20维
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
print('='*30 + '合并处理后的矩阵' + '='*30)
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
print('='*30 + '朴素贝叶斯模型的加载及训练' + '='*30)
nb = BernoulliNB(alpha = 1.0, binarize = 0.0005)
model = nb.fit(data, y_train)#训练模型
print('='*30 + '合并测试集数据矩阵' + '='*30)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
print('='*30 + '测试数据' + '='*30)
start = time.time()
y_predict = model.predict(data_test)
end = time.time()
print('测试模型共消耗时间为:%0.2f'%(end-start))
print('='*30 + '评估模型召回率' + '='*30)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)
print('='*30 + '打印预测结果如下' + '='*30)
print('模型精确率为%0.5f' % precision)
print('模型召回率为%0.5f' % recall)
print('F1_mean为%0.5f' % f1mean)
=========================下面为不同模型的测试============================
decision_tree.py
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
df = pd.read_csv('./data/result_process02', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True)
print(df.head(5))
print(df.info())
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
'content_length_sema']],df['label'],\
test_size = 0.2, random_state = 0)
# print("训练集大小%d" % x_train.shape[0])
# print("测试集大小%d" % x_test.shape[0])
# print(x_train.head(1000))
# print(x_test.head(10))
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
# print(data.head(10))
# print(data.info())
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
# print(data.head(10))
# print(data.info())
tree = DecisionTreeClassifier(criterion='gini', max_depth = 5, random_state = 0)#'entropy'
model = tree.fit(data, y_train)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
# print(data_test.head(10))
# print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)
print('精确率为:%0.5f' % precision)
print('召回率为:%0.5f' % recall)
print('F1均值为:%0.5f' % f1mean)
# list01 = list(zip(data[0:5], tree.feature_importances_))
# list02 = sorted(list01, key = lambda x: x[1], reverse = True)
#
# print(list02)
gradient_boost_decision_tree.py
'''
Created on 2018年1月26日
@author: Administrator
'''
#-*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf
df = pd.read_csv('./data/result_process02.csv', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True)
# print(df.head(5))
# print(df.info())
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
'content_length_sema']],df['label'],\
test_size = 0.2, random_state = 0)
# print("训练集大小%d" % x_train.shape[0])
# print("测试集大小%d" % x_test.shape[0])
# print(x_train.head(1000))
# print(x_test.head(10))
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
# print(data.head(10))
# print(data.info())
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
# print(data.head(10))
# print(data.info())
gbdt = GradientBoostingClassifier(learning_rate=0.01, n_estimators =100, max_depth=3,\
min_samples_split = 50, loss = 'deviance', random_state = 0)
#对数似然损失函数 指数损失函数exponential
model = gbdt.fit(data, y_train)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
# print(data_test.head(10))
# print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)
print('精确率为:%0.5f' % precision)
print('召回率为:%0.5f' % recall)
print('F1均值为:%0.5f' % f1mean)
k_nearest_neighbor.py
'''
Created on 2018年1月26日
@author: Administrator
'''
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf
df = pd.read_csv('./data/result_process02', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = 'True')
print(df.head(5))
print(df.info())
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
'content_length_sema']],df['label'],\
test_size = 0.2, random_state = 0)
print("训练集大小%d" % x_train.shape[0])
print("测试集大小%d" % x_test.shape[0])
print(x_train.head(1000))
print(x_test.head(10))
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
print(data.head(10))
print(data.info())
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
print(data.head(10))
print(data.info())
knn = KNeighborsClassifier(n_neighbors=5)
model = knn.fit(data, y_train)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
print(data_test.head(10))
print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)
print('精确率为:%0.5f' % precision)
print('召回率为:%0.5f' % recall)
print('F1均值为:%0.5f' % f1mean)
random_forest.py
'''
Created on 2018年1月26日
@author: Administrator
'''
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf
df = pd.read_csv('./data/result_process02.csv', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = True)
# print(df.head(5))
# print(df.info())
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
'content_length_sema']],df['label'],\
test_size = 0.2, random_state = 0)
print("训练集大小%d" % x_train.shape[0])
print("测试集大小%d" % x_test.shape[0])
# print(x_train.head(1000))
# print(x_test.head(10))
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
# print(data.head(10))
# print(data.info())
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
# print(data.head(10))
# print(data.info())
forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=3, random_state=0)
model = forest.fit(data, y_train)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
# print(data_test.head(10))
# print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)
print('精确率为:%0.5f' % precision)
print('召回率为:%0.5f' % recall)
print('F1均值为:%0.5f' % f1mean)
support_vector_machine.py
'''
Created on 2018年1月26日
@author: Administrator
'''
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble.tests.test_forest import check_min_samples_leaf
df = pd.read_csv('./data/result_process02', sep =',')
# print(df.head(5))
df.dropna(axis = 0, how ='any', inplace = 'True')
print(df.head(5))
print(df.info())
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content',\
'content_length_sema']],df['label'],\
test_size = 0.2, random_state = 0)
print("训练集大小%d" % x_train.shape[0])
print("测试集大小%d" % x_test.shape[0])
print(x_train.head(1000))
print(x_test.head(10))
#================================================================================================
print('='*30 + '开始训练集的特征工程' + '='*30)
transformer = TfidfVectorizer(norm = 'l2', use_idf = True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
transformer_model = transformer.fit(jieba_cut_content)
df1 = transformer_model.transform(jieba_cut_content)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
print(data.head(10))
print(data.info())
data['has_date'] = list(x_train['has_date'])
data['content_length_sema'] = list(x_train['content_length_sema'])
print(data.head(10))
print(data.info())
svm = SVC(C = 1, kernel='rbf', degree = 3, gamma = 0.001)
model = svm.fit(data, y_train)
jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
data_test['has_date'] = list(x_test['has_date'])
data_test['content_length_sema'] = list(x_test['content_length_sema'])
print(data_test.head(10))
print(data_test.info())
#��ʼԤ��
y_predict = model.predict(data_test)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f1mean = f1_score(y_test, y_predict)
print('精确率为:%0.5f' % precision)
print('召回率为:%0.5f' % recall)
print('F1均值为:%0.5f' % f1mean)