这应该是我能想到最简单的方法惹, 应该是我能想明白的 emm
这中不仅仅用于新闻分类,可以扩展为 标签分类。 这里仅用新闻做实验
鬼知道我参考了多少博客, 一个能直接跑起来的都没有 ,非常抱歉博客参考太多找不到了
由于爬虫爬取的新闻 还需要自己分类 =。= ,于是偷懒使用 sougou 2008年 的数据(http://www.sogou.com/labs/resource/cs.php)
将不同类型的文章 通过 jieba 提取关键词, 将关键词进行存储
(ps: 下载下来的数据, 字符集编码是 gb18030的, 手动转了一次 utf-8)
import os
import jieba
from jieba import analyse
from bs4 import BeautifulSoup
def jieba_content(contnet):
a = analyse.extract_tags(contnet)
return list(a)
def file_read(file_dir): # 得到文本.txt的路径
for root, dirs, files in os.walk(file_dir):
for f in files:
file_path2 = file_dir + "/a/" + f
print(file_path2)
with open(file_path2) as f2:
content = f2.read()
soup = BeautifulSoup(content, "html.parser", from_encoding="utf-8")
doc_list = soup.find_all('doc')
for index, i in enumerate(doc_list):
content = i.find('content').text.strip()
if content == '':
continue
url = i.find('url').text
for item in url.split('/'):
if item in dicurl:
keys = jieba_content(content)
data = {'type': str(dicurl[item].strip()), 'key_list': keys}
data_write_csv(str(dicurl[item].strip()), data)
def data_write_csv(filename, datas):
if filename not in file_list:
file_list.append(filename)
filename = 'test/' + filename + '.txt'
with open(filename, 'a+', encoding='utf-8') as f:
f.write(','.join(datas['key_list']))
def write_file_list(file_list):
with open('filename.txt', 'w') as f:
f.write(file_list)
if __name__ == "__main__":
file_list = []
path = "SogouCS"
# 建立url和类别的映射词典
dicurl = {'auto.sohu.com': 'qiche', 'it.sohu.com': 'hulianwang', 'health.sohu.com': 'jiankang', \
'sports.sohu.com': 'tiyu', 'travel.sohu.com': 'lvyou', 'learning.sohu.com': 'jiaoyu', \
'career.sohu.com': 'zhaopin', 'cul.sohu.com': 'wenhua', 'mil.news.sohu.com': 'junshi', \
'house.sohu.com': 'fangchan', 'yule.sohu.com': 'yule', 'women.sohu.com': 'shishang', \
'media.sohu.com': 'chuanmei', 'gongyi.sohu.com': 'gongyi', '2008.sohu.com': 'aoyun', \
'business.sohu.com': 'shangye', 'news.sohu.com': 'other'}
jieba.load_userdict("user.txt")
analyse.set_stop_words("stopword.txt")
file_read(path)
write_file_list(file_list)
通过上一步处理过的数据 进行 语料库训练
import os
import jieba
from jieba import analyse
from collections import defaultdict
from gensim import corpora, models, similarities
def file_read(file_dir): # 得到文本.txt的路径
for root, dirs, files in os.walk(file_dir):
for f in files:
key_list.append(f)
file_path2 = file_dir + "/" + f
with open(file_path2) as f2:
content = f2.read()
dic[f] = content.split(',')
def jieba_content(contnet):
a = analyse.extract_tags(contnet)
return list(a)
if __name__ == '__main__':
dic = {}
key_list = []
file_read('test')
texts = dic.values()
frequency = defaultdict(int) # 使用默认字典
for text in texts: # 下面2行代码是计算每个词的频数。方便下面的代码去除频数少的单词
for token in text:
frequency[token] = +1
dictionary = corpora.Dictionary(texts)
dictionary.save('dictionary.txt')
texts = [[word for word in text]
for text in texts]
corpus = [dictionary.doc2bow(text) for text in texts] # 建立新的语料库
corpora.MmCorpus.serialize("XinYU.mm", corpus) # 存新的语料库
import os
from jieba import analyse
from gensim import corpora, models, similarities
def jieba_content(contnet):
a = analyse.extract_tags(contnet)
return list(a)
def load():
dictionary = corpora.Dictionary.load('dictionary.txt')
corpus = corpora.MmCorpus('XinYU.mm')
file_list = [files for root, dirs, files in os.walk('test')][0]
return dictionary, corpus, file_list
def get2(dictionary, corpus, content, file_list):
new_vec = dictionary.doc2bow(jieba_content(content)) # 建立向量
tfidf = models.TfidfModel(corpus) # 建立tfidf模型
featureNum = len(dictionary.token2id.keys()) # 通过token2id得到特征数
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=featureNum) # 稀疏矩阵相似度,从而建立索引
sim = index[tfidf[new_vec]] # 计算最终相似度结果
key_index = list(sim).index(max(sim))
return dic.get(file_list[key_index])
if __name__ == '__main__':
dic = {'qiche.txt': '汽车', 'hulianwang.txt': '互联网', 'jiankang.txt': '健康', \
'tiyu.txt': '体育', 'lvyou.txt': '旅游', 'jiaoyu.txt': '教育', \
'zhaopin.txt': '照片', 'wenhua.txt': '文化', 'junshi.txt': '军事', \
'fangchan.txt': '房产', 'yule.txt': '娱乐', 'shishang.txt': '市场', \
'chuanmei.txt': '传媒', 'gongyi.txt': '公益', 'aoyun.txt': '奥运', \
'shangye.txt': '商业', 'other.txt': 'other'}
dictionary, corpus, file_list = load()
content = """
凡孕卵在子宫腔以外的任何部位着床者,统称为异位妊娠,习称为宫外孕。根据着床部位不同,有输卵管妊娠、卵巢妊娠、腹腔妊娠、宫颈妊娠及子宫残角妊娠等。 """
key = get2(dictionary, corpus, content, file_list)
print('内容: {}\n 预测属于: {} 类新闻'.format(content, key))