中国共产党第十八、十九、 二十次全国人民代表大会
报告实录文本挖掘与分析(简单版)
爬取18大报告实录文本
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get('http://cpc.people.com.cn/n/2012/1118/c64094-19612151.html') # 打开网页
# 读入第一页数据
file = open('18da_report_raw.txt', 'w', encoding='utf-8')
file.write(data)
file.close()
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="p_content"]/center/table/tbody/tr/td/a')
)# 可扩展标记语言(英語:Extensible Markup Language,简称:XML)是一种标记语言
).click()
#依次读入后十页数据
for i in range(10):
data = driver.find_element_by_xpath('//*[@id="p_content"]').text
file = open('18da_report_raw.txt', 'a', encoding='utf-8')
file.write(data)
file.close()
time.sleep(2)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="p_content"]/center/table/tbody/tr/td[2]/a')
)
).click()
# 读入最后一页
data = driver.find_element(By.XPATH,'//*[@id="p_content"]').text
file = open('18da_report_raw.txt', 'a', encoding='utf-8')
file.write(data)
file.close()
driver.quit()
爬取19大报告文本
from selenium import webdriver driver = webdriver.Chrome() driver.get('http://finance.sina.com.cn/china/gncj/2017-10-18/doc-ifymvuyt4098830.shtml') data = driver.find_element_by_xpath('//*[@id="articleContent"]/div[1]').text file = open('19da_report_raw.txt', 'w', encoding='utf-8') file.write(data) file.close()
爬取20大报告文本
from selenium import webdriver driver = webdriver.Chrome() driver.get('https://news.ifeng.com/c/8K9l4qcZtaw') data = driver.find_element_by_xpath('//*[@id="root"]/div/div[2]/div[2]/div/div[1]/div/div').text file = open('20da_report_raw.txt', 'w', encoding='utf-8') file.write(data) file.close()
收集数据后进行数据处理
高频词分析以及画词云图替换txt文件即可
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
txt = open("18.txt",encoding="utf-8").read()
stopwords = [line.strip() for line in open("baidu_stopwords.txt",encoding="utf-8").readlines()] # strip()方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
words = jieba.lcut(txt,cut_all=False) # 精确模式
counts = {}
for word in words:
# 不在停词表中
if word not in stopwords:
# 不统计字数为1的词
if len(word) == 1:
continue
else:
counts[word] = counts.get(word,0)+1
print(counts[word])
items = list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(30):
word, count = items[i]
print("{:<10}{:>7}".format(word, count))
text=' '.join(words)
wc=WordCloud(background_color='white',# 设置背景颜色
font_path='msyh.ttc',# 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
scale=2, #按照比例进行放大画布,如设置为2,则长和宽都是原来画布的1.5倍
max_words=100,# 设置最大现实的字数
max_font_size=80,# 设置字体最大值
stopwords =stopwords
)
wc.generate(text)
# 显示词云图
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file('词频统计.jpg')
关键词分析(简易版)
import jieba
import jieba.analyse as analyse
with open('20.txt') as f: # 默认模式为‘r’,只读模式
contents = f.read() # 读取文件全部内容
contents = contents.replace(u'\u3000',u'').replace('\n', '').replace('\r', '').replace(" ","")
contents.rstrip()
# jieba.del_word("坚持")
# jieba.del_word("推进")
# jieba.del_word("中国")
# jieba.del_word("全面")
jieba.del_word("我们")
jieba.del_word("实现")
jieba.del_word("伟大")
jieba.del_word("坚持")
jieba.del_word("加强")
jieba.del_word("全面")
jieba.del_word("中国")
jieba.del_word("完善")
jieba.del_word("加快")
jieba.del_word("健全")
jieba.del_word("社会")
jieba.del_word("推进")
jieba.del_word("推动")
jieba.del_word("提高")
words = jieba.lcut(contents,cut_all=False)# 使用精确模式对文本进行分词
text=' '.join(words)
keywords = jieba.analyse.extract_tags(text,topK=10,withWeight=True,allowPOS=())
for item in keywords:
print(item[0],item[1])