爬取三国演义小说全文进行词频统计并生成词云

from bs4 import BeautifulSoup
import requests
from multiprocessing import Pool
import time
import jieba
from PIL import Image
from wordcloud import WordCloud

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

def get_info(url):
    soup = BeautifulSoup(requests.get(url, headers=headers, verify=False).text, 'lxml')
    # string = re.findall(
    #     '

(.*?)

', html, re.S) # pattern = re.compile('(<.*?>)') # result = pattern.sub('', str(string))#注意此处加str # print(result) # real = ''.join(string) # print(real) contents = soup.select('div.content > p') for content in contents: with open('D:/三国演义.txt', 'a+', encoding='gbk') as fp: fp.write(content.get_text()) def get_words(): txt = open("D:/三国演义.txt", "r", encoding='gbk').read() words = jieba.lcut(txt) counts = dict() stopwords = ['二人', '却说', '不能', '不可', '如此', '左右', '次日', '大喜', '忽然', '此人', '今日', '于是', '一人'] # 出现频繁的无用词 for word in words: # 词频统计 if len(word) == 1 or word in stopwords: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) # 按词频由大到小排序 mylist = [] for i in range(100): word, count = items[i] mylist.append(word) text1 = ' '.join(mylist) # 注意空格 print(text1) return text1 def create(imgFile, s): im = Image.open(imgFile) w, h = im.size # 创建wordcloud对象 wc = WordCloud( r'C:\windows\fonts\simfang.ttf', width=w, height=h, background_color='white', font_step=3, random_state=False, prefer_horizontal=0.9 ) t = wc.generate(s) t = t.to_image() for w1 in range(w): for h1 in range(h): if im.getpixel((w1, h1))[:3] == (255, 255, 255): t.putpixel((w1, h1), (255, 255, 255)) t.save('D:/result.jpg') # 测试 # chs = string.ascii_letters + string.digits + string.punctuation # s = [''.join((random.choice(chs) for i in range(8))) for j in range(650)] # s = ''.join(s) if __name__ == '__main__': urls = [ 'http://www.zggdwx.com/sanguo/{}.html'.format(str(i)) for i in range(1, 121)] '''多进程下载用10秒,单进程下载用13秒,开启多进程电脑风扇呱呱叫。。 one1 = time.time() pool = Pool(processes=4) # for url in urls: # pool.apply_async(get_info,url) pool.map(get_info, urls) two1 = time.time() print('花费时间', two1 - one1 ''' one2 = time.time() for url in urls: get_info(url) two2 = time.time() print('爬取花费时间',two2 - one2) text1 = get_words() create('D:/12345.jpg', text1)

原图
爬取三国演义小说全文进行词频统计并生成词云_第1张图片
结果
爬取三国演义小说全文进行词频统计并生成词云_第2张图片

你可能感兴趣的:(爬虫)