python爬虫项目——爬取网易新闻文本制作词云

爬取网易新闻文本制作词云

一、爬取网易新闻的新闻文本

-导入第三方库

import requests
from lxml import etree

-输入新闻页面的url

url = input('请输入你想爬取的网易新闻的url:\n')

-进行UA伪装

headers = {
     
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
        }

-爬取新闻的文本数据并持久化存储

    response = requests.get(url=url, headers=headers)
    page_text = response.text
    tree = etree.HTML(page_text)
    p_list = tree.xpath('//*[@id="content"]/div[2]/p')
    fp = open('news.txt', 'w')
    print('正在爬取新闻')
    for p in p_list:
        text = p.xpath('./text()')
        if text:
            text = text[0]
            fp.write(text)
            # 换行
            fp.write('\n')
    print('爬取结束!')

二、用获取到的文本制作词云

-导入需要的库

import jieba
import wordcloud

-读取文本文件

fp = open('news.txt', 'r', encoding='utf-8')
text = fp.read()
fp.close()

-制作词云

ls = jieba.lcut(text)
txt = " ".join(ls)
w = wordcloud.WordCloud(font_path='msyhzx.ttf', width=1000, height=700, background_color='white')
w.generate(txt)
w.to_file('new.png')

三、进行模块调用

-爬取新闻的文本

spider_wangyi_news.py

#spider_wangyi_news.py
import requests
from lxml import etree

def spider():
    url = input('请输入你想爬取的网易新闻的url:\n')
	headers = {
     
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
        }
    response = requests.get(url=url, headers=headers)
    page_text = response.text
    tree = etree.HTML(page_text)
    p_list = tree.xpath('//*[@id="content"]/div[2]/p')
    fp = open('news.txt', 'w')
    print('正在爬取新闻')
    for p in p_list:
        text = p.xpath('./text()')
        if text:
            text = text[0]
            fp.write(text)
            # 换行
            fp.write('\n')
    print('爬取结束!')

-制作词云

make_wordcloud.py

#make_wordcloud.py
import jieba
import wordcloud

def make():
    fp = open('news.txt', 'r', encoding='utf-8')
    text = fp.read()
    fp.close()
    ls = jieba.lcut(text)
    txt = " ".join(ls)
    w = wordcloud.WordCloud(font_path='msyh.ttf', width=1000, height=700, background_color='white')
    w.generate(txt)
    w.to_file('new.png')
    print('词云制作成功!')

-开始

start.py

将spider_wangyi_new.py文件、make_wordcloud.py文件、start.py文件、msyh.ttf文件放到同一文件夹

#start.py
import make_worldcloud as a
import spider_wangyi_news as b
#爬取新闻数据
b.spider()
#制作词云
a.make()

你可能感兴趣的:(python,爬虫)