爬虫作业

爬取百度

爬取慕客网所有关于python的课程名及描述信息, 并通过词云进行分析展示;
- 网址: https://www.imooc.com/search/course?words=python

import re
import requests
from bs4 import BeautifulSoup
import jieba
import numpy
from PIL import Image
import wordcloud

def get_pagehtml(url):
    return requests.get(url).text

'''
					Python操作三大主流数据库
				'''

def getname(text):
    '''
    :param text: 获取当前网页的名字和相关信息
    :return: 返回的是一个列表,字典里存储了python每门课程的名字和信息
    '''
    soup = BeautifulSoup(text, 'html5lib')
#
nametag_li = soup.find_all('div', class_="course-item-detail") info_li = [] #Python操作三大主流数据库 for i in nametag_li: info = {} #匹配当前元素下的所有汉字 i = re.findall(r'[\u4E00-\u9FA5]+',str(i)) info['name'] = i[0] info['info'] = i[1:] info_li.append(info) return info_li def creat_wordcloud(text): imgobj = Image.open('img1.jpg') cloud_mask = numpy.array(imgobj) result = jieba.lcut(text) wc = wordcloud.WordCloud( width=500, mask=cloud_mask, max_font_size=100, min_font_size=10, background_color='snow', font_path = './font/msyh.ttf', ) wc.generate(','.join(result)) wc.to_file('./python.png') def main(): li = [] for i in range(2): url = 'https://www.imooc.com/search/course?words=python&page=%d' %(i+1) info = getname(get_pagehtml(url)) for j in info: li.append(j['name']) li.append(''.join(j['info'])) print(j) creat_wordcloud(''.join(li)) main()

结果:
爬虫作业_第1张图片

获取今日百度热点

from bs4 import BeautifulSoup
from urllib.request import urlopen


def getPagehtml(url):
    #获取网页源代码
    a = urlopen(url).read()
    #注意:此时的编码格式为gb2312
    return a.decode('gb2312','ignore')


def deal_text(text):
    soup = BeautifulSoup(text,'html5lib')
    # 

结果:
爬虫作业_第2张图片

你可能感兴趣的:(爬虫作业)