20170523作业001--python爬虫学习

爬取糗事百科信息,包括:作者,性别,年龄,段子内容,好笑数,评论数。
使用的工具:
1.win10
2.pycharm
3.python3.5
代码如下:

import requests
from bs4 import BeautifulSoup
import time
import csv

url='http://www.qiushibaike.com/text/'

#获取网页
def get_html(url):
    html=requests.get(url)
    #确定返回状态
    if html.status_code in (200,304):
        return html.text
    else:
        return None

#获取网页信息
def parse_html(html):
    soup=BeautifulSoup(html,'html.parser')
    Information=list()
    listdata=soup.find_all(name='div',class_='article block untagged mb15')
    for i in listdata:
        #获取名字
        author_name=i.find(name='h2').text
         # 根据div节点的class属性来判断性别,匿名用户不知道性别
        # get('class')获取到两个属性,一个是articleGender,另一个是manIcon(womanIcon)
        # get('class')[-1]取到manIcon(womanIcon)字符串后用切片取得man(woman)
        # get('class')[-1][:-4]表示取字符串第一个字符到倒数第5个字符,字符串最后一个字符串索引表示为-1
        author_sex=i.find(name='div',class_='articleGender').get('class')[1][:-4] if i.find(
            name='div',class_='articleGender') is not None else '不知道'
        # 匿名用户不知道年龄
        author_age=i.find(name='div',class_='articleGender').text if i.find(
            name='div',class_='articleGender') is not None else '0'
        joke_content=i.find(name='div',class_='content').text.strip()
        laugher_count=i.find(name='div', class_='stats').text.split()[0],
        comment_count=i.find(name='div', class_='stats').text.split()[-2],
        Information.append({'author_name':author_name,
                            'author_sex':author_sex,
                            'author_age':author_age,
                            'joke_content':joke_content,
                            'laugher_count':laugher_count,
                            'comment_count':comment_count})
    return Information

def Next_page(html):
    soup=BeautifulSoup(html,'html.parser')
    #判断是否存在下一页
    if soup.find(name='span',class_='next') is not None:
        return soup.find(name='span',class_='next').parent.get('href')
    else:
        return None

def crawl_XiuBai(url):
    baseurl='http://www.qiushibaike.com'
    html=get_html(url)
    Information=parse_html(html)
    next_url=Next_page(html)
    #递归读取下一页
    if next_url is not None:
        next_url=baseurl+next_url
        time.sleep(5)
        Information.extend(crawl_XiuBai(next_url))
    return Information

if __name__=='__main__':
    Information=crawl_XiuBai(url)
    for i in Information:
        print(i)

爬取得结果如下:

20170523作业001--python爬虫学习_第1张图片
部分爬取数据截图.png

你可能感兴趣的:(20170523作业001--python爬虫学习)