爬虫豆瓣

python 爬虫爬取豆瓣读书信息

程序主要是由 6 个函数组成:

get_html():请求页面,返回页面 html 源码。

get_pageurl(base_url):从 html 源码中提取分页链接部分字段。

def bookinfo(url):提取图书信息,以列表形式返回。

get_num(person):判断评价人数,没有评价人数的按 10 人处理。

write2csv():将图书信息保存为 csv 文件。

main():程序执行的主函数。




程序思路:

1.分析豆瓣读书链接,分析分页链接规律。

2.循环提取链接中书本信息。

3.将书本信息保存为 csv 文件
'''python
#解析出图书信息
def bookinfo(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'lxml')
    tag = url.split("?")[0].split("/")[-1]
    booknames = soup.select('#subject_list ul  div.info h2 a')
    details = soup.select('#subject_list ul div.info div.pub')
    ratings = soup.select('#subject_list div.info div.star.clearfix span.rating_nums')  # 评分
    peoples = soup.select('#subject_list  div.star.clearfix span.pl')  # 评价人数
    intros = soup.select('#subject_list  ul  div.info p')
    data=[]
    for bookname, detail, rating, person, intro in zip(booknames, details, ratings, peoples, intros):
        info = {}
        try:
            info['类型']=tag
            booktitle = bookname.get_text().split()[0]
            info['书籍名称'] = booktitle
            author = detail.get_text().split('/', 4)[0].lstrip('\n          ').rstrip('\n        ')
            info['作者'] = author
            translator = detail.get_text().split('/', 4)[1]
            info['译者'] = translator
            rating_num = rating.get_text()  # 评分
            info['豆瓣评分'] = rating_num
            press = detail.get_text().split('/', 4)[2]
            info['出版社'] = press
            date = detail.get_text().split('/', 4)[3].split('-')[0]
            info['出版日期'] = date
            price = detail.get_text().split('/', 4)[4].lstrip('\n          ').rstrip('\n        ')
            info['价格'] = price
            person = get_num(person)  # 评价人数
            info['评价人数'] = person
            introduction = intro.get_text()
            info['简介'] = introduction
            data.append(info)
        except IndexError:
            try:
                info['类型'] = tag
                booktitle = bookname.get_text().split()[0]
                info['书籍名称'] = booktitle
                author = detail.get_text().split('/', 3)[0].lstrip('\n          ').rstrip('\n        ')
                info['作者'] = author
                translator = ""
                info['译者'] = translator
                press = detail.get_text().split('/', 3)[1]
                rating_num = rating.get_text()
                info['豆瓣评分'] = rating_num
                info['出版社'] = press
                date = detail.get_text().split('/', 3)[2].split('-')[0]
                info['出版日期'] = date
                price = detail.get_text().split('/', 3)[3].lstrip('\n          ').rstrip('\n        ')
                info['价格'] = price
                person = get_num(person)
                info['评价人数'] = person
                introduction = intro.get_text()
                info['简介'] = introduction
            except (IndexError, TypeError):
                continue
        except TypeError:
            continue
    return data

'''
'''python
#判断评价人数,没有数据的按 10 人处理
def get_num(person):
    try:
        person = int(person.get_text().split()[0][1:len(person.get_text().split()[0]) - 4])
    except ValueError:
        person = int(10)
    return person

'''

'''python

def main():
    base_url = 'https://book.douban.com/tag/?view=cloud'
    start = time.clock()
    for urls in get_pageurl(base_url):
        urlss = [urls +"?start={}&type=T".format(str(i)) for i in range(0, 1000, 20)]
        for url in urlss:
            data=bookinfo(url)
            write2csv(url)
            time.sleep(int(format(random.randint(0,9))))  #爬取每页书本信息后随机等待几秒,反爬虫操作
    end = time.clock()
    print('Time Usage:', end - start)    #爬取结束,输出爬取时间

'''

主要代码部分如上所示:全部代码

你可能感兴趣的:(reptile)