Python3爬取豆瓣图书Top250并存入csv文件中

本文抓取豆瓣图书Top250中的相关资讯,可以练习对于网页结构的分析及提取。

下面先导入相关的包,并伪装成浏览器访问:

import requests
from lxml import etree
import re

header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}

下面是提取出具体图书页面的内容简介及作者简介:

#获取详情页的内容简介,作者简介
def Books(url):
    response=requests.get(url,headers=header)
    selector=etree.HTML(response.text)
    #内容简介
    title1=selector.xpath('//div[@class="indent" and @id="link-report"]//div[@class="intro"]/p/text()')
    title1='\r\n'.join(title1)
    #作者简介
    title2=selector.xpath('//div[@class="indent " and not(@id)]//div[@class="intro"]/p/text()')
    title2='\r\n'.join(title2)
    return title1,title2

下面提取Top250里相关图书的资讯:

def getBookList():
    url0=u'https://book.douban.com/top250?start={0}'
    for i in range(0,250,25):
        url=url0.format(i)
        response=requests.get(url,headers=header)
        selector=etree.HTML(response.text)
        #书名
        booknames=selector.xpath('//td[@valign="top" and not(@width)]/div[@class="pl2"]/a/text()')
        booknames=[i.replace('\n','').replace(' ','').strip() for i in booknames]
        booknames=[i for i in booknames if len(i)!=0]
        #url
        urls=selector.xpath('//td[@valign="top" and not (@width)]/div[@class="pl2"]/a/@href')
        #评分
        rates=selector.xpath('//td[@valign="top" and not(@width)]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()')
        #评价人数
        hots=selector.xpath('//td[@valign="top" and not(@width)]/div[@class="star clearfix"]/span[@class="pl"]/text()')
        hots=[i.replace('\n','').replace(' ','').strip() for i in hots]
        hots=[re.findall(r"\d+",i)[0] for i in hots]
        #info
        infos=selector.xpath('//td[@valign="top" and not(@width)]/p[@class="pl"]/text()')
        #图书详情页
        title1s=[]
        title2s=[]
        for u in urls:
            title1,title2=Books(u)
            title1s.append(title1)
            title2s.append(title2)
        #汇总
        books=list(zip(booknames,urls,rates,hots,infos,title1s,title2s))
        for book in books:
            yield (book)

下面是把上面提取出的内容写入csv文件中,实际运用中,可以保存在数据库,使用的时候提取即可。

#存入csv文件
import csv
path=r'E:\douban.csv'
def SaveCsv():
    douban=open(path,'w',newline='',encoding='utf-8')
    w=csv.writer(douban)
    w.writerow(['书名','链接','评分','评价人数','资讯','内容简介','作者简介'])
    books=getBookList()
    for book in books:
        w.writerow(book)

下面运行即可爬取需要的内容并写入文件:

SaveCsv()

你可能感兴趣的:(Python)