本文抓取豆瓣图书Top250中的相关资讯,可以练习对于网页结构的分析及提取。
下面先导入相关的包,并伪装成浏览器访问:
import requests
from lxml import etree
import re
header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
下面是提取出具体图书页面的内容简介及作者简介:
#获取详情页的内容简介,作者简介
def Books(url):
response=requests.get(url,headers=header)
selector=etree.HTML(response.text)
#内容简介
title1=selector.xpath('//div[@class="indent" and @id="link-report"]//div[@class="intro"]/p/text()')
title1='\r\n'.join(title1)
#作者简介
title2=selector.xpath('//div[@class="indent " and not(@id)]//div[@class="intro"]/p/text()')
title2='\r\n'.join(title2)
return title1,title2
下面提取Top250里相关图书的资讯:
def getBookList():
url0=u'https://book.douban.com/top250?start={0}'
for i in range(0,250,25):
url=url0.format(i)
response=requests.get(url,headers=header)
selector=etree.HTML(response.text)
#书名
booknames=selector.xpath('//td[@valign="top" and not(@width)]/div[@class="pl2"]/a/text()')
booknames=[i.replace('\n','').replace(' ','').strip() for i in booknames]
booknames=[i for i in booknames if len(i)!=0]
#url
urls=selector.xpath('//td[@valign="top" and not (@width)]/div[@class="pl2"]/a/@href')
#评分
rates=selector.xpath('//td[@valign="top" and not(@width)]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()')
#评价人数
hots=selector.xpath('//td[@valign="top" and not(@width)]/div[@class="star clearfix"]/span[@class="pl"]/text()')
hots=[i.replace('\n','').replace(' ','').strip() for i in hots]
hots=[re.findall(r"\d+",i)[0] for i in hots]
#info
infos=selector.xpath('//td[@valign="top" and not(@width)]/p[@class="pl"]/text()')
#图书详情页
title1s=[]
title2s=[]
for u in urls:
title1,title2=Books(u)
title1s.append(title1)
title2s.append(title2)
#汇总
books=list(zip(booknames,urls,rates,hots,infos,title1s,title2s))
for book in books:
yield (book)
下面是把上面提取出的内容写入csv文件中,实际运用中,可以保存在数据库,使用的时候提取即可。
#存入csv文件
import csv
path=r'E:\douban.csv'
def SaveCsv():
douban=open(path,'w',newline='',encoding='utf-8')
w=csv.writer(douban)
w.writerow(['书名','链接','评分','评价人数','资讯','内容简介','作者简介'])
books=getBookList()
for book in books:
w.writerow(book)
下面运行即可爬取需要的内容并写入文件:
SaveCsv()