我的个人博客:https://www.blog.kamisamak.com/
robot信息查询
from urllib.robotparser import RobotFileParser
UrlRobots = 'https://book.douban.com/robots.txt'
def GetRobotsTxt(url) :
rp = RobotFileParser()
rp.set_url(url)
rp.read()
print(rp.can_fetch('*', 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'))
print(rp.can_fetch('*', 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4'))
print(rp.can_fetch('*', 'https://book.douban.com/'))
GetRobotsTxt(UrlRobots)
code
# 导入相关的库
from lxml import etree
import requests
import xlwt
import os
import fake_useragent
# ua = UserAgent(use_cache_server=False)
# ua = UserAgent(cache=False)
# ua = UserAgent(verify_ssl=False)
# 初始化列表,存入爬虫数据
all_info_list = []
# 表头信息
header = ['name', 'url', 'author', 'publisher', 'date', 'price', 'rate', 'comment', 'bookPic']
# 构造urls
urls = ['https://book.douban.com/top250?start={}'.format(i) for i in range(0, 250, 25)]
# 随机构建请求头
# from:https://fake-useragent.herokuapp.com/browsers/0.1.11
def get_header():
location = os.getcwd() + '/data/fake_useragent.json'
ua = fake_useragent.UserAgent(path=location)
return ua.random
# 下载书本封面
def save_image_file(url, path):
jd = requests.get(url)
if jd.status_code == 200:
with open(path, 'wb') as f:
f.write(jd.content)
f.close()
for url in urls:
# 用requests库获取网页信息,lxml解析html文件
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
# 取大标签
infos = selector.xpath('//tr[@class="item"]')
for info in infos:
# 数名
name = info.xpath('td/div/a/@title')[0]
# 地址
url = info.xpath('td/div/a/@href')[0]
book_infos = info.xpath('td/p/text()')[0]
# 作者
author = book_infos.split('/')[0]
# 出版社
publisher = book_infos.split('/')[-3]
# 日期
date = book_infos.split('/')[-2]
# 价格
price = book_infos.split('/')[-1]
# 分数
rate = info.xpath('td[2]/div[2]/span[2]/text()')[0]
# 议论
comments = info.xpath('td/p/span/text()')
comment = comments[0] if len(comments) != 0 else "空"
# 头图
bookPic = info.xpath('td[1]/a/img/@src')[0]
print(name, url, author, publisher, date, price, rate, comment, bookPic)
all_info_list.append([name, url, author, publisher, date, price, rate, comment, bookPic])
book = xlwt.Workbook(encoding='utf_8')
# 创建工作表
sheet = book.add_sheet('Shee1')
# python range() 函数可创建一个整数列表,一般用在 for 循环中。
# Python len() 方法返回对象(字符、列表、元组等)长度或项目个数。
for h in range(len(header)):
# 写入表头
sheet.write(0, h, header[h])
i = 1
# for循环,数据存入表格
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
# 查看结果
# print(data)
j += 1
i += 1
# 将excel保存至设置路径
book.save('doubanbookTop250/doubanbookTop250.xls')
# 便利循环 图片存储
for info in all_info_list:
save_image_file(info[8], "doubanbookTop250/" + info[0] + ".jpg")
本