python爬虫其实并没有什么太高深的技术,耗费精力的地方在于对网站返回结果的解析,以及对一些反爬机制的研究。
下面这个例子可以爬取起点免费小说,直接看代码(本例来源于参考资料1):
# coding=utf-8
import urllib2
import sys
from bs4 import BeautifulSoup
# 设置编码
reload(sys)
sys.setdefaultencoding('utf-8')
class YuewenFreeSpider:
def __init__(self):
pass
# 获取一个章节的内容
def get_chapter_content(self, file, url):
try:
book_content_res = urllib2.urlopen(url)
book_content_soup = BeautifulSoup(book_content_res.read(), "html.parser")
file.write(book_content_soup.select("h3[class='j_chapterName'] span")[0].string + '\n')
for p in book_content_soup.select(".j_readContent p"):
file.write(p.next + '\n')
except Exception, e:
# 如果出错了,就重新运行一遍
print(e)
self.get_chapter_content(file, url)
else:
chapter_next = book_content_soup.select("a#j_chapterNext")[0]
if chapter_next.string != "书末页":
next_url = "https:" + chapter_next["href"]
self.get_chapter_content(file, next_url)
# 获取当前页所有书的内容
def get_current_url_books(self, url):
response = urllib2.urlopen(url)
the_page = response.read()
soup = BeautifulSoup(the_page, "html.parser")
book_arr = soup.select("ul[class='all-img-list cf'] > li")
global start_index
if start_index > 0:
book_arr = book_arr[start_index:]
start_index = 0
for book in book_arr:
book_cover = book.select("div[class='book-mid-info'] h4 > a")[0]
print "书名:" + book_cover.string
# 先创建.txt文件,然后获取文本内容写入
book_file = open("/home/username/crawler/books/" + book_cover.string + ".txt", "a+")
bres = urllib2.urlopen("https:" + book_cover['href'])
bsoup = BeautifulSoup(bres.read(), "html.parser")
book_content_href = bsoup.select("a[class='red-btn J-getJumpUrl']")[0]["href"]
self.get_chapter_content(book_file, "https:" + book_content_href)
book_file.close()
next_page = soup.select("a.lbf-pagination-next")[0]
return next_page["href"]
# 根据传入参数设置从哪里开始下载
url = "//www.qidian.com/free/all?orderId=&vip=hidden&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=1&page=1"
if __name__ == '__main__':
start_index = 0
# 死循环 直到没有下一页
while True:
if url.startswith("//"):
url = YuewenFreeSpider().get_current_url_books("https:" + url)
else:
break
这里主要用到了两个库,分别是urllib2和BeautifulSoup。urllib2是http请求库,BeautifulSoup的官方解释是:
Beautiful Soup提供一些简单的、python式的函数用来处理导航、搜索、修改分析树等功能。它是一个工具箱,通过解析文档为用户提供需要抓取的数据,因为简单,所以不需要多少代码就可以写出一个完整的应用程序。
Beautiful Soup自动将输入文档转换为Unicode编码,输出文档转换为utf-8编码。你不需要考虑编码方式,除非文档没有指定一个编码方式,这时,Beautiful Soup就不能自动识别编码方式了。然后,你仅仅需要说明一下原始编码方式就可以了。
Beautiful Soup已成为和lxml、html6lib一样出色的python解释器,为用户灵活地提供不同的解析策略或强劲的速度。
上节的例子中展示的是免费小说的爬取过程。如果要爬取签约小说的具体信息呢?示例如下:
# coding=utf-8
import urllib2
import sys
import time
import logging
import requests
from bs4 import BeautifulSoup
from lxml import etree
import json
class YuewenSpider:
def __init__(self):
self.session = requests.session()
@staticmethod
def get_url(url_key):
url = {
'search': 'https://www.qidian.com/search?kw=',
'category': 'https://book.qidian.com/ajax/book/category?',
'vipreader': 'https://vipreader.qidian.com/chapter/',
'subscribe': 'https://vipreader.qidian.com/ajax/subscribe/subscribe?',
'getSubscribe': 'https://vipreader.qidian.com/ajax/subscribe/getSubscribe?'
}
return url[url_key]
@staticmethod
def get_header_and_token():
header = dict()
header['Accept'] = 'text/html,application/xhtml+xml,application/xml;' \
'q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
header['Accept-Encoding'] = 'gzip, deflate, br'
header['Accept-Language'] = 'zh-CN,zh;q=0.9'
header['Cache-Control'] = 'max-age=0'
header['Connection'] = 'keep-alive'
header['cookie'] = '_csrfToken=AYnkW4AoqnnkKTIjgbgbndUM8qQ2rikJA2gqBWvt; newstatisticUUID=1564129077_1845515318; qdrs=0%7C3%7C0%7C0%7C1; showSectionCommentGuide=1; qdgd=1; se_ref=baidu; se_ref_bid=1015221208; gender=male; e1=%7B%22pid%22%3A%22qd_P_Searchresult%22%2C%22eid%22%3A%22qd_S05%22%2C%22l1%22%3A3%7D; e2=%7B%22pid%22%3A%22qd_P_Searchresult%22%2C%22eid%22%3A%22qd_S05%22%2C%22l1%22%3A3%7D; rcr=1013432302%2C1015221208%2C1015741318%2C1014139104%2C1015129326%2C1015055967%2C1015235392%2C1015336641%2C1015835395%2C1010868264%2C1015444718%2C1010144088; lrbc=1013432302%7C432810477%7C0; pageOps=1'
header['Host'] = 'www.qidian.com'
header['Upgrade-Insecure-Requests'] = '1'
header['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/75.0.3770.100 Safari/537.36'
token = ''
cookie = header['cookie']
if cookie:
cookie_pair = cookie.split(";")
for p in cookie_pair:
item = p.split("=")
if item[0].strip() == "_csrfToken":
token = item[1]
return header, token
# 根据更新时间获取原创id和需要更新的章节列表
def spider_chapter_list(self, book_title, book_author, updated_time):
try:
book = self.search_book(book_title, book_author)
if not book:
return None
data = self.get_header_and_token()
header, token = data[0], data[1]
spider_book_chapters = dict()
chapter_list = []
# 爬取章节列表信息,判断是否存在更新章节book_chapter
url = self.get_url(url_key='category') + '_csrfToken=' + token + '&bookId=' + book['id']
r = self.session.get(url, headers=header)
detail = json.loads(r.content)
if detail['code'] == 1:
raise Exception("get chapter list info failed")
else:
# 遍历卷信息
for d in detail['data']['vs'][::-1]:
# 遍历章节信息
for c in d['cs'][::-1]:
t = int(time.mktime(time.strptime(c['uT'], '%Y-%m-%d %H:%M:%S')))
if t > updated_time:
chapter = dict()
chapter['chapter_title'] = c['cN']
chapter['chapter_url'] = str(book['id']) + "/" + str(c['id'])
chapter['chapter_cnt'] = c['cnt'] # 章节字数
chapter['chapter_id'] = c['id']
chapter['free'] = c['sS']
chapter['updated_time'] = t
chapter['uuid'] = c['uuid']
chapter_list.append(chapter)
else:
break
else:
continue
break
if chapter_list:
spider_book_chapters['book_id'] = int(book['id'])
spider_book_chapters['chapters'] = chapter_list[::-1]
return spider_book_chapters
except Exception, e:
logging.error("YueWenSpider-spider_chapter_list except: %s", e.message)
return None
# 根据书名和作者搜书
def search_book(self, book_title, book_author):
url = self.get_url(url_key='search') + book_title
r = self.session.get(url, headers=self.get_header_and_token()[0])
selector = etree.HTML(r.content)
# 获取图书id
book_ids = selector.xpath('//li[@class="res-book-item"]/@data-bid')
# 获取图书名称和作者信息
book_titles = selector.xpath('//li[@class="res-book-item"]//div[@class="book-mid-info"]/h4/a//text()')
book_authors = selector.xpath('//div[@class="book-mid-info"]/p[@class="author"]/a[1]/text()')
if not book_ids or not book_titles or not book_authors:
return None
for i in range(len(book_ids)):
book = dict()
book['id'] = book_ids[i] if book_ids[i] else ''
book['author'] = book_authors[i] if book_authors[i] else ''
book['title'] = book_titles[i] if book_titles[i] else ''
if book['title'] == book_title and book['author'] == book_author:
return book
return None
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO,
format='%(asctime)s %(levelno)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
start_time = int(time.time()) - 3600 * 24
new_chapter_list = YuewenSpider().spider_chapter_list(u"九星毒奶", u"育", start_time)
print new_chapter_list
爬取到需要更新的章节列表后,就需要订阅具体章节内容了,具体代码就不贴了。说一下主要过程:首先先获取章节价格,查询账户余额,然后发起订阅。
获取价格:
url = self.get_url('vipreader') + spider_chapter['chapter_url']
header['Host'] = 'vipreader.qidian.com'
r = self.session.get(url, headers=header, verify=False)
selector = etree.HTML(r.text)
price = selector.xpath('//a[@class="single j_subscribeBtn"]/span/i/text()')
查询账户余额:
price = int(price[0])
# 准备参数
data = dict()
data['bookId'] = book_id
data['chapterPrice'] = price
chapters = dict()
chapters['chapterId'] = spider_chapter['chapter_id']
chapters['chapterCnt'] = spider_chapter['chapter_cnt']
chapters['price'] = price
chapters['uuid'] = spider_chapter['uuid']
data['chapters'] = [chapters]
data['isRenew'] = 0
data['chapterCnt'] = 1
data['isBuyAll'] = 0
# 查询余额信息
url = self.get_url(url_key='getSubscribe') + token
r = self.session.post(url, headers=header, data=json.dumps(data))
最后订阅具体章节:
# 发起订阅
url = self.get_url(url_key='subscribe') + token
r = self.session.post(url, headers=header, data=json.dumps(data))
在本节中解析html时用的是lxml。
BeautifulSoup是一个库,而XPath是一种技术,python中最常用的XPath库是lxml。
两者比较起来,性能方面lxml高于bs,而易用性方面则是bs好一些。
从性能方面看,BeautifulSoup和lxml的原理不一样,BeautifulSoup是基于DOM的,会载入整个文档,解析整个DOM树,因此时间和内存开销都会大很多。而lxml只会局部遍历,另外lxml是用c写的,而BeautifulSoup是用python写的,因此性能方面自然会差很多。
从易用性方面看,BeautifulSoup用起来比较简单,API非常人性化,支持css选择器。lxml的XPath写起来麻烦,开发效率不如BeautifulSoup。
举个例子:
title = soup.select('.content div.title h3')
同样的代码用Xpath写起来会很麻烦:
title = tree.xpath("//*[@class='content']/div[@class='content']/h3")
本文介绍了爬虫的实际使用实例。但很多情况下,内容网站都有一些反爬机制,比如说禁止相同ip在短时间内大量访问,这时候我们就要用到代理了。代理将在下一篇blog中介绍。
[1]https://www.jianshu.com/p/1d658f67fbdf