参考文章:https://www.jianshu.com/p/fbc99cf4d557
个人比较喜欢看小说,于是乎想爬取小说网站--起点中文网,在爬取定位过程中遇到了反爬取,咨询了我旁边的前端大神,说下方法
当前页面接口返回的html源码
𘟠𘟙𘟠𘟜𘟛𘟝𘟟万字
start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1'
#获取当前页面的html
response = requests.get(start_url).text
#通过正则获取当前页面字体文件链接
url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
def get_font(url):
response = requests.get(url)
font = TTFont(BytesIO(response.content))
cmap = font.getBestCmap()
font.close()
return cmap
def get_encode(cmap,values):
WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}
word_count=''
for value in values.split(';'):
value = value[2:]
key = cmap[int(value)]
word_count += WORD_MAP[key]
return word_count
def get_index(start_url):
#获取当前页面的html
response = requests.get(start_url).text
doc = pq(response)
#获取当前字体文件名称
classattr = doc('p.update > span > span').attr('class')
pattern = '(.*?)'%classattr
#获取当前页面所有被字数字符
numberlist = re.findall(pattern,response)
#获取当前包含字体文件链接的文本
fonturl = doc('p.update > span > style').text()
#通过正则获取当前页面字体文件链接
url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
cmap = get_font(url)
books = doc('.all-img-list li').items()
i = 0
for book in books:
item = {}
item['img'] = 'http:' + book('.book-img-box a img').attr('src')
item['bookname'] = book('.book-mid-info h4 a').text()
item['author'] = book('.name').text()
item['classes'] = book('p.author > a:nth-child(4)').text()
item['content'] = book('.intro').text()
item['number'] = get_encode(cmap,numberlist[i][:-1])
i += 1
client = pymongo.MongoClient('127.0.0.1')
db = client.qidian
p = db.finish
def mongo(item):
p.insert(item)
#coding=utf-8
'''
Created on 2018年8月23日
@author: Administrator
'''
import requests,json,time,re
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from fontTools.ttLib import TTFont
from io import BytesIO
import pymongo
client = pymongo.MongoClient('127.0.0.1')
db = client.qidian
p = db.finish
start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page='
def get_font(url):
response = requests.get(url)
font = TTFont(BytesIO(response.content))
cmap = font.getBestCmap()
font.close()
return cmap
def get_encode(cmap,values):
WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}
word_count=''
for value in values.split(';'):
value = value[2:]
key = cmap[int(value)]
word_count += WORD_MAP[key]
return word_count
def get_index(start_url):
#获取当前页面的html
response = requests.get(start_url).text
doc = pq(response)
#获取当前字体文件名称
classattr = doc('p.update > span > span').attr('class')
pattern = '(.*?)'%classattr
#获取当前页面所有被字数字符
numberlist = re.findall(pattern,response)
#获取当前包含字体文件链接的文本
fonturl = doc('p.update > span > style').text()
#通过正则获取当前页面字体文件链接
url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
cmap = get_font(url)
books = doc('.all-img-list li').items()
i = 0
for book in books:
item = {}
item['img'] = 'http:' + book('.book-img-box a img').attr('src')
item['bookname'] = book('.book-mid-info h4 a').text()
item['author'] = book('.name').text()
item['classes'] = book('p.author > a:nth-child(4)').text()
item['content'] = book('.intro').text()
item['number'] = get_encode(cmap,numberlist[i][:-1])
i += 1
mongo(item)
def mongo(item):
p.insert(item)
def main():
for page in range(1,1000):
url = start_url + str(page)
get_index(url)
if __name__ == '__main__':
main()
def get_index(start_url):
# 获取当前页面的html
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
response = requests.get(start_url).text
doc = pq(response)
# 获取当前包含字体文件链接的文本
fonturl = doc('div.total > p > span > style').text()
# 通过正则获取当前页面字体文件链接
addr = re.search('font-family: (.+?); src', fonturl).group(1)
url = 'https://qidian.gtimg.com/qd_anti_spider/{addr}.woff'.format(addr=addr)
cmap = get_font(url)
print(cmap)
# 获取当前字体文件名称
pattern = '(.*?)' % addr
# 获取当前页面所有被字数字符
numberlist = re.findall(pattern, response)
print('numberlist: ', numberlist)
books = doc('.book-img-text li').items()
i = 0
print('i: ', i)
for book in books:
item = {}
item['img'] = 'http:' + book('.book-img-box a img').attr('src')
item['bookname'] = book('.book-mid-info h4 a').text()
item['author'] = book('.name').text()
item['classes'] = book('p.author > a:nth-child(4)').text()
item['content'] = book('.intro').text()
item['number'] = get_encode(cmap, numberlist[i][:-1])
item['font_url'] = url
i += 1
mongo(item)