爬取起点中文网字体反爬取

     参考文章:https://www.jianshu.com/p/fbc99cf4d557 

       个人比较喜欢看小说,于是乎想爬取小说网站--起点中文网,在爬取定位过程中遇到了反爬取,咨询了我旁边的前端大神,说下方法

当前页面接口返回的html源码

𘟠𘟙𘟠𘟜𘟛𘟝𘟟万字

爬取起点中文网字体反爬取_第1张图片

 

第一步:获取当前页面的字体文件链接,可以通过正则获取

    start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page=1'
    #获取当前页面的html
    response = requests.get(start_url).text   
    #通过正则获取当前页面字体文件链接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)

爬取起点中文网字体反爬取_第2张图片第二步:通过fontTools模块获取当前字体映射关系

def get_font(url):
    response = requests.get(url)
    font = TTFont(BytesIO(response.content))
    cmap = font.getBestCmap()
    font.close()
    return cmap

第三步:通过当前映射关系可以对应的数据被变更为英文,然后创建dict已经转换

def get_encode(cmap,values):
    WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}
    word_count=''
    for value in values.split(';'):
        value = value[2:]
        key = cmap[int(value)]
        word_count += WORD_MAP[key]
    return word_count

第四步:然后就是通过pyquery进行数据提取

def get_index(start_url):
    #获取当前页面的html
    response = requests.get(start_url).text   
    doc = pq(response)
    #获取当前字体文件名称
    classattr = doc('p.update > span > span').attr('class')
    pattern = '(.*?)'%classattr
    #获取当前页面所有被字数字符
    numberlist = re.findall(pattern,response)
    #获取当前包含字体文件链接的文本
    fonturl = doc('p.update > span > style').text() 
    #通过正则获取当前页面字体文件链接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
    cmap = get_font(url)
    books = doc('.all-img-list li').items()
    i = 0
    for book in books:
        item = {}
        item['img'] = 'http:' + book('.book-img-box a img').attr('src')
        item['bookname'] = book('.book-mid-info h4 a').text()
        item['author'] = book('.name').text()
        item['classes'] = book('p.author > a:nth-child(4)').text()
        item['content'] = book('.intro').text()
        item['number'] = get_encode(cmap,numberlist[i][:-1])
        i += 1

第五步:将输入存入mongodb

client = pymongo.MongoClient('127.0.0.1')
db = client.qidian
p  = db.finish
def mongo(item):
    p.insert(item)

 

附当前爬虫文件源码

#coding=utf-8
'''
Created on 2018年8月23日

@author: Administrator
'''
import requests,json,time,re
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from fontTools.ttLib import TTFont
from io import BytesIO
import pymongo

client = pymongo.MongoClient('127.0.0.1')
db = client.qidian
p  = db.finish


start_url = 'https://www.qidian.com/finish?action=hidden&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2&page='

def get_font(url):
    response = requests.get(url)
    font = TTFont(BytesIO(response.content))
    cmap = font.getBestCmap()
    font.close()
    return cmap

def get_encode(cmap,values):
    WORD_MAP = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','period':'.'}
    word_count=''
    for value in values.split(';'):
        value = value[2:]
        key = cmap[int(value)]
        word_count += WORD_MAP[key]
    return word_count

def get_index(start_url):
    #获取当前页面的html
    response = requests.get(start_url).text   
    doc = pq(response)
    #获取当前字体文件名称
    classattr = doc('p.update > span > span').attr('class')
    pattern = '(.*?)'%classattr
    #获取当前页面所有被字数字符
    numberlist = re.findall(pattern,response)
    #获取当前包含字体文件链接的文本
    fonturl = doc('p.update > span > style').text() 
    #通过正则获取当前页面字体文件链接
    url = re.search('woff.*?url.*?\'(.+?)\'.*?truetype',fonturl) .group(1)
    cmap = get_font(url)
    books = doc('.all-img-list li').items()
    i = 0
    for book in books:
        item = {}
        item['img'] = 'http:' + book('.book-img-box a img').attr('src')
        item['bookname'] = book('.book-mid-info h4 a').text()
        item['author'] = book('.name').text()
        item['classes'] = book('p.author > a:nth-child(4)').text()
        item['content'] = book('.intro').text()
        item['number'] = get_encode(cmap,numberlist[i][:-1])
        i += 1
        mongo(item)


def mongo(item):
    p.insert(item)


    
def main():
    for page in range(1,1000):
        url = start_url + str(page)
        get_index(url)

if __name__ == '__main__':
    main()
    
    
    

 

针对月票榜月票数字体反爬修改

def get_index(start_url):
    # 获取当前页面的html
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    response = requests.get(start_url).text
    doc = pq(response)
    # 获取当前包含字体文件链接的文本
    fonturl = doc('div.total > p > span > style').text()
    # 通过正则获取当前页面字体文件链接
    addr = re.search('font-family: (.+?); src', fonturl).group(1)
    url = 'https://qidian.gtimg.com/qd_anti_spider/{addr}.woff'.format(addr=addr)
    cmap = get_font(url)
    print(cmap)
    # 获取当前字体文件名称
    pattern = '(.*?)' % addr
    # 获取当前页面所有被字数字符
    numberlist = re.findall(pattern, response)
    print('numberlist: ', numberlist)
    books = doc('.book-img-text li').items()
    i = 0
    print('i: ', i)
    for book in books:
        item = {}
        item['img'] = 'http:' + book('.book-img-box a img').attr('src')
        item['bookname'] = book('.book-mid-info h4 a').text()
        item['author'] = book('.name').text()
        item['classes'] = book('p.author > a:nth-child(4)').text()
        item['content'] = book('.intro').text()
        item['number'] = get_encode(cmap, numberlist[i][:-1])
        item['font_url'] = url
        i += 1
        mongo(item)

 

你可能感兴趣的:(爬虫)