1. 打开页面,观察是否是动态加载页面
2. 查找隐藏的加载内容部分的链接
3. 找到链接所需要的对应的链接的参数
4. 在网页源码中找到对应的参数
5. 从源码中正则提取相关参数
sign = re.search(r'RP.sign = "(.*?)";', html, re.M | re.I)
book_id = re.search(r'book_id":"(.*?)",', html, re.M | re.I)
if sign:
print 'book_id', book_id.group(1)
print 'sign', sign.group(1)
else:
print 'no match'
6. 拼接隐藏链接
contentURL = 'http://book.km.com/index.php?c=catch&a=getContent&book_id=%s&chapter_id=1&sign=%s' % (book_id.group(1), sign.group(1))
7. 开始正式的抓取页面中的数据
import requests
import htmllib
import re
import urllib2
import json
import leancloud
import codecs
import string
from bs4 import BeautifulSoup
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf8')
class bookcom:
NovelName = ''
NovelImageUrl = ''
NovelType = ''
NovelChapter = ''
NovelChapterContent = ''
NovelChapterId = ''
def bookcomGetList(self):
html = requests.get('http://book.km.com/shuku_0_0_0_1_0_0_1.html')
pythonEtree = etree.HTML(html.text)
pythonLink = pythonEtree.xpath('//div[@class="imgbox"]/a')
for each in pythonLink:
self.NovelName = each.xpath('img/@alt')[0]
self.NovelImageUrl = each.xpath('img/@_src')[0]
self.NovelType = '免费'
searchObjOne = re.search(r'/shuku/(.*?).html', each.xpath('@href')[0], re.M | re.I | re.S)
URLTwo = 'http://book.km.com/chapterlist/%s.html' % searchObjOne.group(1)
self.bookcomGetListSave()
self.bookcomGetNovelList(URLTwo)
def bookcomGetNovelList(self,URL):
html = requests.get(URL, 'GET')
pythonEtree = etree.HTML(html.text)
pythonLink = pythonEtree.xpath('//ul[@class="catalog_list clearfix"]/li/a')
for each in pythonLink:
self.NovelChapter = each.xpath('text()')[0]
novelContentUrl = 'http://book.km.com%s' % each.xpath('@href')[0]
self.bookcomGetNovelContent(novelContentUrl, URL)
def bookcomGetNovelContent(self, URL, SuperURl):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Cookie': 'HTTP_REFERER=book.km.com; _ga=GA1.2.1574141621.1496282865; UM_distinctid=15c61689cad25e-04f45024755e92-49526a-fa000-15c61689cae39d; CNZZDATA30085487=cnzz_eid%3D2096924603-1496280564-%26ntime%3D1496291364; book_history=%5B%22b1343939%22%2C%22b1413544%22%5D; bdshare_firstime=1496282865098; Hm_lvt_b2e5ac9401b5820ffa4e9fa608593a5b=1496282865; Hm_lpvt_b2e5ac9401b5820ffa4e9fa608593a5b=1496296670; HTTP_REFERER=book.km.com',
'Referer': SuperURl
, 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
, 'Connection': 'keep-alive'
, 'Accept-Encoding': 'gzip, deflate'
, 'Host': 'book.km.com'
, 'Upgrade-Insecure-Requests': '1'
, 'Cache-Control': 'max-age=0'
}
html = requests.get(URL, headers=headers)
sign = re.search(r'RP.sign = "(.*?)";', html.text, re.M | re.I)
book_id = re.search(r'book_id":"(.*?)",', html.text, re.M | re.I)
chapter_id = re.search(r'"id":"(.*?)",', html.text, re.M | re.I)
self.NovelChapterId = chapter_id.group(1)
contentHeader = {
'Accept': '*/*'
, 'Accept-Encoding': 'gzip, deflate'
, 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
, 'Connection': 'keep-alive'
,
'Cookie': '_ga=GA1.2.1574141621.1496282865; UM_distinctid=15c61689cad25e-04f45024755e92-49526a-fa000-15c61689cae39d; CNZZDATA30085487=cnzz_eid%3D2096924603-1496280564-%26ntime%3D1496800002; book_history=%5B%22b1343939%22%2C%22b1413544%22%5D; bdshare_firstime=1496282865098; Hm_lvt_b2e5ac9401b5820ffa4e9fa608593a5b=1496282865,1496631345; _gat=1'
, 'Host': 'book.km.com'
, 'Referer': URL
, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0'
, 'X-Requested-With': 'XMLHttpRequest'
}
contentURL = 'http://book.km.com/index.php?c=catch&a=getContent&book_id=%s&chapter_id=%s&sign=%s' % (
book_id.group(1), chapter_id.group(1), sign.group(1))
contentHtml = requests.get(contentURL, headers=contentHeader)
self.NovelChapterContent = contentHtml.text
print self.NovelName
print self.NovelImageUrl
print self.NovelType
print self.NovelChapter
print self.NovelChapterId
print self.NovelChapterContent
self.bookcomChapterSave()
def bookcomChapterSave(self):
Todo = leancloud.Object.extend('XuanHuanContent')
todo = Todo()
todo.set('NovelName', self.NovelName)
todo.set('NovelChapterId', self.NovelChapterId)
todo.set('NovelChapter', self.NovelChapter)
todo.set('NovelChapterContent', self.NovelChapterContent)
todo.save()
def bookcomGetListSave(self):
Todo = leancloud.Object.extend('XuanHuanList')
todo = Todo()
todo.set('NovelImageUrl', self.NovelImageUrl)
todo.set('NovelType', self.NovelType)
todo.set('NovelName', self.NovelName)
todo.save()
leancloud.init("", "")
Book = bookcom()
Book.bookcomGetList()