爬取腾讯漫画的一个例子
# encoding: utf-8
from __future__ import print_function
from __future__ import unicode_literals
import requests
import base64
import re
import json
import os
requestSession = requests.session()
def getId(url):
numRE = re.compile(r'\d+')#r的意思是不转义,即\表示原样的\。否则有可能被视图按\d为一个字符解析转义。
id = numRE.findall(url)[0]
return id
def getContent(id):
getComicInfoUrl = 'http://m.ac.qq.com/GetData/getComicInfo?id={}'.format(id)
getComicInfo = requestSession.get(getComicInfoUrl)
comicInfoJson = getComicInfo.text
comicInfo = json.loads(comicInfoJson)
comicName = comicInfo['title']#解析出漫画名
getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(id)
getChapterList = requestSession.get(getChapterListUrl)
contentJson = json.loads(getChapterList.text)
count = contentJson['length']#获得章节数
sortedContentList = []
#按顺序整理章节
for i in range(count + 1):
for item in contentJson:
if isinstance(contentJson[item], dict) and contentJson[item]['seq'] == i:
sortedContentList.append({item: contentJson[item]})
break
return (comicName, count, sortedContentList)
def getImgList(contentJson, id):
#cid = contentJson.keys()[0] #原作者使用的是Python2
#在python2.x中,dict.keys()返回一个列表,在python3.x中,dict.keys()返回一个dict_keys对象,比起列表,这个对象的行为更像是set,所以不支持索引的。解决方案:list(dict.keys())[index]
cid = list(contentJson.keys())[0]
#getPicHashURL = 'http://m.ac.qq.com/View/mGetPicHash?id={}&cid={}'.format(id, cid) #原作者请求失效
#getPicHashURL='http://m.ac.qq.com/chapter/index/id/{}/cid/{}'.format(id,cid) #异步加载,修改后仍是错误的
picJsonPage = requestSession.get(getPicHashURL).text
picJson = json.loads(picJsonPage)
count = picJson['pCount'] #统计图片数量
pHash = picJson['pHash']
sortedImgDictList = []
for i in range(1, count + 1):
for item in pHash:
if pHash[item]['seq'] == i:
sortedImgDictList.append(pHash[item])
break
imgList = []
for imgDict in sortedImgDictList:
k = imgDict['cid']
m = imgDict['pid']
j = int(id)
uin = max(j + k + m, 10001)
l = [j % 1000 / 100, j % 100, j, k]
n = '/mif800/' + '/'.join(str(j) for j in l) + '/'
h = str(m) + '.mif2'
g="http://ac.tc.qq.com/store_file_download?buid=15017&uin="+str(uin)+"&dir_path="+n+"&name="+h
imgList.append(g)
return imgList
def downloadImg(imgUrlList, contentPath):
count = len(imgUrlList)
print('该集漫画共计{}张图片'.format(count))
i = 1
for imgUrl in imgUrlList:
print('\r正在下载第{}张图片...'.format(i), end='')
imgPath = os.path.join(contentPath, '{0:0>3}.jpg'.format(i))
downloadRequest = requestSession.get(imgUrl, stream=True)
with open(imgPath, 'wb') as f:
for chunk in downloadRequest.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
i += 1
print('完毕!\n')
def main():
url = 'http://m.ac.qq.com/comic/index/id/623622' #要爬取的漫画首页
path = 'E:\\WorkSpace\\test' #下载图片存放路径
if not os.path.isdir(path):
os.mkdir(path)
id = getId(url)
comicName,count,contentList = getContent(id)
contentNameList = []
for item in contentList:
for k in item:
contentNameList.append(item[k]['t'])
print('漫画名: {}'.format(comicName))
print('章节数: {}'.format(count))
print('章节列表:')
print('\n'.join(contentNameList))
comicPath = os.path.join(path, comicName)
if not os.path.isdir(comicPath):
os.mkdir(comicPath)
print()
i = 0
for content in contentList:
print('正在下载第{0:0>4}话: {1}'.format(i + 1, contentNameList[i]))
contentPath = os.path.join(comicPath, '第{0:0>4}话-{1}'.format(i + 1, contentNameList[i]))
if not os.path.isdir(contentPath):
os.mkdir(contentPath)
imgList = getImgList(content, id)
downloadImg(imgList, contentPath)
i += 1
if __name__ == '__main__':
main()
#coding=utf-8
#破解采集图片数据
"""
http://api.yyhao.com/app_api/v3/getcomicinfo/?comic_id=27284 #获得目录信息
"""
import requests
import base64
import re
import json
import os
import urllib
#获取网页源代码
def getHtmlText(url):
try:
head={}
head['User-Agent'] ="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
r=requests.get(url,headers=head)
r.raise_for_status()#如果状态不是200,引发HTTPError异常
r.encoding=r.apparent_encoding
return r.text
except:
print("获取网页产生异常")
#获取漫画信息
def getComicInfo(comicId):
url='http://api.yyhao.com/app_api/v3/getcomicinfo/?comic_id={}'.format(comicId)
info=json.loads(getHtmlText(url))
comicName=info['comic_name']
comicAuthor=info['comic_author']
comicDesc=info['comic_desc']
chapterList=info['comic_chapter'][1]['chapter_list']
return(comicName,comicAuthor,comicDesc,chapterList)
#下载某个url对应的图片
def downloadImag(url,picPath,num):
head={}
head['User-Agent'] ="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
image = requests.get(url,headers=head)
if image.status_code == 200:
picPath=picPath+str(num)+'.jpg'
f=open(picPath, 'wb')
f.write(image.content)
f.close()
else:
print("下载图片出错!")
#下载一部漫画所有的图片
def downloadAll(path,chapterList):
for chapter in chapterList:
chapterPath=path+chapter['chapter_name']+'/'
if not os.path.exists(chapterPath):
os.mkdir(chapterPath)
start=chapter['chapter_source'][0]['start_num']
end=chapter['chapter_source'][0]['end_num']
ofDomain=chapter['chapter_source'][0]['chapter_domain']
if ofDomain=='':
domain='mhpic.samanlehua.com'
else:
domain='mhpic.'+ofDomain
rule=urllib.parse.quote(chapter['chapter_source'][0]['rule'])
imageUrl='http://'+domain+rule
imageUrl=imageUrl.replace("%24%24",'{}')
#imageUrl=urllib.parse.quote(imageUrl)
while(start<=end):
imageUrlAll=imageUrl.format(start)#+'-kmw.middle'
downloadImag(imageUrlAll,chapterPath,start)
start+=1
print(chapter['chapter_name']+" has downloaded")
if __name__ == '__main__':
comicId=3130
comicName,coimcAuthor,comicDesc,chapterList=getComicInfo(comicId)
print(comicName+' downloading...')
#存放路径
path='E:/WorkSpace/comic/'+comicName+'/'
if not os.path.exists(path):
os.mkdir(path)
downloadAll(path,chapterList)
print("All finished!")
其他资料
www.manhuagui.com的网站是动态加载的,用静态提取的方法失败了
参考博客
[1]http://blog.sina.com.cn/s/blog_700376570102x80k.html #scrapy静态抓取
[2]http://blog.sina.com.cn/s/blog_700376570102x8er.html
[3]http://blog.sina.com.cn/s/blog_700376570102x96k.html #动态抓取
对于数据来说,我们通常的做法就是进行Ajax处理,即异步请求数据,然后局部渲染生成整个网页。因此,我们设想,它是不是有ajax请求呢,
让我们来看看Network tab页下面查不查得到相关信息
使用PantomJS
#coding=utf-8
from selenium import webdriver
import time
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
#基于PhantomJS创建一个浏览器,并且设置一下用户代理,否则可能出现界面不兼容的情况
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/4.0 (compatible; MSIE 5.5; windows NT)" )
browser = webdriver.PhantomJS(desired_capabilities=dcap)
#打开动漫的第一页
browser.get("http://ac.qq.com/ComicView/index/id/539443/cid/1")
#通过该循环,我们可以依次进行自动滑动,模拟滑动后自然会触发后续的图片资源。
for i in range(10):
js='window.scrollTo('+str(i*1280)+','+str((i+1)*1280)+')'
browser.execute_script(js)
time.sleep(1)
#将打开的界面截图保存,方便观察
a=browser.get_screenshot_as_file("E:/WorkSpace/Python/test.jpg")
#获取当前页面所有源码(此时包含触发出来的异步加载的资源)
data=browser.page_source
#将相关网页源码写入本地文件中,方便分析
fh=open("E:/WorkSpace/Python/dongman.html","w",encoding="utf-8")
fh.write(data)
fh.close()
#结束了PhantomJS的使用之后,我们需要关闭一下浏览器,所以,我们在代码后添加如下一行代码
browser.quit()
#我们可以通过正则表达式'
import re
import urllib
#构造正则表达式提取动漫图片资源网址
pat='
#获取所有动漫图片资源网址
allid=re.compile(pat).findall(data)
for i in range(0,len(allid)):
#得到当前网址
thisurl=allid[i]
#去除网址中的多余元素amp;
thisurl2=thisurl.replace("amp;","")+".jpg"
#输出当前爬取的网址
print(thisurl2)
#设置将动漫存储到本地的本地目录
localpath="E:/WorkSpace/dongman/"+str(i)+".jpg"
#通过urllib对动漫图片资源进行爬取
urllib.request.urlretrieve(thisurl2,filename=localpath)
爬取小说的例子
#coding=utf-8
#BeautifulSoup处理HTML标签包括搜索,遍历相应的标签,但是将HTML文档转化为可读文档,属于浏览器渲染的行为,bs一般是不提供的,具体的解决办法有用正则表达式,用nltk,用js的innnerHTML
#import nltk
#content=nltk.clean_html(contentHTML) #nltk后来的版本不支持clear_html()和clear_url()
#[s.extract() for s in contentHTML.find_all('p',text="")] #去除了所有的p标签
import requests
from bs4 import BeautifulSoup
#from HTMLParser import HTMLParser #python2
from html.parser import HTMLParser #python3
from re import sub
from sys import stderr
from traceback import print_exc
import os
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.texts=""#存放处理的结果
def handle_data(self, data):
data=data.strip() #去除首尾空格
data=' '+data #段落首部空格
self.texts+=data
#处理标签
#def handle_starttag(self, tag, attrs): #在遇到开始标签时发生的行为
#def handle_startendtag(self, tag, attrs): #在遇到开始或者结束标签时发生的行为
def handle_endtag(self,tag): #在遇到结束标签时发生的行为
if tag == 'p':
self.texts+='\n\n'
def text(self):
return self.texts
#解析HTML为Text
def dehtml(text):
try:
parser = MyHTMLParser()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text
#获取网页源代码
def getHTMLText(url):
try:
head={}
head['User-Agent'] ="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
r=requests.get(url,headers=head)
r.raise_for_status()#如果状态不是200,引发HTTPError异常
r.encoding=r.apparent_encoding
return r.text
except:
return "获取网页产生异常"
if __name__=="__main__":
print('starting...')
bookUrl="http://www.shuhuanghuang.com/book/44674.html"#小说首页
path="E:/WorkSpace/novel/"#下载小说位置
soup=BeautifulSoup(getHTMLText(bookUrl),'lxml')
infoSoup=soup.find('div',class_='info-box')
bookTitle=infoSoup.find('p',class_='title').text
bookDescription=infoSoup.get_text()
bookPath=path+bookTitle;
if not os.path.exists(bookPath): #不能用!来取非
os.makedirs(bookPath)
descriptionFile=bookPath+'/'+'Description.txt'
if not os.path.exists(descriptionFile):
open(descriptionFile,'w+').close()
f=open(descriptionFile,'w')
f.write(bookDescription)
f.close()
print('downloading...')
chapterUrl=soup.find('a',class_='read',text='阅读')['href'] #小说第一章url
hasNext=True;
while(hasNext):
chapterSoup = BeautifulSoup(getHTMLText(chapterUrl),'lxml')
title=chapterSoup.find('div',class_='title').string #小说标题
title=title.replace('\r\n','').replace('\t','')
contentHTML=chapterSoup.find('div',class_='content')
contentStr=contentHTML.prettify().replace('\n','').replace('
','')
content=dehtml(contentStr)
chapterPath=bookPath+'/'+title+'.txt'
if not os.path.exists(chapterPath):
open(chapterPath,'w+').close()
f=open(chapterPath,'w')
f.write(content)
f.close()
print(title+' has downloaded!')
nextChapter=chapterSoup.find('a',class_='next')
if not nextChapter:
hasNext=False
break
chapterUrl=nextChapter['href']
print('All finished!')