本来只是想学习一下selenium模块的简单使用,然后一直添加功能,感觉能把它变成一个项目了,于是乎,就有了这篇文章
写得很烂,速度很慢,但不影响这是我的第一次尝试,如果师傅们花时间看看我的代码,给我提点意见,那某人在此表示万分感谢
首先说明,这并不能爬取付费漫画,所谓可见即可爬,所以能爬取的都是腾讯动漫上能看到的漫画,同时我先了个登陆函数,去爬取已购付费。要是有人说腾讯漫画都能看,还去爬它就没有意义了,那我只能说,确实没意义。
贴个github地址 https://github.com/y0un9er/tencentComic/
import re
import os
import time
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import Browser # 使用 selenium 创建浏览器
import AllChapter # 获取漫画所有章节信息
class Download:
browser = ''
wait = ''
name = ''
total = 0
def __init__(self, url=None):
self.browser = Browser.Browser().browser
self.wait = WebDriverWait(self.browser, 30)
if url is not None:
self.comic_info(url)
def login(self):
pass
def loading(self):
pass
def comic_info(self, url):
pass
def getImg(self, chapter_info):
pass
调用 Browser() 类创建浏览器,使用 Browser.Browser().browser
创建无窗口浏览器,使用 Browser.Browser(‘Window’).browser
创建有窗口的浏览器,便于调试
def login(self):
if input('是否登陆(y/n)') != 'y':
return
url = 'http://ac.qq.com'
self.browser.get(url)
self.wait.until(ec.element_to_be_clickable((By.XPATH, '//*[@id="sidebarComTabMe"]')))
self.browser.find_element_by_xpath('//*[@id="sidebarComTabMe"]').click()
self.browser.switch_to.frame('iframe_qq')
self.browser.switch_to.frame('ptlogin_iframe')
if input('选择登陆方式(0:快捷登陆(已登录QQ),1:账号密码登陆(需关闭网页登陆保护)默认为 0) ') != '1':
# 点击登陆
self.browser.find_element_by_xpath('//*[@id="qlogin_list"]/a[1]').click()
else:
# 账号密码登陆
self.browser.find_element_by_xpath('//*[@id="switcher_plogin"]').click()
username = input('请输入账号: ')
password = input('请输入密码: ')
# password = getpass.getpass('请输入密码: ') # pycharm 不可用
self.browser.find_element_by_xpath('//*[@id="u"]').send_keys(username)
self.browser.find_element_by_xpath('//*[@id="p"]').send_keys(password)
self.browser.find_element_by_xpath('//*[@id="p"]').send_keys(Keys.ENTER)
提供两种登陆方式,快键登陆和账号密码登陆,使用账号密码登陆时,需要关闭网页保护,另外请注意密码泄露
def loading(self):
while True:
try:
elements = self.browser.find_elements_by_css_selector(
'img[src="//ac.gtimg.com/media/images/pixel.gif"]')
if not elements:
break
for ele in elements:
self.browser.execute_script("arguments[0].scrollIntoView();", ele)
time.sleep(0.2) # 等待时间,太短可能会出错
except:
break
腾讯动漫使用了 Ajax 异步加载,没有滚动此处,漫画不会被加载,而是一张显示为加载中的 gif,所以我写了个 loading() 函数去模拟浏览器滚动,使所有漫画图片被加载出来
这就显示了 selenium 的优势,如果要自己去分析 json 的话,应该会是一个十分头疼的问题,如果有加密参数,能不能分析出来都是个问题
def comic_info(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
total = soup.select('#catalogueContain > span')
total = re.search('\\d+', str(total))[0]
name = soup.select('#chapter')
name = re.search('>(\\S+)<', str(name))[1]
if not os.path.exists(name):
os.mkdir(name)
self.name = name
self.total = int(total)
用于返回当前所爬漫画的名称和总章节数,同时创建同名文件夹用于保存漫画
def getImg(self, chapter_info):
comic_name = self.name
chapter_num = chapter_info[0]
chapter_name = chapter_info[1].strip().replace(' ', '-')
chapter_url = chapter_info[2]
self.browser.get(chapter_url)
self.loading()
source = self.browser.page_source
soup = BeautifulSoup(source, 'lxml')
lis = soup.select('#comicContain > li')
urls = []
num = []
for li in lis:
try:
num.append(re.search('>(\\d+)/(\\d+)<', str(li))[1])
urls.append(re.search('src="(.*?)"', str(li))[1])
except:
continue
path = comic_name + '/' + chapter_num + '.' + chapter_name
if not os.path.exists(path):
os.mkdir(path)
for i in range(len(urls)):
print('\r当前{}. {} : {}/{}'.format(chapter_num, chapter_name, i + 1, len(urls)), end='')
response = requests.get(urls[i])
image = response.content
path_ = path + '/' + num[i] + '.jpg'
with open(path_, 'wb') as f:
f.write(image)
保存漫画图片,此函数有个参数,因传入一个列表,格式为 [chapter_num, chapter_name, chapter_url]
if __name__ == '__main__':
url = input('请输入要下载的漫画的某一话的链接')
D = Download(url)
D.login()
all_info = AllChapter.ChaptersInfo(url)
for chapter_info in all_info:
D.getImg(chapter_info)
如果需要爬取某本漫画,直接输入漫画某章的链接,如火影忍者:https://ac.qq.com/ComicView/index/id/505432/cid/1
# coding=utf-8
from selenium import webdriver
'''
创建 Browser 类是选择 显示浏览器窗口 或 不显示
browser2 = Browser('Window').browser
browser1 = Browser('Nowindow').browser
'''
class Browser:
browser = ''
def __init__(self, mode='Nowindow'):
if mode == 'Nowindow':
opt = webdriver.ChromeOptions()
opt.headless = True
self.browser = webdriver.Chrome(options=opt)
elif mode == 'Window':
self.browser = webdriver.Chrome()
import re
import requests
from bs4 import BeautifulSoup
'''
获取某本漫画所有章节的信息,以二维列表的形式返回
返回形式: [[章节序号, 章节名, 该章节的链接],[...],...]
'''
def ChaptersInfo(url):
response = requests.get(url)
content = response.content
soup = BeautifulSoup(content, 'lxml')
lis = soup.select('#catalogueList > li')
all_chapter_info = []
for li in lis:
chapter_num = re.search('\\d+', li.find_all('span')[0].get_text())[0]
chapter_name = li.find_all('span')[1].get_text()
chapter_url = 'http://ac.qq.com' + re.search('href="(.*?)"', str(li))[1]
chapter_info = [chapter_num, chapter_name, chapter_url]
all_chapter_info.append(chapter_info)
return all_chapter_info
传入漫画的任一章链接,即可爬取该漫画的所有章节信息
import re
import requests
from bs4 import BeautifulSoup
'''
获取免费漫画的链接
'''
class FreeUrl:
def __init__(self, url='', start=1, end=116):
self.url = url
self.start = start
self.end = end
# 获取某一列表页所有漫画的链接
# url 格式为 https://ac.qq.com/Comic/all/page/1 或 https://ac.qq.com/Comic/all?page=1
def OnePageUrls(self, url=''):
if self.url != '':
url = self.url
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
urls = []
for i in range(1, 13):
a = soup.select(f'body > div.ui-wm.ui-mb40.ui-mt40.clearfix > div.ret-main-wr.ui-mb40.ui-left > div > '
f'div.ret-search-result > ul > li:nth-child({i}) > div.ret-works-info > a')
try:
url = re.search('href="(.*?)"', str(a))[1]
url = 'https://ac.qq.com' + url.replace('Comic/comicInfo', 'ComicView/index') + '/cid/1'
urls.append(url)
except:
continue
return urls
# 所有免费漫画链接
# https://ac.qq.com/Comic/all/search/hot/vip/1/page/{i} i=1--116
def AllUrls(self):
self.url = ''
urls = []
for i in range(self.start, self.end+1):
page_url = f'https://ac.qq.com/Comic/all/search/hot/vip/1/page/{i}'
urls += self.OnePageUrls(page_url)
return urls
爬取腾讯动漫的检索页,返回所有免费漫画的链接
一个简单的爬虫就完成了,虽然小毛病不断,但也算勉强能用
我觉得一开始说这是个小项目,还是挺给自己添金的,但这确实是我第一次尝试,希望师傅们多多指点