运行结果:
这个是爬取的目标
爬取之后建立文件夹,合成长图之后删除文件夹
这里仅仅做几组演示,
由于合成的图片有单列长度限制,所有拆分成两列
开始:
首先打开网站,找到某个漫画,发现点鼠标右键不可以,那就这样:
然后就是漫画的图片是切分成好几段小图片,而且图片的地址是动态加载出来的,仅仅使用etree是提取不出来的
然后就是文件排序问题,为了后面的合成长图片
好了,然后局可以写代码了:
单线程代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/19 12:39
# @Author : huni
# @File : 快看漫画.py
# @Software: PyCharm
import requests
from lxml import etree
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import PIL.Image as Image
import shutil
# 定义图像拼接函数
def image_compose():
to_image = Image.new('RGB', (IMAGE_COLUMN * IMAGE_SIZE, IMAGE_ROW * IMAGE_SIZE)) # 创建一个新图
# 循环遍历,把每张图片按顺序粘贴到对应位置上
total_num = 0
for y in range(1, IMAGE_COLUMN + 1): # 列数
for x in range(1, IMAGE_ROW + 1): # 行数
from_image = Image.open(IMAGES_PATH + image_names[IMAGE_ROW * (y - 1) + x - 1]).resize(
(IMAGE_SIZE, IMAGE_SIZE), Image.ANTIALIAS)
to_image.paste(from_image, ((y - 1) * IMAGE_SIZE, (x - 1) * IMAGE_SIZE))
total_num += 1
if total_num == len(image_names):
break
return to_image.save(IMAGE_SAVE_PATH) # 保存新图
if __name__ == '__main__':
m_path = './快看漫画'
if not os.path.exists(m_path):
os.mkdir(m_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
url = 'https://www.kuaikanmanhua.com/web/topic/4832/'
resp = requests.get(url=url,headers=headers).text
tree = etree.HTML(resp)
div_list = tree.xpath('//*[@id="__layout"]/div/div/div[2]/div/div[1]/div[2]/div')
for div in div_list:
href = 'https://www.kuaikanmanhua.com' + div.xpath('./div[2]/a/@href')[0]
if href == 'https://www.kuaikanmanhua.comjavascript:void(0);':
continue
# 无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
brower = webdriver.Chrome(options=chrome_options)
brower.get(href)
sleep(2)
resp1 = brower.page_source
tree1 = etree.HTML(resp1)
img_list = tree1.xpath('//*[@id="__layout"]/div/div/div[2]/div[1]/div[4]/img')
title = tree1.xpath('/html/head/title/text()')[0][:4].replace(' ','')
title_path = m_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for img,i in zip(img_list,range(int(len(img_list)))):
jpg_url = img.xpath('./@data-src')[0]
jpg_data = requests.get(url=jpg_url,headers=headers).content
jpg_name = href.split('/')[-1] + f'_{i}' + '.jpg'
jpg_path = title_path + f'/{jpg_name}'
with open(jpg_path,'wb') as fp:
fp.write(jpg_data)
print(jpg_name,'下载完成')
# 合成图片
IMAGES_PATH = title_path + '/' # 图片集地址
IMAGES_FORMAT = ['.jpg', '.JPG'] # 图片格式
IMAGE_SIZE = 750 # 每张小图片的大小
IMAGE_COLUMN = 2 # 图片间隔,也就是合并成一张图后,一共有几列
IMAGE_SAVE_PATH = m_path + f'/{title}.jpg' # 图片转换后的地址
# 获取图片集地址下的所有图片名称
image_names = [name for name in os.listdir(IMAGES_PATH) for item in IMAGES_FORMAT if
os.path.splitext(name)[1] == item]
image_names.sort(key=lambda x: int(x[:-4])) ##文件名按数字排序
# IMAGE_ROW = 4 # 图片间隔,也就是合并成一张图后,一共有几行
IMAGE_ROW_yu = len(image_names) % IMAGE_COLUMN
if IMAGE_ROW_yu == 0:
IMAGE_ROW = len(image_names) // IMAGE_COLUMN
else:
IMAGE_ROW = len(image_names) // IMAGE_COLUMN + 1
print('对文件排序如下:')
print("image_names", image_names)
image_compose() # 调用函数
print('合成图片成功')
shutil.rmtree(title_path)
print('删除文件夹成功')
由于小编的电脑很菜,所以多线程运行之后会有的线程失效所以就不详细说了
多线程如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/19 15:50
# @Author : huni
# @File : 快看漫画多线程.py
# @Software: PyCharm
import requests
from lxml import etree
import os
from threading import Thread
from queue import Queue
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import PIL.Image as Image
import shutil
class CrawlInfo(Thread):
def __init__(self,url_queue,html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
while self.url_queue.empty() == False:
url = self.url_queue.get()
# 无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
brower = webdriver.Chrome(options=chrome_options)
brower.get(url)
sleep(2)
resp1 = brower.page_source
self.html_queue.put(resp1)
class ParseInfo(Thread):
def __init__(self,html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
while self.html_queue.empty() == False:
tree1 = etree.HTML(self.html_queue.get())
img_list = tree1.xpath('//*[@id="__layout"]/div/div/div[2]/div[1]/div[4]/img')
title = tree1.xpath('/html/head/title/text()')[0][:4].replace(' ', '')
href0 = tree1.xpath('/html/head/meta[12]/@content')[0]
title_path = m_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for img, i in zip(img_list, range(int(len(img_list)))):
jpg_url = img.xpath('./@data-src')[0]
jpg_data = requests.get(url=jpg_url, headers=headers).content
jpg_name = href0.split('/')[-1] + f'_{i}' + '.jpg'
jpg_path = title_path + f'/{jpg_name}'
with open(jpg_path, 'wb') as fp:
fp.write(jpg_data)
print(jpg_name, '下载完成')
# 合成图片
IMAGES_PATH = title_path + '/' # 图片集地址
IMAGES_FORMAT = ['.jpg', '.JPG'] # 图片格式
IMAGE_SIZE = 750 # 每张小图片的大小
IMAGE_COLUMN = 2 # 图片间隔,也就是合并成一张图后,一共有几列
IMAGE_SAVE_PATH = m_path + f'/{title}.jpg' # 图片转换后的地址
# 获取图片集地址下的所有图片名称
image_names = [name for name in os.listdir(IMAGES_PATH) for item in IMAGES_FORMAT if
os.path.splitext(name)[1] == item]
image_names.sort(key=lambda x: int(x[:-4])) ##文件名按数字排序
# IMAGE_ROW = 4 # 图片间隔,也就是合并成一张图后,一共有几行
IMAGE_ROW_yu = len(image_names) % IMAGE_COLUMN
if IMAGE_ROW_yu == 0:
IMAGE_ROW = len(image_names) // IMAGE_COLUMN
else:
IMAGE_ROW = len(image_names) // IMAGE_COLUMN + 1
print('对文件排序如下:')
print("image_names", image_names)
to_image = Image.new('RGB', (IMAGE_COLUMN * IMAGE_SIZE, IMAGE_ROW * IMAGE_SIZE)) # 创建一个新图
# 循环遍历,把每张图片按顺序粘贴到对应位置上
total_num = 0
for y in range(1, IMAGE_COLUMN + 1): # 列数
for x in range(1, IMAGE_ROW + 1): # 行数
from_image = Image.open(IMAGES_PATH + image_names[IMAGE_ROW * (y - 1) + x - 1]).resize(
(IMAGE_SIZE, IMAGE_SIZE), Image.ANTIALIAS)
to_image.paste(from_image, ((y - 1) * IMAGE_SIZE, (x - 1) * IMAGE_SIZE))
total_num += 1
if total_num == len(image_names):
break
to_image.save(IMAGE_SAVE_PATH) # 保存新图
print('合成图片成功')
shutil.rmtree(title_path)
print('删除文件夹成功')
if __name__ == '__main__':
url_queue = Queue()
html_queue=Queue()
m_path = './快看漫画'
if not os.path.exists(m_path):
os.mkdir(m_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
url0 = 'https://www.kuaikanmanhua.com/web/topic/4832/'
resp = requests.get(url=url0, headers=headers).text
tree = etree.HTML(resp)
div_list = tree.xpath('//*[@id="__layout"]/div/div/div[2]/div/div[1]/div[2]/div')
for div in div_list:
href = 'https://www.kuaikanmanhua.com' + div.xpath('./div[2]/a/@href')[0]
if href == 'https://www.kuaikanmanhua.comjavascript:void(0);':
continue
url_queue.put(href)
crawl_list = []
for i in range(5):
Crawl = CrawlInfo(url_queue, html_queue)
crawl_list.append(Crawl)
Crawl.start()
for crawl in crawl_list:
crawl.join()
parse_list = []
for i in range(5):
parse = ParseInfo(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
今天的不开心就到此位置吧,明天依旧光头万丈啊宝贝!
欢迎大家打赏小编: