无标题文章

Code  Issues 0  Pull requests 0  Pulse

Comics/Comics/spiders/comics.py

master Last changed by moshuqi about 1 month ago

#coding:utf-8

import scrapy

import os

import urllib

import zlib

from bs4 import BeautifulSoup

class Comics(scrapy.Spider):

name = "comics"

# allowed_domains = ['http://www.xeall.com']

# start_urls = ['http://www.xeall.com/shenshi/p20.html']

def start_requests(self):

# urls = ['http://www.xeall.com/shenshi/6458.html']

urls = ['http://www.xeall.com/shenshi']

for url in urls:

yield scrapy.Request(url=url, callback=self.parse)

# yield scrapy.Request(url=url, callback=self.comics_parse)

def parse(self, response):

# 请求返回的html源码

content = response.body;

if not content:

self.log('parse body error.')

return

# 用BeautifulSoup库进行节点的解析

soup = BeautifulSoup(content, "html5lib")

# 获取包含漫画列表的标签

listcon_tag = soup.find('ul', class_='listcon')

if len(listcon_tag) < 1:

self.log('extract comics list error')

return

# 列表中每部漫画的标签

com_a_list = listcon_tag.find_all('a', attrs={'href': True})

if len(com_a_list) < 1:

self.log('Can not find that contain href attribute.')

return

# 获取列表中所有漫画的url

comics_url_list = []

base = 'http://www.xeall.com'

for tag_a in com_a_list:

url = base + tag_a['href']

comics_url_list.append(url)

print('\n>>>>>>>>>>>>>>>>>>> current page comics list <<<<<<<<<<<<<<<<<<<<')

print(comics_url_list)

# 处理当前页每部漫画

for url in comics_url_list:

print('>>>>>>>>  parse comics:' + url)

yield scrapy.Request(url=url, callback=self.comics_parse)

# # 只爬取当前一页

# return

# 漫画列表下方的选页栏

page_tag = soup.find('ul', class_='pagelist')

if len(page_tag) < 1:

self.log('extract page list error')

return

# 获取下一页的url

page_a_list = page_tag.find_all('a', attrs={'href': True})

if len(page_a_list) < 2:

self.log('extract page tag a error.')

return

# 根据select控件来判断当前是否为最后一页

select_tag = soup.find('select', attrs={'name': 'sldd'})

option_list = select_tag.find_all('option')

# 最后一个option标签,若有 selected 属性,则说明为最后一页

last_option = option_list[-1]

current_option = select_tag.find('option' ,attrs={'selected': True})

is_last = (last_option.string == current_option.string)

if not is_last:

# 最后一个为“末页”,倒数第二个为“下一页”

next_page = 'http://www.xeall.com/shenshi/' + page_a_list[-2]['href']

if next_page is not None:

print('\n------ parse next page --------')

print(next_page)

yield scrapy.Request(next_page, callback=self.parse)

pass

else:

print('========= Last page ==========')

def comics_parse(self, response):

# 提取每部漫画数据

content = response.body;

if not content:

self.log('parse comics body error.')

return;

# 注意BeautifulSoup的解析器参数,不能指定为'html.parser',因为有些网页可能为 lxml

soup = BeautifulSoup(content, "html5lib")

# 选择页控件标签

page_list_tag = soup.find('ul', class_='pagelist')

# 当前页数

current_li = page_list_tag.find('li', class_='thisclass')

page_num = current_li.a.string

self.log('current page = ' + page_num)

# 显示当前页图片标签

li_tag = soup.find('li', id='imgshow')

img_tag = li_tag.find('img')

# 当前图片url

img_url = img_tag['src']

self.log('img url: ' + img_url)

# 漫画标题

# title = soup.title.string

title = img_tag['alt']

# 将图片保存到本地

self.save_img(page_num, title, img_url)

# 下一页图片的url,当下一页标签的href属性为‘#’时为漫画的最后一页

a_tag_list = page_list_tag.find_all('a')

next_page = a_tag_list[-1]['href']

if next_page == '#':

self.log('parse comics:' + title + 'finished.')

else:

next_page = 'http://www.xeall.com/shenshi/' + next_page

yield scrapy.Request(next_page, callback=self.comics_parse)

def save_img(self, img_mun, title, img_url):

# 将图片保存到本地

self.log('saving pic: ' + img_url)

# 保存漫画的文件夹

document = '/Users/moshuqi/Desktop/cartoon'

# 每部漫画的文件名以标题命名

comics_path = document + '/' + title

exists = os.path.exists(comics_path)

if not exists:

self.log('create document: ' + title)

os.makedirs(comics_path)

# 每张图片以页数命名

pic_name = comics_path + '/' + img_mun + '.jpg'

# 检查图片是否已经下载到本地,若存在则不再重新下载

exists = os.path.exists(pic_name)

if exists:

self.log('pic exists: ' + pic_name)

return

try:

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

headers = { 'User-Agent' : user_agent }

req = urllib.request.Request(img_url, headers=headers)

response = urllib.request.urlopen(req, timeout=30)

# 请求返回到的数据

data = response.read()

# 若返回数据为压缩数据需要先进行解压

if response.info().get('Content-Encoding') == 'gzip':

data = zlib.decompress(data, 16 + zlib.MAX_WBITS)

# 图片保存到本地

fp = open(pic_name, "wb")

fp.write(data)

fp.close

self.log('save image finished:' + pic_name)

# urllib.request.urlretrieve(img_url, pic_name)

except Exception as e:

self.log('save image error.')

self.log(e)

Desktop version

你可能感兴趣的:(无标题文章)