import scrapy
import BeautifulSoup as bs4
import codecs
import wget
import os, sys
import time
ENCODE = 'GB18030'
HTML_ENCODE = 'utf8'
DIR_NAME = r'e:\meinv\images'
ATTR_HREF = 'href'
ATTR_SRC = 'src'
DOWNLOAD_FAIL_LST = []
MAX_FAIL_COUNT = 50
FAIL_WAIT_COUNT = 5
def init_dir(type):
dir = DIR_NAME + os.sep + type + os.sep
if not os.path.exists(dir):
os.makedirs(dir)
return dir
def download(type,src):
dir = init_dir(type)
file = dir + src.split('/')[-1]
try:
if not os.path.isfile(file):
wget.download(src, file)
except:
DOWNLOAD_FAIL_LST.append(src)
if len(DOWNLOAD_FAIL_LST) > MAX_FAIL_COUNT:
print 'mostly is error, down load fail is too much!'
sys.exit(0)
if (len(DOWNLOAD_FAIL_LST) % FAIL_WAIT_COUNT) == 0:
time.sleep(60)
print 'sleep while for download file ....'
class MeinvSpider(scrapy.Spider):
name = 'meinv'
allowed_domains=['www.4j4j.cn']
start_urls = [
'http://www.4j4j.cn/beauty/'
]
class ReadAlbum(object):
def __init__(self,type):
self.type = type
#Download picutre
def read_page(self, response):
b = bs4.BeautifulSoup(response.body)
r = b.findAll(name='div', attrs={'class':'pic-image-wrapper'})
if not r:
self.log('read page error!')
return
r = r[0]
img = r.findAll('img')[0][ATTR_SRC].encode(ENCODE)
if img:
download(self.type, img)
#Download album
def read_album(self, response):
b = bs4.BeautifulSoup(response.body)
r = b.findAll(name='ul',attrs={'class':'pic-thumb-list'})
if not r:
self.log('read album error!')
return
r = r[0]
hrefs = r.findAll('a')
for i in hrefs:
if i[ATTR_HREF]:
src = i[ATTR_HREF]
yield scrapy.Request(src, callback=self.read_page)
def read_tag(self, response):
b = bs4.BeautifulSoup(response.body)
r = b.findAll(name='ul',attrs={\
'id':'pic-list'})
if len(r) == 0:
self.log('read tag error!')
return
tag_t = b.findAll(name='li', attrs={'class':'current'})[0]
type = tag_t.findAll('a')[0].string.encode(ENCODE)
r = r[0]
mag = r.findAll(name='span', attrs={'class':'magnifier'})
for i in mag:
a = i.findAll('a')[0]
if a[ATTR_HREF]:
src = a[ATTR_HREF].encode(ENCODE)
ra = self.ReadAlbum(type)
yield scrapy.Request(src, callback=ra.read_album)
#Jump to next tag_page, if exists
after = b.findAll('a', attrs={'class':'after'})
if after:
after = after[0]
href = after[ATTR_HREF]
if href:
#self.log('--------!!!Jump to after page!!!!---------')
yield scrapy.Request(href, callback=self.read_tag)
def read_tags(self, body):
b = bs4.BeautifulSoup(body)
r = b.findAll(name='ul',attrs={'class':'tags clearfix'})
if len(r) != 1:
self.log('read tags error!')
return
r = r[0]
tags = r.findAll(name='a')
ret = {}
for i in tags:
if i.string:
k = i.string.encode(ENCODE)
if i[ATTR_HREF]:
ret[k] = i[ATTR_HREF].encode(ENCODE)
return ret
def parse(self, response):
r = self.read_tags(response.body)
for i in r:
yield scrapy.Request(r[i], callback=self.read_tag)