import requests                   # 模块导入的俩种方法
from multiprocessing import Pool
import re

def get(url):
    ret=requests.get(url)
    if ret.status_code==200:
        return  ret.content.decode('gbk')

def call_back(arg):
    ret = com.finditer(arg)
    dict_lst=[]
    for i in ret:
        dic = {
            'png': i.group('png'),
            'name': i.group('name'),
            'place': i.group('place')
        }
        dict_lst.append(dic)
    for i in  dict_lst:
        res=subget(i['png'])
        write_func(i['name'],i['place'],res)

    return dict_lst

def subget(url):
    if 'https' in url:
        ret = requests.get(url)
        if ret.status_code == 200:
            return ret.content
        else:
            pass
    else:
        n_url = 'http://www.xiaohuar.com' + url
        ret = requests.get(n_url)
        if ret.status_code == 200:
            return ret.content
        else:
            pass

def write_func(path,place,picture):
    with open(r'E:\text1\爬虫\text_png\%s-%s.png' %(path,place),'wb') as f:
         f.write(picture)

'''我要爬取的网页的特征'''
'''http://www.xiaohuar.com/list-1-0.html'''
'''http://www.xiaohuar.com/list-1-43.html'''

if __name__ =='__main__':
    com = re.compile(
        '
(?:.*?)src="(?P.*?)"(?:.*?)(?P.*?)(?:.*?)' '(?P.*?)', re.S) pool=Pool(3) res_lst=[] for i in range(40): pool.apply_async(get,args=('http://www.xiaohuar.com/list-1-%s.html' %i,),callback=call_back) pool.close() pool.join()

缺点:爬取的速度慢,最多17个网页(好无奈)