进程池+share memory 爬取131mm网站美女图片

代码很粗糙,娱乐玩玩就好

import time
import os

from lxml import etree
import requests
import os
import random
import multiprocessing

def download(a, b, url1, headers, n,items2_list):
    u='http://www.mm131.com/xinggan/'+b
    headers1 = {
        'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'img1.mm131.me',
        'Referer': u,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    print(u)
    req3 = requests.get(url1 + a, headers=headers)
    url_f = etree.HTML(req3.content).xpath("//div/div/a/img/@src")
    url_str = ''.join(url_f)

    with open("F:\\娱乐\\爬图片\\131mm\\xgmm2\\" + "xgmm" + str(n) + ".jpg", "wb") as f:
        print("保存第{}张,还剩{}张".format(n,len(items2_list)))
        f.write((requests.get(url_str, headers=headers1)).content)
    items2_list.remove(a)

def htmlparser2(i,headers,items2_list,items_list):

    resp2 = requests.get(i, headers=headers)
    resp2.encoding = 'utf8'
    root2 = etree.HTML(resp2.content)
    items2 = root2.xpath('//div[@class="content-page"]/a/@href')
    items2 = list(set(items2))
    # global items2_list
    items2_list.extend(items2)
    # global items_list
    items_list.remove(i) # 解析一条就会删除一条
    # print(items2_list)
    print("这一系列的数量:", len(items2))

def htmlparser(i,headers,items_list):
    url = "http://www.mm131.com/xinggan/list_6_{}.html".format(str(i))
    req = requests.session().get(url, headers=headers)
    # print(req.status_code)
    root = etree.HTML(req.content)
    # items = root.xpath('//dl[@class="list-left public-box"]/dd/a[@target="_blank" ]/@href')
    items = root.xpath('//dl[@class="list-left public-box"]/dd/a[@target="_blank" ]/@href')
    print("第{}页".format(i))
    items_list.extend(items)

if __name__ == '__main__':

    url1 = "http://www.mm131.com/xinggan/"
    headers = {
        'User-Agent': 'Baiduspider+(+http://www.baidu.com/search/spider.html")',
        'Referer': 'https://www.baidu.com/link?url=mPARC6e0QgmXiBEX1UCXo62Hsl1XIxYOsAVJUsS9R_SumSXwtLn3_XcPCIxWUC7U&wd=&eqid=af85b81300074c38000000025ae83304',
        'Cookie': 'UM_distinctid=162299332741c7-07936edc87635f-7b113d-100200-162299332753a2; bdshare_firstime=1521115935504; CNZZDATA3866066=cnzz_eid%3D306650475-1494676185-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1494676185; Hm_lvt_9a737a8572f89206db6e9c301695b55a=1525162468,1525165327,1525165615,1525165652; Hm_lpvt_9a737a8572f89206db6e9c301695b55a=1525165916',
        'Upgrade - Insecure - Requests': '1',
        'Host': 'www.mm131.com',
        'Connection': 'keep - alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept - Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9'
    }
    n = 1  
    # items_list=urldata.items_list
    items_list=multiprocessing.Manager().list()
    pool = multiprocessing.Pool(40)
    for i in range(2, 140):
        pool.apply_async(htmlparser, (i,headers,items_list))

    pool.close()
    pool.join()
    pool1 = multiprocessing.Pool(40)
    pool2 = multiprocessing.Pool(40)  # 开启40个进程下载
    # 多进程
    print(len(items_list))
    items2_list = multiprocessing.Manager().list()
    for i in items_list:
        print("执行第三阶段")
        print(i)
        pool1.apply_async(htmlparser2, (i,headers,items2_list,items_list))
    pool1.close()
    pool1.join()
    print(items2_list)
    for a in items2_list:
        print("***************************执行下载*****************************")
        b=a[:4]+a[-5:]
        print(b)
        n += 1
        pool2.apply_async(download, (a, b, url1, headers, n,items2_list))
    pool2.close()
    pool2.join()
    print("over")

你可能感兴趣的:(经验)