Python多线程爬虫

from gevent import monkey, joinall, spawn

monkey.patch_all()
import requests
import re
import os

BASE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'meizitu2')
girl_list = []


def save_imgs(name, url):
    name = name.replace('?', '')
    name = name.replace(':', ' ')
    if not os.path.exists(os.path.join(BASE_DIR, name)):
        os.mkdir(os.path.join(BASE_DIR, name))
        print('create path', os.path.join(BASE_DIR, name))
    res = requests.get(url)
    source = re.search(r'img src="(.*?)"', res.text).group(1)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ,
        'cache - control': 'no - cache',
        'pragma': 'no - cache',
        'upgrade - insecure - requests': '1',
        'Referer': "https://www.mzitu.com/1",
    }
    with open(os.path.join(BASE_DIR, name, source.split('/')[-1]),'wb') as f:
        f.write(requests.get(source, headers=headers, timeout=3).content)
    print('download successful:', source)


def get_girl_pics(url):
    res = requests.get(url)
    last_page = re.findall(r'(\d+)', res.text)[-1]
    title = re.search(r'

(.*?)

', res.text).group(1) save_imgs(title, url) for i in range(2, int(last_page) + 1): save_imgs(title, ''.join([url, '/', str(i)])) def get_all_grils(url): global girl_list res = requests.get(url) pages = re.findall(r'
  • ',res.text) for i in pages: girl_list.append(i) def get_url_lists(): url = 'https://www.mzitu.com/mm' url_pages = [url] res = requests.get(url) girl_pages = re.findall(r'(.*?)
  • 你可能感兴趣的:(个人实战)