python多线程爬虫爬取图片

# 先导入所需模块
import requests
import re
import threading
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
# 生成访问代理
headers = {
    "User-Agent": UserAgent().random
}

# 爬取目标地址

def crawl(n): # n为爬取页数
    v = 2
    while v <= n:
        url = "http://pic.netbian.com/4kfengjing/index_" + str(v) + ".html"
        res = requests.get(url=url, headers=headers)
        html = res.content.decode('gbk')
        bs = BeautifulSoup(html, "html.parser")
        con = bs.select(".clearfix li")
        for i in con:
            img_url = "http://pic.netbian.com" + re.findall(r'', str(i))[0]
            th = threading.Thread(target=crawlImg, args=(img_url,)) # 创建多线程
            th.start() # 启动多线程
        v = v + 1

# 解析下载图片

def crawlImg(url):
    res = requests.get(url=url, headers=headers)
    html = res.content.decode('gbk')
    bs = BeautifulSoup(html, "html.parser")
    img_name = bs.select(".view .photo-hd h1")[0].text
    img_url = "http://pic.netbian.com" + bs.select(".view .photo-pic #img img")[0].get('src')
    print(img_url, end=" ")
    print(img_name)
    con = requests.get(img_url)
    # 保存图片到img文件夹
    f = open('img/' + img_name + '.jpg', 'wb')
    f.write(con.content)
    f.close()

# 运行项目

if __name__ == "__main__":
    crawl(10) # 参数为爬取页数

 

你可能感兴趣的:(爬虫)