此次爬取淘宝商家图片是为了对相关行业(此处以激光雷达为例)的产品结构以及外观设计进行对比。而淘宝的反爬机制非常强大,能力有限只能有简单点的办法进行爬取。由于淘宝的每一页数据都是存放在js文件里面,所以我们只需获取js文件对应的url即可,最后使用多线程下载并保存图片。
# -*- coding = utf-8 -*-
# @Time : 2021/4/16 17:31
# @Author : 陈良兴
# @File : 多线程获取激光雷达图片.py
# @Software : PyCharm
import requests
import json # 数据交换格式
import os
import threading
if not os.path.exists(r'test/pictures'):
os.mkdir(r'test/pictures')
# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=20)
# 获取图片链接
def get_url():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36',
'referer': 'https://ai.taobao.com/',
'cookie': 'cna=7RLoGCIMsTMCAd9KVjT44b5z; xlly_s=1; _m_h5_tk=2b4885f8cb7a934289c07d8f8f1a4dad_1618576985077; _m_h5_tk_enc=ecec9968703e0eceb2376fa5af7eaedd; tfstk=ctBhB64tX6RCXfOGGJ9CAXUuyWFOZrneRIdNQTDW3DzNMs6NiPlZg2PERFrgIb1..; l=eBx1oRSejwHJF1fMBOfanurza77OSIRYSuPzaNbMiOCP961p5uOcW6asCHY9C3GVh6y9R3u2eEtTBeYBqI2jPGK3X2uPIODmn; isg=BP7-BAP8EtDkEUb05ICumJyPTxRAP8K5G-WoWqgHasE8S54lEM8SySQtwxeH9rrR',
}
url_list = [
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567358196&sign=99afe8a1b1edd608cd6aaedb7f348e44&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A0%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.186.139.24_2738689_1618567356079%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.186.139.24_2738689_1618567356079%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.search.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.search.1%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567450790&sign=f51540e32e131c78eca058441759c2bc&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A1%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.1.81.73_2737071_1618567449080%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.1.81.73_2737071_1618567449080%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D1%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567475602&sign=61ea96453b2bc13c04b7012249d609d4&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A2%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.23.94.254_2738523_1618567473696%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.23.94.254_2738523_1618567473696%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D2%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567492113&sign=7fd6710bc06905e82c2bb84bcffab2d8&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A3%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.15.95.103_2736173_1618567490394%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.15.95.103_2736173_1618567490394%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D3%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567513427&sign=013c7476a124c3634b3fa7d4119d3186&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A4%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.224.245.240_2737827_1618567511468%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.224.245.240_2737827_1618567511468%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D4%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567532551&sign=e5746826967196e6c44883a221a421f6&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A5%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.11.105.189_2739043_1618567530561%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.11.105.189_2739043_1618567530561%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D5%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567552146&sign=869114a9a679dc2b09c8aa999c4f6e61&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A6%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.224.245.240_2737924_1618567550261%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.224.245.240_2737924_1618567550261%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D6%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567569287&sign=0c2354128f1ed8fa69364efd089be122&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A7%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.11.105.189_2742567_1618567567387%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.11.105.189_2742567_1618567567387%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D7%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567587130&sign=2b059ade121542e28c25254351e25905&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A8%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.21.57.63_2741260_1618567585209%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.21.57.63_2741260_1618567585209%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D8%22%7D',
'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/?jsv=2.5.1&appKey=12574478&t=1618567602748&sign=42adf8c4d738d9dc2bf9dd8ab8d68709&api=mtop.alimama.union.xt.en.api.entry&v=1.0&AntiCreep=true&timeout=20000&AntiFlood=true&type=jsonp&dataType=jsonp&callback=mtopjsonp2&data=%7B%22pNum%22%3A9%2C%22pSize%22%3A%2260%22%2C%22refpid%22%3A%22mm_43125636_4246598_110178550458%22%2C%22variableMap%22%3A%22%7B%5C%22q%5C%22%3A%5C%22%E6%BF%80%E5%85%89%E9%9B%B7%E8%BE%BE%5C%22%2C%5C%22navigator%5C%22%3Atrue%2C%5C%22union_lens%5C%22%3A%5C%22recoveryid%3A201_11.1.240.219_2704532_1618558195481%3Bprepvid%3A201_11.23.83.213_2737140_1618567341667%5C%22%2C%5C%22recoveryId%5C%22%3A%5C%22201_11.23.83.213_2741107_1618567600862%5C%22%7D%22%2C%22qieId%22%3A%2234374%22%2C%22spm%22%3A%22a2e1u.19484427.29996460%22%2C%22app_pvid%22%3A%22201_11.23.83.213_2741107_1618567600862%22%2C%22ctm%22%3A%22spm-url%3Aa2e1u.19484427.29996460.1%3Bpage_url%3Ahttps%253A%252F%252Fai.taobao.com%252Fsearch%252Findex.htm%253Fkey%253D%2525E6%2525BF%252580%2525E5%252585%252589%2525E9%25259B%2525B7%2525E8%2525BE%2525BE%2526pid%253Dmm_43125636_4246598_110178550458%2526union_lens%253Drecoveryid%25253A201_11.1.240.219_2704532_1618558195481%25253Bprepvid%25253A201_11.23.83.213_2737140_1618567341667%2526spm%253Da2e1u.19484427.29996460.1%2526pnum%253D9%22%7D',
]
pic_url_list = []
for url in url_list:
html = requests.get(url=url, headers=headers).text
text = html.replace('mtopjsonp2(', '').replace(')', '')
info_dic = json.loads(text)
pic_url = info_dic["data"]["recommend"]["resultList"]
for i in range(len(pic_url)):
url = 'https:' + pic_url[i]["pic"]
pic_url_list.append(url)
return pic_url_list
# 下载并保存图片
def pic_download(pic_url_list, n):
try:
response = requests.get(pic_url_list)
path = r'./pictures/%s.jpg' % n
with open(path, 'wb') as fp:
fp.write(response.content)
except (requests.exceptions.InvalidURL):
pass
def main():
# 保存
print("saving......")
pic_url_list = get_url()
num = 0
for pic_url in pic_url_list:
num += 1
print("正在保存第{}张图片".format(num))
# 上锁
thread_lock.acquire()
# 下载 这个方法丢进线程池
t = threading.Thread(target=pic_download, args=(pic_url, num))
t.start()
pic_download(pic_url, num)
# 下载完成就解锁
thread_lock.release()
print("爬取完毕!!!")
if __name__ == '__main__':
main()