python3爬取车标网,再也没有不认识的车标

导入相关库

"""
 -*- coding:utf-8 -*-
 author:Air
 datetime:2019/7/25 17:40
 software: PyCharm
 学习交流qq群:916696436
"""
import requests
from parsel import Selector
from multiprocessing import Pool
import os

分析网页

都是静态数据

1.获取分类网址

python3爬取车标网,再也没有不认识的车标_第1张图片

 url='http://www.chebiaow.com/logo'
    res=requests.get(url,headers=headers)
    s=Selector(text=res.content.decode())
    urls=['http://www.chebiaow.com/{}'.format(i) for i in s.xpath('//ul[@class="cate-list"]/li/h2/a/@href').getall()] 

 2.获取具体分类图片url

res=requests.get(url,headers=headers)
    s=Selector(text=res.content.decode())
    li_list=s.xpath('//li[@class="clearfix"]')
    for li in li_list:
        img_url=li.xpath('./div[1]/a/img/@src').get()
        name=li.xpath('./div[2]/a[1]/text()').get()
        print(name,end='    ')

3.下载图片到本地

    if not os.path.exists(root):
        os.makedirs(root)
    res=requests.get(url,headers=headers)
    path=root+name+'.jpg'
    with open(path,'wb') as f:
        f.write(res.content)
        print('保存成功')

4.结果

python3爬取车标网,再也没有不认识的车标_第2张图片

5.源码

"""
 -*- coding:utf-8 -*-
 author:Air
 datetime:2019/7/25 17:40
 software: PyCharm
 学习交流qq群:916696436
"""
import requests
from parsel import Selector
from multiprocessing import Pool
import os
root='./img/'
from fake_useragent import UserAgent

# 生成对象
useragent = UserAgent()
headers = {
    'User-Agent': useragent.random
}
def get_img(url):
    res=requests.get(url,headers=headers)
    s=Selector(text=res.content.decode())
    li_list=s.xpath('//li[@class="clearfix"]')
    for li in li_list:
        img_url=li.xpath('./div[1]/a/img/@src').get()
        name=li.xpath('./div[2]/a[1]/text()').get()
        print(name,end='    ')
        down_img(name,img_url)
def down_img(name,url):
    if not os.path.exists(root):
        os.makedirs(root)
    res=requests.get(url,headers=headers)
    path=root+name+'.jpg'
    with open(path,'wb') as f:
        f.write(res.content)
        print('保存成功')
def cate_url():
    url='http://www.chebiaow.com/logo'
    res=requests.get(url,headers=headers)
    s=Selector(text=res.content.decode())
    urls=['http://www.chebiaow.com/{}'.format(i) for i in s.xpath('//ul[@class="cate-list"]/li/h2/a/@href').getall()]  #真实url   列表生成式
    return urls

if __name__ == '__main__':
    urls=cate_url()
    p=Pool(16)
    p.map(get_img,urls)

 

你可能感兴趣的:(spider,spider)