python爬取百度图片上的图像

from fake_useragent import UserAgent
import requests
import re
headers = {"User-agent": UserAgent().random,  # 随机生成一个代理请求
           "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
           "Connection": "keep-alive"}

img_re = re.compile('"thumbURL":"(.*?)"')
img_format = re.compile("f=(.*).*?w")
def file_op(img,num):
    # 图片保存路径,可修改
    tem_file_name = "/home/wang/hewuchun/CarPlateIdentity/Car_Plate_Identity/code/baidu_car_plate_imagess/%s.jpg" % num
    with open(file=tem_file_name, mode="wb") as file:
        try:
            file.write(img)
        except:
            pass
def xhr_url(url_xhr, start_num=0, page=5):
    end_num = page*30
    count = 1
    for page_num in  range(start_num,end_num,30):
        resp = requests.get(url=url_xhr+str(page_num),headers=headers)
        if resp.status_code == 200:
            resp.content.decode("utf-8")
            img_url_list = img_re.findall(resp.text)
            for img_url in img_url_list:
                img_rsp = requests.get(url=img_url, headers=headers)
                file_op(img=img_rsp.content,num=count)
                print("正在爬取第",count,"张")
                count = count + 1
        else:
            break
    print("内容已经全部爬取")
if __name__ == '__main__':
    org_url = "https://image.baidu.com/search/acjson?tn=resultjson_com&word={text}&pn=".format(text=input("输入你想要检索的内容:"))
    # 一般来说一页大概30张
    xhr_url(url_xhr=org_url, start_num=int(input("开始页:")),page=int(input("所爬取的页数:")))

python爬取百度图片上的图像_第1张图片
python爬取百度图片上的图像_第2张图片

你可能感兴趣的:(python,开发语言,ubuntu)