python脚本——数据集获取:图片爬取,剔除无效图片,统一分辨率,提取出标记过的json和png

1.图片爬取

复制来的。。

import requests
from bs4 import BeautifulSoup
import time
import json
import os
import socket

# 设置请求超时时间,防止长时间停留在同一个请求
socket.setdefaulttimeout(8)


def sougou_pic_url(num, keyword):
    pic_url = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    for i in range((num // 48) + 1):
        url = 'https://pic.sogou.com/pics?query=' + keyword + '&mode=1&start={}&reqType=ajax&reqFrom=result&tn=0'.format(
            i * 47)
        imgs = requests.get(url)
        jd = json.loads(imgs.text)
        jd = jd['items']
        for j in jd:
            pic_url.append(j['pic_url'])

    # print(len(pic_url))
    return pic_url


def down_img(num, keyword):
    pic_url = sougou_pic_url(num, keyword)

    if os.path.exists('D:/p_images/' + keyword):
        pass
    else:
        os.makedirs('D:/p_images/' + keyword)

    path = 'D:/p_images/'
    for index, i in enumerate(pic_url):
        try:
            filename = path + keyword + '/' + str(index) + '.png'
            print(filename)
            with open(filename, 'wb+') as f:
                f.write(requests.get(i).content)
            if (index >= (num - 1)):
                break
        except:
            continue


if __name__ == '__main__':
    while 1:
        print("1.搜索图片")
        print("2.退出程序")
        print("提示:图片默认存储路径为 D:/p_images/")
        choose = int(input("请选择:"))
        if (choose == 1):
            keyword = input('请输入图片关键词:')
            num = int(input('请输入爬取图片数目:'))
            down_img(num, keyword)
        elif (choose == 2):
            break
        else:
            print("输入有误,请重新输入!")

剔除无效图片

import imghdr
import os


def delect_webp_and_none_type(path):
    for root, dir, file in os.walk(path):
        for name in file:
            target = (os.path.join(root, name))
            result_type = imghdr.what(target)
            if result_type == "webp" or result_type == None:
                print(target)
                os.remove(target)


if __name__ == "__main__":
    delect_webp_and_none_type("D:\p_images\交通路标")

统一分辨率

import os
import cv2


def resizeImg(w, h, imgs_floder, save_folder):
    imgs = os.listdir(imgs_floder)
    for img in imgs:
        img_full_path = os.path.join(imgs_floder, img)
        ori_img = cv2.imread(img_full_path)
        img_r = cv2.resize(ori_img, (w, h))
        print(img)
        cv2.imwrite(os.path.join(save_folder, img), img_r)

    return 0


if __name__ == '__main__':
    print('----进行照片处理以及标签转换----')
    # w, h需要%16 == 0
    resizeImg(800, 560, "D:\BaiduNetdiskDownload\l01000~01999", "D:\BaiduNetdiskDownload\image2")
    print('---处理完成---')

提取标记过的。。。

import os
import cv2
import shutil

def resizeImg(imgs_floder, save_folder):
    imgs = os.listdir(imgs_floder)
    print(imgs)
    list1 = {}
    for img in imgs:
        list1[img.split('.')[0]+'.json'] = 0
    print(list1)
    for img in imgs:
        if img.split('.')[1] == 'json':
            list1[img] = 1
    for img in imgs:
        #print(img.split('.')[1])
        #print(list1[img.split('.')[0]])
        if img.split('.')[1] == 'png' and list1[img.split('.')[0]+'.json'] == 1:
            print(1)
            img_full_path = os.path.join(imgs_floder, img)
            ori_img = cv2.imread(img_full_path)
            cv2.imwrite(os.path.join(save_folder, img), ori_img)
            shutil.copyfile(os.path.join(imgs_floder, img.split('.')[0]+'.json'), os.path.join(save_folder, img.split('.')[0]+'.json'))
    return 0


if __name__ == '__main__':
    print('----start out----')
    # w, h需要%16 == 0
    resizeImg( "D:/p_images/test", "D:/p_images/test2")
    print('---处理完成---')

你可能感兴趣的:(python)