(1) 学会使用爬虫爬取图像和视频,从视频中提取图片。
(2) 对获得的图片数据进行整理,包括重命名,格式统一,去重。
有些任务没有直接对应的开源数据集,或者开源数据集中的数据比较少,这就需要我们通过搜索引擎自行爬取图片。
百度图片爬虫
Download images from Google, Bing, Baidu. 谷歌、百度、必应图片下载
Google, Naver multiprocess image web crawler (Selenium)
爬取的如果是视频需要先转换成图片,如果是图片就要做好统一格式、数据清洗的工作。
使用爬虫爬取数据,如果是视频可以使用 python getimagefromvideo.py
将视频转换为图片
#coding:utf8
import cv2
import dlib
import numpy as np
import sys
import os
video_capture = cv2.VideoCapture(sys.argv[1])
video_id = sys.argv[1].split('.')[0]
os.mkdir(video_id)
count = 0
while True:
is_sucessfully_read, im = video_capture.read()
if is_sucessfully_read == False:
break
cv2.imwrite(os.path.join(video_id,str(count)+'.jpg'),im)
print "image shape=",im.shape
count = count + 1
print count
统一后缀格式可以减少以后写数据 API 时的压力,也可以测试图片是不是可以正常的读取,及时防止未知问题的出现,这很重要。
使用 python reformat_image.py
将图片全部转换为 jpg 格式,这也是所有框架支持的格式。
import os
import sys
import cv2
import numpy as np
def listfiles(rootDir):
list_dirs = os.walk(rootDir)
for root, dirs, files in list_dirs:
for d in dirs:
print os.path.join(root,d)
for f in files:
fileid = f.split('.')[0]
filepath = os.path.join(root,f)
try:
src = cv2.imread(filepath,1)
print "src=",filepath,src.shape
os.remove(filepath)
cv2.imwrite(os.path.join(root,fileid+".jpg"),src)
except:
os.remove(filepath)
continue
listfiles(sys.argv[1])
统一格式的命名有利于区分和整理数据
mkdir tmp
./rename_files_function.sh ./tmp/
i=0
dir=$1
resultdir=$2
app=$3
for file in $dir""*
do
arr=$(echo $file | tr "/" "\n")
for x in $arr
do
filename=$x
done
brr=$(echo $filename | tr "." "\n")
brrs=( $brr )
fileid=${brrs[0]}
num=${#brrs[@]}
index=$(expr $num - 1)
fileformat=${brrs[index]}
echo file=""$file
echo fileid=""$fileid
echo fileformat=""$fileformat
if [ $fileformat == jpeg -o $fileformat == png -o $fileformat == jpg -o $fileformat == bmp ] ;
then
#echo "good"
i=$(expr $i + 1)
resultfile=$resultdir""$app""$i"".$fileformat
echo file=""$file"",resultfile=""$resultfile
mv "$file" "$resultfile"
else
echo $file""not good
fi
done
echo 执行删除""$dir""*
#rm $dir""*
echo 执行mv""$resultdir""*
mv $resultdir""* $dir
如果你使用多个关键词或者使用不同的搜索引擎同样的关键词,或者从视频中提取图片,那么爬取回来的图片很可能有重复或者非常的相似,这样的样本应该被去除。
去除有很多种方法,比如直接比较两幅图像是不是完全相同,通过 hash 等相似度方法来进行相似度,这里我们提供一个方法,利用相似度来进行去重。
# sudo pip install python-Levenshtein
conda install -c conda-forge python-levenshtein
python remove_repeat.py
https://anaconda.org/conda-forge/python-levenshtein
#!/usr/bin/env python
#coding:utf8
import math
from PIL import Image
import Levenshtein
class BWImageCompare(object):
"""Compares two images (b/w)."""
_pixel = 255
_colour = False
def __init__(self, imga, imgb, maxsize=64):
"""Save a copy of the image objects."""
sizea, sizeb = imga.size, imgb.size
newx = min(sizea[0], sizeb[0], maxsize)
newy = min(sizea[1], sizeb[1], maxsize)
# Rescale to a common size:
imga = imga.resize((newx, newy), Image.BICUBIC)
imgb = imgb.resize((newx, newy), Image.BICUBIC)
if not self._colour:
# Store the images in B/W Int format
imga = imga.convert('I')
imgb = imgb.convert('I')
self._imga = imga
self._imgb = imgb
# Store the common image size
self.x, self.y = newx, newy
def _img_int(self, img):
"""Convert an image to a list of pixels."""
x, y = img.size
for i in xrange(x):
for j in xrange(y):
yield img.getpixel((i, j))
@property
def imga_int(self):
"""Return a tuple representing the first image."""
if not hasattr(self, '_imga_int'):
self._imga_int = tuple(self._img_int(self._imga))
return self._imga_int
@property
def imgb_int(self):
"""Return a tuple representing the second image."""
if not hasattr(self, '_imgb_int'):
self._imgb_int = tuple(self._img_int(self._imgb))
return self._imgb_int
@property
def mse(self):
"""Return the mean square error between the two images."""
if not hasattr(self, '_mse'):
tmp = sum((a-b)**2 for a, b in zip(self.imga_int, self.imgb_int))
self._mse = float(tmp) / self.x / self.y
return self._mse
@property
def psnr(self):
"""Calculate the peak signal-to-noise ratio."""
if not hasattr(self, '_psnr'):
self._psnr = 20 * math.log(self._pixel / math.sqrt(self.mse), 10)
return self._psnr
@property
def nrmsd(self):
"""Calculate the normalized root mean square deviation."""
if not hasattr(self, '_nrmsd'):
self._nrmsd = math.sqrt(self.mse) / self._pixel
return self._nrmsd
@property
def levenshtein(self):
"""Calculate the Levenshtein distance."""
if not hasattr(self, '_lv'):
stra = ''.join((chr(x) for x in self.imga_int))
strb = ''.join((chr(x) for x in self.imgb_int))
lv = Levenshtein.distance(stra, strb)
self._lv = float(lv) / self.x / self.y
return self._lv
class ImageCompare(BWImageCompare):
"""Compares two images (colour)."""
_pixel = 255 ** 3
_colour = True
def _img_int(self, img):
"""Convert an image to a list of pixels."""
x, y = img.size
for i in xrange(x):
for j in xrange(y):
pixel = img.getpixel((i, j))
yield pixel[0] | (pixel[1]<<8) | (pixel[2]<<16)
@property
def levenshtein(self):
"""Calculate the Levenshtein distance."""
if not hasattr(self, '_lv'):
stra_r = ''.join((chr(x>>16) for x in self.imga_int))
strb_r = ''.join((chr(x>>16) for x in self.imgb_int))
lv_r = Levenshtein.distance(stra_r, strb_r)
stra_g = ''.join((chr((x>>8)&0xff) for x in self.imga_int))
strb_g = ''.join((chr((x>>8)&0xff) for x in self.imgb_int))
lv_g = Levenshtein.distance(stra_g, strb_g)
stra_b = ''.join((chr(x&0xff) for x in self.imga_int))
strb_b = ''.join((chr(x&0xff) for x in self.imgb_int))
lv_b = Levenshtein.distance(stra_b, strb_b)
self._lv = (lv_r + lv_g + lv_b) / 3. / self.x / self.y
return self._lv
class FuzzyImageCompare(object):
"""Compares two images based on the previous comparison values."""
def __init__(self, imga, imgb, lb=1, tol=15):
"""Store the images in the instance."""
self._imga, self._imgb, self._lb, self._tol = imga, imgb, lb, tol
def compare(self):
"""Run all the comparisons."""
if hasattr(self, '_compare'):
return self._compare
lb, i = self._lb, 2
diffs = {
'levenshtein': [],
'nrmsd': [],
'psnr': [],
}
stop = {
'levenshtein': False,
'nrmsd': False,
'psnr': False,
}
while not all(stop.values()):
cmp = ImageCompare(self._imga, self._imgb, i)
diff = diffs['levenshtein']
if len(diff) >= lb+2 and \
abs(diff[-1] - diff[-lb-1]) <= abs(diff[-lb-1] - diff[-lb-2]):
stop['levenshtein'] = True
else:
diff.append(cmp.levenshtein)
diff = diffs['nrmsd']
if len(diff) >= lb+2 and \
abs(diff[-1] - diff[-lb-1]) <= abs(diff[-lb-1] - diff[-lb-2]):
stop['nrmsd'] = True
else:
diff.append(cmp.nrmsd)
diff = diffs['psnr']
if len(diff) >= lb+2 and \
abs(diff[-1] - diff[-lb-1]) <= abs(diff[-lb-1] - diff[-lb-2]):
stop['psnr'] = True
else:
try:
diff.append(cmp.psnr)
except ZeroDivisionError:
diff.append(-1) # to indicate that the images are identical
i *= 2
self._compare = {
'levenshtein': 100 - diffs['levenshtein'][-1] * 100,
'nrmsd': 100 - diffs['nrmsd'][-1] * 100,
'psnr': diffs['psnr'][-1] == -1 and 100.0 or diffs['psnr'][-1],
}
return self._compare
def similarity(self):
"""Try to calculate the image similarity."""
cmp = self.compare()
lnrmsd = (cmp['levenshtein'] + cmp['nrmsd']) / 2
return lnrmsd
return min(lnrmsd * cmp['psnr'] / self._tol, 100.0) # TODO: fix psnr!
if __name__ == '__main__':
import sys
import os
srcimages = os.listdir(sys.argv[1])
srcimages.sort()
tot = len(srcimages)
tot = (tot ** 2 - tot) / 2
print 'Comparing %d images:' % tot
images = {}
###向后删除图片
similarity_thresh = 0.5 ##相似度阈值,超过即判断为相同图片
i = 0
while(i < len(srcimages)-1):
print "i=", i,"num of srcimages",len(srcimages)
imga = Image.open(os.path.join(sys.argv[1],srcimages[i]))
imgb = Image.open(os.path.join(sys.argv[1],srcimages[i+1]))
cmp = FuzzyImageCompare(imga, imgb)
sim = cmp.similarity() / 100
print "image ",os.path.join(sys.argv[1],srcimages[i])," and image",os.path.join(sys.argv[1],srcimages[i+1])," sim=",sim
if sim > similarity_thresh:
print "delete ",os.path.join(sys.argv[1],srcimages[i+1])
os.remove(os.path.join(sys.argv[1],srcimages[i+1]))
srcimages.pop(i+1)
else:
i = i+1
'''
results, i = {}, 1
for namea, imga in images.items():
for nameb, imgb in images.items():
if namea == nameb or (nameb, namea) in results:
continue
print ' * %2d / %2d:' % (i, tot),
print namea, nameb, '...',
cmp = FuzzyImageCompare(imga, imgb)
sim = cmp.similarity()
results[(namea, nameb)] = sim
print '%.2f %%' % sim
i += 1
res = max(results.values())
imgs = [k for k, v in results.iteritems() if v == res][0]
print 'Most similar images: %s %s (%.2f %%)' % (imgs[0], imgs[1], res)
'''
在此之后还需要自己手动筛选图片,工作量其实也不小,不过经过去重还是可以减少不少工作量的。
爬取的图片需要自己标注,可以使用下面这些标注工具。
https://github.com/tzutalin/labelImg
LabelImg is a graphical image annotation tool and label object bounding boxes in images https://youtu.be/p0nR2YsCY_U
https://github.com/wkentaro/labelme
Image Polygonal Annotation with Python (polygon, rectangle, circle, line, point and image-level flag annotation).
https://github.com/Microsoft/VoTT
Visual Object Tagging Tool: An electron app for building end to end Object Detection Models from Images and Videos.
一般会按照 8:1:1 将数据集划分为训练集、验证集、测试集。这个要根据自己的情况编写 shell 脚本,下面是我用 darknet 训练 yolov3 模型时划分数据的脚本。
#!/bin/sh
if [ $# != 1 ];then
echo "Usage: $0 "
exit -1
fi
path=$1
for sub_dir in `ls $path`
do
# 获取子文件夹的全路径
sub_dir_path=$path/$sub_dir
if [ -d $sub_dir_path ]
then
# 将子目录下所有文件移动到父目录中
`mv $sub_dir_path/* $path`
# 删除子目录
`rm -rf $sub_dir_path`
fi
# 给所有文件添加前缀
done
`rm tmp.txt`
# 将文件夹下指定类型的文件写到文件中
# ***** 问题:最后会有个空行 *****
# 图片文件存在对应的 txt 文件,则将图片路径追加到 tmp.txt 文件中
for image in `find $path | grep -E 'jpg|png|JPEG|JPG|PNG'`
do
txt=${image%.*}".txt"
if [ -f $txt ]
then
echo ${image}
`echo ${image} >> tmp.txt`
fi
done
# 将路径 8:1:1 放到 train.txt,val.txt,test.txt
# 1. 计算 tmp,txt 文件行数
# 2. 计算得出分配到各个文件的行号
# 3. 将对应行数的内容写到对应文件夹中
line=`cat tmp.txt | wc -l`
line1=$(($line/10*8))
line2=$(($line/10*8+line/10+1))
`sed -n 1,${line1}p tmp.txt >> train.txt`
`sed -n $((${line1}+1)),${line2}p tmp.txt >> val.txt`
`sed -n $((${line2}+1)),$((${line}-1))p tmp.txt >> test.txt`
yolov3 的标注格式如下所示
9 0.732955 0.591102 0.270317 0.193503
统计标签的时候可以使用
awk '{print $1}' *.txt | sort -g | uniq -c
以上就是自己建立一个数据集的流程:爬取图片->整理图片->标注图片->训练。