读取jpg文件的exif信息

照片有点多,准备整理一下,按照片拍照的时间进行重新整理,用python写了一个,只读自己感兴趣的exif信息的脚本,看看速度怎么样。



以前用.net写过一个,用的现成的exif类库:大约开启1~5个线程来分析所有目标图片文件,读取exif信息,放入一个队列中;大约开启20个左右的线程来进行重命名、拷贝和删除原始文件。


这个程序运行起来,速度会越来越慢,一直没找到关键原因在哪,觉得可能有的原因:

  1. 瓶颈在硬盘的IO上,copy操作比较多,感觉应该是主要原因,但是无法解释为什么运行速度会越来越慢
  2. 硬件问题,老笔记本Thinkpad R60
  3. .net本身比较慢,程序里加了强制垃圾回收,占用内存空间比较稳定,cpu占用率也不是特别高,线程比较多,但是基本互相之间没有什么资源竞争
  4. 由于照片都是单反拍的,文件大小比较大,exif信息比较多,用exifLib会读取整个图片信息并格式化所有exif信息,但是感觉对程序的性能影响比较小

刚试了一下,没有多线程,速度还挺快的,测试文件大约有2G左右,2秒以内就能完成,全部80多G的照片整理大约需要不到4分钟



# http://www.codeproject.com/Articles/43665/ExifLibrary-for-NET
# http://www.exiv2.org/tags.html
# http://www.awaresystems.be/imaging/tiff/tifftags.html

import os
import struct
import random
import datetime
import sys
import traceback

class exiftags:
	datetime = 0x0132
	datetime_original = 0x9003
	datetime_digited = 0x9004
	exifpointer = 0x8769

class log:
	visited = 0

class jpg:
	def __init__(self, file_path):
		self.__file_path = file_path
		self.__fo = None
		self.__endian = '>'
		self.__baseoffset = None
		self.exif = {}
	
	def __del__(self):
		if self.__fo is not None:
			self.__fo.close()

	def __getfo(self):
		if self.__fo is None:
			self.__fo = open(self.__file_path, 'rb')
		return self.__fo

	def __isjpg(self):
		arr = self.__getfo().read(2)
		if (arr is None) or (len(arr) < 2):
			return False
		if (ord(arr[0]) == 0xff) and (ord(arr[1]) == 0xd8):
			return True
		return False
	
	def __read_app0_section(self):
		pos = self.__getfo().tell()
		arr = self.__getfo().read(2)
		if (ord(arr[0]) == 0xff) and (ord(arr[1]) == 0xe0):
			arr = self.__getfo().read(2)
			size = struct.unpack('>H', arr)[0] # big-endian
			pos = self.__getfo().tell()
			self.__getfo().seek(pos + size - 2, 0) # skip app0 section
		else:
			self.__getfo().seek(pos, 0)

	def __read_app1_section(self):
		pos = self.__getfo().tell()
		arr = self.__getfo().read(2)
		if (ord(arr[0]) == 0xff) and (ord(arr[1]) == 0xe1):
			arr = self.__getfo().read(2)
			size = struct.unpack('>H', arr)[0]
			arr = self.__getfo().read(6)
			# no exif
			if arr != '\x45\x78\x69\x66\x00\x00':
				print("NOT EXIF!")
				return
			# base position
			self.__baseoffset = self.__getfo().tell()
			# get little/bigdian
			arr = self.__getfo().read(2)
			if (ord(arr[0]) == 0x49) and (ord(arr[1]) == 0x49):
				self.__endian = '<'
			elif (ord(arr[0]) == 0x4d) and (ord(arr[1]) == 0x4d):
				self.__endian = '>'
			else:
				print("Failed to get big-/little-endian")
				raise IOError
			# TIFF marker, should always be [0x002A]
			self.__getfo().read(2)
			arr = self.__getfo().read(4)
			# Read 0th IFD
			nextifd = struct.unpack(self.__endian + 'L', arr)[0]
			if nextifd != 0:
				exifpointer = {exiftags.exifpointer:None}
				self.__getfo().seek(self.__baseoffset + nextifd, 0)
				self.__read_IFD(exifpointer)
			else:
				print("Read 0th ifd failed...")
				return
			# Read EXIF IFD
			if exifpointer[exiftags.exifpointer] is None:
				print("Read EXIF IFD offset failed...")
				return
			nextifd = struct.unpack(self.__endian + 'L', exifpointer[exiftags.exifpointer])[0]
			if 	nextifd != 0:
				self.__getfo().seek(self.__baseoffset + nextifd, 0)
				self.__read_IFD(self.exif)
			else:
				print("exif pointer is 0")
		else:
			self.__getfo().seek(pos, 0)

	def __read_IFD(self,tags):
		# get IFD field count
		arr = self.__getfo().read(2)
		fieldcount = struct.unpack(self.__endian + 'H', arr)[0]
		# process fileds
		for i in range(0, fieldcount):
			self.__read_IFD_Field(tags)

	def __read_IFD_Field(self,tags):
		arr = self.__getfo().read(2)
		tagid = struct.unpack(self.__endian + 'H', arr)[0]
		arr = self.__getfo().read(2)
		type = struct.unpack(self.__endian + 'H', arr)[0]
		arr = self.__getfo().read(4) 
		count = struct.unpack(self.__endian + 'L', arr)[0]
		# Byte length of field data
		if type == 1:
			n = count
		elif (type == 2) or (type == 7):
			n = count
		elif (type == 3):
			n = 2 * count
		elif (type == 4) or (type == 9):
			n = 4 * count
		elif (type == 5) or (type == 10):
			n = 8 * count
		# Get value or offset
		value = self.__getfo().read(4)
		if tagid not in tags:
			return
		# offset
		if n > 4:
			pos = self.__getfo().tell()
			value = struct.unpack(self.__endian + 'L', value)[0]
			self.__getfo().seek(self.__baseoffset + value, 0)
			value = self.__getfo().read(n)
			self.__getfo().seek(pos, 0)
		tags[tagid] = value

	def getEXIF(self, tags):
		try:
			self.exif = tags
			if not self.__isjpg():
				print("file " + self.__file_path + " is not jpg file")
				return
			self.__read_app0_section()
			self.__read_app1_section()
		finally:
			self.__getfo().seek(0, 0)

def testjpg1():
	tags = {exiftags.datetime:''}
	j = jpg('/tmp/1.jpg')
	j.getEXIF(tags)
	for k in tags:
		print(hex(k) + "=" + j.exif[k])
	del j

def visitjpg(destdir, dirname, names):
	for name in names:
		if name.find('.jpg') < 0 and name.find('.JPG') < 0:
			continue
		tags = {exiftags.datetime_original:None}
		try:
			log.visited += 1
			origpath = os.path.join(dirname, name)
			j = jpg(origpath)
			j.getEXIF(tags)
			if j.exif[exiftags.datetime_original] is None:
				print("failed to get exif of: " + dirname + "/" + name)
				continue
			# the exif read from jpg has \0(NULL bytes) at the end of the string, trim it
			strdt = j.exif[exiftags.datetime_original]
			del j
			while strdt[-1] == "\0":
				strdt = strdt[0:-1]
			dt = datetime.datetime.strptime(strdt, '%Y:%m:%d %H:%M:%S')
			# Get date aggregate folder
			dtdir = os.path.join(destdir, dt.date().isoformat())
			if not os.path.exists(dtdir):
				os.mkdir(dtdir)
			if not os.path.isdir(dtdir):
				print("failed to initialize dir: " + dtdir)
				continue
			newpath = os.path.join(dtdir, dt.date().isoformat() +"_" + dt.time().isoformat().replace(":","-") + ".jpg")
			while os.path.exists(newpath):
				newpath = os.path.splitext(newpath)[0] + "_" + str(random.randint(0,100)) + os.path.splitext(newpath)[1]
			os.rename(origpath, newpath) # use shutil.move if src and dest is on difference file system
		except IOError as e:
			#print("failed to rename: " + dirname + "/" + name + ' due to: ' + str(e))
			#traceback.print_exc(file=sys.stdout)
			print("IOError: failed to rename: " + dirname + "/" + name)
			traceback.print_exc(file=sys.stdout)
		except:
			print("Filed while rename: " + dirname + "/" + name)
			traceback.print_exc(file=sys.stdout)
			raise

def mgmjpg(srcdir, destdir):
	if (not os.path.exists(srcdir)) or (not os.path.isdir(srcdir)):
		print("src: " + str(srcdir) + " is not a directory")
		return
	start = datetime.datetime.now()
	log.visited = 0
	if not os.path.exists(destdir):
		os.mkdir(destdir)
	else:
		if (not os.path.isdir(destdir)):
			print("dest: " + str(destdir) + " is not a directory")
			return
	os.path.walk(srcdir, visitjpg, destdir)
	print("visited %s files" % log.visited)
	print("started when: %s, finished when: %s, cost: %s" % (str(start), str(datetime.datetime.now()), str(datetime.datetime.now()-start)))

def testexif():
	j = jpg('/Volumes/DATA/Pictures/Dudu1/2013-01-03/2013-01-03_19-53-22_47.jpg')
	tags = {exiftags.datetime:None, exiftags.datetime_original:None, exiftags.datetime_digited:None}
	j.getEXIF(tags)
	print("datetime: %s" % j.exif[exiftags.datetime])
	print("original: %s" % j.exif[exiftags.datetime_original])
	print("digited: %s" % j.exif[exiftags.datetime_digited])

if __name__ == '__main__':
	#mgmjpg('/root/src', '/tmp')
	mgmjpg('/Volumes/DATA/Pictures/Dudu1', '/Volumes/DATA/Pictures/Dudu')


你可能感兴趣的:(Python)