初次写python应用,比较简单的转换文件编码,用来将文件转换到指定编码,主要利用了 open 文件操作,os 目录遍历,chardet 编码探测,解决 movist(多字幕播放器) 只能正确读取utf-8字幕文件问题,一次将目录下所有字幕都转换城utf-8编码.
ps:发现使用多线程后,时间反而会上升一倍,看来对于小任务线程还是开销比较大的
/Users/yiminghe/code/python/tools/encode.py :
# -*- coding: utf-8 -*- import sys,os,shutil,traceback,time from chardet.universaldetector import UniversalDetector #deal with chinese encodes={ "gb2312":"gb18030", "gbk":"gb18030" } class HeEncodingEx(Exception): def __init__(self,msg): Exception.__init__(self,msg); def gb(encoding): if encoding is None: raise HeEncodingEx,"unknown encoding" encoding=encoding.strip().lower() return encodes[encoding] if encoding in encodes else encoding def transferToEncoding(filename,toCode): ''' save the content of filename to filename with toCode text encoding @param filename{string}: text file @param toCode{string}: text encoding code ,gbk,utf-8...etc @return{boolean}: operation success true/false ''' if(os.path.isdir(filename)): print "error:not file" return False try: detector = UniversalDetector() #print filename #read content f=open(filename,"r") ls=f.readlines(); f.close(); #detect encoding for l in ls: detector.feed(l) if detector.done: break detector.close() #print detector.result #print dir(detector.result) encode=gb(detector.result['encoding']) #print "original encoding:",encode if(encode.lower() != toCode.lower()): #backup orginal file if not os.path.exists(filename+".bak"): shutil.copy(filename, filename+".bak") #save to another encoding f=open(filename,"w") for l in ls: f.write(unicode(l,encode).encode(toCode)) f.close() #print "result encoding:"+toCode else: pass #print "same encoding" except BaseException,e: #print "error:",e traceback.print_exc() #restore if(os.path.exists(filename+".bak")): shutil.copy(filename+".bak", filename) return False finally: print print return True #main if __name__=="__main__": start=time.time() if len(sys.argv)<2: print "erro argv! filename toCoding" sys.exit(1) #default transfer to utf-8 toCode=sys.argv[2] if len(sys.argv) > 2 else "utf-8" filename=sys.argv[1] if(os.path.isfile(filename)): transferToEncoding(filename,toCode) else: import threading #同时10个线程处理文件 THREAD_NUM=10 lock=threading._allocate_lock() def fetchAndProcess(files,func): ''' 每次取一个元素运行 @param files{Array}:数据存放数组 @param func{Function}:处理函数 ''' while len(files): lock.acquire() if len(files)==0: break try: file_=files.pop() except IndexError,e: print e break print threading.current_thread().ident," got : ",file_ lock.release() func(file_,toCode) #folder? then walk all_files=[] for base,folders,files in os.walk(filename): if not base.endswith(os.sep) : base+=os.sep for file_ in files: if file_.lower().endswith("srt"): all_files.append(base+file_) if 0: transferToEncoding(base+file_,toCode) if 1: num=THREAD_NUM threads=[]; #print all_files; while num: num-=1 threads.append(threading.Thread(target=fetchAndProcess,args=(all_files,transferToEncoding))) for thread_ in threads: thread_.start() for thread_ in threads: thread_.join() #10,20,30个线程 40。6秒 #单线程 28.1秒 print "consume time :",time.time()-start
/Users/yiminghe/code/python/tools/ he_encode.sh:
#!/bin/bash - #"$@" ,not $* ,$@,"$*" python /Users/yiminghe/code/python/tools/encode.py "$@"
创建软链接
chmod 777 /Users/yiminghe/code/python/tools/he_encode.sh ln -s /Users/yiminghe/code/python/tools/he_encode.sh /usr/bin
运行:
定位在某个目录下运即可:转换目录下的所有字幕文件为utf-8格式
he_encode .