http://www.cnblogs.com/joyeecheung/p/3842002.html
鐢╬ython + openpyxl澶勭悊excel(07+)鏂囨。 + 涓�浜涗腑鏂囧鐞嗙殑鎶�宸�
鏈�杩戣甯仛RA鐨勮�佸鍐欎釜鍚堝苟excel宸ヤ綔琛ㄧ殑鑴氭湰鈥︹�︽簮鏁版嵁鏄�4000+涓猠xcel 宸ヤ綔琛紝鍒嗗竷鍦�9涓獂lsm鏂囦欢閲岋紝鏂囦欢鍐呭鏄腑鑻辨枃娣锋潅鐨勪竴浜涙暟鎹紝闇�瑕佷粠姣忓紶琛ㄤ腑鎻愬彇闇�瑕佺殑閮ㄥ垎锛屽垎闂ㄥ埆绫诲悎骞跺埌澶氫釜澶х殑琛ㄩ噷銆�
瀵昏宸ュ叿
纭畾浠诲姟涔嬪悗绗竴姝ュ氨鏄壘涓秮鎵嬬殑搴撴潵骞叉椿銆� Python Excel涓婂垪鍑轰簡xlrd銆亁lwt銆亁lutils杩欏嚑涓寘锛屼絾鏄�
瀹冧滑閮芥瘮杈冭�侊紝xlwt鐢氳嚦涓嶆敮鎸�07鐗堜互鍚庣殑excel
瀹冧滑鐨勬枃妗d笉澶弸濂斤紝閮藉彲鑳介渶瑕佸幓璇绘簮浠g爜锛岃�岃�佸鐨勪换鍔℃瘮杈冪揣锛屽姞涓婃垜褰撴椂鍦ㄦ湡鏈紝娌℃湁杩欎釜鏃堕棿缁嗚婧愪唬鐮�
鍐嶄竴鐣悳绱㈠悗鎴戞壘鍒颁簡openpyxl锛屾敮鎸�07+鐨別xcel锛屼竴鐩存湁浜哄湪缁存姢锛屾枃妗f竻鏅版槗璇伙紝鍙傜収Tutorial鍜孉PI鏂囨。寰堝揩灏辫兘涓婃墜锛屽氨鏄畠浜唦
瀹夎
杩欎釜寰堝鏄擄紝鐩存帴pip install openpyxl锛屽懙鍛靛懙~
鍥犱负鎴戜笉闇�瑕佸鐞嗗浘鐗囷紝灏辨病鏈夎pillow銆�
涓�浜涜�冭檻
婧愭枃浠跺ぇ绾︿竴涓湪1~2MB宸﹀彸锛屾瘮杈冨皬锛屾墍浠ュ彲浠ョ洿鎺ヨ鍏ュ唴瀛樺鐞嗐��
鏃㈢劧鏄鐞唀xcel锛屼綍鍐典粬浠暣涓粍鏄剧劧閮芥槸win涓嬪共娲伙紙鏁版嵁閮界敤excel瀛樹簡= =锛屽晢绉戠殑浜哄晩鈥︹�︼級锛岃繖涓剼鏈繕鏄湪win涓嬪仛鍚�
杩欎釜浠诲姟瀹屽叏涓嶉渶瑕佹垜瀵圭幇鏈夌殑鏂囦欢鍋氫慨鏀癸紒鍥р�︹�︽垜鍙璇诲叆銆佸鐞嗐�佸啀鍐欏嚭鍙︿竴涓枃浠跺氨琛屼簡
瀛︿範浣跨敤
鍡紝灏辨槸鎵撳紑cmd锛岀劧鍚庣敤python鐨剆hell鍚勭鐜╄繖涓ā鍧楁潵涓婃墜鈥︹�︼紙win涓嬫病鏈夎ipython锛屽洤锛�
鍋氳繖涓皬鑴氭湰鍩烘湰涓婃垜鍙渶瑕乮mport涓や釜涓滆タ
from openpyxl import Workbook
from openpyxl import load_workbook
load_workbook椤惧悕鎬濅箟鏄妸鏂囦欢瀵煎叆鍒板唴瀛橈紝Workbook鏄渶鍩烘湰鐨勪竴涓被锛岀敤鏉ュ湪鍐呭瓨閲屽垱寤烘枃浠舵渶鍚庡啓杩涚鐩樼殑銆�
骞叉椿
棣栧厛鎴戦渶瑕佸鍏ヨ繖涓枃浠�
inwb = load_workbook(filename)
寰楀埌鐨勫氨鏄竴涓獁orkbook瀵硅薄
鐒跺悗鎴戦渶瑕佸垱寤轰竴涓柊鐨勬枃浠�
outwb = Workbook()
鎺ョ潃鍦ㄨ繖涓柊鏂囦欢閲岋紝鐢╟reate_sheet鏂板缓鍑犱釜宸ヤ綔琛紝姣斿
careerSheet = outwb.create_sheet(0, 'career')
灏变細浠庡ご閮ㄦ彃鍏ヤ竴涓彨career鐨勫伐浣滆〃锛堜篃灏辨槸璇寸敤娉曠被浼紁ython list鐨刬nsert锛�
鎺ヤ笅鏉ユ垜闇�瑕侀亶鍘嗚緭鍏ユ枃浠剁殑姣忎釜宸ヤ綔琛紝骞朵笖鎸夌収琛ㄥ悕鍋氫竴浜涘伐浣滐紙e.g.濡傛灉琛ㄥ悕涓嶆槸鏁板瓧锛屾垜涓嶉渶瑕佸鐞嗭級锛宱penpyxl鏀寔鐢ㄥ瓧鍏镐竴鏍风殑鏂瑰紡閫氳繃琛ㄥ悕鑾峰彇宸ヤ綔琛紝鑾峰彇涓�涓伐浣滅翱鐨勮〃鍚嶇殑鏂规硶鏄痝et_sheet_names
for sheetName in inwb.get_sheet_names():
if not sheetName.isdigit():
continue
sheet = inwb[sheetName]
寰楀埌宸ヤ綔琛ㄤ箣鍚庯紝灏辨槸鎸夊垪鍜岃澶勭悊浜嗐�俹penpyxl浼氭牴鎹伐浣滆〃閲屽疄闄呮湁鏁版嵁鐨勫尯鍩熸潵纭畾琛屾暟鍜屽垪鏁帮紝鑾峰彇琛屽拰鍒楃殑鏂规硶鏄痵heet.rows鍜宻heet.columns锛屽畠浠兘鍙互鍍弆ist涓�鏍风敤銆傛瘮濡傦紝濡傛灉鎴戞兂璺宠繃鏁版嵁灏戜簬2鍒楃殑琛紝鍙互鍐�
if len(sheet.columns) < 2:
continue
濡傛灉鎴戞兂鑾峰彇杩欎釜宸ヤ綔琛ㄧ殑鍓嶄袱鍒楋紝鍙互鍐�
colA, colB = sheet.columns[:2]
闄や簡鐢╟olumns鍜宺ows鏉ュ緱鍒拌繖涓伐浣滆〃鐨勮鍒椾箣澶栵紝杩樺彲浠ョ敤excel鐨勫崟鍏冩牸缂栫爜鏉ヨ幏鍙栦竴涓尯鍩燂紝姣斿
cells = sheet['A1':'B20']
鏈夌偣鍍廵xcel鑷繁鐨勫嚱鏁帮紝鍙互鎷夊嚭涓�鍧椾簩缁寸殑鍖哄煙~
涓轰簡鏂逛究澶勭悊锛岄亣鍒颁竴涓病鏈塁鍒楃殑宸ヤ綔琛紝鎴戣鍒涘缓涓�涓拰A鍒楃瓑闀跨殑绌虹殑C鍒楀嚭鏉ワ紝閭d箞鎴戝彲浠ョ敤sheet.cell杩欎釜鏂规硶锛岄�氳繃浼犲叆鍗曞厓鏍肩紪鍙峰拰娣诲姞绌哄�兼潵鍒涘缓鏂板垪銆�
alen = len(colA)
for i in range(1, alen + 1):
sheet.cell('C%s' % (i)).value = None
娉ㄦ剰锛歟xcel鐨勫崟鍏冩牸鍛藉悕鏄粠1寮�濮嬬殑~
涓婇潰鐨勪唬鐮佷篃鏄剧ず鍑烘潵浜嗭紝鑾峰彇鍗曞厓鏍肩殑鍊兼槸鐢╟ell.value锛堝彲浠ユ槸宸﹀�间篃鍙互鏄彸鍊硷級锛屽畠鐨勭被鍨嬪彲浠ユ槸瀛楃涓层�佹诞鐐规暟銆佹暣鏁般�佹垨鑰呮椂闂达紙datetime.datetime锛夛紝excel鏂囦欢閲屼篃浼氱敓鎴愬搴旂被鍨嬬殑鏁版嵁銆�
寰楀埌姣忎釜鍗曞厓鏍肩殑鍊间箣鍚庯紝灏卞彲浠ヨ繘琛屾搷浣滀簡~openpyxl浼氳嚜 鍔ㄥ皢瀛楃涓茬敤unicode缂栫爜锛屾墍浠ュ瓧绗︿覆閮芥槸unicode绫诲瀷鐨勩��
闄や簡閫愪釜閫愪釜鍗曞厓鏍肩敤cell.value淇敼鍊间互澶栵紝杩樺彲浠ヤ竴琛岃append鍒板伐浣滆〃閲�
sheet.append(strA, dateB, numC)
鏈�鍚庯紝绛夋柊鐨勬枃浠跺啓濂斤紝鐩存帴鐢╳orkbook.save淇濆瓨灏辫
outwb.save("test.xlsx")
杩欎釜浼氳鐩栧綋鍓嶅凡鏈夌殑鏂囦欢锛岀敋鑷充綘涔嬪墠璇诲彇鍒板唴瀛樼殑閭d釜鏂囦欢銆�
涓�浜涜娉ㄦ剰鐨勫湴鏂�
濡傛灉瑕佸湪閬嶅巻涓�鍒楃殑姣忎釜鍗曞厓鏍肩殑鏃跺�欒幏鍙栧綋鍓嶅崟鍏冩牸鐨勫湪杩欎釜column瀵硅薄閲岀殑涓嬫爣
for idx, cell in enumerate(colA):
# do something...
涓轰簡闃叉鑾峰彇鐨勬暟鎹袱绔湁鐪嬩笉瑙佺殑绌烘牸锛坋xcel鏂囦欢閲屽緢甯歌鐨勫潙锛夛紝璁板緱strip()
濡傛灉宸ヤ綔琛ㄩ噷鐨勫崟鍏冩牸娌℃湁鏁版嵁锛宱penpyxl浼氳瀹冪殑鍊间负None锛屾墍浠ュ鏋滆鍩轰簬鍗曞厓鏍肩殑鍊煎仛澶勭悊锛屼笉鑳介鍏堝亣瀹氬畠鐨勭被鍨嬶紝鏈�濂界敤
if not cell.value
continue
涔嬬被鐨勮鍙ユ潵鍏堣鍒ゆ柇
濡傛灉瑕佸鐞嗙殑excel鏂囦欢閲屾湁寰堝noise锛屾瘮濡傚綋浣犻鏈熶竴涓崟鍏冩牸鏄椂闂寸殑鏃跺�欙紝鏈変簺琛ㄧ殑鏁版嵁鍙兘鏄瓧绗︿覆锛岃繖鏃跺�欏彲浠ョ敤
if isinstance(cell.value, unicode):
break
涔嬬被鐨勮鍙ュ鐞嗐��
win涓嬬殑cmd浼间箮涓嶅お濂借瀹氱敤utf-8鐨刢ode page锛屽鏋滄槸绠�浣撲腑鏂囩殑璇濆彲浠ョ敤936锛圙BK锛夛紝print鐨勬椂鍊欎細鑷姩浠巙nicode杞崲鍒癎BK杈撳嚭鍒扮粓绔��
涓�浜涘府蹇欏鐞嗕腑鏂囬棶棰樼殑灏忓嚱鏁�
鎴戝鐞嗙殑琛ㄦ湁涓�浜涜秴鍑篏BK鑼冨洿鐨勫瓧绗︼紝褰撴垜闇�瑕佹妸涓�浜涗俊鎭痯rint鍑烘潵鐩戞帶澶勭悊杩涘害鐨勬椂鍊欓潪甯搁夯鐑︼紝濂藉湪瀹冧滑閮芥槸鍙互鏃犺鐨勶紝鎴戠洿鎺ョ敤绌烘牸鏇挎崲鍐峱rint涔熻锛屾墍浠ュ姞涓婁竴浜涙垜鏈潵灏辫鏇挎崲鎺夌殑鍒嗛殧绗︼紝鎴戝彲浠ワ細
澶嶅埗浠g爜
# annoying seperators
dot = u'\u00b7'
dash = u'\u2014'
emph = u'\u2022'
dot2 = u'\u2027'
seps = (u'.', dot, dash, emph, dot2)
def get_clean_ch_string(chstring):
"""Remove annoying seperators from the Chinese string.
Usage:
cleanstring = get_clean_ch_string(chstring)
"""
cleanstring = chstring
for sep in seps:
cleanstring = cleanstring.replace(sep, u' ')
return cleanstring
澶嶅埗浠g爜
姝ゅ鎴戣繕鏈変竴涓渶姹傦紝鏄妸鑻辨枃鍚峓绌烘牸]涓枃鍚嶅垎鎴愯嫳鏂囧銆佽嫳鏂囧悕銆佷腑鏂囧銆佷腑鏂囧悕銆�
棣栧厛鎴戦渶瑕佽兘鎶婅嫳鏂囧拰涓枃鍒嗗壊寮�锛屾垜鐨勫姙娉曟槸鐢ㄦ鍒欏尮閰嶏紝鎸夌収甯歌涓嫳鏂囧瓧绗﹀湪unicode鐨勮寖鍥存潵濂椼�傚尮閰嶈嫳鏂囧拰涓枃鐨勬鍒檖attern濡備笅锛�
# regex pattern matching all ascii characters
asciiPattern = ur'[%s]+' % ''.join(chr(i) for i in range(32, 127))
# regex pattern matching all common Chinese characters and seporators
chinesePattern = ur'[\u4e00-\u9fff. %s]+' % (''.join(seps))
鑻辨枃灏辩敤ASCII鍙墦鍗板瓧绗︾殑鑼冨洿鏇夸唬锛屽父瑙佷腑鏂囧瓧绗︾殑鑼冨洿鏄痋u4e00-\u9fff锛岄偅涓猻eps鏄墠闈㈡彁鍒拌繃鐨勮秴鍑篏BK鑼冨洿鐨勪竴浜涘瓧绗︺�� 闄や簡绠�鍗曠殑鍒嗗壊锛屾垜杩橀渶瑕佸鐞嗗彧鏈変腑鏂囧悕娌℃湁鑻辨枃鍚嶃�佸彧鏈夎嫳鏂囧悕娌℃湁涓枃鍚嶇瓑鎯呭喌锛屽垽鏂�昏緫濡備笅锛�
澶嶅埗浠g爜
def split_name(name):
"""Split [English name, Chinese name].
If one of them is missing, None will be returned instead.
Usage:
engName, chName = split_name(name)
"""
matches = re.match('(%s) (%s)' % (asciiPattern, chinesePattern), name)
if matches:聽 # English name + Chinese name
return matches.group(1).strip(), matches.group(2).strip()
else:
matches = re.findall('(%s)' % (chinesePattern), name)
matches = ''.join(matches).strip()
if matches:聽 # Chinese name only
return None, matches
else:聽 # English name only
matches = re.findall('(%s)' % (asciiPattern), name)
return ''.join(matches).strip(), None
澶嶅埗浠g爜
寰楀埌浜嗕腑鏂囧悕涔嬪悗锛屾垜闇�瑕佸垎鍓叉垚濮撳拰鍚嶏紝鍥犱负浠诲姟瑕佹眰涓嶉渶瑕佹妸濮撳悕鍒嗗壊寰楀緢鏄庣‘锛屾垜灏辨寜鐓у父瑙佺殑涓枃鍚嶅鍚嶅垎鍓叉柟寮忔潵鍒嗏�斺�斾袱涓瓧or涓変釜瀛楃殑绗竴涓瓧鏄锛屽洓涓瓧鐨勫墠涓や釜瀛楁槸濮擄紝鍚嶅瓧甯﹀垎闅旂鐨勶紙灏戞暟姘戞棌鍚嶅瓧锛夊垎闅旂鍓嶆槸濮擄紙杩欓噷鐢ㄥ埌浜嗗墠闈㈢殑get_clean_ch_string鍑芥暟鏉ョЩ闄ゅ垎闅旂锛夛紝鍚嶅瓧鍐嶉暱涓�浜涘張涓嶅甫鍒嗗壊绗︾殑锛屽亣璁炬暣涓瓧绗︿覆閮芥槸鍚嶅瓧銆傦紙娉ㄦ剰鑻辫鐨刦irst name 鎸囩殑鏄悕锛宭ast name鎸囩殑鏄锛�2333锛�
澶嶅埗浠g爜
def split_ch_name(chName):
"""Split the Chinese name into first name and last name.
* If the name is XY or XYZ, X will be returned as the last name.
* If the name is WXYZ, WX will be returned as the last name.
* If the name is ...WXYZ, the whole name will be returned
as the last name.
* If the name is ..ABC * XYZ..., the part before the seperator
will be returned as the last name.
Usage:
chFirstName, chLastName = split_ch_name(chName)
"""
if len(chName) < 4:聽 # XY or XYZ
chLastName = chName[0]
chFirstName = chName[1:]
elif len(chName) == 4:聽 # WXYZ
chLastName = chName[:2]
chFirstName = chName[2:]
else:聽 # longer
cleanName = get_clean_ch_string(chName)
nameParts = cleanName.split()
print u' '.join(nameParts)
if len(nameParts) < 2:聽 # ...WXYZ
return None, nameParts[0]
chLastName, chFirstName = nameParts[:2]聽 # ..ABC * XYZ...
return chFirstName, chLastName
澶嶅埗浠g爜
鍒嗗壊鑻辨枃鍚嶅氨寰堢畝鍗曚簡锛岀┖鏍煎垎寮�锛岀涓�閮ㄥ垎鏄悕锛岀浜岄儴鍒嗘槸濮擄紝鍏朵粬鎯呭喌鏆傛椂涓嶇灏辫銆�