1. 判断字符串中是否含有汉字。
def has_hz(text): hz_yes = False for ch in text: if isinstance(ch, unicode): if unicodedata.east_asian_width(ch)!= 'Na': hz_yes = True break else: continue return hz_yes
单元测试:
assert not has_hz("") assert not has_hz(" ") assert not has_hz("123") assert not has_hz(u"123abc") assert has_hz(u"123abc汉字") assert has_hz(u"汉字")
2.隔指定长度插入一个换行符(\n),一个汉字算2个字符长。
def get_hz_string_width(text): """ 获取可能包含汉字的字符串的长度(1个汉字算2个字符长) """ s = 0 for ch in text: if isinstance(ch, unicode): if unicodedata.east_asian_width(ch)!= 'Na': s += 2 else: s += 1 else: s += 1 return s def get_hz_sub_string(text,startat,sub_len=None): """ 获取可能包含汉字的字符串的子串(计算长度时,1个汉字算2个字符长) 用法: get_hz_sub_string(record,0,44) #取子串,位置为0至43 get_hz_sub_string(record,44) #取子串,位置为44至末尾 """ s = [] pos = 0 for ch in text: if pos >= startat: s.append(ch) if isinstance(ch, unicode): if unicodedata.east_asian_width(ch)!= 'Na': pos += 2 else: pos += 1 else: pos += 1 if sub_len!=None and get_hz_string_width(''.join(s))>=sub_len: break return ''.join(s) def insert_line_feed(my_str,interval,line_feed="\n"): """隔指定长度插入一个\n符号(一个汉字处理为2个字符长度)""" if len(my_str)==0: return "" n = int((get_hz_string_width(my_str)-1)/interval)+1 str_list = [] k = 1 pos_start = 0 while k <= n: sub_str = get_hz_sub_string(my_str,pos_start,interval) str_list.append(sub_str) k = k + 1 pos_start = pos_start + get_hz_string_width(sub_str) return line_feed.join(str_list)
单元测试:
assert insert_line_feed("",1)=="" assert insert_line_feed("1",1)=="1" assert insert_line_feed("1234567890",5)=="12345\n67890" assert insert_line_feed(u"汉字1汉字234567890",5)==u"汉字1\n汉字2\n34567\n890" assert insert_line_feed(u"汉字1汉字234567890",4)==u"汉字\n1汉字\n2345\n6789\n0"
3. 按指定长度为文字块分行(类似Word效果),并取消末尾的空行。
def wrap_text_block(text,line_length,do_trim=True): if do_trim: str_list = split(text.rstrip(),'\n') else: str_list = split(text,'\n') #检测末尾空行的开始位置 text_to_line = -1 if do_trim: i = len(str_list)-1 while i > 0: line_str = str_list[i] if len(line_str.strip())==0: text_to_line = i i -= 1 else: break new_str_list = [] i = 0 for obj in str_list: if do_trim and i == text_to_line: break new_str_list += split(insert_line_feed(obj,line_length),'\n') i += 1 #不加 u'' 就出错“'unicode' object is not callable”!? return u''+'\n'.join(new_str_list)
单元测试:
assert wrap_text_block("",1)=="" assert wrap_text_block("",1,do_trim=False)=="" assert wrap_text_block(u"文字1234",2)==u"文\n字\n12\n34" assert wrap_text_block(u"文字12345 ",2)==u"文\n字\n12\n34\n5" assert wrap_text_block(u"文字1\n234",2)==u"文\n字\n1\n23\n4" assert wrap_text_block(u"文字1\n2345 ",2)==u"文\n字\n1\n23\n45"