关键字: 汉字字符串
1. 判断字符串中是否含有汉字。
- def has_hz(text):
- hz_yes = False
- for ch in text:
- if isinstance(ch, unicode):
- if unicodedata.east_asian_width(ch)!= 'Na' :
- hz_yes = True
- break
- else :
- continue
-
- return hz_yes
def has_hz(text):
hz_yes = False
for ch in text:
if isinstance(ch, unicode):
if unicodedata.east_asian_width(ch)!= 'Na':
hz_yes = True
break
else:
continue
return hz_yes
单元测试:
- assert not has_hz("")
- assert not has_hz( " " )
- assert not has_hz( "123" )
- assert not has_hz(u "123abc" )
- assert has_hz(u "123abc汉字" )
- assert has_hz(u "汉字" )
assert not has_hz("")
assert not has_hz(" ")
assert not has_hz("123")
assert not has_hz(u"123abc")
assert has_hz(u"123abc汉字")
assert has_hz(u"汉字")
2.隔指定长度插入一个换行符(/n),一个汉字算2个字符长。
- def get_hz_string_width(text):
-
-
-
- s = 0
- for ch in text:
- if isinstance(ch, unicode):
- if unicodedata.east_asian_width(ch)!= 'Na' :
- s += 2
- else :
- s += 1
- else :
- s += 1
- return s
-
- def get_hz_sub_string(text,startat,sub_len= None ):
-
-
-
-
-
-
-
- s = []
- pos = 0
- for ch in text:
- if pos >= startat:
- s.append(ch)
- if isinstance(ch, unicode):
- if unicodedata.east_asian_width(ch)!= 'Na' :
- pos += 2
- else :
- pos += 1
- else :
- pos += 1
- if sub_len!= None and get_hz_string_width( '' .join(s))>=sub_len:
- break
- return '' .join(s)
-
- def insert_line_feed(my_str,interval,line_feed= "/n" ):
-
- if len(my_str)== 0 :
- return ""
-
- n = int((get_hz_string_width(my_str)-1 )/interval)+ 1
- str_list = []
- k = 1
- pos_start = 0
- while k <= n:
- sub_str = get_hz_sub_string(my_str,pos_start,interval)
- str_list.append(sub_str)
- k = k + 1
- pos_start = pos_start + get_hz_string_width(sub_str)
-
- return line_feed.join(str_list)
def get_hz_string_width(text):
"""
获取可能包含汉字的字符串的长度(1个汉字算2个字符长)
"""
s = 0
for ch in text:
if isinstance(ch, unicode):
if unicodedata.east_asian_width(ch)!= 'Na':
s += 2
else:
s += 1
else:
s += 1
return s
def get_hz_sub_string(text,startat,sub_len=None):
"""
获取可能包含汉字的字符串的子串(计算长度时,1个汉字算2个字符长)
用法:
get_hz_sub_string(record,0,44) #取子串,位置为0至43
get_hz_sub_string(record,44) #取子串,位置为44至末尾
"""
s = []
pos = 0
for ch in text:
if pos >= startat:
s.append(ch)
if isinstance(ch, unicode):
if unicodedata.east_asian_width(ch)!= 'Na':
pos += 2
else:
pos += 1
else:
pos += 1
if sub_len!=None and get_hz_string_width(''.join(s))>=sub_len:
break
return ''.join(s)
def insert_line_feed(my_str,interval,line_feed="/n"):
"""隔指定长度插入一个/n符号(一个汉字处理为2个字符长度)"""
if len(my_str)==0:
return ""
n = int((get_hz_string_width(my_str)-1)/interval)+1
str_list = []
k = 1
pos_start = 0
while k <= n:
sub_str = get_hz_sub_string(my_str,pos_start,interval)
str_list.append(sub_str)
k = k + 1
pos_start = pos_start + get_hz_string_width(sub_str)
return line_feed.join(str_list)
单元测试:
- assert insert_line_feed(" ",1)==" "
- assert insert_line_feed( "1" , 1 )== "1"
- assert insert_line_feed( "1234567890" , 5 )== "12345/n67890"
- assert insert_line_feed(u "汉字1汉字234567890" , 5 )==u "汉字1/n汉字2/n34567/n890"
- assert insert_line_feed(u "汉字1汉字234567890" , 4 )==u "汉字/n1汉字/n2345/n6789/n0"
assert insert_line_feed("",1)==""
assert insert_line_feed("1",1)=="1"
assert insert_line_feed("1234567890",5)=="12345/n67890"
assert insert_line_feed(u"汉字1汉字234567890",5)==u"汉字1/n汉字2/n34567/n890"
assert insert_line_feed(u"汉字1汉字234567890",4)==u"汉字/n1汉字/n2345/n6789/n0"
3. 按指定长度为文字块分行(类似Word效果),并取消末尾的空行。
- def wrap_text_block(text,line_length,do_trim= True ):
- if do_trim:
- str_list = split(text.rstrip(),'/n' )
- else :
- str_list = split(text,'/n' )
-
-
- text_to_line = -1
- if do_trim:
- i = len(str_list)-1
- while i > 0 :
- line_str = str_list[i]
- if len(line_str.strip())== 0 :
- text_to_line = i
- i -= 1
- else :
- break
-
- new_str_list = []
- i = 0
- for obj in str_list:
- if do_trim and i == text_to_line:
- break
- new_str_list += split(insert_line_feed(obj,line_length),'/n' )
- i += 1
-
-
- return u '' + '/n' .join(new_str_list)
def wrap_text_block(text,line_length,do_trim=True):
if do_trim:
str_list = split(text.rstrip(),'/n')
else:
str_list = split(text,'/n')
#检测末尾空行的开始位置
text_to_line = -1
if do_trim:
i = len(str_list)-1
while i > 0:
line_str = str_list[i]
if len(line_str.strip())==0:
text_to_line = i
i -= 1
else:
break
new_str_list = []
i = 0
for obj in str_list:
if do_trim and i == text_to_line:
break
new_str_list += split(insert_line_feed(obj,line_length),'/n')
i += 1
#不加 u'' 就出错“'unicode' object is not callable”!?
return u''+'/n'.join(new_str_list)
单元测试:
- assert wrap_text_block(" ",1)==" "
- assert wrap_text_block(" ",1,do_trim=False)==" "
-
- assert wrap_text_block(u "文字1234" , 2 )==u "文/n字/n12/n34"
- assert wrap_text_block(u "文字12345 " , 2 )==u "文/n字/n12/n34/n5"
-
- assert wrap_text_block(u "文字1/n234" , 2 )==u "文/n字/n1/n23/n4"
- assert wrap_text_block(u "文字1/n2345 " , 2 )==u "文/n字/n1/n23/n45"
assert wrap_text_block("",1)==""
assert wrap_text_block("",1,do_trim=False)==""
assert wrap_text_block(u"文字1234",2)==u"文/n字/n12/n34"
assert wrap_text_block(u"文字12345 ",2)==u"文/n字/n12/n34/n5"
assert wrap_text_block(u"文字1/n234",2)==u"文/n字/n1/n23/n4"
assert wrap_text_block(u"文字1/n2345 ",2)==u"文/n字/n1/n23/n45"