#!/usr/bin/env python
#encoding=utf-8
import redis,codecs,sys,time,datetime,doctest,re
reload(sys)
sys.setdefaultencoding('utf8')
class Unbuffered:
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
sys.stdout = Unbuffered(sys.stdout)
def read_keys():
keys=r.keys()
r=redis.Redis(host='localhost',db=6)
print len(keys)
f=codecs.open("query_keys.txt","w","utf-8")
#print r.info()
for key in keys:
print key
#print type(key)
f.write("%s\n"%(key,))
f.close()
def read_relevent_words():
keys=r.keys()
r=redis.Redis(host='localhost',db=6)
print len(keys)
f=codecs.open("query_relevent_words.txt","w","utf-8")
for key in keys:
# print r.get(key)
f.write("%s\n"%(r.get(key),))
f.close()
def parser_one_line_one_words():
ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
f=codecs.open("query_relevent_words.txt","r","utf-8")
for line in f.readlines():
li=line.strip().split("*")
for elem in li:
ff.write("%s\n"%(elem,))
ff.close()
def parser_one_line_one_words2():
s=set()
ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
f=codecs.open("query_relevent_words.txt","r","utf-8")
for line in f.readlines():
li=line.strip().split("*")
for elem in li:
s.add(elem.strip())
ff.write("%s\n"%(elem,))
ff.close()
print len(s)
def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
count=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
if a.strip()<>b.replace(" ","").strip():
print count,a,b
time.sleep(5)
def build_invert_index():
"""
对wname建倒排索引
以set结构存放倒排数据
"""
r=redis.Redis(db=1)
p=r.pipeline()
count=0
#for line in codecs.open("../result_text.txt","r","utf-8").readlines():
for line in codecs.open("../output_result_process","r","utf-8").readlines():
count+=1
#if count<2553148:
# continue
#print count
#print line,
#print line.strip().split(" ").__len__()
for elem in line.strip().split(" "):
p.sadd(elem.strip(),count)
if count%10000==0:
print count
print "batch insert to redis ..."
s=datetime.datetime.now()
p.execute()
e=datetime.datetime.now()
print "done:%s"%((e-s).seconds)
p.execute()
def is_chinese(uchar):
"""
判断一个unicode是否是汉字
>>> is_chinese(u"人")
True
>>> is_chinese("人")
True
>>> is_chinese("1")
False
>>> is_chinese(" ")
False
"""
if type(uchar)==type(""):
u=uchar.decode("utf-8","ignore")
else:
u=uchar.encode("utf-8","ignore")
if len(u)!=len(uchar):
return True
else:
return False
#if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
# return True
#else:
# return False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar >= u'\u0030' and uchar<=u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""
判断一个unicode是否是英文字母
#>>> is_alphabet(u"t")
#True
#>>> is_alphabet("t")
#True
"""
if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
return True
else:
return False
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def _filter(line):
"""
对分词后的文本wname字符进行非中文汉字、字母、数字的替换
"""
r=[]
for elem in line.strip().split(" "):
element=elem.strip()
if type(element)<>type(u""):
element=element.decode("utf-8","ignore")
if is_other(element)==False:
r.append(element)
return " ".join(r)
def post_process_wname_segments_illegal_characters():
f=codecs.open("../output_result_process","w","utf-8")
for line in codecs.open("../output_result","r","utf-8").readlines():
s=_filter(line)
print s
f.write(_filter(line)+"\n")
f.close()
def build_word_segments_hash_map():
"""
给查询词和相关词建立原词-分词结果之间的hashmap
"""
r2=redis.Redis(db=2)
p=r2.pipeline()
f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
#f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
count=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
p.set(a.strip(),b.strip())
if count%10000==0:
print count
print "batch insert to redis ..."
s=datetime.datetime.now()
p.execute()
e=datetime.datetime.now()
print "done:%s"%((e-s).seconds)
p.execute()
f1=codecs.open("query_keys.txt","r","utf-8")
#f2=codecs.open("query_keys_result.txt","r","utf-8")
f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
count=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
p.set(a.strip(),b.strip())
if count%10000==0:
print count
print "batch insert to redis ..."
s=datetime.datetime.now()
p.execute()
e=datetime.datetime.now()
print "done:%s"%((e-s).seconds)
p.execute()
r2.bgsave()
def _build_list_for_inter_args(s1,s2):
"""
将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西
"""
r=[]
r.extend(s1.split(" "))
r.extend(s2.split(" "))
return [elem.strip() for elem in r if elem.strip()<>""]
def final_find_synomns_out():
"""
"""
#f=codecs.open("synomns.txt","w","utf-8")
f=codecs.open("synomns_pku.txt","w","utf-8")
r1=redis.Redis(db=1)
r2=redis.Redis(db=2)
f1=codecs.open("query_keys.txt","r","utf-8")
f2=codecs.open("query_relevent_words.txt","r","utf-8")
count=0
validateCount=0
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
#print count
query_segments=r2.get(a.strip())
for elem in b.split("*"):
if elem.strip()=="":
continue
if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
validateCount+=1
if validateCount%1000==0:
print "validateCount:%s\n"%validateCount
f.write("%s|||%s\n"%(a.strip(),elem.strip()))
f.flush()
f.close()
def interactive_mode():
while(True):
r1=redis.Redis(db=1)
r2=redis.Redis(db=2)
input=raw_input("input query|||relevent_word:\n")
a,b=input.strip().split("|||")
query_segments=r2.get(a.strip())
print a.strip(),"==>",query_segments
print b.strip(),"==>",r2.get(b.strip())
print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
print "========="
def c1(line):
"""
空格切分
>>> c1("执手|||把手")
False
"""
a,b=line.strip().split("|||")
return a.split(" ").__len__()>1 or b.split(" ").__len__()>1
r2=redis.Redis(db=2)
def c4(s1,s2):
"""
#>>> c4("尤利西斯","追忆逝水年华")
False
#>>> c4("A B C","A B")
True
>>> c4("无线鼠套装","无线键鼠套装")
False
#>>> c4("A B","A C")
False
#>>> c4("A B","A C")
False
#>>> c4("A","A")
True
>>> c4("行政职业能力测验真题","行测真题")
False
#>>> c4("B","C")
False
"""
if s1==s2:
return True
global r2
set1=set()
set2=set()
if r2.exists(s1):
s1=r2.get(s1).strip()
if s1.find(" ")>-1:
set1=set([elem.strip() for elem in s1.split(" ") if elem.strip()<>""])
else:
set1=set([s1.strip()])
if r2.exists(s2):
s2=r2.get(s2).strip()
if s2.find(" ")>-1:
set2=set([elem.strip() for elem in s2.split(" ") if elem.strip()<>""])
else:
set2=set([s2.strip()])
#print set1,set2
#for elem in set1:
# print elem,
#print "=========="
#for elem in set2:
# print elem,
inster=set1 & set2
if inster.__len__()==0:
return False
if inster.__len__()<min(set1.__len__(),set2.__len__()):
return False
else:
return True
def c3(line):
"""
>>> c3("执手|||把手")
False
>>> c3("the north face|||tnf")
False
>>> c3("the 大north face|||tnf")
True
>>> c3("wd1tb|||i5 2320")
True
"""
def is_en_or_num(s):
#if re.match(r"[a-zA-A0-9]{1,}\Z",s):
if re.match(r"[a-zA-Z]{1,}\Z",s.strip()):
return True
else:
return False
def f(list):
r=set()
def _f(s1,s2):
r.add(is_en_or_num(s1) & is_en_or_num(s2))
return s2
reduce(_f,list)
if False in r:
return True
else:
return False
a,b=line.strip().split("|||")
if a.split(" ").__len__()>1 and b.split(" ").__len__()>1:
return f(a.split(" ")) & f(b.split(" "))
if a.split(" ").__len__()==1 and b.split(" ").__len__()>1:
return f(b.split(" "))
if a.split(" ").__len__()>1 and b.split(" ").__len__()==1:
return f(a.split(" "))
if a.split(" ").__len__()==1 and b.split(" ").__len__()==1:
return False
def c2(line):
"""
包含子串
>>> c2("执手|||把手")
False
>>> c2("浓缩咖啡|||咖啡")
True
"""
a,b=line.strip().split("|||")
return (a in b) or (b in a)
def filter_synonym_result():
"""
将pku分词获得的query和relevent_word有交集的synomns_pku.txt,
对其结果进行过滤
过滤掉以下条件:
有空格切分的
包含子串的进行过滤
"""
f=codecs.open("synomns_pku_filter.txt","w","utf-8")
for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
if c1(line)==False and c2(line)==False:
f.write(line)
f.close()
def test_redis_is_ready():
"""
测试redis启动OK了
"""
r=redis.Redis()
print r.info()
def pivot_query_relvent_word_order_and_intersation_size():
"""
将结果以
Query为key
hashmap为value
hashmap的key为relevent word
value为list [intersation_size,relevent word order]
"""
debug=False
r1=redis.Redis(db=1)
r2=redis.Redis(db=2)
r3=redis.Redis(db=3)
#r3.flushdb()
p=r3.pipeline()
def step1():
"""
从synomns_pku_filter.txt中初始化存储的格式为hmap格式
"""
count=0
for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
count+=1
a,b=line.split("|||")
a=a.strip()
b=b.strip()
#print type(a),type(b)
#print a,b
p.hset(a,b,[])
if count%10000==0:
p.execute()
print "执行一次批量提交redis操作"
if count==1 and debug==True:
break
p.execute()
#step1()
def step2():
"""
将相关词的顺序插入到redis的hmap 的value中
"""
count=0
exists_count=0
not_exists_count=0
f1=codecs.open("query_keys.txt","r","utf-8")
f2=codecs.open("query_relevent_words.txt","r","utf-8")
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
a=a.strip()
b=b.strip()
for idx,elem in enumerate(b.split("*")):
element=elem.strip()
if element=="":
continue
#print type(a),type(element)
#print a,b,element
if r3.hexists(a,element):
exists_count+=1
r3.hset(a,element,[idx+1])
else:
not_exists_count+=1
#print "%s,%s not exists in redis"%(a,element)
if count%10000==0:
print "exists_count:%s"%exists_count
print "not_exists_count:%s"%not_exists_count
if count==1 and debug==True:
break
print "exists_count:%s"%exists_count
print "not_exists_count:%s"%not_exists_count
print "step2 finished"
#step2()
def test_step1_and_step2_is_ok():
"""
"""
result=r3.hget("透明茶杯","茶具")
if type([])==type(eval(result)):
print "正确"
else:
print "不正确"
#test_step1_and_step2_is_ok()
def step3():
"""
将有交集结果的数据重新再跑一遍并将交集的大小改写到hmap的value中
"""
count=0
validateCount=0
for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
a,b=line.strip().split("|||")
a=a.strip()
b=b.strip()
count+=1
#print count
query_segments=r2.get(a)
intersation_len=r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b))).__len__()
if intersation_len>0:
list_result=eval(r3.hget(a,b))
if len(list_result)<>1:
print a,b
print type(a),type(b)
print "ERROR"
exit(-1)
#print type(list_result)==type([])
list_result.append(intersation_len)
r3.hset(a,b,list_result)
validateCount+=1
if validateCount%1000==0:
print "validateCount:%s\n"%validateCount
print "final validateCount %s"%validateCount
#step3()
def step4():
"""
将存储在redis hmap中的结构进行输出
"""
def cmp(x,y):
if x[1][1]<y[1][1]:
return 1
elif x[1][1]>y[1][1]:
return -1
else:
if x[1][0]<y[1][0]:
return 1
elif x[1][0]>y[1][0]:
return -1
else:
return 0
f=codecs.open("synomns_pku_filter_process.txt","w","utf-8")
#[('b', [2, 4]), ('a', [1, 3])]
count=0
for key in r3.keys():
count+=1
print count
f.write("%s"%key)
z=r3.hgetall(key)
for k,v in z.iteritems():
z[k]=eval(v)
for elem in sorted(z.items(),cmp):
word,orders=elem
f.write("|||%s,%s"%(word,str(orders)))
f.write("\n")
f.flush()
f.close()
step4()
def _find_short_name(s1,s2):
"""
>>> _find_short_name("行测","行政能力测试")
True
>>> _find_short_name("AB","ABC")
False
>>> _find_short_name("A","D")
False
"""
if len(s1)>=len(s2):
return False
if s1 in s2:
return False
return set(s1).issubset(set(s2))
def find_short_name():
"""
在synomns_pku_filter.txt中查找简称
"""
for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
a,b=line.strip().split("|||")
a=a.strip()
b=b.strip()
if _find_short_name(a,b) or _find_short_name(b,a):
print "%s|||%s"%(a,b)
def find_short_name2():
"""
在原query和relevent word中查找简称
"""
f=codecs.open("short_name_global.txt","w","utf-8")
count=0
validateCount=0
f1=codecs.open("query_keys.txt","r","utf-8")
f2=codecs.open("query_relevent_words.txt","r","utf-8")
for a,b in zip(f1.readlines(),f2.readlines()):
count+=1
#if count<146146:
# continue
#else:
# print a,b
#if count<146148:
# print "stop..."
# time.sleep(100000)
a=a.strip()
b=b.strip()
for idx,elem in enumerate(b.split("*")):
element=elem.strip()
if element=="":
continue
line="%s|||%s\n"%(a,element)
#print line
#time.sleep(4000)
if c3(line)==False and c2(line)==False and c4(a,element)==False:
#if _find_short_name(a,element) or _find_short_name(element,a):
validateCount+=1
#if validateCount%10000==0:
print "validateCount:%s"%validateCount
print line
#time.sleep(100000)
f.write(line)
f.flush()
if count%10000==0:
print "cout===========>%s"%count
f.close()
print "validateCount:%s"%validateCount
print "cout===========>%s"%count
def test_sorted():
a=[('a',[1,2]),("b",[0,2]),("c",[-1,3])]
def cmp(x,y):
if x[1][1]<y[1][1]:
return 1
elif x[1][1]>y[1][1]:
return -1
else:
if x[1][0]<y[1][0]:
return 1
elif x[1][0]>y[1][0]:
return -1
else:
return 0
print sorted(a,cmp)
def _find_only_one_word_difference(line):
"""
>>> _find_only_one_word_difference("毛领毛衣|||毛领衣服")
True
"""
return True
def find_only_one_word_difference():
f=codecs.open("./short_name_global_filter.txt","w","utf-8")
for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
a,b=line.strip().split("|||")
if len(a)==len(b) and a<>b:
#print type(a),type(b)
set1=set(a)
set2=set(b)
inster=set1&set2
m=len(a)-1
if m>0 and inster.__len__()==m:
if ((set1-set2).__len__()>0 and str(list(set1-set2)[0]).isdigit()==False) or \
((set2-set1).__len__()>0 and str(list(set2-set1)[0]).isdigit()==False):
f.write(line)
f.close()
def find_human_names():
"""
从query_relevent_word中找一批人名
苏轼 苏东坡
"""
xins=['白','毕','卞','蔡','曹','岑','常','车','陈','成','程','池','邓','丁','范','方','樊','费','冯','符','傅','甘','高','葛','龚','古','关','郭','韩','何','贺','洪','侯','胡','华','黄','霍','姬','简','江','姜','蒋','金','康','柯','孔','赖','郎','乐','雷','黎','李','连','','梁','廖','林','凌','刘','柳','龙','卢','鲁','陆','路','吕','罗','骆','马','梅','孟','莫','母','穆','倪','宁','欧','区','潘','彭','','皮','齐','戚','钱','强','秦','丘','邱','饶','任','沈','盛','施','石','时','史','司徒','苏','孙','谭','汤','唐','陶','田','童','涂','王','危','韦','卫','魏','温','文','翁','巫','邬','吴','伍','武','席','夏','萧','谢','辛','邢','徐','许','薛','严','颜','杨','叶','易','殷','尤','于','余','俞','虞','元','袁','岳','云','曾','詹','张','章','赵','郑','钟','周','邹','朱','褚','庄','卓']
xins+=['李','王','张','刘','陈','黄','周','吴','徐','孙','胡','朱','高','林','何','郭','马','罗','梁','宋','郑','谢','韩','唐','冯','于','董','萧','程','曹','袁','邓','许','傅','沈','曾','彭','吕','苏','卢','蒋','蔡','贾','丁','魏','薛','叶','阎','余','潘','杜','戴','夏','','汪','田','任','姜','范','方','石','姚','谭','廖','邹','熊','金','陆','郝','孔','白','崔','康','毛','邱','秦','江','史','顾','侯','','孟','龙','万','段','章','钱','汤','尹','黎','易','常','武','乔','贺','赖','龚','文']
xins+=['鲍俎','百里','碧鲁','伯赏','北堂','陈林','淳于','第五','东方','东郭','东门','段干','独孤','端木','范姜','哥舒','公良','公孙','公西','公冶','公羊','缑亢','谷梁','归海','赫连','胡母','呼延','黄方','皇甫','即墨','夹谷','晋楚','况后','梁丘','令狐','陆费','闾丘','闾邱','明哲','墨哈','慕容','万俟','南宫','南郭','南门','年爱','欧阳','濮阳','漆雕','亓官','屈突','壤驷','汝鄢','司马','司空','司寇','司徒','官','商牟','申屠','侍其','疏束','叔孙','太史','太叔','澹台','涂钦','拓拔','完完','完颜','王子','闻人','微生','巫马','乌雅','铁笔','夏','许世','轩辕','闫法','羊舌','阳佟','耶律','有琴','尉迟','余佴','宇文','岳帅','乐正','宰父','子车','子阳','宗政','左丘','张简','章佳','长孙','郑余','仲孙','钟离','诸葛','颛孙']
xins+=['付']
xins+=['李','王','张','刘','陈','杨','黄','孙','周','吴','徐','赵','朱','马','胡','郭','林','何','高','梁','郑','罗','宋','谢','唐','韩','曹','许','邓','萧','冯','曾','程','蔡','彭','潘','袁','于','董','余','苏','叶','吕','魏','蒋','田','杜','丁','沈','姜','范','江','傅','','卢','汪','戴','崔','任','陆','廖','姚','方','金','邱','夏','谭','韦','贾','邹','石','熊','孟','秦','阎','薛','侯','雷','白','龙','','郝','孔','邵','史','毛','常','万','顾','赖','武','康','贺','严','尹','钱','施','牛','洪','龚','汤','陶','黎','温','莫','易','樊','','文','安','殷','颜','庄','章','鲁','倪','庞','邢','俞','翟','蓝','聂','齐','向','申','葛','柴','伍','覃','骆','关','焦','柳','欧','','纪','尚','毕','耿','芦','左','季','管','符','辛','苗','詹','曲','欧阳','靳','祁','路','涂','兰','甘','裴','梅','童','翁','霍','游','阮','尤','岳','柯','牟','滕','谷','舒','卜','成','饶','宁','凌','盛','查','单','冉','鲍','华','包','屈','房','喻','解','蒲','卫','简','时','连','车','项','闵','邬','吉','党','阳','司','费','蒙','席','晏','隋','古','强','穆','姬','宫','景','米','麦','谈','柏','瞿','艾','沙','鄢','桂','窦','郁','缪','畅','巩','卓','褚','栾','戚','全','娄','甄','郎','池','丛','边','岑','农','苟','迟','保','商','臧','','卞','虞','刁','冷','应','匡','栗','仇','练','楚','揭','师','官','佟','封','燕','桑','巫','敖','原','植','邝','仲','荆','储','宗','','干','苑','寇','盖','南','屠','鞠','荣','井','乐','银','奚','明','麻','雍','花','闻','冼','木','郜','廉','衣','蔺','和','冀','占','','门','帅','利','满','陈生']
xins=set(xins)
print xins.__len__()
#f=codecs.open("./baijiaxin.txt","w","utf-8")
#for elem in [elem.strip() for elem in xins if elem.strip()<>""]:
# f.write("%s\n"%elem.strip())
#f.close()
f=codecs.open("./short_name_global_xin.txt","w","utf-8")
xins=[elem.strip() for elem in xins if elem.strip()<>""]
for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
a,b=line.strip().split("|||")
a=a.strip()
b=b.strip()
if (a[:2]==b[:2] and a[:2] in xins) or (a[:1]==b[:1] and a[:1] in xins) and len(a)<5 and len(b)<5:
f.write(line)
f.close()
def extrace_names():
subject="""
<TD width=90><a href=/zaobao/chinese/surname/pages/story_bai2.html target=_blank>白</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_bi4.html target=_blank>毕</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/bian040600.html target=_blank>卞</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cai4.html target=_blank>蔡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cao2.html target=_blank>曹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cen2.html target=_blank>岑</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/chang040600.html target=_blank>常</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_che.html target=_blank>车</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_chen2.html target=_blank>陈</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/cheng030100.html target=_blank>成</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cheng2.html target=_blank>程</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_chi2.html target=_blank>池</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_deng4.html target=_blank></a>邓</TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ding.html target=_blank>丁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fan4.html target=_blank>范</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fang.html target=_blank></a>方</TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/fan140600.html target=_blank>樊</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/fei140600.html target=_blank>费</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_feng2.html target=_blank>冯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fu2.html target=_blank>符</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fu4.html target=_blank>傅</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gan.html target=_blank>甘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gao.html target=_blank>高</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ge170100.html target=_blank>葛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gong.html target=_blank>龚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gu3.html target=_blank>古</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_guan.html target=_blank>关</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_guo.html target=_blank>郭</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_han2.html target=_blank>韩</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_he2.html target=_blank>何</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/he140600.html target=_blank>贺</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hong2.html target=_blank>洪</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hou2.html target=_blank>侯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hu2.html target=_blank>胡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hua4.html target=_blank>华</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_huang2.html target=_blank>黄</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/huo010600.html target=_blank>霍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ji030100.html target=_blank>姬</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jian3.html target=_blank>简</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang1.html target=_blank>江</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang.html target=_blank>姜</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang3.html target=_blank>蒋</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jin.html target=_blank>金</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_kang.html target=_blank>康</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ke.html target=_blank>柯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_kong3.html target=_blank>孔</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lai4.html target=_blank>赖</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lang170100.html target=_blank>郎</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/le140600.html target=_blank>乐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lei2.html target=_blank>雷</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_li2.html target=_blank>黎</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_li3.html target=_blank>李</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lian2.html target=_blank>连</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lian140600.html target=_blank>廉</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/liang030100.html target=_blank>梁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liao4.html target=_blank>廖</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lin2.html target=_blank>林</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ling2.html target=_blank>凌</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liu2.html target=_blank>刘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liu3.html target=_blank>柳</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_long2.html target=_blank>龙</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lu2.html target=_blank>卢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lu170100.html target=_blank>鲁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lu4.html target=_blank>陆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lu140600.html target=_blank>路</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lv3.html target=_blank>吕</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_luo2.html target=_blank>罗</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_luo4.html target=_blank>骆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ma3.html target=_blank>马</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_mei2.html target=_blank>梅</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/meng140600.html target=_blank>孟</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_mo4.html target=_blank>莫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/mu030100.html target=_blank>母</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/mu130700.html target=_blank>穆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ni2.html target=_blank>倪</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ning2.html target=_blank>宁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ou.html target=_blank>欧</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ou030100.html target=_blank>区</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_pan.html target=_blank>潘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_peng2.html target=_blank>彭</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_pu2.html target=_blank>蒲</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/pi130700.html target=_blank>皮</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qi130700.html target=_blank>齐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qi030100.html target=_blank>戚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qian2.html target=_blank>钱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qiang310500.html target=_blank>强</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qing2.html target=_blank>秦</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qiu030100.html target=_blank>丘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qiu.html target=_blank>邱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_rao2.html target=_blank>饶</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ren2.html target=_blank>任</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shen3.html target=_blank>沈</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/sheng010600.html target=_blank>盛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi.html target=_blank>施</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi2.html target=_blank>石</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/shi300500.html target=_blank>时</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi3.html target=_blank>史</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/situ030100.html target=_blank>司徒</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_su.html target=_blank>苏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_sun.html target=_blank>孙</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tan2.html target=_blank>谭</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tang.html target=_blank>汤</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tang2.html target=_blank>唐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tao2.html target=_blank>陶</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tian2.html target=_blank>田</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/tong040600.html target=_blank>童</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tu2.html target=_blank>涂</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wang2.html target=_blank>王</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/wei010600.html target=_blank>危</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wei3.html target=_blank>韦</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/wei180100a.html target=_blank>卫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wei4.html target=_blank>魏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wen.html target=_blank>温</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wen2.html target=_blank>文</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_weng.html target=_blank>翁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu.html target=_blank>巫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu1.html target=_blank>邬</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu2.html target=_blank>吴</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3.html target=_blank>伍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3a.html target=_blank>武</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/xi040600.html target=_blank>席</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xia4.html target=_blank>夏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xiao.html target=_blank>萧</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xie4.html target=_blank>谢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xin.html target=_blank>辛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xing2.html target=_blank>邢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xu2.html target=_blank>徐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xu3.html target=_blank>许</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xue.html target=_blank>薛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2.html target=_blank>严</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2a.html target=_blank>颜</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yang2.html target=_blank>杨</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ye4.html target=_blank>叶</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yi4.html target=_blank>易</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yin020600.html target=_blank>殷</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_you2.html target=_blank>尤</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu010600.html target=_blank>于</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yu2.html target=_blank>余</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu020600a.html target=_blank>俞</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu020600.html target=_blank>虞</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/yuan310500.html target=_blank>元</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yuan2.html target=_blank>袁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yue030100.html target=_blank>岳</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yun2.html target=_blank>云</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zeng.html target=_blank>曾</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhan.html target=_blank>詹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang.html target=_blank>张</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang1.html target=_blank>章</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhao4.html target=_blank>赵</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zheng4.html target=_blank>郑</a></TD>
</TR>
<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhong.html target=_blank>钟</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou.html target=_blank>周</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou1.html target=_blank>邹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhu.html target=_blank>朱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/zhu180100.html target=_blank>褚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuang.html target=_blank>庄</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuo.html target=_blank>卓</a></TD>
"""
result = re.findall(r"target=_blank>(?P<name>[\s\S]*?)</TD>", subject)
print ("['"+"','".join(result)+"']").replace("</a>","")
html=u"""
1李
2王
3张
4刘
5陈
6杨
7赵
8黄
9周
10吴
11徐
12孙
13胡
14朱
15高
16林
17何
18郭
19马
20罗
21梁
22宋
23郑
24谢
25韩
26唐
27冯
28于
29董
30萧
31程
32曹
33袁
34邓
35许
36傅
37沈
38曾
39彭
40吕
41苏
42卢
43蒋
44蔡
45贾
46丁
47魏
48薛
49叶
50阎
51余
52潘
53杜
54戴
55夏
56钟
57汪
58田
59任
60姜
61范
62方
63石
64姚
65谭
66廖
67邹
68熊
69金
70陆
71郝
72孔
73白
74崔
75康
76毛
77邱
78秦
79江
80史
81顾
82侯
83邵
84孟
85龙
86万
87段
88章
89钱
90汤
91尹
92黎
93易
94常
95武
96乔
97贺
98赖
99龚
100文
"""
list2=[]
for line in html.strip().split(" \n"):
list2.append("'"+line[-1]+"'")
print "[" + ",".join(list2) + "]"
html=u"""
鲍俎(bao zu)、百里(bai li)、碧鲁(bi lu)、伯赏(bo shang)、北堂(bei tang)
单于(chan yu)、陈林(chen lin)、淳于(chun yu)、
第五(di wu)、 东方(dong fang)、东郭(dong guo)、东门(dong men)、段干(duan gan)、独孤(du gu)、端木(duan mu)、
范姜(fan jiang)、
哥舒(ge shu)、公良(gong liang)、公孙(gong sun)、公西(gong xi)、公冶(gong yan)、公羊(gong yang)、缑亢(gou kang)、谷梁(gu liang)、归海(gui hai)、
赫连(he lian)、胡母(hu mu)、呼延(hu yan)、黄方(huang fang)、皇甫(huang fu)、
即墨(ji mo)、夹谷(jia gu)、晋楚(jin chu)、
况后(kuang hou)、
梁丘(liang qiu)、令狐(ling hu)、陆费(lu fei)、闾丘(lv qiu)、闾邱(lv qiu)、
明哲(ming zhe)、墨哈(mo ha)、慕容(mu rong)、万俟(mò qí)
钠兰(na lan)、南宫(nan gong)、南郭(nan guo)、南门(nan men)、年爱(nian ai)、
欧阳(ou yang)、
濮阳(pu yang)、
漆雕(qi diao)、亓官(qi guan)、屈突(qu tu)、
壤驷(rang si)、汝鄢(ru yan)、
司马(si ma)、司空(si kong)、司寇(si kou)、司徒(si tu)、上官(shang guan)、商牟(shang mou)、申屠(shen tu)、侍其(shi qi)、疏束(shu su)、叔孙(shu sun)、
太史(tai shi)、太叔(tai shu)、澹台(tan tai)、涂钦(tu qin)、拓拔(tuo ba)、
完完(wan wan)、完颜(wan yan)、王子(wang zi)、闻人(wen ren)、微生(wei sheng)、巫马(wu ma)、乌雅(wu ya)、铁笔(tie bi)
西门(xi men)、夏侯(xia hou)、许世(xu shi)、轩辕(xuan yuan)、
闫法(yan fa)、羊舌(yang she)、阳佟(yang tong)、耶律(ye lv)、有琴(you qin)、尉迟(yu chi)、余佴(yu er)、宇文(yu wen)、岳帅(yue shuai)、乐正(yue zheng)、
宰父(zai fu)、子车(zi che)、子阳(zi yang)、宗政(zong zheng)、左丘(zuo qiu)、张简(zhang jian)、章佳(zhang jia)、长孙(zhang sun)、郑余(zheng yu)、仲孙(zhong sun)、钟离(zhong li)、诸葛(zhu ge)、颛孙(zhuan sun)、
"""
list3=[]
for line in html.strip().split("\r\n"):
list3.extend(["'" + elem.strip()[:2] + "'" for elem in line.split("、") if elem.strip()<>""])
for elem in list3:
print elem
print "[" + ",".join(list3) + "]"
html=u"""
李 王 张 刘 陈 杨 黄 孙 周 吴
徐 赵 朱 马 胡 郭 林 何 高 梁
郑 罗 宋 谢 唐 韩 曹 许 邓 萧
冯 曾 程 蔡 彭 潘 袁 于 董 余
苏 叶 吕 魏 蒋 田 杜 丁 沈 姜
范 江 傅 钟 卢 汪 戴 崔 任 陆
廖 姚 方 金 邱 夏 谭 韦 贾 邹
石 熊 孟 秦 阎 薛 侯 雷 白 龙
段 郝 孔 邵 史 毛 常 万 顾 赖
武 康 贺 严 尹 钱 施 牛 洪 龚
汤 陶 黎 温 莫 易 樊 乔 文 安
殷 颜 庄 章 鲁 倪 庞 邢 俞 翟
蓝 聂 齐 向 申 葛 柴 伍 覃 骆
关 焦 柳 欧 祝 纪 尚 毕 耿 芦
左 季 管 符 辛 苗 詹 曲 欧阳 靳
祁 路 涂 兰 甘 裴 梅 童 翁 霍
游 阮 尤 岳 柯 牟 滕 谷 舒 卜
成 饶 宁 凌 盛 查 单 冉 鲍 华
包 屈 房 喻 解 蒲 卫 简 时 连
车 项 闵 邬 吉 党 阳 司 费 蒙
席 晏 隋 古 强 穆 姬 宫 景 米
麦 谈 柏 瞿 艾 沙 鄢 桂 窦 郁
缪 畅 巩 卓 褚 栾 戚 全 娄 甄
郎 池 丛 边 岑 农 苟 迟 保 商
臧 佘 卞 虞 刁 冷 应 匡 栗 仇
练 楚 揭 师 官 佟 封 燕 桑 巫
敖 原 植 邝 仲 荆 储 宗 楼 干
苑 寇 盖 南 屠 鞠 荣 井 乐 银
奚 明 麻 雍 花 闻 冼 木 郜 廉
衣 蔺 和 冀 占 公 门 帅 利 满
陈生
"""
list4=[]
for line in html.split(" "):
if line.strip()<>"" and line.strip().isdigit()==False:
list4.append("'" + line.strip()+"'")
print list4.__len__()
print "[" + ",".join(list4) + "]"
def is_chinese_or_space(str):
"""
>>> is_chinese_or_space(u"中国 人")
True
>>> is_chinese_or_space(u"中国 人1")
False
>>> is_chinese_or_space(u"华为huawei")
False
>>> is_chinese_or_space(u"游泳裤xxxl")
False
"""
if type(str)==type(""):
str=str.encode("utf-8","ignore")
r=[]
for char in str:
r.append(_is_chinese_or_space(char))
if False in r:
return False
return True
def is_english_or_space(str):
"""
>>> is_english_or_space(u"abc def1")
False
>>> is_english_or_space(u"abc def")
True
>>> is_english_or_space(u"游泳裤xxxl")
False
>>> is_english_or_space(u"茶具")
False
"""
if type(str)==type(""):
str=str.encode("utf-8","ignore")
r=[]
for char in str:
r.append(_is_english_or_space(char))
if False in r:
return False
return True
def _is_chinese_or_space(uchar):
"""
>>> is_chinese_or_space(u"人")
True
>>> is_chinese_or_space(u"1")
False
>>> is_chinese_or_space(u" ")
True
"""
if is_chinese(uchar) or uchar==u" ":
return True
else:
return False
def _is_english_or_space(uchar):
"""
>>> _is_english_or_space(u"1")
False
>>> _is_english_or_space(u"a")
True
>>> _is_english_or_space(u" ")
True
>>> _is_english_or_space(u"中")
False
"""
if is_chinese(uchar):
return False
if uchar.isalpha() or uchar==u" ":
return True
return False
def find_one_side_chinese_and_another_side_is_english():
f=codecs.open("./short_name_global_chinese_english.txt","w","utf-8")
for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
a,b=line.strip().split("|||")
a=a.strip()
b=b.strip()
#print a,b
#print is_chinese_or_space(a)
#print is_english_or_space(b)
#if (is_chinese_or_space(a)==True and is_english_or_space(b)==True):
# print line
# time.sleep(100000000)
if (is_chinese_or_space(a)==True and is_english_or_space(b)==True) or \
(is_chinese_or_space(b)==True and is_english_or_space(a)==True):
f.write(line)
f.close()
if __name__=="__main__":
doctest.testmod()
# read_relevent_words()
# parser_one_line_one_words2()
# compare_pareser_one_line_one_words_result_lost_line_for_tmp()
# build_invert_index()
# build_word_segments_hash_map()
# final_find_synomns_out()
# interactive_mode()
# print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")
# print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")
# post_process_wname_segments_illegal_characters()
# filter_synonym_result()
# test_redis_is_ready()
# pivot_query_relvent_word_order_and_intersation_size()
# find_short_name()
# find_short_name2()
# test_sorted()
# find_only_one_word_difference()
# extrace_names()
# find_human_names()
find_one_side_chinese_and_another_side_is_english()
# print is_english_or_space(u"茶具")