export.py

#!/usr/bin/env python
#encoding=utf-8
import redis,codecs,sys,time,datetime,doctest,re
reload(sys)
sys.setdefaultencoding('utf8')
class Unbuffered:
    def __init__(self, stream):
        self.stream = stream

    def write(self, data):
        self.stream.write(data)
        self.stream.flush()

    def __getattr__(self, attr):
        return getattr(self.stream, attr)

sys.stdout = Unbuffered(sys.stdout)

def read_keys():
    keys=r.keys()
    r=redis.Redis(host='localhost',db=6)
    print len(keys)
    f=codecs.open("query_keys.txt","w","utf-8")
    #print r.info()
    for key in keys:
        print key
        #print type(key)
        f.write("%s\n"%(key,))
    f.close()

def read_relevent_words():
    keys=r.keys()
    r=redis.Redis(host='localhost',db=6)
    print len(keys)
    f=codecs.open("query_relevent_words.txt","w","utf-8")
    for key in keys:
#        print r.get(key)
        f.write("%s\n"%(r.get(key),))
    f.close()

def parser_one_line_one_words():
    ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
    f=codecs.open("query_relevent_words.txt","r","utf-8")
    for line in f.readlines():
        li=line.strip().split("*")
        for elem in li:
            ff.write("%s\n"%(elem,))
    ff.close()


def parser_one_line_one_words2():
    s=set()
    ff=codecs.open("parser_one_line_one_words.txt","w","utf-8")
    f=codecs.open("query_relevent_words.txt","r","utf-8")
    for line in f.readlines():
        li=line.strip().split("*")
        for elem in li:
            s.add(elem.strip())
            ff.write("%s\n"%(elem,))
    ff.close()
    print len(s)

def compare_pareser_one_line_one_words_result_lost_line_for_tmp():
    f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
    f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
    count=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        if a.strip()<>b.replace(" ","").strip():
            print count,a,b
            time.sleep(5)   

def build_invert_index():
    """
    对wname建倒排索引
    以set结构存放倒排数据
    """
    r=redis.Redis(db=1)
    p=r.pipeline()
    count=0
    #for line in codecs.open("../result_text.txt","r","utf-8").readlines():
    for line in codecs.open("../output_result_process","r","utf-8").readlines():
        count+=1
        #if count<2553148:
        #    continue
        #print count
        #print line,
        #print line.strip().split(" ").__len__()
        for elem in line.strip().split(" "):
            p.sadd(elem.strip(),count)
        if count%10000==0:
            print count
            print "batch insert to redis ..."
            s=datetime.datetime.now()
            p.execute()
            e=datetime.datetime.now()
            print "done:%s"%((e-s).seconds)
    p.execute()

def is_chinese(uchar):
    """
    判断一个unicode是否是汉字
    >>> is_chinese(u"人")
    True

    >>> is_chinese("人")
    True

    >>> is_chinese("1")
    False

    >>> is_chinese(" ")
    False
    """
    if type(uchar)==type(""):
        u=uchar.decode("utf-8","ignore")
    else:
        u=uchar.encode("utf-8","ignore")
    if len(u)!=len(uchar):
        return True
    else:
        return False
    #if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
    #    return True
    #else:
    #    return False

def is_number(uchar):
    """判断一个unicode是否是数字"""
    if uchar >= u'\u0030' and uchar<=u'\u0039':
        return True
    else:
        return False
   
def is_alphabet(uchar):
    """
        判断一个unicode是否是英文字母
       
        #>>> is_alphabet(u"t")
        #True   
       
        #>>> is_alphabet("t")
        #True
        """
    if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
        return True
    else:
        return False

def is_other(uchar):
    """判断是否非汉字,数字和英文字符"""
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return True
    else:
       return False

def _filter(line):
    """
    对分词后的文本wname字符进行非中文汉字、字母、数字的替换
    """
    r=[]
    for elem in line.strip().split(" "):
        element=elem.strip()
        if type(element)<>type(u""):
            element=element.decode("utf-8","ignore")
        if is_other(element)==False:
            r.append(element)
    return " ".join(r)

def post_process_wname_segments_illegal_characters():
    f=codecs.open("../output_result_process","w","utf-8")
    for line in codecs.open("../output_result","r","utf-8").readlines():
        s=_filter(line)
        print s
        f.write(_filter(line)+"\n")
    f.close()

def build_word_segments_hash_map():
    """
    给查询词和相关词建立原词-分词结果之间的hashmap
    """
    r2=redis.Redis(db=2)
    p=r2.pipeline()
    f1=codecs.open("parser_one_line_one_words_uniq.txt","r","utf-8")
    #f2=codecs.open("parser_one_line_one_words_uniq_result.txt","r","utf-8")
    f2=codecs.open("parser_one_line_one_words_uniq_result_pku.txt","r","utf-8")
    count=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        p.set(a.strip(),b.strip())
        if count%10000==0:
            print count
            print "batch insert to redis ..."
            s=datetime.datetime.now()
            p.execute()
            e=datetime.datetime.now()
            print "done:%s"%((e-s).seconds)
    p.execute()

    f1=codecs.open("query_keys.txt","r","utf-8")
    #f2=codecs.open("query_keys_result.txt","r","utf-8")
    f2=codecs.open("query_keys_result_pku.txt","r","utf-8")
    count=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        p.set(a.strip(),b.strip())
        if count%10000==0:
            print count
            print "batch insert to redis ..."
            s=datetime.datetime.now()
            p.execute()
            e=datetime.datetime.now()
            print "done:%s"%((e-s).seconds)
    p.execute()
    r2.bgsave()

def _build_list_for_inter_args(s1,s2):
    """
    将分词后的字符串组合成一个list形式反加给r.sinter使用,去除无用的东西
    """
    r=[]
    r.extend(s1.split(" "))
    r.extend(s2.split(" "))
    return [elem.strip() for elem in r if elem.strip()<>""]

def final_find_synomns_out():
    """

    """
    #f=codecs.open("synomns.txt","w","utf-8")
    f=codecs.open("synomns_pku.txt","w","utf-8")
    r1=redis.Redis(db=1)
    r2=redis.Redis(db=2)
    f1=codecs.open("query_keys.txt","r","utf-8")
    f2=codecs.open("query_relevent_words.txt","r","utf-8")
    count=0
    validateCount=0
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        #print count
        query_segments=r2.get(a.strip())
        for elem in b.split("*"):
            if elem.strip()=="":
                continue
            if r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(elem.strip()))).__len__()>0:
                validateCount+=1
                if validateCount%1000==0:
                    print "validateCount:%s\n"%validateCount
                f.write("%s|||%s\n"%(a.strip(),elem.strip()))
                f.flush()
    f.close()

def interactive_mode():
    while(True):
        r1=redis.Redis(db=1)
        r2=redis.Redis(db=2)
        input=raw_input("input query|||relevent_word:\n")
        a,b=input.strip().split("|||")
        query_segments=r2.get(a.strip())
        print a.strip(),"==>",query_segments
        print b.strip(),"==>",r2.get(b.strip())
        print r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b.strip())))
        print "========="

def c1(line):
    """
    空格切分
    >>> c1("执手|||把手")
    False
    """
    a,b=line.strip().split("|||")
    return a.split(" ").__len__()>1 or b.split(" ").__len__()>1

r2=redis.Redis(db=2)
def c4(s1,s2):
    """
    #>>> c4("尤利西斯","追忆逝水年华")
    False

    #>>> c4("A B C","A B")
    True

    >>> c4("无线鼠套装","无线键鼠套装")
    False

    #>>> c4("A B","A C")
    False

    #>>> c4("A B","A C")
    False

    #>>> c4("A","A")
    True

    >>> c4("行政职业能力测验真题","行测真题")
    False

    #>>> c4("B","C")
    False
    """
    if s1==s2:
        return True

    global r2
    set1=set()
    set2=set()
    if r2.exists(s1):
        s1=r2.get(s1).strip()
    if s1.find(" ")>-1:
        set1=set([elem.strip() for elem in s1.split(" ") if elem.strip()<>""])
    else:
        set1=set([s1.strip()])

    if r2.exists(s2):
        s2=r2.get(s2).strip()
    if s2.find(" ")>-1:
        set2=set([elem.strip() for elem in s2.split(" ") if elem.strip()<>""])
    else:
        set2=set([s2.strip()])

    #print set1,set2
    #for elem in set1:
    #    print elem,
    #print "=========="
    #for elem in set2:
    #    print elem,
    inster=set1 & set2
    if inster.__len__()==0:
        return False
    if inster.__len__()<min(set1.__len__(),set2.__len__()):
        return False
    else:
        return True
   
   
def c3(line):
    """
    >>> c3("执手|||把手")
    False

    >>> c3("the north face|||tnf")
    False
   
    >>> c3("the 大north face|||tnf")
    True
   
    >>> c3("wd1tb|||i5 2320")
    True
    """
    def is_en_or_num(s):
        #if re.match(r"[a-zA-A0-9]{1,}\Z",s):
        if re.match(r"[a-zA-Z]{1,}\Z",s.strip()):
            return True
        else:
            return False

    def f(list):
        r=set()
        def _f(s1,s2):
            r.add(is_en_or_num(s1) & is_en_or_num(s2))
            return s2
        reduce(_f,list)
        if False in r:
            return True
        else:
            return False

    a,b=line.strip().split("|||")
    if a.split(" ").__len__()>1 and b.split(" ").__len__()>1:
        return f(a.split(" ")) & f(b.split(" "))
    if a.split(" ").__len__()==1 and b.split(" ").__len__()>1:
        return f(b.split(" "))
    if a.split(" ").__len__()>1 and b.split(" ").__len__()==1:
        return f(a.split(" "))
    if a.split(" ").__len__()==1 and b.split(" ").__len__()==1:
        return False

       
def c2(line):
    """
    包含子串
    >>> c2("执手|||把手")
    False
   
    >>> c2("浓缩咖啡|||咖啡")
    True
    """
    a,b=line.strip().split("|||")
    return (a in b) or (b in a)

def filter_synonym_result():
    """
    将pku分词获得的query和relevent_word有交集的synomns_pku.txt,
    对其结果进行过滤
    过滤掉以下条件:
    有空格切分的
    包含子串的进行过滤
    """
   
    f=codecs.open("synomns_pku_filter.txt","w","utf-8")
    for line in codecs.open("synomns_pku.txt","r","utf-8").readlines():
        if c1(line)==False and c2(line)==False:
            f.write(line)
    f.close()
   
def test_redis_is_ready():
    """
    测试redis启动OK了
    """       
    r=redis.Redis()
    print r.info()
   
def pivot_query_relvent_word_order_and_intersation_size():
    """
    将结果以
    Query为key
    hashmap为value
    hashmap的key为relevent word
                 value为list [intersation_size,relevent word order]
    """   
    debug=False
    r1=redis.Redis(db=1)
    r2=redis.Redis(db=2)
    r3=redis.Redis(db=3)
    #r3.flushdb()
    p=r3.pipeline()
    def step1():
        """
        从synomns_pku_filter.txt中初始化存储的格式为hmap格式
        """
        count=0
        for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
            count+=1
            a,b=line.split("|||")
            a=a.strip()
            b=b.strip()
            #print type(a),type(b)
            #print a,b
            p.hset(a,b,[])
            if count%10000==0:
                p.execute()
                print "执行一次批量提交redis操作"
            if count==1 and debug==True:
                break
        p.execute()   
    #step1()

    def step2():
        """
        将相关词的顺序插入到redis的hmap 的value中
        """
        count=0
        exists_count=0
        not_exists_count=0
        f1=codecs.open("query_keys.txt","r","utf-8")
        f2=codecs.open("query_relevent_words.txt","r","utf-8")
        for a,b in zip(f1.readlines(),f2.readlines()):
            count+=1
            a=a.strip()
            b=b.strip()
            for idx,elem in enumerate(b.split("*")):
                element=elem.strip()
                if element=="":
                    continue
                #print type(a),type(element)
                #print a,b,element
                if r3.hexists(a,element):
                    exists_count+=1
                    r3.hset(a,element,[idx+1])
                else:
                    not_exists_count+=1
                    #print "%s,%s not exists in redis"%(a,element)
                if count%10000==0:
                    print "exists_count:%s"%exists_count
                    print "not_exists_count:%s"%not_exists_count
                   
            if count==1 and debug==True:
                break       
        print "exists_count:%s"%exists_count
        print "not_exists_count:%s"%not_exists_count
        print "step2 finished"

    #step2()

    def test_step1_and_step2_is_ok():
        """
        """
        result=r3.hget("透明茶杯","茶具")
        if type([])==type(eval(result)):
            print "正确"
        else:
            print "不正确"
    #test_step1_and_step2_is_ok()

    def step3():
        """
        将有交集结果的数据重新再跑一遍并将交集的大小改写到hmap的value中
        """
        count=0
        validateCount=0
        for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
            a,b=line.strip().split("|||")
            a=a.strip()
            b=b.strip()
            count+=1
            #print count
            query_segments=r2.get(a)
            intersation_len=r1.sinter(*_build_list_for_inter_args(query_segments,r2.get(b))).__len__()
            if intersation_len>0:
                    list_result=eval(r3.hget(a,b))
                    if len(list_result)<>1:
                        print a,b
                        print type(a),type(b)
                        print "ERROR"
                        exit(-1)
                    #print type(list_result)==type([])
                    list_result.append(intersation_len)
                    r3.hset(a,b,list_result)
                    validateCount+=1
                    if validateCount%1000==0:
                        print "validateCount:%s\n"%validateCount
        print "final validateCount %s"%validateCount

    #step3()
   
    def step4():
        """
        将存储在redis hmap中的结构进行输出
        """
        def cmp(x,y):
            if x[1][1]<y[1][1]:
                return 1
            elif x[1][1]>y[1][1]:
                return -1
            else:
                if x[1][0]<y[1][0]:
                    return 1
                elif x[1][0]>y[1][0]:
                    return -1
                else:
                    return 0
        f=codecs.open("synomns_pku_filter_process.txt","w","utf-8")
        #[('b', [2, 4]), ('a', [1, 3])]
        count=0
        for key in r3.keys():
            count+=1
            print count
            f.write("%s"%key)
            z=r3.hgetall(key)
            for k,v in z.iteritems():
                z[k]=eval(v)
            for elem in sorted(z.items(),cmp):
                word,orders=elem
                f.write("|||%s,%s"%(word,str(orders)))
            f.write("\n")
            f.flush()
        f.close()
       
    step4()

def _find_short_name(s1,s2):
    """
    >>> _find_short_name("行测","行政能力测试")
    True

    >>> _find_short_name("AB","ABC")
    False

    >>> _find_short_name("A","D")
    False
    """
    if len(s1)>=len(s2):
        return False
    if s1 in s2:
        return False
    return set(s1).issubset(set(s2))   

def find_short_name():
    """
    在synomns_pku_filter.txt中查找简称
    """
    for line in codecs.open("synomns_pku_filter.txt","r","utf-8").readlines():
        a,b=line.strip().split("|||")
        a=a.strip()
        b=b.strip()
        if _find_short_name(a,b) or _find_short_name(b,a):
            print "%s|||%s"%(a,b)

def find_short_name2():
    """
    在原query和relevent word中查找简称
    """
    f=codecs.open("short_name_global.txt","w","utf-8")
    count=0
    validateCount=0
    f1=codecs.open("query_keys.txt","r","utf-8")
    f2=codecs.open("query_relevent_words.txt","r","utf-8")
    for a,b in zip(f1.readlines(),f2.readlines()):
        count+=1
        #if count<146146:
        #    continue
        #else:
        #    print a,b
        #if count<146148:
        #    print "stop..."
        #    time.sleep(100000)
        a=a.strip()
        b=b.strip()
        for idx,elem in enumerate(b.split("*")):
            element=elem.strip()
            if element=="":
                continue
            line="%s|||%s\n"%(a,element)
            #print line
            #time.sleep(4000)
            if c3(line)==False and c2(line)==False and c4(a,element)==False:
                #if _find_short_name(a,element) or _find_short_name(element,a):
                validateCount+=1
                #if validateCount%10000==0:
                print "validateCount:%s"%validateCount
                print line
                #time.sleep(100000)
                f.write(line)
                f.flush()

        if count%10000==0:
                print "cout===========>%s"%count
    f.close()
    print "validateCount:%s"%validateCount
    print "cout===========>%s"%count

def test_sorted():
    a=[('a',[1,2]),("b",[0,2]),("c",[-1,3])]
    def cmp(x,y):
        if x[1][1]<y[1][1]:
            return 1
        elif x[1][1]>y[1][1]:
            return -1
        else:
            if x[1][0]<y[1][0]:
                return 1
            elif x[1][0]>y[1][0]:
                return -1
            else:
                return 0
    print sorted(a,cmp)

def _find_only_one_word_difference(line):
    """
    >>> _find_only_one_word_difference("毛领毛衣|||毛领衣服")
    True
    """
    return True

def find_only_one_word_difference():
    f=codecs.open("./short_name_global_filter.txt","w","utf-8")
    for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
        a,b=line.strip().split("|||")
        if len(a)==len(b) and a<>b:
            #print type(a),type(b)
            set1=set(a)
            set2=set(b)
            inster=set1&set2
            m=len(a)-1
            if m>0 and inster.__len__()==m:
                if ((set1-set2).__len__()>0 and str(list(set1-set2)[0]).isdigit()==False) or \
                      ((set2-set1).__len__()>0 and str(list(set2-set1)[0]).isdigit()==False):   
                        f.write(line)
    f.close()
           
def find_human_names():
    """
    从query_relevent_word中找一批人名
    苏轼 苏东坡
    """
    xins=['白','毕','卞','蔡','曹','岑','常','车','陈','成','程','池','邓','丁','范','方','樊','费','冯','符','傅','甘','高','葛','龚','古','关','郭','韩','何','贺','洪','侯','胡','华','黄','霍','姬','简','江','姜','蒋','金','康','柯','孔','赖','郎','乐','雷','黎','李','连','','梁','廖','林','凌','刘','柳','龙','卢','鲁','陆','路','吕','罗','骆','马','梅','孟','莫','母','穆','倪','宁','欧','区','潘','彭','','皮','齐','戚','钱','强','秦','丘','邱','饶','任','沈','盛','施','石','时','史','司徒','苏','孙','谭','汤','唐','陶','田','童','涂','王','危','韦','卫','魏','温','文','翁','巫','邬','吴','伍','武','席','夏','萧','谢','辛','邢','徐','许','薛','严','颜','杨','叶','易','殷','尤','于','余','俞','虞','元','袁','岳','云','曾','詹','张','章','赵','郑','钟','周','邹','朱','褚','庄','卓']
    xins+=['李','王','张','刘','陈','黄','周','吴','徐','孙','胡','朱','高','林','何','郭','马','罗','梁','宋','郑','谢','韩','唐','冯','于','董','萧','程','曹','袁','邓','许','傅','沈','曾','彭','吕','苏','卢','蒋','蔡','贾','丁','魏','薛','叶','阎','余','潘','杜','戴','夏','','汪','田','任','姜','范','方','石','姚','谭','廖','邹','熊','金','陆','郝','孔','白','崔','康','毛','邱','秦','江','史','顾','侯','','孟','龙','万','段','章','钱','汤','尹','黎','易','常','武','乔','贺','赖','龚','文']
    xins+=['鲍俎','百里','碧鲁','伯赏','北堂','陈林','淳于','第五','东方','东郭','东门','段干','独孤','端木','范姜','哥舒','公良','公孙','公西','公冶','公羊','缑亢','谷梁','归海','赫连','胡母','呼延','黄方','皇甫','即墨','夹谷','晋楚','况后','梁丘','令狐','陆费','闾丘','闾邱','明哲','墨哈','慕容','万俟','南宫','南郭','南门','年爱','欧阳','濮阳','漆雕','亓官','屈突','壤驷','汝鄢','司马','司空','司寇','司徒','官','商牟','申屠','侍其','疏束','叔孙','太史','太叔','澹台','涂钦','拓拔','完完','完颜','王子','闻人','微生','巫马','乌雅','铁笔','夏','许世','轩辕','闫法','羊舌','阳佟','耶律','有琴','尉迟','余佴','宇文','岳帅','乐正','宰父','子车','子阳','宗政','左丘','张简','章佳','长孙','郑余','仲孙','钟离','诸葛','颛孙']
    xins+=['付']
    xins+=['李','王','张','刘','陈','杨','黄','孙','周','吴','徐','赵','朱','马','胡','郭','林','何','高','梁','郑','罗','宋','谢','唐','韩','曹','许','邓','萧','冯','曾','程','蔡','彭','潘','袁','于','董','余','苏','叶','吕','魏','蒋','田','杜','丁','沈','姜','范','江','傅','','卢','汪','戴','崔','任','陆','廖','姚','方','金','邱','夏','谭','韦','贾','邹','石','熊','孟','秦','阎','薛','侯','雷','白','龙','','郝','孔','邵','史','毛','常','万','顾','赖','武','康','贺','严','尹','钱','施','牛','洪','龚','汤','陶','黎','温','莫','易','樊','','文','安','殷','颜','庄','章','鲁','倪','庞','邢','俞','翟','蓝','聂','齐','向','申','葛','柴','伍','覃','骆','关','焦','柳','欧','','纪','尚','毕','耿','芦','左','季','管','符','辛','苗','詹','曲','欧阳','靳','祁','路','涂','兰','甘','裴','梅','童','翁','霍','游','阮','尤','岳','柯','牟','滕','谷','舒','卜','成','饶','宁','凌','盛','查','单','冉','鲍','华','包','屈','房','喻','解','蒲','卫','简','时','连','车','项','闵','邬','吉','党','阳','司','费','蒙','席','晏','隋','古','强','穆','姬','宫','景','米','麦','谈','柏','瞿','艾','沙','鄢','桂','窦','郁','缪','畅','巩','卓','褚','栾','戚','全','娄','甄','郎','池','丛','边','岑','农','苟','迟','保','商','臧','','卞','虞','刁','冷','应','匡','栗','仇','练','楚','揭','师','官','佟','封','燕','桑','巫','敖','原','植','邝','仲','荆','储','宗','','干','苑','寇','盖','南','屠','鞠','荣','井','乐','银','奚','明','麻','雍','花','闻','冼','木','郜','廉','衣','蔺','和','冀','占','','门','帅','利','满','陈生']
    xins=set(xins)
    print xins.__len__()
    #f=codecs.open("./baijiaxin.txt","w","utf-8")
    #for elem in [elem.strip() for elem in xins if elem.strip()<>""]:
    #    f.write("%s\n"%elem.strip())
    #f.close()
    f=codecs.open("./short_name_global_xin.txt","w","utf-8")
    xins=[elem.strip() for elem in xins if elem.strip()<>""]   
    for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
        a,b=line.strip().split("|||")
        a=a.strip()
        b=b.strip()
        if (a[:2]==b[:2] and a[:2] in xins) or (a[:1]==b[:1] and a[:1] in xins) and len(a)<5 and len(b)<5:
            f.write(line)
    f.close()

def extrace_names():
    subject="""
    <TD width=90><a href=/zaobao/chinese/surname/pages/story_bai2.html target=_blank>白</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_bi4.html target=_blank>毕</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/bian040600.html target=_blank>卞</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cai4.html target=_blank>蔡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cao2.html target=_blank>曹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cen2.html target=_blank>岑</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/chang040600.html target=_blank>常</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_che.html target=_blank>车</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_chen2.html target=_blank>陈</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/cheng030100.html target=_blank>成</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_cheng2.html target=_blank>程</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_chi2.html target=_blank>池</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_deng4.html target=_blank></a>邓</TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ding.html target=_blank>丁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fan4.html target=_blank>范</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fang.html target=_blank></a>方</TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/fan140600.html target=_blank>樊</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/fei140600.html target=_blank>费</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_feng2.html target=_blank>冯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fu2.html target=_blank>符</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_fu4.html target=_blank>傅</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gan.html target=_blank>甘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gao.html target=_blank>高</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ge170100.html target=_blank>葛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gong.html target=_blank>龚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_gu3.html target=_blank>古</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_guan.html target=_blank>关</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_guo.html target=_blank>郭</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_han2.html target=_blank>韩</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_he2.html target=_blank>何</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/he140600.html target=_blank>贺</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hong2.html target=_blank>洪</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hou2.html target=_blank>侯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hu2.html target=_blank>胡</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_hua4.html target=_blank>华</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_huang2.html target=_blank>黄</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/huo010600.html target=_blank>霍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ji030100.html target=_blank>姬</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jian3.html target=_blank>简</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang1.html target=_blank>江</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang.html target=_blank>姜</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jiang3.html target=_blank>蒋</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_jin.html target=_blank>金</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_kang.html target=_blank>康</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ke.html target=_blank>柯</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_kong3.html target=_blank>孔</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lai4.html target=_blank>赖</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lang170100.html target=_blank>郎</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/le140600.html target=_blank>乐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lei2.html target=_blank>雷</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_li2.html target=_blank>黎</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_li3.html target=_blank>李</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lian2.html target=_blank>连</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lian140600.html target=_blank>廉</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/liang030100.html target=_blank>梁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liao4.html target=_blank>廖</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lin2.html target=_blank>林</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ling2.html target=_blank>凌</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liu2.html target=_blank>刘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_liu3.html target=_blank>柳</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_long2.html target=_blank>龙</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lu2.html target=_blank>卢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lu170100.html target=_blank>鲁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lu4.html target=_blank>陆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/lu140600.html target=_blank>路</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_lv3.html target=_blank>吕</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_luo2.html target=_blank>罗</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_luo4.html target=_blank>骆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ma3.html target=_blank>马</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_mei2.html target=_blank>梅</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/meng140600.html target=_blank>孟</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_mo4.html target=_blank>莫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/mu030100.html target=_blank>母</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/mu130700.html target=_blank>穆</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ni2.html target=_blank>倪</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ning2.html target=_blank>宁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ou.html target=_blank>欧</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/ou030100.html target=_blank>区</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_pan.html target=_blank>潘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_peng2.html target=_blank>彭</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_pu2.html target=_blank>蒲</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/pi130700.html target=_blank>皮</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qi130700.html target=_blank>齐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qi030100.html target=_blank>戚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qian2.html target=_blank>钱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qiang310500.html target=_blank>强</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qing2.html target=_blank>秦</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/qiu030100.html target=_blank>丘</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_qiu.html target=_blank>邱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_rao2.html target=_blank>饶</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ren2.html target=_blank>任</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shen3.html target=_blank>沈</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/sheng010600.html target=_blank>盛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi.html target=_blank>施</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi2.html target=_blank>石</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/shi300500.html target=_blank>时</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_shi3.html target=_blank>史</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/situ030100.html target=_blank>司徒</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_su.html target=_blank>苏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_sun.html target=_blank>孙</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tan2.html target=_blank>谭</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tang.html target=_blank>汤</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tang2.html target=_blank>唐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tao2.html target=_blank>陶</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tian2.html target=_blank>田</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/tong040600.html target=_blank>童</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_tu2.html target=_blank>涂</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wang2.html target=_blank>王</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/wei010600.html target=_blank>危</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wei3.html target=_blank>韦</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/wei180100a.html target=_blank>卫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wei4.html target=_blank>魏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wen.html target=_blank>温</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wen2.html target=_blank>文</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_weng.html target=_blank>翁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu.html target=_blank>巫</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu1.html target=_blank>邬</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu2.html target=_blank>吴</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3.html target=_blank>伍</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_wu3a.html target=_blank>武</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/xi040600.html target=_blank>席</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xia4.html target=_blank>夏</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xiao.html target=_blank>萧</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xie4.html target=_blank>谢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xin.html target=_blank>辛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xing2.html target=_blank>邢</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xu2.html target=_blank>徐</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xu3.html target=_blank>许</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_xue.html target=_blank>薛</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2.html target=_blank>严</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yan2a.html target=_blank>颜</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yang2.html target=_blank>杨</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_ye4.html target=_blank>叶</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yi4.html target=_blank>易</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yin020600.html target=_blank>殷</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_you2.html target=_blank>尤</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu010600.html target=_blank>于</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yu2.html target=_blank>余</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu020600a.html target=_blank>俞</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yu020600.html target=_blank>虞</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/yuan310500.html target=_blank>元</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yuan2.html target=_blank>袁</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/yue030100.html target=_blank>岳</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_yun2.html target=_blank>云</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zeng.html target=_blank>曾</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhan.html target=_blank>詹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang.html target=_blank>张</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhang1.html target=_blank>章</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhao4.html target=_blank>赵</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zheng4.html target=_blank>郑</a></TD>
</TR>

<TR ALIGN=CENTER>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhong.html target=_blank>钟</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou.html target=_blank>周</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhou1.html target=_blank>邹</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhu.html target=_blank>朱</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/zhu180100.html target=_blank>褚</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuang.html target=_blank>庄</a></TD>
<TD width=90><a href=/zaobao/chinese/surname/pages/story_zhuo.html target=_blank>卓</a></TD>
"""
    result = re.findall(r"target=_blank>(?P<name>[\s\S]*?)</TD>", subject)
    print ("['"+"','".join(result)+"']").replace("</a>","")
    html=u"""
1李
2王
3张
4刘
5陈
6杨
7赵
8黄
9周
10吴
11徐
12孙
13胡
14朱
15高
16林
17何
18郭
19马
20罗
21梁
22宋
23郑
24谢
25韩
26唐
27冯
28于
29董
30萧
31程
32曹
33袁
34邓
35许
36傅
37沈
38曾
39彭
40吕
41苏
42卢
43蒋
44蔡
45贾
46丁
47魏
48薛
49叶
50阎
51余
52潘
53杜
54戴
55夏
56钟
57汪
58田
59任
60姜
61范
62方
63石
64姚
65谭
66廖
67邹
68熊
69金
70陆
71郝
72孔
73白
74崔
75康
76毛
77邱
78秦
79江
80史
81顾
82侯
83邵
84孟
85龙
86万
87段
88章
89钱
90汤
91尹
92黎
93易
94常
95武
96乔
97贺
98赖
99龚
100文
"""
    list2=[]
    for line in html.strip().split(" \n"):
        list2.append("'"+line[-1]+"'")
    print "[" +  ",".join(list2) + "]"
    html=u"""
鲍俎(bao zu)、百里(bai li)、碧鲁(bi lu)、伯赏(bo shang)、北堂(bei tang)
单于(chan yu)、陈林(chen lin)、淳于(chun yu)、
第五(di wu)、 东方(dong fang)、东郭(dong guo)、东门(dong men)、段干(duan gan)、独孤(du gu)、端木(duan mu)、
范姜(fan jiang)、
哥舒(ge shu)、公良(gong liang)、公孙(gong sun)、公西(gong xi)、公冶(gong yan)、公羊(gong yang)、缑亢(gou kang)、谷梁(gu liang)、归海(gui hai)、
赫连(he lian)、胡母(hu mu)、呼延(hu yan)、黄方(huang fang)、皇甫(huang fu)、
即墨(ji mo)、夹谷(jia gu)、晋楚(jin chu)、
况后(kuang hou)、
梁丘(liang qiu)、令狐(ling hu)、陆费(lu fei)、闾丘(lv qiu)、闾邱(lv qiu)、
明哲(ming zhe)、墨哈(mo ha)、慕容(mu rong)、万俟(mò qí)
钠兰(na lan)、南宫(nan gong)、南郭(nan guo)、南门(nan men)、年爱(nian ai)、
欧阳(ou yang)、
濮阳(pu yang)、
漆雕(qi diao)、亓官(qi guan)、屈突(qu tu)、
壤驷(rang si)、汝鄢(ru yan)、
司马(si ma)、司空(si kong)、司寇(si kou)、司徒(si tu)、上官(shang guan)、商牟(shang mou)、申屠(shen tu)、侍其(shi qi)、疏束(shu su)、叔孙(shu sun)、
太史(tai shi)、太叔(tai shu)、澹台(tan tai)、涂钦(tu qin)、拓拔(tuo ba)、
完完(wan wan)、完颜(wan yan)、王子(wang zi)、闻人(wen ren)、微生(wei sheng)、巫马(wu ma)、乌雅(wu ya)、铁笔(tie bi)
西门(xi men)、夏侯(xia hou)、许世(xu shi)、轩辕(xuan yuan)、
闫法(yan fa)、羊舌(yang she)、阳佟(yang tong)、耶律(ye lv)、有琴(you qin)、尉迟(yu chi)、余佴(yu er)、宇文(yu wen)、岳帅(yue shuai)、乐正(yue zheng)、
宰父(zai fu)、子车(zi che)、子阳(zi yang)、宗政(zong zheng)、左丘(zuo qiu)、张简(zhang jian)、章佳(zhang jia)、长孙(zhang sun)、郑余(zheng yu)、仲孙(zhong sun)、钟离(zhong li)、诸葛(zhu ge)、颛孙(zhuan sun)、
"""
    list3=[]
    for line in html.strip().split("\r\n"):
        list3.extend(["'" + elem.strip()[:2] + "'" for elem in line.split("、") if elem.strip()<>""])
    for elem in list3:
        print elem
    print "[" +  ",".join(list3) + "]"
   
    html=u"""
李 王 张 刘 陈 杨 黄 孙 周 吴
徐 赵 朱 马 胡 郭 林 何 高 梁
郑 罗 宋 谢 唐 韩 曹 许 邓 萧
冯 曾 程 蔡 彭 潘 袁 于 董 余
苏 叶 吕 魏 蒋 田 杜 丁 沈 姜
范 江 傅 钟 卢 汪 戴 崔 任 陆
廖 姚 方 金 邱 夏 谭 韦 贾 邹
石 熊 孟 秦 阎 薛 侯 雷 白 龙
段 郝 孔 邵 史 毛 常 万 顾 赖
武 康 贺 严 尹 钱 施 牛 洪 龚
汤 陶 黎 温 莫 易 樊 乔 文 安
殷 颜 庄 章 鲁 倪 庞 邢 俞 翟
蓝 聂 齐 向 申 葛 柴 伍 覃 骆
关 焦 柳 欧 祝 纪 尚 毕 耿 芦
左 季 管 符 辛 苗 詹 曲 欧阳 靳
祁 路 涂 兰 甘 裴 梅 童 翁 霍
游 阮 尤 岳 柯 牟 滕 谷 舒 卜
成 饶 宁 凌 盛 查 单 冉 鲍 华
包 屈 房 喻 解 蒲 卫 简 时 连
车 项 闵 邬 吉 党 阳 司 费 蒙
席 晏 隋 古 强 穆 姬 宫 景 米
麦 谈 柏 瞿 艾 沙 鄢 桂 窦 郁
缪 畅 巩 卓 褚 栾 戚 全 娄 甄
郎 池 丛 边 岑 农 苟 迟 保 商
臧 佘 卞 虞 刁 冷 应 匡 栗 仇
练 楚 揭 师 官 佟 封 燕 桑 巫
敖 原 植 邝 仲 荆 储 宗 楼 干
苑 寇 盖 南 屠 鞠 荣 井 乐 银
奚 明 麻 雍 花 闻 冼 木 郜 廉
衣 蔺 和 冀 占 公 门 帅 利 满
陈生
"""
    list4=[]
    for line in html.split(" "):
        if line.strip()<>"" and line.strip().isdigit()==False:
            list4.append("'" + line.strip()+"'")
    print list4.__len__()
    print "[" +  ",".join(list4) + "]"

def is_chinese_or_space(str):
    """
    >>> is_chinese_or_space(u"中国 人")
    True
   
    >>> is_chinese_or_space(u"中国 人1")
    False
   
    >>> is_chinese_or_space(u"华为huawei")
    False

    >>> is_chinese_or_space(u"游泳裤xxxl")
    False
    """
    if type(str)==type(""):
        str=str.encode("utf-8","ignore")
    r=[]
    for char in str:
        r.append(_is_chinese_or_space(char))
    if False in r:
        return False
    return True

def is_english_or_space(str):
    """
    >>> is_english_or_space(u"abc def1")
    False
   
    >>> is_english_or_space(u"abc def")
    True

    >>> is_english_or_space(u"游泳裤xxxl")
    False
   
    >>> is_english_or_space(u"茶具")
    False
    """
    if type(str)==type(""):
        str=str.encode("utf-8","ignore")
    r=[]
    for char in str:
        r.append(_is_english_or_space(char))
    if False in r:
        return False
    return True

def _is_chinese_or_space(uchar):
    """
    >>> is_chinese_or_space(u"人")
    True
   
    >>> is_chinese_or_space(u"1")
    False
   
    >>> is_chinese_or_space(u" ")
    True
    """
    if is_chinese(uchar) or uchar==u" ":
        return True
    else:
        return False

def _is_english_or_space(uchar):
    """
    >>> _is_english_or_space(u"1")
    False
   
    >>> _is_english_or_space(u"a")
    True
   
    >>> _is_english_or_space(u" ")
    True
   
    >>> _is_english_or_space(u"中")
    False
    """
    if is_chinese(uchar):
        return False
    if uchar.isalpha() or uchar==u" ":
        return True
    return False
   
def find_one_side_chinese_and_another_side_is_english():
    f=codecs.open("./short_name_global_chinese_english.txt","w","utf-8")
    for line in codecs.open("./short_name_global.txt","r","utf-8").readlines():
        a,b=line.strip().split("|||")
        a=a.strip()
        b=b.strip()
        #print a,b
        #print is_chinese_or_space(a)
        #print is_english_or_space(b)
        #if (is_chinese_or_space(a)==True and is_english_or_space(b)==True):
        #    print line
        #    time.sleep(100000000)

        if (is_chinese_or_space(a)==True and is_english_or_space(b)==True) or \
             (is_chinese_or_space(b)==True and is_english_or_space(a)==True):
           
            f.write(line)
    f.close()

               
if __name__=="__main__":
    doctest.testmod()
#    read_relevent_words()
#    parser_one_line_one_words2()
#    compare_pareser_one_line_one_words_result_lost_line_for_tmp()
#    build_invert_index()
#    build_word_segments_hash_map()
#    final_find_synomns_out()   
#    interactive_mode()
#    print _filter("龟 鹿 补 肾丸 4.5 g*12 袋 水 蜜丸 / 盒 [ 补 肾 失眠 体弱 疲乏 壮 阳 ]")
#    print _filter("龟 牌 ( turtle ) 硬壳 防水 全效 蜡 g-223r")
#    post_process_wname_segments_illegal_characters()
#    filter_synonym_result()   
#    test_redis_is_ready()
#    pivot_query_relvent_word_order_and_intersation_size()
#    find_short_name()
#    find_short_name2()
#    test_sorted()
#    find_only_one_word_difference()
#    extrace_names()
#    find_human_names()
    find_one_side_chinese_and_another_side_is_english()
#    print is_english_or_space(u"茶具")

你可能感兴趣的:(Export)