文本清洗及分词

文本清洗 

'''
代码来源https://blog.csdn.net/qq_43814415/article/details/119517978?spm=1001.2101.3001.6650.15&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&utm_relevant_index=16
'''
def clean(line):
    """对一个文件的数据进行清洗"""
    rep=['【】','【','】','','',
        '','','','','❤️','………','','...、、',',,','..','','',
         '⚕️','','','','','','','','','','✧٩(ˊωˋ*)و✧','','????','//','','','','',
         '(ღ˘⌣˘ღ)','✧\٩(눈౪눈)و//✧','','','',
         '','','','','','(ง•̀_•́)ง!','️','',
         '','⊙∀⊙!','','【?','+1','','','','','',
         '','!!!!','','\(^▽^)/','','','',
         '','','','','0371-12345','☕️','','','','','','\U0001f92e\U0001f92e','','+1','','','','➕1',
         '','::','','√','x','!!!','','♂️','','','o(^o^)o','mei\u2006sha\u2006shi','','','',
         '','关注','……','(((╹д╹;)))','⚠️','Ծ‸Ծ','⛽️','','',
         '️','','…','','[]','[',']','→_→','','','"','','ฅ۶•ﻌ•♡','','️',
         '','','(ง•̀_•́)ง','','✊','','','','',':','','(*^▽^)/★*☆','','','','','(✪▽✪)','(❁´ω`❁)','1⃣3⃣','(^_^)/','☀️',
	     '','','','','→_→','','✨','❄️','•','','','','','','⊙∀⊙!','','✌(̿▀̿\u2009̿Ĺ̯̿̿▀̿̿)✌',
         '','','','','','','','','♡♪..•͈ᴗ•͈✩‧₊˚','','','','','','','','(✪▽✪)','','','','♂️','','✌️','',' ̄ ̄)σ',
         '','','','','✊','','','','','✔️','','','','❤','','','','丨','✅','','ノ','☀','5⃣⏺1⃣0⃣','','','','',
         '',
         ]
    pattern_0=re.compile('#.*?#')#在用户名处匹配话题名称
    pattern_1=re.compile('【.*?】')#在用户名处匹配话题名称
    pattern_2=re.compile('肺炎@([\u4e00-\u9fa5\w\-]+)')#匹配@
    pattern_3=re.compile('@([\u4e00-\u9fa5\w\-]+)')#匹配@
    #肺炎@环球时报
    pattern_4=re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFF]')#匹配表情
    pattern_5=re.compile('(.*?)')#匹配一部分颜文字
    pattern_7=re.compile('L.*?的微博视频')
    pattern_8=re.compile('(.*?)')
    #pattern_9=re.compile(u"\|[\u4e00-\u9fa5]*\|")#匹配中文

    line=line.replace('O网页链接','')
    line=line.replace('-----','')
    line=line.replace('①','')
    line=line.replace('②','')
    line=line.replace('③','')
    line=line.replace('④','')
    line=line.replace('>>','')
    line=re.sub(pattern_0, '', line,0) #去除话题
    line=re.sub(pattern_1, '', line,0) #去除【】
    line=re.sub(pattern_2, '', line,0) #去除@
    line=re.sub(pattern_3, '', line,0) #去除@
    line=re.sub(pattern_4, '', line,0) #去除表情
    line=re.sub(pattern_5, '', line,0) #去除一部分颜文字
    line=re.sub(pattern_7, '', line,0) 
    line=re.sub(pattern_8, '', line,0) 
    line=re.sub(r'\[\S+\]', '', line,0) #去除表情符号
    
    for i in rep:
        line=line.replace(i,'')
    return line

分词

def seg_sentence(sentence):
    sentence = re.sub(u'[0-9\.]+', u'', sentence)
    jieba.load_userdict('自建词表.txt')#加载自建词表
    #suggest_freq((), tune=True) #修改词频,使其能分出来
    #jieba.add_word('知识集成')		# 这里是加入用户自定义的词来补充jieba词典
    sentence_seged =jieba.cut(sentence.strip(),cut_all=False,use_paddle=10)#默认精确模式
    #sentence_seged =jieba.cut_for_search(sentence.strip(),HMM=True)#搜索引擎模式
    #keywords =jieba.analyse.extract_tags(sentence, topK=30, withWeight=True, allowPOS=('n', 'v','nr', 'ns'))#关键词模式
    #sentence_seged=[item[0] for item in keywords]
    stopwords = stopwordslist('停用词表.txt')  # 这里加载停用词的路径
    synwords=synwordslist('近义词表.txt')#这里加载近义词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords and word.__len__()>1:
            if word != '\t':#判断出不是停用词
                if word in synwords.keys():#如果是同义词
                    word = synwords[word]
                    outstr += word
                    outstr += " "    
                else:
                    outstr += word
                    outstr += " "
    return outstr

你可能感兴趣的:(python,开发语言)