文本清洗
'''
代码来源https://blog.csdn.net/qq_43814415/article/details/119517978?spm=1001.2101.3001.6650.15&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-15-119517978-blog-122389948.pc_relevant_recovery_v2&utm_relevant_index=16
'''
def clean(line):
"""对一个文件的数据进行清洗"""
rep=['【】','【','】','','',
'','','','','❤️','………','','...、、',',,','..','','',
'⚕️','','','','','','','','','','✧٩(ˊωˋ*)و✧','','????','//','','','','',
'(ღ˘⌣˘ღ)','✧\٩(눈౪눈)و//✧','','','',
'','','','','','(ง•̀_•́)ง!','️','',
'','⊙∀⊙!','','【?','+1','','','','','',
'','!!!!','','\(^▽^)/','','','',
'','','','','0371-12345','☕️','','','','','','\U0001f92e\U0001f92e','','+1','','','','➕1',
'','::','','√','x','!!!','','♂️','','','o(^o^)o','mei\u2006sha\u2006shi','','','',
'','关注','……','(((╹д╹;)))','⚠️','Ծ‸Ծ','⛽️','','',
'️','','…','','[]','[',']','→_→','','','"','','ฅ۶•ﻌ•♡','','️',
'','','(ง•̀_•́)ง','','✊','','','','',':','','(*^▽^)/★*☆','','','','','(✪▽✪)','(❁´ω`❁)','1⃣3⃣','(^_^)/','☀️',
'','','','','→_→','','✨','❄️','•','','','','','','⊙∀⊙!','','✌(̿▀̿\u2009̿Ĺ̯̿̿▀̿̿)✌',
'','','','','','','','','♡♪..•͈ᴗ•͈✩‧₊˚','','','','','','','','(✪▽✪)','','','','♂️','','✌️','',' ̄ ̄)σ',
'','','','','✊','','','','','✔️','','','','❤','','','','丨','✅','','ノ','☀','5⃣⏺1⃣0⃣','','','','',
'',
]
pattern_0=re.compile('#.*?#')#在用户名处匹配话题名称
pattern_1=re.compile('【.*?】')#在用户名处匹配话题名称
pattern_2=re.compile('肺炎@([\u4e00-\u9fa5\w\-]+)')#匹配@
pattern_3=re.compile('@([\u4e00-\u9fa5\w\-]+)')#匹配@
#肺炎@环球时报
pattern_4=re.compile(u'[\U00010000-\U0010ffff\uD800-\uDBFF\uDC00-\uDFFF]')#匹配表情
pattern_5=re.compile('(.*?)')#匹配一部分颜文字
pattern_7=re.compile('L.*?的微博视频')
pattern_8=re.compile('(.*?)')
#pattern_9=re.compile(u"\|[\u4e00-\u9fa5]*\|")#匹配中文
line=line.replace('O网页链接','')
line=line.replace('-----','')
line=line.replace('①','')
line=line.replace('②','')
line=line.replace('③','')
line=line.replace('④','')
line=line.replace('>>','')
line=re.sub(pattern_0, '', line,0) #去除话题
line=re.sub(pattern_1, '', line,0) #去除【】
line=re.sub(pattern_2, '', line,0) #去除@
line=re.sub(pattern_3, '', line,0) #去除@
line=re.sub(pattern_4, '', line,0) #去除表情
line=re.sub(pattern_5, '', line,0) #去除一部分颜文字
line=re.sub(pattern_7, '', line,0)
line=re.sub(pattern_8, '', line,0)
line=re.sub(r'\[\S+\]', '', line,0) #去除表情符号
for i in rep:
line=line.replace(i,'')
return line
分词
def seg_sentence(sentence):
sentence = re.sub(u'[0-9\.]+', u'', sentence)
jieba.load_userdict('自建词表.txt')#加载自建词表
#suggest_freq((), tune=True) #修改词频,使其能分出来
#jieba.add_word('知识集成') # 这里是加入用户自定义的词来补充jieba词典
sentence_seged =jieba.cut(sentence.strip(),cut_all=False,use_paddle=10)#默认精确模式
#sentence_seged =jieba.cut_for_search(sentence.strip(),HMM=True)#搜索引擎模式
#keywords =jieba.analyse.extract_tags(sentence, topK=30, withWeight=True, allowPOS=('n', 'v','nr', 'ns'))#关键词模式
#sentence_seged=[item[0] for item in keywords]
stopwords = stopwordslist('停用词表.txt') # 这里加载停用词的路径
synwords=synwordslist('近义词表.txt')#这里加载近义词的路径
outstr = ''
for word in sentence_seged:
if word not in stopwords and word.__len__()>1:
if word != '\t':#判断出不是停用词
if word in synwords.keys():#如果是同义词
word = synwords[word]
outstr += word
outstr += " "
else:
outstr += word
outstr += " "
return outstr