文本清洗+python+正则表达式+词频统计

文本清洗,导出到文件

import re

# make English text clean 
def clean_en_text(text):
    # keep English, digital and space
    comp = re.compile('[^A-Z^a-z^0-9^ ]')
    return comp.sub(' ',text)

# make Chinese text clean
def clean_zh_text(text):
    # keep English, digital and Chinese
    comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
    return comp.sub(' ',text)

def file_en_clean(r_file_ad, w_file_ad):
    f = open(r_file_ad,'rt')
    print('读取文件的名字:',f.name)
    lines = f.readlines()
    output = []
    for line in lines:
        line = clean_en_text(line)
        output.append(line)
    f.close()
    f = open(w_file_ad,'w')
    print('写入文件的名字:',f.name)
    for o in output:
        f.write(o)
        f.write('\n')
    f.close()
if __name__ == '__main__':
# 本代码所在文件和两个.txt文件在同一目录下
    file_en_clean('./e2.txt','./new_test.txt')

加入词频统计,导出到文件

import re

# make English text clean 
def clean_en_text(text):
    # keep English, digital and space
    comp = re.compile('[^A-Z^a-z^0-9^ ]')
    return comp.sub(' ',text)

# make Chinese text clean
def clean_zh_text(text):
    # keep English, digital and Chinese
    comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
    return comp.sub(' ',text)

def dealed_list(filename):
    f = open(filename,'rt')
    print('读取文件的名字:',f.name)
    lines = f.readlines()
    output = []
    for line in lines:
        line = clean_en_text(line)
        output.append(line)
    f.close()
    return output

def readlist(dealed_list):
    fr = dealed_list
    wordsL = []#use this list to save the words
    for word in fr:
        word = word.lower()
        word = word.strip()
        word = word.split()
        wordsL = wordsL + word
    return wordsL

#count the frequency of every word and store in a dictionary
#And sort dictionaries by value from large to small
def count(wordsL):
    wordsD = {}
    for x in wordsL:
        #move these words that we don't need
        if Judge(x):
            continue
        #count
        if not x in wordsD:
            wordsD[x] = 1
        wordsD[x] += 1
    #Sort dictionaries by value from large to small
    wordsInorder = sorted(wordsD.items(), key=lambda x:x[1], reverse = True)
    return wordsInorder

#juege whether the word is that we want to move such as punctuation or letter
#You can modify this function to move more words such as number
def Judge(word):
    punctList = [' ','\t','\n',',','.',':','?']#juege whether the word is punctuation
    letterList = ['a','b','c','d','m','n','x','p','t']#juege whether the word is letter
    if word in punctList:
        return True
    elif word in letterList:
        return True
    else:
        return False

if __name__ == '__main__':
    for x in range(1,6):
        filename = 'e' + str(x) + '.txt'
        # 去掉不需要的字符
        L = dealed_list(filename)
        wordsL = readlist(L)
        words = count(wordsL)
        fw = open('./results/words_e' + str(x) + '.txt','w')
        for item in words:
            fw.write(item[0] + '\t' + str(item[1]) + '\n')
        fw.close()

参考博文:
用Python实现针对英文论文的词频分析
Python正则表达式做文本预处理,去掉特殊符号

你可能感兴趣的:(文本清洗+python+正则表达式+词频统计)