批量处理CSV文件


有必要介绍两个个棒棒的库

1)glob        #用于查找符合规定的文件路径(如果你os 模块用的6,用哪个也挺好的)

def get_all_content():
    all_files = glob.glob(r'D:/../../*.csv') #填写自己的文件路径
    return all_files
这样就搜索到了文件夹中所有的CSV文件

2)re        #用于匹配

note:

[\u4E00-\u9FA5]  #用来匹配中文        
[\d{4}]        #用来匹配数字(表示有四个数字)
        pattern = "\\.*?(\w+[\u4E00-\u9FA5]+[\d{4}]+[\u4E00-\u9FA5]+说明会)"
        string =  all_path_[j]
        match_obj = re.search(pattern,string)
完整代码:
"""
author:suxue
date:2018/6/20
version:1.0
copyright:[email protected]

"""
"""
#1.提取问题和答案
#2.分词并对数据做简单清洗
#3.计算tiidf,提取关键词
#4.词袋向量化,对于每一对问答,计算出一个余弦相似度
#5.阈值判断,归一化处理
"""
import  glob          #查找符合特定规则的文件路径
import codecs
from os.path import exists,isdir,basename,join,splitext
import pandas as pd
import numpy as np
import csv
import os
import re
import time
import warnings
import gensim
from gensim import corpora,models,similarities
import jieba                    #引入结巴
import jieba.posseg as pseg     #引入结巴词性标注
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
warnings.filterwarnings(action='ignore',category=UserWarning,module='pandas')

#获取所有文件
def all_path(dirname):
    result = []
    for maindir, subdir, file_name_list in os.walk(dirname):
        for filename in file_name_list:
            apath = os.path.join(maindir, filename)
            result.append(apath)
    return result
def get_all_content():
    #abel_dir = [path + x for x in os.listdir(path) if os.path.isdir(path + x)]
    all_files = glob.glob(r'D:/GFZQ/GFZQ/xuexuesu/*.csv')
    return all_files

#获取停用词库
def loadPoorEnt(path2 = './stopwords.csv'):
    poor_ent=set([])
    #f = open(path, encoding='utf-8')
    with open(path2, 'r',encoding='utf-8') as ropen:
        lines=ropen.readlines()
        for line in lines:
            line=line.replace('\r','').replace('\n','')
            poor_ent.add(line)
    return poor_ent
stop_words=loadPoorEnt()

#1.提取问题和答案
def extra_data(path):
    f = open(path, encoding='utf-8')
    trainFile = pd.read_csv(f)
    #trainFile = trainFile[:top]
    Questions = trainFile['Question']
    Answers = trainFile['Answer']
    df = pd.DataFrame({})
    return Questions,Answers,trainFile

#2.分词并做简单清洗
def cut(data):
    result=[]    #pos=['n','v']
    for line in data:
        #line = line[0]
        res = pseg.cut(line)
        list = []
        for item in res:
            if item.word.encode('utf8') not in stop_words :
                list.append(item.word)
        result.append(list)
    return result

#3.使用TF-IDF对语料库进行建模
def model(all_data):
    processed_corpus = [[token for token in text] for text in all_data]
    dictionary = corpora.Dictionary(processed_corpus)
    corpus = [dictionary.doc2bow(line) for line in processed_corpus]
    tfidf = models.TfidfModel(corpus)
    return tfidf

#用于将词汇向量化
def list2vec(list,length):
    vector=np.zeros(length)
    for item in list:
        #print (item[0])
        #print (item[1])
        vector[item[0]]=item[1]
    return vector

#计算余弦值
def cal_cos(vec_a,vec_b):
    #cosin =[]
    num = np.dot(vec_a, vec_b.T)
    denom = np.linalg.norm(vec_a) * np.linalg.norm(vec_b)
    cos = num / denom  # 余弦值
    return cos
#计算时间
def cal_time(time):
    if time < 60:
        return str(time) + 'secs '
    elif time < 60 * 60:
        return str(time / (60.0)) + ' mins '
    else:
        return str(time / (60 * 60.0)) + ' hours '
#主函数
if __name__ =='__main__':
    start = time.clock()
    all_path_= get_all_content()
    #length1 = len(all_path_)
    length1 = 5
    all_result =[]
    for j in range(length1):
        Question,Answer,trainFile = extra_data(all_path_[j])
        print ("读取数据完成...")
        #raw_data = cut(trainFile)
        raw_data =[]
        cut_question = cut(Question)
        cut_answer = cut(Answer)
        raw_data = cut_question + cut_answer  #不知道为啥不能用append和extend,气气气~
        print ("分词完成....")
        #tfidf = model(raw_data)
        processed_corpus = [[token for token in text] for text in raw_data]
        dictionary = corpora.Dictionary(processed_corpus)
        corpus = [dictionary.doc2bow(line) for line in processed_corpus]
        words =[]
        for item in corpus:
            for word in item:
                words.append(word)
        tf_idf = models.TfidfModel(corpus)
        result =[]
        length = len(words)
        for i in range(len(Question)):
            q1 = dictionary.doc2bow(cut_question[i])
            a1 = dictionary.doc2bow(cut_answer[i])
            vec1 = tf_idf[q1]       #必须是方括号,TF-IDF 值(前面一项为ID,后面一项是TFIDF值)
            vec2 = tf_idf[a1]
            vec_a = list2vec(vec1, length)
            vec_b = list2vec(vec2, length)
            cosin = cal_cos(vec_a, vec_b)
            result.append(cosin)
        # df = pd.DataFrame({'cosin':result})
        # df.to_csv()
        # print ("finish第%d")
        pattern = "\\.*?(\w+[\u4E00-\u9FA5]+[\d{4}]+[\u4E00-\u9FA5]+说明会)"
        string =  all_path_[j]
        match_obj = re.search(pattern,string)
        if match_obj:
            file_name = match_obj.group(1)
            df2 = pd.DataFrame({'Cosin':result})
            df2.to_csv('D:\GFZQ\GFZQ\Cosin\%s.csv' % file_name)
        print("finish第%d家company" % (j + 1))
    #all_result.append(result)
    elapsed = (time.clock() - start)
    print('Time use', cal_time(elapsed))
    #print(all_result)



你可能感兴趣的:(NLP)