NLP关键词提取(一)

NLP关键词提取(一)

      • jieba分词
      • 将csv文件的数据按照一定规则放入dataframe中
      • 提取role、key_word、end_time、begin_time列值

import jieba
import pandas as pd
import re
data=pd.read_csv(r'/Users/atsushi/Desktop/python/data_analysis/data/data.csv',encoding='gbk',header=None)
#header=None,表示读取数据无表头
#给列赋名字
data.columns=['foreign_id','text']

jieba分词

1.自定义添加少数词和字典
jieba.add_word(“长江黄河”)
2.自定义添加很多词和字典
jibe.load_userdict(‘user_dict.txt’)函数
user_dict.txt格式如下:
jieba.cut方法接收三个输入参数:需要分词的字符串;cut_all参数用来控制是否采用全模式,HMM参数用来控制是否使用HMM模型

key_words=['柜员机','分期','有效期']
for word in key_words:
    jieba.add_word(word)

将csv文件的数据按照一定规则放入dataframe中

df=pd.DataFrame(columns=['foreign_id','sentence','role','key_word','begin_time','end_time'])
for i in range(data.shape[0]):
    df1=df=pd.DataFrame(columns=['foreign_id','sentence','role','key_word','begin_time','end_time'])
    df1.sentence=re.findall(r'\(.*?\)',data['text'][i])
    df1.foreign_id=data['foreign_id'][i]
    df=df.append(df1)
# .*? 为正则表达式中分非贪婪匹配规则
#re.findall(r'\(.*?\)',data['text'][0])的返回结果为一个list:["('您好,请问有什么可以帮你'- '坐席'- ''- ''- '0.18'- '1.32')","('你好,请问是**公司吗'- '客户'- ''- ''- '2.00'- '5.32')","('是的,请说明你的问题,很高兴为你服务'- '坐席'- ''- ''- '6.76'- '10.32')"]

提取role、key_word、end_time、begin_time列值

#获得role
def get_role(sentence):
    role=re.sub(r"['\(\)]",'',sentence).split('-')[1].strip()
    return role

#获得keyword
def get_keyword(sentence):
    keywords=[]
    words=re.sub(r"['\(\)]",'',sentence).split('-')[0].strip()
    word=(',').join([word for word in jieba.lcut(words) if word in key_words])
    return word

#获得begin_time
def get_begin_time(sentence):
    begin_time=re.sub(r"['\(\)]",'',sentence).split('-')[4].strip()
    return begin_time

#获得end_time
def get_end_time(sentence):
    end_time=re.sub(r"['\(\)]",'',sentence).split('-')[4].strip()
    return end_time

df['role']=df['sentence'].apply(get_role)
df['key_word']=df['sentence'].apply(get_keyword)
df['begin_time']=df['sentence'].apply(get_begin_time)
df['end_time']=df['sentence'].apply(get_end_time)

#删除没有关键词
df=df[df['key_word']!='']
#重置索引
df.reset_index(drop=True,inplace=True)
#将key_word有多个词的拆分为多列
result=df.drop('key_word',axis=1).join(df['key_word'].str.split(',',expand=True).stack().reset_index(level=1,drop=True).rename('key_word'))

你可能感兴趣的:(自然语言学习)