中文情感分类实例

一个简单的情感二分类实例

情感分类是指根据文本所表达的含义和情感信息将文本划分成褒扬的或贬义的两种或几种类型,是对文本作者倾向性和观点、态度的划分,因此有时也称倾向性分析(opinion analysis)。
情感分类作为一种特殊的分类问题,既有一般模式分类的共性问题,也有其特殊性,如情感信息表达的隐蔽性、多义性和极性不明显等。

一、任务介绍

适用于将该题目分为两个子任务(负面分类–Sentiment.py,主体判定-Entity.py)的同学使用。
1.第一个任务是情感分类,分别对金融信息实现是否为金融负面消息,若是,则为1,;若不是,则为0;主要方法如下:

  • a.jieba分词+停用词+去低频次+表格一些简单的处理
  • b.用卡方统计的方法获取信息量从高到低的特征(获得特征数是可调参
    数,目前的特征数量适合我的数据集,使用者可根据自己的数据集自行调整优化)
  • c.用线性分类器训练数据(使用者可试用其他分类器,对比效果)

2.根据任务一会生成一个结果文件,只需把第一个任务的结果(id,negative)文件加入代码中即可得到最终的结果(id,negative,key_entity)
3.任务二是命名实体识别,只用了两个方式过滤实体,“取最大字符串”和“NIKE实体过滤” 。

  • a.取最大字符串方法意思是:如果存在一个实体的字符串可以包含其他实体的字符串,那么就取这一个实体就行。
    如:小资易贷,小资易贷有限公司,就只取:小资易贷有限公司

  • b.NICK实体过滤的NIKE全称"Not In Key Entity" but in
    entity。统计每个实体在Entity中出现又在KeyEntity中出现的次数, 以及对应没有出现的次数。设定一个比例,在Entity中出现又在KeyEntity中出现的次数的比例低于多少的实体为"NIKE"实体,直接过滤。

二、准备数据集

数据形式如下图所示:中文情感分类实例_第1张图片

三、具体的代码实现

任务一,情感分类的代码如下:

import pandas as pd
import jieba
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
from random import shuffle
import csv
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression


# 任务一:情感分类
# 先提取出负面信息,将负面信息的后百分之七十作为训练集,前百分之三十作为测试集
# 一、读入数据
data = pd.read_csv("MY_DATAA/Train_Data.csv")
test = pd.read_csv("MY_DATAA/Test_Data.csv")
neg_text = data[data["negative"] == 1]["text"]
pos_text = data[data["negative"] == 0]["text"]
test_text = test["text"]
test_id = test["id"]
tag = data["negative"]

# 读取文本
def read_text(text):
    stop = [line.strip() for line in open('E:/untitled/CCF_NSJ/NSJ_DATA/stop.txt', 'r', encoding='utf-8').readlines()]
    str = []
    for line in text:
        s = line.split('\t')
        fenci = jieba.cut(s[0], cut_all=False)
        str.append(list(set(fenci) - set(stop)))
    return str


# 获取信息量从高到低的特征(卡方统计)
def jieba_feature(number):
    posWords = []
    negWords = []
    for items in read_text(pos_text):
        for item in items:
            posWords.append(item)
    for items in read_text(neg_text):
        for item in items:
            negWords.append(item)

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频

    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['0'][word] += 1

    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['1'][word] += 1

    pos_word_count = cond_word_fd['0'].N()  # 积极词的数量
    neg_word_count = cond_word_fd['1'].N()  # 消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}  # 包括了每个词和这个词的信息量

    for word, freq in word_fd.items():
        # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['0'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['1'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score  # 一个词的信息量等于积极卡方统计量加上消极卡方统计量

    # 把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的
    best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:number]
    best_words = set([w for w, s in best_vals])
    return dict([(word, True) for word in best_words])


def build_features():
    feature = jieba_feature(4700)  # 结巴分词,number的值可根据自己的数据集调整到最优
    posFeatures = []

    for items in read_text(pos_text):
        a = {}
        for item in items:
            if item in feature.keys():
                a[item] = 'True'
        posWords = [a, '0']  # 为积极文本赋予"0"
        posFeatures.append(posWords)
    negFeatures = []
    for items in read_text(neg_text):
        a = {}
        for item in items:
            if item in feature.keys():
                a[item] = 'True'
        negWords = [a, '1']  # 为消极文本赋予"1"
        negFeatures.append(negWords)
    testFeatures = []
    for items in read_text(test_text):
        a = {}
        for item in items:
            if item in feature.keys():
                a[item] = 'True'
        testWords = [a, '0']
        testFeatures.append(testWords)
    train = posFeatures + negFeatures
    return train, testFeatures


# 获取数据
train, testFeatures = build_features()
# 把文本的排列随机化
shuffle(train)
data = testFeatures
data, tag2 = zip(*data)#分离测试集合的数据和标签,便于验证和测试


# 创建CSV文件
def create_csv():
    path = "MY_DATAA/neg.csv"
    with open(path, 'w') as f:
        csv_write = csv.writer(f)
        csv_head = ["id", "negative"]
        csv_write.writerow(csv_head)


# 向文件中追加内容
def write_csv(count, state):
    path = "MY_DATAA/neg.csv"
    with open(path, 'a+') as f:
        csv_write = csv.writer(f)
        data_row = [count, state]
        csv_write.writerow(data_row)


# 创建一个文件
create_csv()


def score(classifier):
    classifier = SklearnClassifier(classifier)
    classifier.train(train)  # 训练分类器
    pred = classifier.classify_many(data)  # 给出预测的标签
    count_1 = 0
    for i, k in test_id.items():
        write_csv(k, pred[count_1])
        count_1 = count_1 + 1


# 线性分类器(可尝试别的分类器)
print('LogisticRegression`s accuracy is ', score(LogisticRegression()))

任务二Entity.py,命名实体识别代码的实现如下。

import pandas as pd


class PickKeyEntity(object):
    # 初始化数据
    def __init__(self, neg_filepath, train_filepath, test_filepath):
        self.train_filepath = train_filepath
        self.test_filepath = test_filepath
        self.neg_filepath = neg_filepath
        self.ratio = 0.1  # in entity and in key_entitiy ratio that smaller than this ratio is not key entity
        self.nike = self._generate_nike_data()

    def run(self):
        test_data = pd.read_csv(self.test_filepath)
        test_data = test_data.loc[:, ["id", "entity"]]
        neg_data = pd.read_csv(self.neg_filepath)
        test_data = pd.merge(neg_data, test_data, on="id", how="inner")
        test_data["key_entity"] = test_data.apply(self.func_on_row, axis=1)
        test_data.to_csv("MY_DATAA/result.csv", index=False,
                         columns=["id", "negative", "key_entity"])

    # add your thoughts here to filter entities if you want to improve the effect further.
    def func_on_row(self, row):
        key_entitys = []
        if row["negative"] == 1:
            if isinstance(row["entity"], float) == False:
                entitys = row["entity"].split(";")
                for entity in entitys:
                    if entity not in self.nike:
                         key_entitys.append(entity)
            key_entitys = self._remove_substring(key_entitys)
        return ";".join(key_entitys)

    # 'nike' entity means "not in key entitiy" but in entity from train data
    def _generate_nike_data(self):
        nike = []
        numsOfEntitiyAsKey = {}
        train_data = pd.read_csv(self.train_filepath)
        train_data = train_data.loc[:, ["negative", "entity", "key_entity"]]
        train_data = train_data[train_data.negative == 1]
        for index, row in train_data.iterrows():
            entitys = row["entity"]
            key_entitys = row["key_entity"]
            entitys = entitys.split(";")
            key_entitys = key_entitys.split(";")
            for entity in entitys:
                if numsOfEntitiyAsKey.get(entity, -1) == -1:
                    if entity in key_entitys:
                        numsOfEntitiyAsKey.update({entity: {"in": 1, "out": 0}})
                    else:
                        numsOfEntitiyAsKey.update({entity: {"in": 0, "out": 1}})
                else:
                    if entity in key_entitys:
                        numsOfEntitiyAsKey[entity]["in"] += 1
                    else:
                        numsOfEntitiyAsKey[entity]["out"] += 1
        for entity, nums in numsOfEntitiyAsKey.items():
            num_in = nums["in"]
            num_out = nums["out"]
            freq_in = num_in / (num_in + num_out)
            if freq_in < self.ratio:
                nike.append(entity)
        return nike

    # remove entities that can be substring of other entities.
    # eg: 资易贷,小资易贷,资易贷有限公司 we retain 小资易贷,资易贷有限公司
    def _remove_substring(self, entities):
        entities = list(set(entities))
        longest_entities = []
        for entity in entities:
            flag = 0
            for entity_ in entities:
                if entity == entity_:
                    continue
                if entity_.find(entity) != -1:
                    flag = 1
            if flag == 0:
                longest_entities.append(entity)
        return longest_entities


if __name__ == "__main__":
    pickKeyEntity = PickKeyEntity("MY_DATAA/neg.csv", "MY_DATAA/Train_Data.csv", "MY_DATAA/Test_Data.csv" )
    pickKeyEntity.run()

希望对大家有帮助,刚开始接触,自己也是小白一枚,继续摸索这个复杂滴领域。

你可能感兴趣的:(自然语言处理)