信息增益提取文本分类特征词

信息增益提取文本分类特征词

# -*- coding: utf-8 -*-
# @Time    : 2018/11/5 10:26
# @Author  : lidra
# @File    : IG.py
# @Software: PyCharm
import cPickle as pickle
import numpy as np
import pandas as pd
import nltk
path = ''



def getvec():
    # from jieba import analyse
    data_a = pd.read_csv(open('dblp_Aconfs_acl.csv', 'r'))
    data_a = data_a[data_a.year>=2015]
    titles_a = list(data_a['title'])
    data_c = pd.read_csv(open('dblp_Aconfs_cvpr.csv', 'r'))
    data_c = data_c[data_c.year>=2015]
    titles_c = list(data_c['title'])
    words = []
    print len(titles_c), len(titles_a)
    for line in titles_a:
        line=line.lower()
        words.extend(nltk.word_tokenize(line))
    for line in titles_c:
        line = line.lower()
        words.extend(nltk.word_tokenize(line))
    words = list(set(words))
    pickle.dump(words, open('words.pkl','w'))
    wordid = {}
    rolevectores = []
    for w in words:
        wordid[w] = len(wordid)
    for text in titles_a:
        text = text.lower()
        rolevec = np.zeros(len(words) + 1)
        for w in nltk.word_tokenize(text):
            if w in words:
                rolevec[wordid[w]] += 1
        rolevec[-1] = 0
        rolevectores.append(rolevec)
    for text in titles_c:
        text = text.lower()
        rolevec = np.zeros(len(words) + 1)
        for w in nltk.word_tokenize(text):
            if w in words:
                rolevec[wordid[w]] += 1
        rolevec[-1] = 1
        rolevectores.append(rolevec)

    pickle.dump(np.array(rolevectores), open(path + 'tfidf-rolevector.pkl', 'wb'))
    return np.array(rolevectores)


def calc_ent(x):
    """
        calculate shanno ent of x
    """
    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    return ent


def calc_condition_ent(x, y):
    """
        calculate ent H(y|x)
    """

    # calc ent(y|x)
    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        sub_y = y[x == x_value]
        temp_ent = calc_ent(sub_y)
        ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
    return ent


def IG_word():
    words = pickle.load(open(path + 'words.pkl', 'rb'))
    wordid = {}
    idword = {}
    for w in words:
        idword[len(wordid)] = w
        wordid[w] = len(wordid)
    role = pickle.load(open(path + 'tfidf-rolevector.pkl', 'rb'))
    role = role.T
    label = role[-1]
    enty = calc_ent(label)
    IG = {}
    for i in range(len(role) - 1):
        fea = role[i]
        entcon = calc_condition_ent(fea, label)
        IG[i] = enty - entcon

    IG = sorted(IG.items(), key=lambda d: d[1], reverse=False)[:300]
    IGW = []
    for ig in IG:
        print idword[ig[0]]
        IGW.append((idword[ig[0]], ig[1]))
    pickle.dump(IGW, open(path + 'igwords.pkl', 'wb'))


getvec()
IG_word()

getvector():根据不同类别的语料库构建特征词典,并构建文本向量,[文本ont hot 向量]concat[文本类别标签]
IG_word():计算每个特征词的信息增益,并排序。

你可能感兴趣的:(机器学习)