tensorflow的尝试进行谣言检测

数据是清华大学的中文谣言集

我只取了一千条谣言,分词,计算出IF-IDF,贴上标签,用作训练集,但是效果不是很好,这其中有很多需要优化的地方。我是小白,正在学东西
-这是数据处理的程序,由于代码写的很乱,不断的修改、不断的调试,最终有很多注释和不满意的地方。

# -*- coding: utf-8 -*-
import jieba
import numpy
import json
import copy
import os
import re
import time
import logging
import csv
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA,KernelPCA

#########配置log日志方便打印#############

LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m-%d-%Y %H:%M:%S"

logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

logger = logging.getLogger(__name__)

#------------------处理数据-----------------------#


# stopword_list = []
# rumor_corpus = []
# unrumor_corpus = []
# training_data = []
# validation_data = []
# test_data = []
# bag_of_word_count = {}


def readrumorfile(filename, bag_of_word_count, stopword_list, rumor_corpus):
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            # line.encode('utf-8')
            text_json = json.loads(line)
            s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", text_json["rumorText"]))  # 过滤掉句子中的数字和字母以及标点符号
            line_list = list(s)

            cp_line = copy.deepcopy(line_list)
            for word in line_list:
                if word in stopword_list:
                    word.encode('utf-8')
                    cp_line.remove(word)  # 去掉停用词
            for word in cp_line:
                if word not in bag_of_word_count:
                    bag_of_word_count[word] = 1
                else:
                    bag_of_word_count[word] += 1
            rumor_corpus.append(",".join(cp_line))
            if len(rumor_corpus) >= 1000:
                break
        f.close()
def readnewsfile(filename,bag_of_word_count,stopword_list,unrumor_corpus):
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            if len(line) <= 43:  # 过滤掉短文本   毫无意义的文本 len("①柔道冠军称父亲深夜被带走真相是这样http://t.cn/EJuoyyO")=38
                continue
            s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", line.strip('\n')))
            line_list = list(s)

            cp_line = copy.deepcopy(line_list)
            for word in line_list:
                if word in stopword_list:
                    word.encode('utf-8')
                    cp_line.remove(word)  # 去掉停用词
            for word in cp_line:
                if word not in bag_of_word_count:
                    bag_of_word_count[word] = 1
                else:
                    bag_of_word_count[word] += 1
            unrumor_corpus.append(",".join(cp_line))
            if len(unrumor_corpus) >= 1000:
                break
        f.close()

#将词袋中小于frequ的直接去掉

def removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ):
    rumor_cor = []
    unrumor_cor = []
    for s_r,s_u in zip(rumor_corpus,unrumor_corpus):
        list_s_r = s_r.split(",")
        list_s_u = s_u.split(",")

        list_r = copy.deepcopy(list_s_r)
        list_u = copy.deepcopy(list_s_u)

        for w in list_s_r:
            if w not in bag_of_word_count:
                logger.info(w)
                continue
            if bag_of_word_count[w] < frequ:
                list_r.remove(w)
        for w in list_s_u:
            if w not in bag_of_word_count:
                logger.info(w)
                continue
            if bag_of_word_count[w] < frequ:
                list_u.remove(w)

        if list_s_r:
            rumor_cor.append(",".join(list_r))
        if list_s_u:
            unrumor_cor.append(",".join(list_u))

    return rumor_cor,unrumor_cor



def getdata(stopword_list, bag_of_word_count, rumor_corpus, unrumor_corpus):
    # remove stopwords from list_corpus

    with open("../data/stopword.txt", "r", encoding="utf-8") as fp:
        for line in fp:
            stopword_list.append(line[:-1])
        fp.close()

    logger.info("读取停用词,构造stopword_list集合")

    # 谣言
    # 数据处理    list_corpus = [rumorText,rumorText,rumorText,...]
    readrumorfile("../data/rumors_v170613.json", bag_of_word_count, stopword_list, rumor_corpus)

    logger.info("从 rumors_v170613.json 谣言文本中获取 %g条数据" % (len(rumor_corpus)))
    # 非谣言
    readnewsfile("../data/news20190407-214236.txt", bag_of_word_count, stopword_list, unrumor_corpus)
    if len(unrumor_corpus) <= 1000:
        readnewsfile("../data/news20190407-214412.txt", bag_of_word_count, stopword_list, unrumor_corpus)
    # 释放堆内存
    stopword_list.clear()

    logger.info("从 news20190407-214236.txt | news20190407-214412.txt文本中获取到 %g" % (len(unrumor_corpus)))
    logger.info("词袋长度:%s" % (len(bag_of_word_count)))
    corpus = rumor_corpus + unrumor_corpus

    return corpus,bag_of_word_count, rumor_corpus, unrumor_corpus


def Sklearn_getfeature(corpus):
    # 将list_corpus里面所有的谣言短文本转换向量化,构建词袋
    vectoerizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')

    X = vectoerizer.fit_transform(corpus)

    # 计算TF-IDF
    tfidf_transformer = TfidfTransformer()

    tfidf = tfidf_transformer.fit_transform(X)

    logger.info("用sklearn构建词袋,TFIDF计算完成")
    # logger.info(tfidf[0][0])
    # logger.info(type(tfidf.toarray()))

    # 构造tupple,准备测试:
    # label = numpy.zeros((1000, 2))
    # for i in range(0, 500):
    #     label[i][0] = 1
    # for i in range(500, 1000):
    #     label[i][1] = 1
    # label = numpy.asarray(label)
    data_tfidf = tfidf.toarray()

    with open('../data/roumordataset.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data_tfidf)

    #利用PCA降维
    pca = PCA(n_components=841)
    data_pca = pca.fit_transform(data_tfidf)
    with open('../data/roumordatasetPCA.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data_pca)

    #利用PCA核方法进行降维
    kpca = KernelPCA(kernel="rbf")
    data_kpca = kpca.fit_transform(data_tfidf)
    with open('../data/roumordatasetKPCA.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data_kpca)
    return tfidf


def gensim_getfeature(corpus):
    return


# 测试时使用的函数----毫无用处
def WriteFile(data, target):
    if os.path.exists(target):
        path, suffix = os.path.splitext(target)
        s = time.strftime("%Y%m%d-%H%M%S", time.localtime())
        target = path + s + suffix
    with open(target, 'w', encoding="utf-8") as fp:
        for str in data:
            fp.write(str)
            fp.write("\n")
    fp.close()

#做数据集   按照训练集:验证集 = 4:1

if __name__ == '__main__':
    stopword_list = []
    rumor_corpus = []
    unrumor_corpus = []
    training_data = []
    validation_data = []
    test_data = []
    bag_of_word_count = {}
    frequ = 2

    corpus,bag_of_word_count,rumor_corpus, unrumor_corpus = getdata(stopword_list,bag_of_word_count,rumor_corpus,unrumor_corpus)
    logger.info(len(rumor_corpus))
    logger.info(len(unrumor_corpus))

    rumor_cor,unrumor_cor = removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ)
    logger.info(len(rumor_cor))
    logger.info(len(unrumor_cor))

    with open("../data/bag_word.json","w",encoding='utf-8') as f:
        json.dump(bag_of_word_count,f,ensure_ascii=False)

    Sklearn_getfeature(rumor_cor+unrumor_cor)

-这是训练的程序,效果很差

"""
X 841=29*29维特征的文本  Y label[1.,0.]  [0.,1.]
第一层:卷积层    输入的是29*29*1的文本特征
    过滤器尺寸 3*3 深度为5 不使用全零填充 步长为1
    输出为29-3+1=27*27 深度为5
    参数w = 3*3*1*5  b = 5
第二层:池化层   输入27*27*5的矩阵
    过滤器大小 3*3 步长为 3
    输出9*9*5
第三层:卷积层   输入9*9*5的矩阵
    过滤器尺寸 2*2 深度为12 不使用全零填充 步长为1
    参数w = 2*2*5*12 b = 12
    输出9-2+1=8*8*12
第四层:池化层   输入8*8*12
    过滤器大小 2*2 步长 2
    输出4*4*12
第五层:全连接层 输入4*4*12
    过滤器尺寸 4*4*80 不使用全零填充 步长为1
    参数w = 4*4*12*80 b = 80
    输出1*1*80
第六层:全连接层
    输入80
    w = 80*56 b = 56
    输出56
输出层:
    输入56
    w = 56*2 b=2
    输出2

"""
import tensorflow as tf
import numpy as np
import csv
import logging
#########配置log日志方便打印#############

LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m-%d-%Y %H:%M:%S"

logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)

logger = logging.getLogger(__name__)


num_input = 841
num_classes = 2
dropout = 0.5

learning_rate = 0.001
batch_size = 100
num_steps = 10000
display_step = 10

X = tf.placeholder(tf.float32, [None, num_input])
Y = tf.placeholder(tf.float32, [None, num_classes])


X_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32)
Y_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32)
#权重和偏向
weigths={
    "w1":tf.Variable(tf.random_normal([3, 3, 1, 5])),
    "w2":tf.Variable(tf.random_normal([2, 2, 5, 12])),
    "w3":tf.Variable(tf.random_normal([4*4*12,80])),
    "w4":tf.Variable(tf.random_normal([80,56])),
    "w5":tf.Variable(tf.random_normal([56,2]))
}
bias = {
    "b1":tf.Variable(tf.random_normal([5])),
    "b2":tf.Variable(tf.random_normal([12])),
    "b3":tf.Variable(tf.random_normal([80])),
    "b4":tf.Variable(tf.random_normal([56])),
    "b5":tf.Variable(tf.random_normal([2]))
}

def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)

def maxpool2d(x, k=2):
    # MaxPool2D wrapper
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],padding='VALID')

#定义操作
def conv_net(x, weights, biases, dropout):
    x = tf.reshape(x, shape=[-1, 29, 29, 1])

    conv1 = conv2d(x,weights['w1'],biases['b1'])
    conv1 = maxpool2d(conv1,k=3)

    conv2 = conv2d(conv1, weights['w2'], biases['b2'])
    conv2 = maxpool2d(conv2, k=2)

    fc3 = tf.reshape(conv2,[-1,weights['w3'].get_shape().as_list()[0]])
    fc3 = tf.add(tf.matmul(fc3, weights['w3']), biases['b3'])
    fc3 = tf.nn.relu(fc3)
    fc3 = tf.nn.dropout(fc3, dropout)

    fc4 = tf.add(tf.matmul(fc3, weights['w4']), biases['b4'])
    fc4 = tf.nn.relu(fc4)
    fc4 = tf.nn.dropout(fc4, dropout)

    fc5 = tf.add(tf.matmul(fc4, weights['w5']), biases['b5'])
    # fc5 = tf.nn.relu(fc5)

    return fc5

# Construct model
logits = conv_net(X, weigths, bias, dropout)

prediction = tf.nn.softmax(logits)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

init = tf.global_variables_initializer()

#初始化label  [0,1] 代表谣言  [1,0]非谣言
r = np.zeros((1000, 1),dtype=float)
c = np.ones((1000, 1),dtype=float)
a = np.hstack((r, c))
b = np.hstack((c, r))
lable = np.vstack((a, b))

count =0
with tf.Session() as sess:
    logger.info("-----------")
    s = []
    sess.run(init)
    with open("D:/WorkSpace/pyWorkspace/deepLearning/GradientDescent/data/roumordatasetPCA.csv", "r") as f:

        csv_reader = csv.reader(f)
        for line in csv_reader:
            s.append(line)
        f.close()
        line = np.array([ list(map(float, x))for x in s])
        data_x = np.reshape(line, (-1, 841))
        for step in range(1,num_steps + 1):
                data_y = np.reshape(lable,(-1,2))

                sess.run(train_op, feed_dict={X: data_x, Y: data_y})

                if step % 100 == 0 or step == 1:
                    loss, acc = sess.run([loss_op, accuracy], feed_dict={X: data_x, Y: data_y})
                    print("Step " + str(step) + ", Minibatch Loss= " + \
                          "{:.4f}".format(loss) + ", Training Accuracy= " + \
                          "{:.3f}".format(acc))

-最后贴一下结果,其实我是想做三个数据集,训练集,验证集,测试集,但是刚开始拿所有数据跑都是很差的结果,我就没有心情做下去了,后面我再改进一下,看哪里除了问题,还是数据集量太少,这可能是最根本的原因。

Step 1, Minibatch Loss= 1520.1038, Training Accuracy= 0.491
Step 100, Minibatch Loss= 262.6792, Training Accuracy= 0.549
Step 200, Minibatch Loss= 23.2395, Training Accuracy= 0.686
Step 300, Minibatch Loss= 2.4426, Training Accuracy= 0.615
Step 400, Minibatch Loss= 2.0057, Training Accuracy= 0.592
Step 500, Minibatch Loss= 1.4990, Training Accuracy= 0.586
Step 600, Minibatch Loss= 1.2961, Training Accuracy= 0.589
Step 700, Minibatch Loss= 1.1176, Training Accuracy= 0.576
Step 800, Minibatch Loss= 1.0612, Training Accuracy= 0.584
Step 900, Minibatch Loss= 0.8949, Training Accuracy= 0.595
Step 1000, Minibatch Loss= 0.8426, Training Accuracy= 0.595
Step 1100, Minibatch Loss= 0.8437, Training Accuracy= 0.592
Step 1200, Minibatch Loss= 0.7606, Training Accuracy= 0.576
Step 1300, Minibatch Loss= 0.7009, Training Accuracy= 0.609
Step 1400, Minibatch Loss= 0.6559, Training Accuracy= 0.609
Step 1500, Minibatch Loss= 0.6449, Training Accuracy= 0.598
Step 1600, Minibatch Loss= 0.6219, Training Accuracy= 0.603
Step 1700, Minibatch Loss= 0.5932, Training Accuracy= 0.608
Step 1800, Minibatch Loss= 0.5810, Training Accuracy= 0.623
Step 1900, Minibatch Loss= 0.5983, Training Accuracy= 0.608
Step 2000, Minibatch Loss= 0.5709, Training Accuracy= 0.607
Step 2100, Minibatch Loss= 0.5430, Training Accuracy= 0.626
Step 2200, Minibatch Loss= 0.5401, Training Accuracy= 0.642
Step 2300, Minibatch Loss= 0.5308, Training Accuracy= 0.630
Step 2400, Minibatch Loss= 0.5409, Training Accuracy= 0.627
Step 2500, Minibatch Loss= 0.5284, Training Accuracy= 0.638
Step 2600, Minibatch Loss= 0.5743, Training Accuracy= 0.627
Step 2700, Minibatch Loss= 0.5084, Training Accuracy= 0.649
Step 2800, Minibatch Loss= 0.5221, Training Accuracy= 0.643
Step 2900, Minibatch Loss= 0.5110, Training Accuracy= 0.651
Step 3000, Minibatch Loss= 0.5214, Training Accuracy= 0.663
Step 3100, Minibatch Loss= 0.4978, Training Accuracy= 0.663
Step 3200, Minibatch Loss= 0.5084, Training Accuracy= 0.647
Step 3300, Minibatch Loss= 0.4945, Training Accuracy= 0.677
Step 3400, Minibatch Loss= 0.4991, Training Accuracy= 0.660
Step 3500, Minibatch Loss= 0.4948, Training Accuracy= 0.667
Step 3600, Minibatch Loss= 0.4915, Training Accuracy= 0.660
Step 3700, Minibatch Loss= 0.4986, Training Accuracy= 0.670
Step 3800, Minibatch Loss= 0.4813, Training Accuracy= 0.674
Step 3900, Minibatch Loss= 0.5162, Training Accuracy= 0.682
Step 4000, Minibatch Loss= 0.5086, Training Accuracy= 0.680
Step 4100, Minibatch Loss= 0.4827, Training Accuracy= 0.677
Step 4200, Minibatch Loss= 0.4798, Training Accuracy= 0.686
Step 4300, Minibatch Loss= 0.4738, Training Accuracy= 0.682
Step 4400, Minibatch Loss= 0.4889, Training Accuracy= 0.679
Step 4500, Minibatch Loss= 0.4631, Training Accuracy= 0.690
Step 4600, Minibatch Loss= 0.4766, Training Accuracy= 0.681
Step 4700, Minibatch Loss= 0.4778, Training Accuracy= 0.686
Step 4800, Minibatch Loss= 0.4525, Training Accuracy= 0.704
Step 4900, Minibatch Loss= 0.4552, Training Accuracy= 0.692
Step 5000, Minibatch Loss= 0.4411, Training Accuracy= 0.701
Step 5100, Minibatch Loss= 0.4653, Training Accuracy= 0.694
Step 5200, Minibatch Loss= 0.4400, Training Accuracy= 0.709
Step 5300, Minibatch Loss= 0.4426, Training Accuracy= 0.698
Step 5400, Minibatch Loss= 0.4385, Training Accuracy= 0.705
Step 5500, Minibatch Loss= 0.4365, Training Accuracy= 0.705
Step 5600, Minibatch Loss= 0.4332, Training Accuracy= 0.711
Step 5700, Minibatch Loss= 0.4404, Training Accuracy= 0.708
Step 5800, Minibatch Loss= 0.4188, Training Accuracy= 0.715
Step 5900, Minibatch Loss= 0.4118, Training Accuracy= 0.722
Step 6000, Minibatch Loss= 0.4032, Training Accuracy= 0.713
Step 6100, Minibatch Loss= 0.4179, Training Accuracy= 0.711
Step 6200, Minibatch Loss= 0.4081, Training Accuracy= 0.714
Step 6300, Minibatch Loss= 0.4038, Training Accuracy= 0.713
Step 6400, Minibatch Loss= 0.4081, Training Accuracy= 0.719
Step 6500, Minibatch Loss= 0.3908, Training Accuracy= 0.742
Step 6600, Minibatch Loss= 0.3901, Training Accuracy= 0.735
Step 6700, Minibatch Loss= 0.3915, Training Accuracy= 0.729
Step 6800, Minibatch Loss= 0.3782, Training Accuracy= 0.721
Step 6900, Minibatch Loss= 0.3917, Training Accuracy= 0.712
Step 7000, Minibatch Loss= 0.3819, Training Accuracy= 0.734
Step 7100, Minibatch Loss= 0.3765, Training Accuracy= 0.738
Step 7200, Minibatch Loss= 0.3544, Training Accuracy= 0.749
Step 7300, Minibatch Loss= 0.3634, Training Accuracy= 0.748
Step 7400, Minibatch Loss= 0.3551, Training Accuracy= 0.758
Step 7500, Minibatch Loss= 0.3613, Training Accuracy= 0.746
Step 7600, Minibatch Loss= 0.3574, Training Accuracy= 0.753
Step 7700, Minibatch Loss= 0.3532, Training Accuracy= 0.758
Step 7800, Minibatch Loss= 0.3456, Training Accuracy= 0.762
Step 7900, Minibatch Loss= 0.3695, Training Accuracy= 0.747
Step 8000, Minibatch Loss= 0.3646, Training Accuracy= 0.768
Step 8100, Minibatch Loss= 0.3573, Training Accuracy= 0.756
Step 8200, Minibatch Loss= 0.3461, Training Accuracy= 0.760
Step 8300, Minibatch Loss= 0.3557, Training Accuracy= 0.759
Step 8400, Minibatch Loss= 0.3514, Training Accuracy= 0.756
Step 8500, Minibatch Loss= 0.3472, Training Accuracy= 0.768
Step 8600, Minibatch Loss= 0.3538, Training Accuracy= 0.757
Step 8700, Minibatch Loss= 0.3424, Training Accuracy= 0.763
Step 8800, Minibatch Loss= 0.3516, Training Accuracy= 0.754
Step 8900, Minibatch Loss= 0.3555, Training Accuracy= 0.762
Step 9000, Minibatch Loss= 0.3448, Training Accuracy= 0.767
Step 9100, Minibatch Loss= 0.3467, Training Accuracy= 0.761
Step 9200, Minibatch Loss= 0.3319, Training Accuracy= 0.777
Step 9300, Minibatch Loss= 0.3444, Training Accuracy= 0.765
Step 9400, Minibatch Loss= 0.3430, Training Accuracy= 0.762
Step 9500, Minibatch Loss= 0.3375, Training Accuracy= 0.766
Step 9600, Minibatch Loss= 0.3355, Training Accuracy= 0.768
Step 9700, Minibatch Loss= 0.3285, Training Accuracy= 0.780
Step 9800, Minibatch Loss= 0.3374, Training Accuracy= 0.772
Step 9900, Minibatch Loss= 0.3304, Training Accuracy= 0.781
Step 10000, Minibatch Loss= 0.3401, Training Accuracy= 0.768

你可能感兴趣的:(生活闲谈)