我只取了一千条谣言,分词,计算出IF-IDF,贴上标签,用作训练集,但是效果不是很好,这其中有很多需要优化的地方。我是小白,正在学东西
-这是数据处理的程序,由于代码写的很乱,不断的修改、不断的调试,最终有很多注释和不满意的地方。
# -*- coding: utf-8 -*-
import jieba
import numpy
import json
import copy
import os
import re
import time
import logging
import csv
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA,KernelPCA
#########配置log日志方便打印#############
LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m-%d-%Y %H:%M:%S"
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
logger = logging.getLogger(__name__)
#------------------处理数据-----------------------#
# stopword_list = []
# rumor_corpus = []
# unrumor_corpus = []
# training_data = []
# validation_data = []
# test_data = []
# bag_of_word_count = {}
def readrumorfile(filename, bag_of_word_count, stopword_list, rumor_corpus):
with open(filename, "r", encoding="utf-8") as f:
for line in f:
# line.encode('utf-8')
text_json = json.loads(line)
s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", text_json["rumorText"])) # 过滤掉句子中的数字和字母以及标点符号
line_list = list(s)
cp_line = copy.deepcopy(line_list)
for word in line_list:
if word in stopword_list:
word.encode('utf-8')
cp_line.remove(word) # 去掉停用词
for word in cp_line:
if word not in bag_of_word_count:
bag_of_word_count[word] = 1
else:
bag_of_word_count[word] += 1
rumor_corpus.append(",".join(cp_line))
if len(rumor_corpus) >= 1000:
break
f.close()
def readnewsfile(filename,bag_of_word_count,stopword_list,unrumor_corpus):
with open(filename, "r", encoding="utf-8") as f:
for line in f:
if len(line) <= 43: # 过滤掉短文本 毫无意义的文本 len("①柔道冠军称父亲深夜被带走真相是这样http://t.cn/EJuoyyO")=38
continue
s = jieba.cut(re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", line.strip('\n')))
line_list = list(s)
cp_line = copy.deepcopy(line_list)
for word in line_list:
if word in stopword_list:
word.encode('utf-8')
cp_line.remove(word) # 去掉停用词
for word in cp_line:
if word not in bag_of_word_count:
bag_of_word_count[word] = 1
else:
bag_of_word_count[word] += 1
unrumor_corpus.append(",".join(cp_line))
if len(unrumor_corpus) >= 1000:
break
f.close()
#将词袋中小于frequ的直接去掉
def removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ):
rumor_cor = []
unrumor_cor = []
for s_r,s_u in zip(rumor_corpus,unrumor_corpus):
list_s_r = s_r.split(",")
list_s_u = s_u.split(",")
list_r = copy.deepcopy(list_s_r)
list_u = copy.deepcopy(list_s_u)
for w in list_s_r:
if w not in bag_of_word_count:
logger.info(w)
continue
if bag_of_word_count[w] < frequ:
list_r.remove(w)
for w in list_s_u:
if w not in bag_of_word_count:
logger.info(w)
continue
if bag_of_word_count[w] < frequ:
list_u.remove(w)
if list_s_r:
rumor_cor.append(",".join(list_r))
if list_s_u:
unrumor_cor.append(",".join(list_u))
return rumor_cor,unrumor_cor
def getdata(stopword_list, bag_of_word_count, rumor_corpus, unrumor_corpus):
# remove stopwords from list_corpus
with open("../data/stopword.txt", "r", encoding="utf-8") as fp:
for line in fp:
stopword_list.append(line[:-1])
fp.close()
logger.info("读取停用词,构造stopword_list集合")
# 谣言
# 数据处理 list_corpus = [rumorText,rumorText,rumorText,...]
readrumorfile("../data/rumors_v170613.json", bag_of_word_count, stopword_list, rumor_corpus)
logger.info("从 rumors_v170613.json 谣言文本中获取 %g条数据" % (len(rumor_corpus)))
# 非谣言
readnewsfile("../data/news20190407-214236.txt", bag_of_word_count, stopword_list, unrumor_corpus)
if len(unrumor_corpus) <= 1000:
readnewsfile("../data/news20190407-214412.txt", bag_of_word_count, stopword_list, unrumor_corpus)
# 释放堆内存
stopword_list.clear()
logger.info("从 news20190407-214236.txt | news20190407-214412.txt文本中获取到 %g" % (len(unrumor_corpus)))
logger.info("词袋长度:%s" % (len(bag_of_word_count)))
corpus = rumor_corpus + unrumor_corpus
return corpus,bag_of_word_count, rumor_corpus, unrumor_corpus
def Sklearn_getfeature(corpus):
# 将list_corpus里面所有的谣言短文本转换向量化,构建词袋
vectoerizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b')
X = vectoerizer.fit_transform(corpus)
# 计算TF-IDF
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(X)
logger.info("用sklearn构建词袋,TFIDF计算完成")
# logger.info(tfidf[0][0])
# logger.info(type(tfidf.toarray()))
# 构造tupple,准备测试:
# label = numpy.zeros((1000, 2))
# for i in range(0, 500):
# label[i][0] = 1
# for i in range(500, 1000):
# label[i][1] = 1
# label = numpy.asarray(label)
data_tfidf = tfidf.toarray()
with open('../data/roumordataset.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data_tfidf)
#利用PCA降维
pca = PCA(n_components=841)
data_pca = pca.fit_transform(data_tfidf)
with open('../data/roumordatasetPCA.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data_pca)
#利用PCA核方法进行降维
kpca = KernelPCA(kernel="rbf")
data_kpca = kpca.fit_transform(data_tfidf)
with open('../data/roumordatasetKPCA.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data_kpca)
return tfidf
def gensim_getfeature(corpus):
return
# 测试时使用的函数----毫无用处
def WriteFile(data, target):
if os.path.exists(target):
path, suffix = os.path.splitext(target)
s = time.strftime("%Y%m%d-%H%M%S", time.localtime())
target = path + s + suffix
with open(target, 'w', encoding="utf-8") as fp:
for str in data:
fp.write(str)
fp.write("\n")
fp.close()
#做数据集 按照训练集:验证集 = 4:1
if __name__ == '__main__':
stopword_list = []
rumor_corpus = []
unrumor_corpus = []
training_data = []
validation_data = []
test_data = []
bag_of_word_count = {}
frequ = 2
corpus,bag_of_word_count,rumor_corpus, unrumor_corpus = getdata(stopword_list,bag_of_word_count,rumor_corpus,unrumor_corpus)
logger.info(len(rumor_corpus))
logger.info(len(unrumor_corpus))
rumor_cor,unrumor_cor = removeWord(rumor_corpus,unrumor_corpus,bag_of_word_count,frequ)
logger.info(len(rumor_cor))
logger.info(len(unrumor_cor))
with open("../data/bag_word.json","w",encoding='utf-8') as f:
json.dump(bag_of_word_count,f,ensure_ascii=False)
Sklearn_getfeature(rumor_cor+unrumor_cor)
-这是训练的程序,效果很差
"""
X 841=29*29维特征的文本 Y label[1.,0.] [0.,1.]
第一层:卷积层 输入的是29*29*1的文本特征
过滤器尺寸 3*3 深度为5 不使用全零填充 步长为1
输出为29-3+1=27*27 深度为5
参数w = 3*3*1*5 b = 5
第二层:池化层 输入27*27*5的矩阵
过滤器大小 3*3 步长为 3
输出9*9*5
第三层:卷积层 输入9*9*5的矩阵
过滤器尺寸 2*2 深度为12 不使用全零填充 步长为1
参数w = 2*2*5*12 b = 12
输出9-2+1=8*8*12
第四层:池化层 输入8*8*12
过滤器大小 2*2 步长 2
输出4*4*12
第五层:全连接层 输入4*4*12
过滤器尺寸 4*4*80 不使用全零填充 步长为1
参数w = 4*4*12*80 b = 80
输出1*1*80
第六层:全连接层
输入80
w = 80*56 b = 56
输出56
输出层:
输入56
w = 56*2 b=2
输出2
"""
import tensorflow as tf
import numpy as np
import csv
import logging
#########配置log日志方便打印#############
LOG_FORMAT = "%(asctime)s -%(filename)s[line:%(lineno)d]- %(name)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m-%d-%Y %H:%M:%S"
logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, datefmt=DATE_FORMAT)
logger = logging.getLogger(__name__)
num_input = 841
num_classes = 2
dropout = 0.5
learning_rate = 0.001
batch_size = 100
num_steps = 10000
display_step = 10
X = tf.placeholder(tf.float32, [None, num_input])
Y = tf.placeholder(tf.float32, [None, num_classes])
X_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32)
Y_batch = tf.Variable(tf.constant(0.0), dtype=tf.float32)
#权重和偏向
weigths={
"w1":tf.Variable(tf.random_normal([3, 3, 1, 5])),
"w2":tf.Variable(tf.random_normal([2, 2, 5, 12])),
"w3":tf.Variable(tf.random_normal([4*4*12,80])),
"w4":tf.Variable(tf.random_normal([80,56])),
"w5":tf.Variable(tf.random_normal([56,2]))
}
bias = {
"b1":tf.Variable(tf.random_normal([5])),
"b2":tf.Variable(tf.random_normal([12])),
"b3":tf.Variable(tf.random_normal([80])),
"b4":tf.Variable(tf.random_normal([56])),
"b5":tf.Variable(tf.random_normal([2]))
}
def conv2d(x, W, b, strides=1):
# Conv2D wrapper, with bias and relu activation
x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='VALID')
x = tf.nn.bias_add(x, b)
return tf.nn.relu(x)
def maxpool2d(x, k=2):
# MaxPool2D wrapper
return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],padding='VALID')
#定义操作
def conv_net(x, weights, biases, dropout):
x = tf.reshape(x, shape=[-1, 29, 29, 1])
conv1 = conv2d(x,weights['w1'],biases['b1'])
conv1 = maxpool2d(conv1,k=3)
conv2 = conv2d(conv1, weights['w2'], biases['b2'])
conv2 = maxpool2d(conv2, k=2)
fc3 = tf.reshape(conv2,[-1,weights['w3'].get_shape().as_list()[0]])
fc3 = tf.add(tf.matmul(fc3, weights['w3']), biases['b3'])
fc3 = tf.nn.relu(fc3)
fc3 = tf.nn.dropout(fc3, dropout)
fc4 = tf.add(tf.matmul(fc3, weights['w4']), biases['b4'])
fc4 = tf.nn.relu(fc4)
fc4 = tf.nn.dropout(fc4, dropout)
fc5 = tf.add(tf.matmul(fc4, weights['w5']), biases['b5'])
# fc5 = tf.nn.relu(fc5)
return fc5
# Construct model
logits = conv_net(X, weigths, bias, dropout)
prediction = tf.nn.softmax(logits)
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
init = tf.global_variables_initializer()
#初始化label [0,1] 代表谣言 [1,0]非谣言
r = np.zeros((1000, 1),dtype=float)
c = np.ones((1000, 1),dtype=float)
a = np.hstack((r, c))
b = np.hstack((c, r))
lable = np.vstack((a, b))
count =0
with tf.Session() as sess:
logger.info("-----------")
s = []
sess.run(init)
with open("D:/WorkSpace/pyWorkspace/deepLearning/GradientDescent/data/roumordatasetPCA.csv", "r") as f:
csv_reader = csv.reader(f)
for line in csv_reader:
s.append(line)
f.close()
line = np.array([ list(map(float, x))for x in s])
data_x = np.reshape(line, (-1, 841))
for step in range(1,num_steps + 1):
data_y = np.reshape(lable,(-1,2))
sess.run(train_op, feed_dict={X: data_x, Y: data_y})
if step % 100 == 0 or step == 1:
loss, acc = sess.run([loss_op, accuracy], feed_dict={X: data_x, Y: data_y})
print("Step " + str(step) + ", Minibatch Loss= " + \
"{:.4f}".format(loss) + ", Training Accuracy= " + \
"{:.3f}".format(acc))
-最后贴一下结果,其实我是想做三个数据集,训练集,验证集,测试集,但是刚开始拿所有数据跑都是很差的结果,我就没有心情做下去了,后面我再改进一下,看哪里除了问题,还是数据集量太少,这可能是最根本的原因。
Step 1, Minibatch Loss= 1520.1038, Training Accuracy= 0.491
Step 100, Minibatch Loss= 262.6792, Training Accuracy= 0.549
Step 200, Minibatch Loss= 23.2395, Training Accuracy= 0.686
Step 300, Minibatch Loss= 2.4426, Training Accuracy= 0.615
Step 400, Minibatch Loss= 2.0057, Training Accuracy= 0.592
Step 500, Minibatch Loss= 1.4990, Training Accuracy= 0.586
Step 600, Minibatch Loss= 1.2961, Training Accuracy= 0.589
Step 700, Minibatch Loss= 1.1176, Training Accuracy= 0.576
Step 800, Minibatch Loss= 1.0612, Training Accuracy= 0.584
Step 900, Minibatch Loss= 0.8949, Training Accuracy= 0.595
Step 1000, Minibatch Loss= 0.8426, Training Accuracy= 0.595
Step 1100, Minibatch Loss= 0.8437, Training Accuracy= 0.592
Step 1200, Minibatch Loss= 0.7606, Training Accuracy= 0.576
Step 1300, Minibatch Loss= 0.7009, Training Accuracy= 0.609
Step 1400, Minibatch Loss= 0.6559, Training Accuracy= 0.609
Step 1500, Minibatch Loss= 0.6449, Training Accuracy= 0.598
Step 1600, Minibatch Loss= 0.6219, Training Accuracy= 0.603
Step 1700, Minibatch Loss= 0.5932, Training Accuracy= 0.608
Step 1800, Minibatch Loss= 0.5810, Training Accuracy= 0.623
Step 1900, Minibatch Loss= 0.5983, Training Accuracy= 0.608
Step 2000, Minibatch Loss= 0.5709, Training Accuracy= 0.607
Step 2100, Minibatch Loss= 0.5430, Training Accuracy= 0.626
Step 2200, Minibatch Loss= 0.5401, Training Accuracy= 0.642
Step 2300, Minibatch Loss= 0.5308, Training Accuracy= 0.630
Step 2400, Minibatch Loss= 0.5409, Training Accuracy= 0.627
Step 2500, Minibatch Loss= 0.5284, Training Accuracy= 0.638
Step 2600, Minibatch Loss= 0.5743, Training Accuracy= 0.627
Step 2700, Minibatch Loss= 0.5084, Training Accuracy= 0.649
Step 2800, Minibatch Loss= 0.5221, Training Accuracy= 0.643
Step 2900, Minibatch Loss= 0.5110, Training Accuracy= 0.651
Step 3000, Minibatch Loss= 0.5214, Training Accuracy= 0.663
Step 3100, Minibatch Loss= 0.4978, Training Accuracy= 0.663
Step 3200, Minibatch Loss= 0.5084, Training Accuracy= 0.647
Step 3300, Minibatch Loss= 0.4945, Training Accuracy= 0.677
Step 3400, Minibatch Loss= 0.4991, Training Accuracy= 0.660
Step 3500, Minibatch Loss= 0.4948, Training Accuracy= 0.667
Step 3600, Minibatch Loss= 0.4915, Training Accuracy= 0.660
Step 3700, Minibatch Loss= 0.4986, Training Accuracy= 0.670
Step 3800, Minibatch Loss= 0.4813, Training Accuracy= 0.674
Step 3900, Minibatch Loss= 0.5162, Training Accuracy= 0.682
Step 4000, Minibatch Loss= 0.5086, Training Accuracy= 0.680
Step 4100, Minibatch Loss= 0.4827, Training Accuracy= 0.677
Step 4200, Minibatch Loss= 0.4798, Training Accuracy= 0.686
Step 4300, Minibatch Loss= 0.4738, Training Accuracy= 0.682
Step 4400, Minibatch Loss= 0.4889, Training Accuracy= 0.679
Step 4500, Minibatch Loss= 0.4631, Training Accuracy= 0.690
Step 4600, Minibatch Loss= 0.4766, Training Accuracy= 0.681
Step 4700, Minibatch Loss= 0.4778, Training Accuracy= 0.686
Step 4800, Minibatch Loss= 0.4525, Training Accuracy= 0.704
Step 4900, Minibatch Loss= 0.4552, Training Accuracy= 0.692
Step 5000, Minibatch Loss= 0.4411, Training Accuracy= 0.701
Step 5100, Minibatch Loss= 0.4653, Training Accuracy= 0.694
Step 5200, Minibatch Loss= 0.4400, Training Accuracy= 0.709
Step 5300, Minibatch Loss= 0.4426, Training Accuracy= 0.698
Step 5400, Minibatch Loss= 0.4385, Training Accuracy= 0.705
Step 5500, Minibatch Loss= 0.4365, Training Accuracy= 0.705
Step 5600, Minibatch Loss= 0.4332, Training Accuracy= 0.711
Step 5700, Minibatch Loss= 0.4404, Training Accuracy= 0.708
Step 5800, Minibatch Loss= 0.4188, Training Accuracy= 0.715
Step 5900, Minibatch Loss= 0.4118, Training Accuracy= 0.722
Step 6000, Minibatch Loss= 0.4032, Training Accuracy= 0.713
Step 6100, Minibatch Loss= 0.4179, Training Accuracy= 0.711
Step 6200, Minibatch Loss= 0.4081, Training Accuracy= 0.714
Step 6300, Minibatch Loss= 0.4038, Training Accuracy= 0.713
Step 6400, Minibatch Loss= 0.4081, Training Accuracy= 0.719
Step 6500, Minibatch Loss= 0.3908, Training Accuracy= 0.742
Step 6600, Minibatch Loss= 0.3901, Training Accuracy= 0.735
Step 6700, Minibatch Loss= 0.3915, Training Accuracy= 0.729
Step 6800, Minibatch Loss= 0.3782, Training Accuracy= 0.721
Step 6900, Minibatch Loss= 0.3917, Training Accuracy= 0.712
Step 7000, Minibatch Loss= 0.3819, Training Accuracy= 0.734
Step 7100, Minibatch Loss= 0.3765, Training Accuracy= 0.738
Step 7200, Minibatch Loss= 0.3544, Training Accuracy= 0.749
Step 7300, Minibatch Loss= 0.3634, Training Accuracy= 0.748
Step 7400, Minibatch Loss= 0.3551, Training Accuracy= 0.758
Step 7500, Minibatch Loss= 0.3613, Training Accuracy= 0.746
Step 7600, Minibatch Loss= 0.3574, Training Accuracy= 0.753
Step 7700, Minibatch Loss= 0.3532, Training Accuracy= 0.758
Step 7800, Minibatch Loss= 0.3456, Training Accuracy= 0.762
Step 7900, Minibatch Loss= 0.3695, Training Accuracy= 0.747
Step 8000, Minibatch Loss= 0.3646, Training Accuracy= 0.768
Step 8100, Minibatch Loss= 0.3573, Training Accuracy= 0.756
Step 8200, Minibatch Loss= 0.3461, Training Accuracy= 0.760
Step 8300, Minibatch Loss= 0.3557, Training Accuracy= 0.759
Step 8400, Minibatch Loss= 0.3514, Training Accuracy= 0.756
Step 8500, Minibatch Loss= 0.3472, Training Accuracy= 0.768
Step 8600, Minibatch Loss= 0.3538, Training Accuracy= 0.757
Step 8700, Minibatch Loss= 0.3424, Training Accuracy= 0.763
Step 8800, Minibatch Loss= 0.3516, Training Accuracy= 0.754
Step 8900, Minibatch Loss= 0.3555, Training Accuracy= 0.762
Step 9000, Minibatch Loss= 0.3448, Training Accuracy= 0.767
Step 9100, Minibatch Loss= 0.3467, Training Accuracy= 0.761
Step 9200, Minibatch Loss= 0.3319, Training Accuracy= 0.777
Step 9300, Minibatch Loss= 0.3444, Training Accuracy= 0.765
Step 9400, Minibatch Loss= 0.3430, Training Accuracy= 0.762
Step 9500, Minibatch Loss= 0.3375, Training Accuracy= 0.766
Step 9600, Minibatch Loss= 0.3355, Training Accuracy= 0.768
Step 9700, Minibatch Loss= 0.3285, Training Accuracy= 0.780
Step 9800, Minibatch Loss= 0.3374, Training Accuracy= 0.772
Step 9900, Minibatch Loss= 0.3304, Training Accuracy= 0.781
Step 10000, Minibatch Loss= 0.3401, Training Accuracy= 0.768