代码基于Text-GCN作者论文和基于Bible的实践,保留原作者信息,具体代码请移步原作者仓库
https://github.com/plkmo/Bible_Text_GCN
https://github.com/yao8839836/text_gcn
从无到有学习GCN的一个过程,很多python的用法都不懂,打上一部分注释,希望给我一样的小白一点帮助
# 下面的代码关键部分打了注释,觉得有帮助的同学不妨给我点个免费的赞让我开心一下
# -*- coding: utf-8 -*-
"""
Created on Thu May 9 10:28:24 2019
@author: WT
"""
import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
import networkx as nx
from collections import OrderedDict
from itertools import combinations
import math
from tqdm import tqdm
import logging
import jieba
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # 配置输出日志
logger = logging.getLogger(__file__)
stop = [line.strip() for line in open('stop_words.txt').readlines()]
def cut_words(text):
text = str(text)
text = list(jieba.cut(text))
for word in text:
if word in stop:
text.remove(word)
return text
def load_pickle(filename):
completeName = os.path.join("./data/", \
filename)
with open(completeName, 'rb') as pkl_file:
data = pickle.load(pkl_file)
return data
def save_as_pickle(filename, data):
completeName = os.path.join("./data/", \
filename)
with open(completeName, 'wb') as output:
pickle.dump(data, output)
def nCr(n, r):
f = math.factorial # 阶乘
return int(f(n) / (f(r) * f(n - r)))
# 移除无意义词汇和符号
def filter_tokens(tokens, stopwords):
tokens1 = []
for token in tokens:
if (token not in stopwords) and (token not in [".", ",", ";", "&", "'s", ":", "?", "!", "(", ")", \
"'", "'m", "'no", "***", "--", "...", "[", "]"]):
tokens1.append(token)
return tokens1
def dummy_fun(doc):
return doc
def word_word_edges(p_ij): # 该函数生成词汇和词汇的边
word_word = [] # list() 方法用于将元组转换为列表
cols = list(p_ij.columns);
cols = [str(w) for w in cols] # str() 函数将对象转化为字符串
'''
# old, inefficient but maybe more instructive code
dum = []; counter = 0
for w1 in tqdm(cols, total=len(cols)):
for w2 in cols:
#if (counter % 300000) == 0:
# print("Current Count: %d; %s %s" % (counter, w1, w2))
if (w1 != w2) and ((w1,w2) not in dum) and (p_ij.loc[w1,w2] > 0):
word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]})); dum.append((w2,w1))
counter += 1
'''
for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
if (p_ij.loc[w1, w2] > 0):
word_word.append((w1, w2, {"weight": p_ij.loc[w1, w2]}))
return word_word
def generate_text_graph(window=10):
""" generates graph based on text corpus; window = sliding window size to calculate point-wise mutual information between words """ # 用滑动窗口的方式统计词与词间的同时出现概率
logger.info("Preparing data...")
datafolder = "./data/" # 基于文本生成图的函数,窗口用于计算文字点与点之间的相互关系
df = pd.read_csv(os.path.join(datafolder, "data3.csv")) # pandas读取csv文件的一个基本函数
# column是行,index是列,axis=1的目的是为了删掉某一列,inplace=True是为了在原df上进行修改,类似实参形参
# 这里v是指的verse 属于哪一节不要了
df = df[["t", "c", "b"]]
# one chapter per document, labelled by book
# 查注释,id = verse ID , b= Book , c = Chapter , v = Verse (诗句的)节, t = Text
# df_data = pd.DataFrame(columns=["c", "b"]) # 创建一个data_frame,列索引选择c,b,也就是选了章和册(实际上这里就text没选)
# for book in df["b"].unique(): # unique()是以 数组形式(numpy.ndarray)返回列的所有唯一值(特征的所有唯一值)
# dum = pd.DataFrame(columns=["c", "b"])
# dum["c"] = df[df["b"] == book].groupby("c").apply(lambda x: (" ".join(x["t"])).lower())
# dum["b"] = book
# df_data = pd.concat([df_data,dum], ignore_index=True) # 使用 concat是一种基本的合并方式
# del df
df_data = pd.DataFrame(columns=["c", "b"])
for book in df["b"].unique():
dum = pd.DataFrame(columns=["c", "b"])
dum["c"] = df[df["b"] == book].groupby("c").apply(lambda x: (" ".join(x["t"])))
dum["c"] = dum["c"].apply(cut_words)
dum["b"] = book
df_data = pd.concat([df_data, dum], ignore_index=True) # 使用 concat是一种基本的合并方式
del df
save_as_pickle("df_data.pkl", df_data)
# tokenize是分词断句的函数
# Tfidf # tf-idf对文本重要程度进行加权
logger.info("Calculating Tf-idf...") # 使用sklearn包下的TfidfVectorizer()进行文本处理,该函数将原始文档集合转换为TF-IDF特性矩阵。
vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=dummy_fun, preprocessor=dummy_fun)
vectorizer.fit(df_data["c"]) # 从培训集中学习词汇和IDF
df_tfidf = vectorizer.transform(df_data["c"]) # 将文档转换为文档术语矩阵
df_tfidf = df_tfidf.toarray()
vocab = vectorizer.get_feature_names() # 从特征整数索引到特征名的数组映射,Return :A list of feature names.
vocab = np.array(vocab)
df_tfidf = pd.DataFrame(df_tfidf, columns=vocab) # 以特征名和 fit() 处理后的 df_tfidf(df == datafolder) 生成一个dataframe
# PMI between words #计算词与词之间的PMI PMI是 point-wise mutual information
names = vocab
# names是上节传递的列的'名字'
n_i = OrderedDict((name, 0) for name in names)
word2index = OrderedDict((name, index) for index, name in enumerate(names))
# OrderedDict是python中的有序字典,n_i和word2index都是有序的字典
occurrences = np.zeros((len(names), len(names)), dtype=np.int32)
# Find the co-occurrences:
no_windows = 0;
logger.info("Calculating co-occurences...")
for l in tqdm(df_data["c"], total=len(df_data["c"])): # 这里是暴力计算co-occurrences的函数
for i in range(len(l) - window):
no_windows += 1
d = set(l[i:(i + window)]) # 这里是上周没看懂的地方
for w in d:
n_i[w] += 1 # 出现的频率++ # combinations(iterable, r)
for w1, w2 in combinations(d, 2): # 创建一个迭代器,返回iterable中所有长度为r的子序列,返回的子序列中的项按输入iterable中的顺序排序
i1 = word2index[w1] # 简单来说combination实现排列组合
i2 = word2index[w2]
occurrences[i1][i2] += 1
occurrences[i2][i1] += 1
logger.info("Calculating PMI*...")
# convert to PMI
p_ij = pd.DataFrame(occurrences, index=names, columns=names) / no_windows
p_i = pd.Series(n_i, index=n_i.keys()) / no_windows
del occurrences
del n_i
for col in p_ij.columns:
p_ij[col] = p_ij[col] / p_i[col]
for row in p_ij.index:
p_ij.loc[row, :] = p_ij.loc[row, :] / p_i[row]
p_ij = p_ij + 1E-9
for col in p_ij.columns:
p_ij[col] = p_ij[col].apply(lambda x: math.log(x))
# Build graph # 建图,调用networkx模块
logger.info("Building graph (No. of document, word nodes: %d, %d)..." % (len(df_tfidf.index), len(vocab)))
G = nx.Graph()
logger.info("Adding document nodes to graph...")
G.add_nodes_from(df_tfidf.index) # document nodes # add_nodes_from()向图G中添加节点,文档节点
logger.info("Adding word nodes to graph...")
G.add_nodes_from(vocab) # word nodes # 这里添加的是单词节点
# build edges between document-word pairs
logger.info("Building document-word edges...") # 这一步比较慢
document_word = [(doc, w, {"weight": df_tfidf.loc[doc, w]}) for doc in
tqdm(df_tfidf.index, total=len(df_tfidf.index)) \
for w in df_tfidf.columns] # 建立文档和单词之间的边
logger.info("Building word-word edges...")
word_word = word_word_edges(p_ij) # 建立单词和单词之间的边
save_as_pickle("word_word_edges.pkl", word_word)
logger.info("Adding document-word and word-word edges...")
G.add_edges_from(document_word) # 添加上面建立好的边
G.add_edges_from(word_word)
save_as_pickle("text_graph.pkl", G) # 用pickle格式保存,准备把图送入text_GCN函数
logger.info("Done and saved!")
if __name__ == "__main__":
generate_text_graph()
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 3 10:58:01 2019
@author: WT
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
class gcn(nn.Module):
def __init__(self, X_size, A_hat, args, bias=True): # X_size = num features
super(gcn, self).__init__() # 继承nn.Module的__init__()
self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, args.hidden_size_1)) # 定义图和权重矩阵
var = 2./(self.weight.size(1)+self.weight.size(0))
self.weight.data.normal_(0,var)
self.weight2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1, args.hidden_size_2))
var2 = 2./(self.weight2.size(1)+self.weight2.size(0)) # 使用normal_方法初始化权重weight
# normal_(mean=0, std=1, , gengerator=None*)
# 将tensor用均值为mean和标准差为std的正态分布填充。
self.weight2.data.normal_(0,var2)
if bias:
self.bias = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1))
self.bias.data.normal_(0,var)
self.bias2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_2))
self.bias2.data.normal_(0,var2)
else:
self.register_parameter("bias", None)
self.fc1 = nn.Linear(args.hidden_size_2, args.num_classes) # 全连接网络
def forward(self, X): ### 2-layer GCN architecture
X = torch.mm(X, self.weight) # torch.mm(a, b)是矩阵a和b矩阵相乘
if self.bias is not None:
X = (X + self.bias) # relu套上就是一层GCN的H1,送入下一层作为新的'图'结构
X = F.relu(torch.mm(self.A_hat, X))
X = torch.mm(X, self.weight2)
if self.bias2 is not None:
X = (X + self.bias2)
X = F.relu(torch.mm(self.A_hat, X))
return self.fc1(X)