本文主要是对我在写实验的时候所用到的networkx进行的一个初步的封装。其实不是很标准,现在再写第二版。先把之前的代码贴上来。主要参考的文档就是networkx的官方文档。
[networkx-reference]
我需要说明一点,下面的代码针对的是无向图。
下面这一部分代码是对networkx的初步封装。
#-*- coding:utf-8 -*-
import networkx as nx
import matplotlib.pyplot as plt
import traceback
'''
我对networkx 的封装
还是一个图操作-工具类
'''
class GraphOperation:
#-----------------graph operation-----------------
# construct a graph - undirected graph if default
def __init__(self):
self.graph = nx.Graph()
def convert_to_directed_graph(self):
self.graph = nx.DiGraph()
def convert_to_multi_graph(self):
self.graph = nx.MultiGraph()
# only directed graph can do this operation
def convert_to_undirected_graph(self):
self.graph = nx.Graph()
# clear the graph
def clear_graph(self):
try:
self.graph.clear()
except Exception, e:
print traceback.print_exc()
#------------------node operation----------------------------
# add a node
def add_node(self, node):
try:
self.graph.add_node(node)
except Exception,e:
print traceback.print_exc()
# add a list of nodes
def add_nodes_by_list(self, node_list):
try:
self.graph.add_nodes_from(node_list)
except Exception,e:
print traceback.print_exc()
# remove a node
def remove_node(self, node):
try:
self.graph.remove_node(node)
except Exception,e:
print traceback.print_exc()
# remove a list of nodes
def remove_nodes_by_list(self, node_list):
try:
self.graph.remove_nodes_from(node_list)
except Exception,e:
print traceback.print_exc()
# get number of nodes
def get_number_of_nodes(self):
try:
return self.graph.number_of_nodes()
except Exception, e:
print traceback.print_exc()
# get nodes, return a list of nodes
def get_nodes(self):
try:
return self.graph.nodes()
except Exception, e:
print traceback.print_exc()
# get neighbors of v, return a list of nodes which is the neighbor of v
def get_neighbors(self, v):
try:
return self.graph.neighbors(v)
except Exception, e:
print traceback.print_exc()
#---------------edge operation------------------------------
# add an edge
def add_edge(self,u,v):
try:
self.graph.add_edge(u,v)
except Exception,e:
print traceback.print_exc()
# add an edge by a tuple
def add_edge_by_tuple(self,e):
try:
self.add_edge(*e) # unpack edge tuple
except Exception,e:
print traceback.print_exc()
# add edges by list which is compromised of tuples, every tuple is an edge
def add_edges_by_list(self, edge_list):
try:
self.graph.add_edges_from(edge_list)
except Exception,e:
print traceback.print_exc()
# remove an edge
def remove_edge(self,u ,v ):
try:
self.graph.remove_edge(u, v)
except Exception,e:
print traceback.print_exc()
# remove an edge by tuple
def remove_edge_by_tuple(self, e):
try:
self.remove_edge(*e)
except Exception,e:
print traceback.print_exc()
# remove edges by list which is compromised of tuples
def remove_edges_by_list(self, edge_list):
try:
self.remove_edges_from(edge_list)
except Exception, e:
print traceback.print_exc()
# get number of edges
def get_number_of_edges(self):
try:
return self.graph.number_of_edges()
except Exception, e:
print traceback.print_exc()
# get edges, return a list of tuple which is a presentation of an edge
def get_edges(self):
try:
return self.graph.edges()
except Exception, e:
print traceback.print_exc()
# add weighted list by a list which is compromised of tuples
def add_weighted_edge(self, weighted_edge_list):
try:
self.graph.add_weighted_edges_from(weighted_edge_list)
except Exception, e:
print traceback.print_exc()
# get weighted edge
def get_weighted_edge(self):
try:
return self.graph.edges(data='weight')
except Exception, e:
print traceback.print_exc()
#---------------degree analysis-------------------------------------------------------------
# get the degree of all nodes, return a dict.
# directed graph work well, undirected graph does not test.
def get_degree(self):
try:
return self.graph.degree()
except Exception, e:
print traceback.print_exc()
# get the degree of a node, return an interger
def get_degree_by_node(self, node_id):
try:
return self.graph.degree(node_id)
except Exception, e:
print traceback.print_exc()
# get the degree of a node, but the degree is not viewed as sum of edges
# instead the degree is viewed as sum of the weight of edges
# eg: (1,2,0.5),(3,1,0.75) the degree based on weight of node 1 is 0.5+0.75 = 1.25(not 2)
def get_degree_based_on_weight_by_node(self, node_id):
try:
return self.graph.degree(node_id, weight="weight")
except Exception, e:
print traceback.print_exc()
# get sorted degrees, return a list. the item of a list is degree value of a node
def get_sorted_degrees(self):
try:
return sorted(nx.degree(self.graph).values(), reverse=True)
except Exception, e:
print traceback.print_exc()
# get the indegree of all nodes.
def get_in_degree(self):
try:
return self.graph.in_degree()
except Exception, e:
print traceback.print_exc()
# get the indegree of a node
def get_in_degree_by_node(self, node_id):
try:
return self.graph.in_degree(node_id)
except Exception, e:
print traceback.print_exc()
def get_in_degree_based_on_weight_by_node(self, node_id):
try:
return self.graph.in_degree(node_id, weight = "weight")
except Exception, e:
print traceback.print_exc()
# get the outdegree of all nodes
def get_out_degree(self):
try:
return self.graph.out_degree()
except Exception, e:
print traceback.print_exc()
# get the outdegree of a node
def get_out_degree_by_node(self, node_id):
try:
return self.graph.out_degree(node_id)
except Exception, e:
print traceback.print_exc()
def get_out_degree_based_on_weight_by_node(self, node_id):
try:
return self.graph.out_degree(node_id, weight="weight")
except Exception, e:
print traceback.print_exc()
# ----------component analysis-----------------
# get connected components - return a list of set which is a component
def get_connected_components(self):
try:
return nx.connected_components(self.graph)
except Exception, e:
print traceback.print_exc()
# ----------drawing graph-----------------------
def draw_graph(self,title):
try:
plt.title(title)
nx.draw(self.graph)
plt.show(title)
except Exception, e:
print traceback.print_exc()
def draw_network(self):
try:
nx.draw_networkx(self.graph, nx.spring_layout)
plt.show()
except Exception,e:
print traceback.print_exc()
def draw_graph_random_layout(self):
try:
nx.draw_random(self.graph)
plt.show()
except Exception,e:
print traceback.print_exc()
def draw_graph_spring_layout(self):
try:
nx.draw_spring(self.graph)
plt.show()
except Exception,e:
print traceback.print_exc()
# ---------- Graph methods--------------------------
# return a list of the frequency of each degree value
# 这个函数我说明一下,之前的degree函数返回的是每个节点的度,但是度分布则是统计了度为某个值的个数。下面的函数
# 很好的完成了这个任务,就是统计了度分布,当然最后一项是还有值的情形
def get_degree_distribution(self):
try:
return nx.degree_histogram(self.graph)
except Exception,e:
print traceback.print_exc()
def get_density(self):
try:
return nx.density(self.graph)
except Exception,e:
print traceback.print_exc()
# get the transitivity - global clustering coefficient
def get_transitivity(self):
try:
return nx.transitivity(self.graph)
except Exception,e:
print traceback.print_exc()
def get_averate_clustering(self):
try:
return nx.average_clustering(self.graph)
except Exception,e:
print traceback.print_exc()
def get_average_shortest_path_length(self):
try:
return nx.average_shortest_path_length(self.graph)
except Exception,e:
print traceback.print_exc()
def write_to_pajek(self, pajek_net_path):
try:
nx.write_pajek(self.graph, pajek_net_path)
except Exception,e:
print traceback.print_exc()
#--------------------------------------------------------
#--------------centrality--------------------------------
#--------------------------------------------------------
# The degree centrality for a node v is the fraction of nodes it is connected to.
def get_degree_centrality(self):
try:
return nx.degree_centrality(self.graph)
except Exception,e:
print traceback.print_exc()
# Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass through v
def get_betweenness_centrality(self):
try:
return nx.betweenness_centrality(self.graph)
except Exception,e:
print traceback.print_exc()
# The load centrality of a node is the fraction of all shortest paths that pass through that node.
def get_load_centrality(self):
try:
return nx.load_centrality(self.graph)
except Exception,e:
print traceback.print_exc()
# Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors.
def get_eigenvector_centrality(self):
try:
return nx.eigenvector_centrality(self.graph)
except Exception,e:
print traceback.print_exc()
#-*- coding:utf-8 -*-
from GraphOperation import*
'''
基于我自己的工具类MyGraph
写一个图的操作类,实现图的各种操作
'''
class MyGraph:
# 构造函数 - 主要是为了定义成员变量
def __init__(self):
self.my_graph = GraphOperation()
self.map_name_to_number = dict()
self.map_number_to_name = dict()
self.output_path = ""
self.clique_list = [] # for draw_community
self.max_connected_component_subgraph = None
# 构造图 - 初始化两个mapper,并且构造图
def construct_graph(self, clique_list):
try:
# convert the name to number and store the relation in map_name_to_number
number = 1
new_clique_list = []
for clique in clique_list:
new_clique = []
for u in clique:
if u in self.map_name_to_number:
new_clique.append(self.map_name_to_number[u])
else:
self.map_name_to_number[u] = number
number += 1
new_clique.append(self.map_name_to_number[u])
new_clique_list.append(new_clique)
# convert the number to name and store the relation in map_number_to_name
self.map_number_to_name = dict()
for name, number in self.map_name_to_number.items():
self.map_number_to_name[number] = name
self.clique_list = new_clique_list
# construct graph based on the new_clique_list
for clique in new_clique_list:
# add all edges
for u in clique:
# add a single node in case there exists node itself
self.my_graph.add_node(u)
for v in clique:
if (u == v):
continue
e = (u, v)
self.my_graph.add_edge_by_tuple(e)
print "[INFO]: construct_graph is finished!"
except Exception,e:
print traceback.print_exc()
# 加入一条边
def add_edge(self, u, v):
try:
self.my_graph.add_edge(u, v)
except Exception,e:
print traceback.print_exc()
# 获得所有边
def get_all_edges(self):
try:
return self.my_graph.get_edges()
except Exception,e:
print traceback.print_exc()
# 设置网络特征的输出路径
def set_output_path(self, output_path):
try:
self.output_path = output_path
print "[INFO]: set_output_path is finished!"
except Exception,e:
print traceback.print_exc()
# 获得最大联通分量
# 由于必须是在整个图生成之后,才能获得最大联通分量
# 所以这个方法必须写在封装的第二层,第一层的类写的不够好。不能直接封装
def set_max_connected_component_subgraph(self):
try:
self.max_connected_component_subgraph = max(nx.connected_component_subgraphs(self.my_graph.graph), key=len)
print "[INFO]: set_max_connected_component_subgraph is finished!"
except Exception,e:
print traceback.print_exc()
# 返回的是原生的nx.Graph()
def get_max_connected_component_subgraph(self):
try:
return self.max_connected_component_subgraph
except Exception,e:
print traceback.print_exc()
#-----------------------------------------------------------------------
#-----------------------draw the network--------------------------------
#-----------------------------------------------------------------------
# 按照不同的社团进行绘图 - 不同社团具有不同的颜色
# 逻辑是 不同的社团分别加入进去,然后配置颜色,绘图
# 因为少了一层封装,所以掉用的时候只能按照最底层的凡是去调用,这样其实不好。
# 为此,还增加了成员变量,保存clique_list
def draw_community(self):
try:
# 初始信息
#pos = nx.spring_layout(self.my_graph.graph)
pos = nx.spring_layout(self.my_graph.graph)
node_size_ = 100
color_list = ["red", "yellow", "blue", "green", "pink", "orange", "purple"]
#color_list = ["red", "yello", "blue", "green"]
color_list_len = len(color_list)
# add node and edges
for i, node_list in enumerate(self.clique_list):
edge_list = self.get_edges_for_community(node_list)
# 以下两个函数参数太多,先暂时不直接封装
#nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i%color_list_len])
nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i], label="hello")
nx.draw_networkx_edges(self.my_graph.graph, pos, edge_list)
#title = "Collaboration Network"
title = "people relation by train"
plt.title(title)
plt.show()
print "[INFO]: draw_community is finished!"
except Exception,e:
print traceback.print_exc()
def get_edges_for_community(self, node_list):
try:
edge_list = []
for u in node_list:
for v in node_list:
if u == v:
continue
else:
edge_list.append((u,v))
return edge_list
except Exception,e:
print traceback.print_exc()
# 基本画图
def draw_graph(self,title):
try:
self.my_graph.draw_graph(title)
print "[INFO]: draw_graph is finished!"
except Exception,e:
print traceback.print_exc()
def draw_network(self):
try:
self.draw_network()
except Exception,e:
print traceback.print_exc()
def draw_graph_random_layout(self):
try:
self.my_graph.draw_graph_random()
except Exception,e:
print traceback.print_exc()
def draw_graph_spring_layout(self):
try:
self.my_graph.draw_graph_spring_layout()
print "[INFO]: draw_graph is finished!"
except Exception,e:
print traceback.print_exc()
#-----------------------------------------------------------------------
#-----------------------network analysis--------------------------------
#-----------------------------------------------------------------------
# 计算节点数
def cal_num_of_nodes(self):
try:
num_nodes = self.my_graph.get_number_of_nodes()
file_path = self.output_path+"number_of_nodes.txt"
outfile = open(file_path, "w")
outfile.write(str(num_nodes) + '\n')
outfile.close()
print "[INFO]: cal_num_of_nodes is finished!"
except Exception,e:
print traceback.print_exc()
# 计算边数
def cal_num_of_edges(self):
try:
num_edges = self.my_graph.get_number_of_edges()
file_path = self.output_path + "number_of_edges.txt"
outfile = open(file_path, "w")
outfile.write(str(num_edges) + '\n')
outfile.close()
print "[INFO]: cal_num_of_edges is finished!"
except Exception, e:
print traceback.print_exc()
# 计算度分布
def cal_degree_distribution(self):
try:
degree_distribution_list = self.my_graph.get_degree_distribution()
file_path = self.output_path + "degree_distribution.txt"
outfile = open(file_path, "w")
for item in degree_distribution_list:
line = str(item) + '\n'
outfile.write(line)
outfile.close()
print "[INFO]: cal_degree_distribution is finished!"
except Exception, e:
print traceback.print_exc()
# 计算网络密度
def cal_density(self):
try:
density = self.my_graph.get_density()
file_path = self.output_path + "graph_density.txt"
outfile = open(file_path, "w")
outfile.write(str(density) + '\n')
outfile.close()
print "[INFO]: cal_density is finished!"
except Exception, e:
print traceback.print_exc()
# 计算聚集系数
def cal_transitivity(self):
try:
transitivity = self.my_graph.get_transitivity()
file_path = self.output_path + "transitivity.txt"
outfile = open(file_path, "w")
outfile.write(str(transitivity) + '\n')
outfile.close()
print "[INFO]: cal_transitivity is finished!"
except Exception, e:
print traceback.print_exc()
def cal_average_clustering(self):
try:
average_clustering = self.my_graph.get_averate_clustering()
file_path = self.output_path + "average_clustering.txt"
outfile = open(file_path, "w")
outfile.write(str(average_clustering) + '\n')
outfile.close()
print "[INFO]: cal_average_clustering is finished!"
except Exception,e:
print traceback.print_exc()
# 计算平均距离
def cal_average_shortest_path_length(self):
try:
aver_shortest_path = self.my_graph.get_average_shortest_path_length()
file_path = self.output_path + "average_shortest_path_length.txt"
outfile = open(file_path, "w")
outfile.write(str(aver_shortest_path) + '\n')
outfile.close()
print "[INFO]: cal_average_shortest_path_length is finished!"
except Exception, e:
print traceback.print_exc()
# 写入pajek格式文件
def write_to_pajek_net(self):
try:
output_path = self.output_path + "graph_of_author_relation.net"
# write to net file
outfile = open(output_path, "w")
nodes_num = self.my_graph.get_number_of_nodes()
edges_num = self.my_graph.get_number_of_edges()
first_line_of_node = "*Vertices " + str(nodes_num) + '\n'
first_line_of_edge = "*Edges " + str(edges_num) + '\n'
outfile.write(first_line_of_node)
nodes_list = self.my_graph.get_nodes()
for node in nodes_list:
line = ""
line += str(node) + ' ' + "\"" + str(self.map_number_name[node]) + "\"" + '\n'
outfile.write(line)
outfile.write(first_line_of_edge)
edges_list = self.my_graph.get_edges()
for edge in edges_list:
line = ""
line += str(edge[0]) + ' ' + str(edge[1]) + '\n'
outfile.write(line)
outfile.close()
print "[INFO]: write_to_pajek_net is finished!"
except Exception, e:
print traceback.print_exc()
def write_to_pajek_net1(self):
try:
pajek_net_path = self.output_path + "graph_of_author_relation.net"
self.my_graph.write_to_pajek(pajek_net_path)
print "[INFO]: write_to_pajek_net1 is finished!"
except Exception, e:
print traceback.print_exc()
#--------------------------------------------------------
#--------------centrality--------------------------------
#--------------------------------------------------------
def get_degree_centrality(self):
try:
return self.my_graph.get_degree_centrality()
print "[INFO]: get_degree_centrality is finished!"
except Exception,e:
print traceback.print_exc()
def get_betweenness_centrality(self):
try:
return self.my_graph.get_betweenness_centrality()
print "[INFO]: get_betweenness_centrality is finished!"
except Exception, e:
print traceback.print_exc()
def get_load_centrality(self):
try:
return self.my_graph.get_load_centrality()
print "[INFO]: get_load_centrality is finished!"
except Exception, e:
print traceback.print_exc()
def get_eigenvector_centrality(self):
try:
return self.my_graph.get_eigenvector_centrality()
print "[INFO]: get_eigenvector_centrality is finished!"
except Exception, e:
print traceback.print_exc()
# --------------------------------------------------------
# --------------component--------------------------------
# --------------------------------------------------------
def draw_max_connected_component_subgraph(self):
try:
nx.draw_networkx(self.get_max_connected_component_subgraph(),with_labels = False)
title = "Max connected subgraph of Collaboration Network"
plt.title(title)
plt.show()
print "[INFO]: draw_max_connected_component_subgraph is finished!"
except Exception, e:
print traceback.print_exc()
def get_average_shortest_path_length_in_max_connected_component_subgraph(self):
try:
res = nx.average_shortest_path_length(self.get_max_connected_component_subgraph())
print "[INFO]: draw_max_connected_component_subgraph is finished!"
return res
except Exception, e:
print traceback.print_exc()
def cal_average_shortest_path_length_in_max_connected_component_subgraph(self):
try:
aver_shortest_path = self.get_average_shortest_path_length_in_max_connected_component_subgraph()
file_path = self.output_path + "average_shortest_path_length_in_max_connected_subgraph.txt"
outfile = open(file_path, "w")
outfile.write(str(aver_shortest_path) + '\n')
outfile.close()
print "[INFO]: cal_average_shortest_path_length_in_max_connected_component_subgraph is finished!"
except Exception, e:
print traceback.print_exc()
#----------------------------------------------------------------------------
下面这一部分代码就不针对networkx了,主要是xml的封装类,以及测试部分的代码
- XmlParser
#-*- coding:utf-8
import xml.etree.ElementTree as et
import traceback
'''
基于XML的数据提取以及分析
其实我只可以负责数据提取
但是毕竟是同一个XML,所以把数据分析写进来我认为也是合理的
'''
class XmlParser:
def __init__(self, xml_path, stop_words_path):
self.stop_words_path = stop_words_path
tree = et.parse(xml_path)
self.root = tree.getroot()
# 1-pubmed 获取文章作者
def get_article_author(self):
try:
res_list = []
for pubmed_article in self.root:
try:
#print "---------------------------------------------------"
medline_citation = pubmed_article.findall("MedlineCitation")[0]
article = medline_citation.findall("Article")[0]
author_list = article.findall("AuthorList")[0]
author_list = author_list.findall("Author")
current_authour_list = []
for author in author_list:
try:
last_name = author.findall("LastName")[0]
initials = author.findall("Initials")[0]
name = str(last_name.text) + ' ' + str(initials.text)
current_authour_list.append(name)
#print name
except:
continue
res_list.append(current_authour_list)
except:
continue
return res_list
except Exception, e:
print traceback.print_exc()
# 1-1 PMC 获取文章作者
def get_article_author1(self):
try:
res_list = []
for article in self.root:
try:
author_list = []
#print pubmed_article
#print "---------------------------------------------------"
front = article.findall("front")[0]
article_meta = front.findall("article-meta")[0]
contrib_group = article_meta.findall("contrib-group")[0]
contrib_list = contrib_group.findall("contrib")
for contrib in contrib_list:
name = contrib.findall("name")[0]
surname = name.findall("surname")[0]
given_name = name.findall("given-names")[0]
final_name = ""
final_name += str(given_name.text) + " " + str(surname.text)
author_list.append(final_name)
#print final_name
res_list.append(author_list)
except:
continue
return res_list
except Exception, e:
print traceback.print_exc()
# 2_获得文章标题
def get_article_title(self, root):
try:
article_title_list = []
for pubmed_article in root:
try:
medline_citation = pubmed_article.findall("MedlineCitation")[0]
article = medline_citation.findall("Article")[0]
article_title = article.findall("ArticleTitle")[0]
article_title = str(article_title.text)
#print article_title
article_title_list.append(article_title)
except:
continue
return article_title_list
except Exception,e:
print traceback.print_exc()
# 3_获取年份
def get_article_year(self, root):
try:
article_year_list = []
cnt = 0
for pubmed_article in root:
try:
medline_citation = pubmed_article.findall("MedlineCitation")[0]
article = medline_citation.findall("Article")[0]
article_journal = article.findall("Journal")[0]
article_journal_issue = article_journal.findall("JournalIssue")[0]
pub_date = article_journal_issue.findall("PubDate")[0]
year = pub_date.findall("Year")[0]
year = str(year.text)
article_year_list.append(year)
except:
continue
return article_year_list
except Exception, e:
print traceback.print_exc()
# 4_获取出版社名称
def get_article_journal_title(self, root):
try:
journal_title_list = []
for pubmed_article in root:
try:
medline_citation = pubmed_article.findall("MedlineCitation")[0]
article = medline_citation.findall("Article")[0]
article_journal = article.findall("Journal")[0]
article_journal_title = article_journal.findall("Title")[0]
journal_title = str(article_journal_title.text)
journal_title_list.append(journal_title)
except:
continue
return journal_title_list
except Exception, e:
print traceback.print_exc()
# 5_pubmed获取文章摘要
def get_article_abstract(self, root):
try:
article_abstract_list = []
cnt = 0
for pubmed_article in root:
try:
medline_citation = pubmed_article.findall("MedlineCitation")[0]
article = medline_citation.findall("Article")[0]
article_abstract = article.findall("Abstract")[0]
article_abstract_text = article_abstract.findall("AbstractText")[0]
# 考虑有些文章不存在摘要的情形
if article_abstract_text is not None :
cnt += 1
abstract = str(article_abstract_text.text)
#print cnt, " ", abstract
article_abstract_list.append(abstract)
except:
continue
return article_abstract_list
except Exception, e:
print traceback.print_exc()
# 5-1_pmc_获取文章作者
def get_article_abstract1(self):
try:
res_list = []
for article in self.root:
try:
author_list = []
# print pubmed_article
# print "---------------------------------------------------"
front = article.findall("front")[0]
article_meta = front.findall("article-meta")[0]
abstract = article_meta.findall("abstract")[0]
abstract_p = abstract.findall("p")[0]
res_list.append(abstract_p.text)
except:
continue
return res_list
except Exception, e:
print traceback.print_exc()
# 6_获取出版社名称 - (名字,位置)
def get_article_journal_info(self, root):
try:
# journal_country_list = []
# journal_name_list = []
journal_info_list = []
for pubmed_article in root:
try:
medline_citation = pubmed_article.findall("MedlineCitation")[0]
journal_info = medline_citation.findall("MedlineJournalInfo")[0]
journal_country = str(journal_info.findall("Country")[0].text)
journal_name = str(journal_info.findall("MedlineTA")[0].text)
journal_info_list.append(journal_name + ',' + journal_country)
except:
continue
return journal_info_list
except Exception, e:
print traceback.print_exc()
#---------------------------------------------------------#
# 计算统计特征 -#
#----------------------------------------------------------#
# 7_计算每年所发文章数
def cal_num_of_article_in_each_year(self, write_path):
try:
year_list = self.get_article_year(self.root)
counter = dict()
#total = len(year_list)
#print "TOTAL articles: ", total
for y in year_list:
if y in counter :
counter[y] += 1
else:
counter[y] = 1
pairs = list(counter.items())
pairs.sort(reverse=True)
outfile = open(write_path, "w")
for pair in pairs:
line = str(pair[0]) + "\t" + str(pair[1])
outfile.write(line +'\n')
outfile.close()
except Exception, e:
print traceback.print_exc()
# 8_pubmed计算文章标题中词频
def cal_word_occurence_in_article_title(self,output_path):
try:
article_list = self.get_article_title(self.root)
stop_words_list = self.get_stop_words(self.stop_words_path)
stop_words_list.append(' ')
stop_words_list.append('') # 这个要占很大的地方
word_counter = dict()
for article in article_list:
try:
# 预处理
line = ""
for ch in article:
if ch.isalpha():
line += ch
else:
line += ' '
article = line
article = article.split(' ')
for word in article:
word = word.lower()
if word in stop_words_list:
continue
if word in word_counter:
word_counter[word] += 1
else:
word_counter[word] = 1
except:
continue
pairs = list(word_counter.items())
items = [(count,word) for (word,count) in pairs]
items.sort(reverse=True)
write_path = output_path + "word_occurence_in_article_title.txt"
outfile = open(write_path,"w")
final_str = ""
final_freq = ""
cnt = 0
for item in items:
line = str(item[1]) + "\t" + str(item[0])
outfile.write(line +'\n')
if cnt < 10:
if cnt == 0:
final_str = "'" + item[1] + "'" + final_str
final_freq = "'" + str(item[0]) + "'" + final_freq
else:
final_str = "'" + item[1] + "'" + ',' + final_str
final_freq = "'" + str(item[0]) + "'" + ',' + final_freq
cnt += 1
final_str = '[' + final_str + ']'
final_freq = '[' + final_freq + ']'
outfile.write(final_str + '\n')
outfile.write(final_freq + '\n')
outfile.close()
except Exception, e:
print traceback.print_exc()
# 9_pubmed计算文章摘要中词频
def cal_word_occurence_in_article_abstract(self, output_path):
try:
abstract_list = self.get_article_abstract(self.root)
stop_words_list = self.get_stop_words(self.stop_words_path)
stop_words_list.append(' ')
stop_words_list.append('') # 这个要占很大的地方
word_counter = dict()
for abstract in abstract_list:
try:
# 预处理
line = ""
for ch in abstract:
if ch.isalpha():
line += ch
else:
line += ' '
abstract = line
abstract = abstract.split(' ')
for word in abstract:
word = word.lower()
if word in stop_words_list:
continue
if word in word_counter:
word_counter[word] += 1
else:
word_counter[word] = 1
except:
continue
pairs = list(word_counter.items())
items = [(count, word) for (word, count) in pairs]
items.sort(reverse=True)
write_path = output_path + "word_occurence_in_article_abstract.txt"
outfile = open(write_path, "w")
final_str = ""
final_freq = ""
cnt = 0
for item in items:
line = str(item[1]) + "\t" + str(item[0])
outfile.write(line + '\n')
if cnt < 10:
if cnt == 0:
final_str = "'" + item[1] + "'" + final_str
final_freq = "'" + str(item[0]) + "'"+ final_freq
else:
final_str = "'"+item[1]+"'" + ',' + final_str
final_freq = "'" + str(item[0]) + "'" + ',' + final_freq
cnt += 1
final_str = '[' + final_str + ']'
final_freq = '[' + final_freq + ']'
outfile.write(final_str + '\n')
outfile.write(final_freq + '\n')
outfile.close()
except Exception, e:
print traceback.print_exc()
# 9_1_pmc计算文章摘要中词频
def cal_word_occurence_in_article_abstract1(self, write_path):
try:
abstract_list = self.get_article_abstract1()
stop_words_list = self.get_stop_words(self.stop_words_path)
stop_words_list.append(' ')
stop_words_list.append('') # 这个要占很大的地方
word_counter = dict()
for abstract in abstract_list:
try:
# 预处理
line = ""
for ch in abstract:
if ch.isalpha():
line += ch
else:
line += ' '
abstract = line
abstract = abstract.split(' ')
for word in abstract:
word = word.lower()
if word in stop_words_list:
continue
if word in word_counter:
word_counter[word] += 1
else:
word_counter[word] = 1
except:
continue
pairs = list(word_counter.items())
items = [(count, word) for (word, count) in pairs]
items.sort(reverse=True)
#for item in items:
# print item[0], '\t', item[1]
outfile = open(write_path, "w")
for item in items:
try:
line = ""
line = str(item[1]) + '\t' + str(item[0])
outfile.write(line+'\n')
except Exception as ex:
print ex
outfile.close()
except Exception, e:
print traceback.print_exc()
# 10_计算期刊的名字以及其地理位置的出现次数
def cal_journal_name_and_country_ouucrence(self, country_path, name_path):
try:
name_counter = dict()
country_counter = dict()
journal_info_list = self.get_article_journal_info(self.root)
for item in journal_info_list:
item = item.split(',')
journal_name = item[0]
journal_country = item[1]
if journal_name in name_counter:
name_counter[journal_name] += 1
else:
name_counter[journal_name] = 1
if journal_country in country_counter:
country_counter[journal_country] += 1
else:
country_counter[journal_country] = 1
pairs = list(name_counter.items())
reverse_pairs = [ (count,name) for (name,count) in pairs ]
reverse_pairs.sort(reverse=True)
outfile = open(name_path, "w")
for item in reverse_pairs:
name = str(item[1])
count = str(item[0])
line = ""
line += name
line += '\t'
line += count
outfile.write(line + '\n')
outfile.close()
pairs = list(country_counter.items())
reverse_pairs = [(count, country) for (country, count) in pairs]
reverse_pairs.sort(reverse=True)
outfile = open(country_path, "w")
for item in reverse_pairs:
name = str(item[1])
count = str(item[0])
line = ""
line += name
line += '\t'
line += count
outfile.write(line + '\n')
outfile.close()
except Exception, e:
print traceback.print_exc()
# 11_计算发布量前10的论文,在不同区的数量
def cal_num_in_diff_area(self, input_path, out_path):
try:
area_counter = {}
cnt = 0
infile = open(input_path, "r")
for line in infile:
cnt += 1
if cnt == 1:
continue
line = line.rstrip('\n').split(' ')
num = int(line[1])
area = line[3]
if area in area_counter:
area_counter[area] += num
else:
area_counter[area] = num
infile.close()
outfile = open(out_path, "w")
for area in area_counter:
line = ""
line += str(area)
line += " "
line += str(area_counter[area])
outfile.write(line + '\n')
outfile.close()
except Exception, e:
print traceback.print_exc()
# 12_计算影响因子
def cal_aver_if_factor(self, input_path):
try:
cnt = 0
infile = open(input_path, "r")
total_num = 0
total_factor = 0.0
for line in infile:
cnt += 1
if cnt == 1:
continue
line = line.rstrip('\n').split(' ')
num = int(line[1])
factor = float(line[2])
total_num += num
total_factor += factor * num
infile.close()
print total_factor / total_num
except Exception, e:
print traceback.print_exc()
# 13_获取停用词
def get_stop_words(self, stop_words_path):
result_list = []
infile = open(stop_words_path, "r")
for line in infile:
line = line.rstrip('\n')
result_list.append(line)
infile.close()
return result_list
# 14_测试函数
def test(self):
journal_info_list = self.get_article_journal_info(self.root)
print len(journal_info_list)
for aa in journal_info_list:
print aa
#-*- coding:utf-8 -*-
from XmlParser import*
from MyGraph import*
STOP_WORDS_PATH = "../file/stop_words.txt"
XML_PATH1 = "../data/PUBMED/LANCET/2006/lancet_2006_1570.xml"
#XML_PATH2 = "../data/PUBMED/LANCET/2009/lancet_2009_1516.xml"
#OUTPUT_PATH1 = "../output/network_analysis/PUBMED/LANCET/2006/"
#OUTPUT_PATH2 = "../output/network_analysis/PUBMED/LANCET/2009/"
OUTPUT_PATH3 = "../output/src_output/edge.txt"
INPUT_PATH = "../data/src_input/citation.csv"
OUTPUT_PATH = "../output/src_output/"
# @xml_parser_obj:xml解析后的对象
# @OUTPUT_PATH:统计分析之后的输出路径
def statical_analysis( xml_parser_obj, OUTPUT_PATH ):
try:
xml_parser_obj.cal_word_occurence_in_article_abstract(OUTPUT_PATH)
xml_parser_obj.cal_word_occurence_in_article_title(OUTPUT_PATH)
print "[INFO]: statical_analysis is finished!"
except Exception,e:
print traceback.print_exc()
# @xml_parser_obj:xml解析后的对象
# @OUTPUT_PATH: 网络静态分析之后的输出路径
def author_collaboration_network_analysis( xml_parser_obj, OUTPUT_PATH ):
try:
# get the author clique list
author_clique_list = xml_parser_obj.get_article_author()
# construct the graph based on the author clique list
graph = MyGraph()
graph.construct_graph(author_clique_list)
graph.set_output_path(OUTPUT_PATH)
# calculate the statistics
graph.cal_num_of_nodes()
graph.cal_num_of_edges()
graph.cal_degree_distribution()
graph.cal_density()
# the colloboration network is usually not connected
#graph.cal_average_shortest_path_length()
graph.cal_average_clustering()
graph.write_to_pajek_net1()
# 这个函数并不是真的画社团 只是把不同clique画出来而已 画的是整个的图
graph.draw_community()
graph.set_max_connected_component_subgraph()
graph.draw_max_connected_component_subgraph()
graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()
#graph.draw_graph()
#graph.draw_graph_spring_layout()
#graph.draw_graph_random()
print "[INFO]: author_collaboration_network_analysis is finished!"
except Exception,e:
print traceback.print_exc()
def author_collaboration_network_analysis1( xml_parser_obj1, xml_parser_obj2, OUTPUT_PATH ):
try:
# get the author clique list
author_clique_list = xml_parser_obj1.get_article_author()
author_clique_list.extend(xml_parser_obj2.get_article_author())
# construct the graph based on the author clique list
graph = MyGraph()
graph.construct_graph(author_clique_list)
graph.set_output_path(OUTPUT_PATH)
# calculate the statistics
graph.cal_num_of_nodes()
graph.cal_num_of_edges()
graph.cal_degree_distribution()
graph.cal_density()
graph.cal_average_shortest_path_length()
graph.cal_average_clustering()
graph.write_to_pajek_net1()
graph.draw_community()
#graph.draw_graph()
#graph.draw_graph_spring_layout()
#graph.draw_graph_random()
print "[INFO]: author_collaboration_network_analysis is finished!"
except Exception,e:
print traceback.print_exc()
def test_for_srx():
try:
graph = MyGraph()
graph.set_output_path(OUTPUT_PATH)
for line in file(INPUT_PATH, "r"):
u = line.split(',')[0]
v = line.split(',')[1]
graph.add_edge(u, v)
print "[INFO]: graph is finished!"
graph.cal_average_clustering()
graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()
graph.cal_degree_distribution()
graph.cal_density()
graph.cal_transitivity()
except Exception,e:
print traceback.print_exc()
def test_for_jcx():
try:
graph = MyGraph()
graph.set_output_path(OUTPUT_PATH)
cnt = 0
for line in file(INPUT_PATH,"r"):
u =line.split()[0]
v =line.split()[1]
graph.add_edge(u,v)
cnt += 1
if(cnt == 10000):
break;
print "[INFO]: graph is finished!"
'''
graph.cal_average_clustering()
graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()
graph.cal_degree_distribution()
graph.cal_density()
graph.cal_transitivity()
'''
title = "Social Network - Live Journal"
graph.draw_graph(title)
except Exception,e:
print traceback.print_exc()
def main():
try:
print "[INFO]: Programme is running......"
# parse the xml and get the result
#a_obj1 = XmlParser(XML_PATH1, STOP_WORDS_PATH)
#a_obj2 = XmlParser(XML_PATH2, STOP_WORDS_PATH)
#statical_analysis(a_obj1, OUTPUT_PATH1)
#statical_analysis(a_obj2, OUTPUT_PATH2)
#author_collaboration_network_analysis(a_obj1, OUTPUT_PATH1)
test_for_srx()
print "[INFO]: Programme terminated successfully!"
except Exception, e:
print traceback.print_exc()
main()