Python3处理文档中的作者之间的关系示例

我们在做学术文档数据处理的过程中,难免会遇到数据预处理的问题,最近我用python处理了社交网络的图结构的数据,我把数据处理过程分享给大家,我的数据格式为:

4661 5803 1
165223 16335 1
70014 592188 1
127668 31434 1
121512 139585 1
69671 69669 1
584423 123509 1
62950 30194 1
4059 159837 1
149579 133853 1
357503 469035 1
457747 68752 1
19572 29581 1

我解释一下数据,对每一行数据,一次为,(结点,结点,权重),整个文档存的是一个带权重的有向图的网络结构,我的python文件名为network_graph_filter.py

我的数据预处理代码为:

import json
class Nodes_Util(object):

    def __init__(self):
        self.in_node={}
        self.out_node={}
        self.in_degree_total=0
        self.out_degree_total=0
        self.load_data={}

    def calculate_frequency(self,edges):
        # print(authors)
        num=0
        for edge in edges:
            node1,node2,weight=edge.split(" ")
            if self.is_in_node_Exists(node1): #入度统计
                self.in_node[node1]=self.in_node[node1]+1
            else:
                self.in_node[node1]=1
            self.in_degree_total=self.in_degree_total+1  #总入度+1
            
            if self.is_out_node_Exists(node2): #出度统计
                self.out_node[node2]=self.out_node[node2]+1
            else:
                self.out_node[node2]=1
            self.out_degree_total=self.out_degree_total+1 #总出度+1
            num=num+1
            # if(num==10000):
            #     break
        # print(self.in_node)
        # print(self.out_node)
        # print(self.in_degree_total)
        # print(self.out_degree_total)
    
    def filter(self):
        mean_in_degree_value=self.mean_in_degree()
        mean_out_degree_value=self.mean_out_degree()
        print("mean in value is "+str(mean_in_degree_value))
        print("mean out value is "+str(mean_out_degree_value))
        # in_file=filter(lambda i: i[1] > mean_in_degree_value ,self.in_node.items())
        # print(in_file)  
        in_temp={}
        out_temp={}
        with open("in_degree.txt","w") as in_file:
            for key,value in self.in_node.items():
                if(value>mean_in_degree_value):
                    in_temp[key]=value
                    in_file.write(key+" "+str(value)+"\n")
        with open("out_degree.txt","w") as in_file:
            for key,value in self.out_node.items():
                if(value>mean_out_degree_value):
                    out_temp[key]=value
                    in_file.write(key+" "+str(value)+"\n")
        self.in_node.clear()
        self.out_node.clear()

        self.in_node=in_temp
        self.out_node=out_temp
        # print(self.in_node)
        # print(self.out_node)
        

    def mean_in_degree(self): #平均入度
        return self.in_degree_total/float(len(self.in_node))

    def mean_out_degree(self): #平均出度
        return self.out_degree_total/float(len(self.out_node))

    
    #判断节点存在于入度词典中
    def is_in_node_Exists(self,node):
        return node in self.in_node
    
     #判断节点存在于出度词典中
    def is_out_node_Exists(self,node):
        return node in self.out_node

    def export_json(self):
        jsObj=json.dumps(self.in_node)
        fileObject = open('in_node.json', 'w')
        fileObject.write(jsObj)
        fileObject.close()

        jsObj=json.dumps(self.out_node)
        fileObject = open('out_node.json', 'w')
        fileObject.write(jsObj)
        fileObject.close()
    
    def load_json(self):
               # 读取数据
        with open('in_node.json', 'r') as f:
            self.load_data = json.load(f)
        # print( self.load_data["R. Lauterbach"])
        print("load...dict_author_id.json")
        return self.load_data

    def fileter_weights_graph(self):
        final_output=open("weights_graph_relationship_final.txt","w")
        with open("weights_graph_relationship.txt","r") as file:
            edges=file.readlines()
            for edge in edges:
                node1,node2,weight=edge.split(" ")
                if(node1 in self.in_node):
                    final_output.write(edge)
                elif(node2 in self.out_node):
                    final_output.write(edge)
        final_output.close()
        print("convert finished!")



def main():
    nu=Nodes_Util()
    with open("weights_graph_relationship.txt","r") as file:
        edges=file.readlines()
        nu.calculate_frequency(edges)
        nu.filter()
        nu.fileter_weights_graph()
        # nu.export_json()
    # num=0
    # for edge in edges:
    #     node1,node2,frequent=edge.split(" ")
    #     print(node1)
    #     print(node2)
    #     print(frequent)
    #     num=num+1
    #     if(num==130):
    #         break

if __name__ == '__main__':
    main()

代码的主要功能为计算每个结点的入度和出度,然后计算每个结点的平均入度和出度,这样保留大于平均入度和出的结点,读者可以根据需要进行修改,代码编写基本保持原子性。代码不长,命名都很清楚的表达了作用和功能。



你可能感兴趣的:(python学习)