# coding: utf-8
import random
import time
import csv
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
f1 = open("raw_data.csv", "a+", encoding='utf-8')
writer_csv_1 = csv.writer(f1)
header = ['Nodeid1','Nodeid2','author_degree1','author_degree2','No','isBD']
writer_csv_1.writerow(header)
data=pd.DataFrame(pd.read_csv('/home/henson/Desktop/huanping/huanping.csv_EDGE_NBD.csv',encoding='gb18030')) #数据集路径
data.head()
X = np.array(data[['Nodeid1','Nodeid2','author_degree1','author_degree2','No','isBD']])
Nodeid1=X[:,0]
Nodeid2=X[:,1]
author_degree1=X[:,2]
author_degree2=X[:,3]
No=X[:,4]
isBD=X[:,5]
data1=[]
for i in range(0,len(isBD)):
data1.append((Nodeid1[i],Nodeid2[i],author_degree1[i],author_degree2[i],No[i],isBD[i]))
writer_csv_1.writerows(data1)
f1.close()
""" 把原始数据再写一遍到csv,保证数据的精度一直,才能做比较
"""
f2 = open("sele.csv", "a+", encoding='utf-8')
writer_csv_2 = csv.writer(f2)
writer_csv_2.writerow(header)
data2=[]
for j in range(0,len(isBD)):
if isBD[j] == 1:
data2.append((Nodeid1[j], Nodeid2[j], author_degree1[j], author_degree2[j], No[j], isBD[j]))
writer_csv_2.writerows(data2)
f2.close()
""" 写入label为1的数据,精度和raw_data一致,方便做比较
"""
f3 = open("sele.csv", "a+", encoding='utf-8')
writer_csv_3 = csv.writer(f3)
data3=pd.DataFrame(pd.read_csv('sele.csv',encoding='utf-8'))
X1 = np.array(data3[['author_degree1','author_degree2','No']])
isBD1 = np.array(data3[['isBD']])
degreex1=X1[:,0]
degreex2=X1[:,1]
Nox=X1[:,2]
isBD1=X1[:,]
data4=pd.DataFrame(pd.read_csv('raw_data.csv',encoding='utf-8'))
data4.head()
nodeid1=np.array((data4[['Nodeid1']]))
nodeid2=np.array((data4[['Nodeid2']]))
nodeid1=nodeid1[:,0]
nodeid2=nodeid2[:,0]
X = np.array(data4[['author_degree1','author_degree2','No']])
isBD = np.array(data4[['isBD']])
data3=[]
degree1=X[:,0]
degree2=X[:,1]
No=X[:,2]
isBD=isBD[:,0]
"""
if isBD[969] == 0:
a = X[969, :]
print(a)
for j in range(0, 1):
b = X1[337, :]
print("b:",b)
if (a == b).all():
print("yes,equal") #选取个别检验是否相等
c = 0
break
else:
c = 1
"""
for i in range(0,len(isBD)):
#c = 1
if isBD[i] == 0:
a=X[i,:]
for j in range(0,len(isBD1)):
b = X1[j, :]
if (a == b).all():
#print(a)
c=0
break
else:c=1
if c==1:
data3.append((nodeid1[i],nodeid2[i],X[i,0],X[i,1],X[i,2],isBD[i]))
#print(data3)
writer_csv_3.writerows(data3)
f3.close()
""" 直接追加到sele.csv,合并0和1的数据
"""
把原始数据再写一遍到csv,保证数据的精度一直,才能做比较,生成raw_data.csv,先选出label为1的数据,精度和raw_data一致,生成sele.csv方便做比较,从raw_data.csv与sele.csv反选给出输入特征向量与label=1不一样的数据,(即征向量与label=1不一样,且label=0的数据),数据生成追加到sele.csv,得到的数据可作为训练集训练生成模型