1.python代码生成words.csv、relations_daixiang.csv relations_yongxiang.csv relations_xiangguan.csv 注意:每种关系单独导入!
# 生成三元组
with open('d2019.txt', 'r', encoding='utf-8') as f:
all = f.read()
records = all.split('*NEWRECORD')
tuples = []
mapd = {}
for record, i in zip(records[1:], range(1, len(records))):
tuplesitem = []
items = record.split('\n')
idcount = 0
for item in items:
if item.startswith('MH = '):
heading = re.findall('MH = ([^"]+)', item)[0]
mapd[heading] = "D" + str(i)
if item.startswith('ENTRY = '):
entry = re.findall('[^\|]+', item)[0]
entry = re.findall('ENTRY = ([^"]+)', entry)[0]
tuples.append((heading, entry, '代项'))
tuples.append((entry, heading, '用项'))
if not mapd.get(entry,0):
idcount = idcount + 1
mapd[entry] = mapd[heading] + "_" + str(idcount)
if item.startswith('PRINT ENTRY = '):
printentry = re.findall('[^\|]+', item)[0]
printentry = re.findall('PRINT ENTRY = ([^"]+)', printentry)[0]
tuples.append((heading, printentry, '代项'))
tuples.append((printentry, heading, '用项'))
if not mapd.get(printentry, 0):
idcount = idcount + 1
mapd[printentry] = mapd[heading] + "_" + str(idcount)
if item.startswith('MN = '):
nid = re.findall('MN = ([^"]+)', item)[0]
hypernyms = find_hypernym(nid)
for hypernym in hypernyms:
tuples.append((heading, hypernym, '上位词'))
hyponyms = find_hyponym(nid)
for hyponym in hyponyms:
tuples.append((heading, hyponym, '下位词'))
homoionyms = find_homoionym(nid)
for homoionym in homoionyms:
tuples.append((heading, homoionym, '兄弟'))
if item.startswith('FX = '):
relate = re.findall('FX = ([^"]+)', item)[0]
tuples.append((heading, relate, '相关'))
logger.debug('tuples(left_word, right_word, relation)已生成')
logger.debug('mapd{word:id}已生成')
# 生成 relations[[from_id, type, to_id]...]和relations.csv
relations_daixiang = []
relations_yongxiang = []
relations_shangweici = []
relations_xiaweici = []
relations_xiongdi = []
relations_xiangguan = []
for tuplei in tuples:
if tuplei[2] == "代项":
relations_daixiang.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
if tuplei[2] == "用项":
relations_yongxiang.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
if tuplei[2] == "上位词":
if not mapd.get(tuplei[1], 0):
i = i + 1
mapd[tuplei[1]] = "D" + str(i)
relations_shangweici.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
if tuplei[2] == "下位词":
if not mapd.get(tuplei[1], 0):
i = i + 1
mapd[tuplei[1]] = "D" + str(i)
relations_xiaweici.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
if tuplei[2] == "兄弟":
if not mapd.get(tuplei[1], 0):
i = i + 1
mapd[tuplei[1]] = "D" + str(i)
relations_xiongdi.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
if tuplei[2] == "相关":
relations_xiangguan.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
logger.debug('relations[[from_id, type, to_id]...]已生成')
# 生成 words[[name, identification]...]和dwords.csv
words = []
for key in mapd.keys():
words.append([key, mapd.get(key)])
logger.debug('words[[name, identification]...]已生成')
with open("dwords.csv", "w", newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["name", "id"]) # 先写入columns_name
writer.writerows(words) # 写入多行用writerows
logger.debug('dwords.csv已生成')
with open("drelations_daixiang.csv", "w", newline='', encoding='utf-8') as csvfile1:
writer = csv.writer(csvfile1)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_daixiang)
with open("drelations_yongxiang.csv", "w", newline='', encoding='utf-8') as csvfile2:
writer = csv.writer(csvfile2)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_yongxiang)
with open("drelations_shangweici.csv", "w", newline='', encoding='utf-8') as csvfile3:
writer = csv.writer(csvfile3)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_shangweici)
with open("drelations_xiaweici.csv", "w", newline='', encoding='utf-8') as csvfile4:
writer = csv.writer(csvfile4)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_xiaweici)
with open("drelations_xiongdi.csv", "w", newline='', encoding='utf-8') as csvfile5:
writer = csv.writer(csvfile5)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_xiongdi)
with open("drelations_xiangguan.csv", "w", newline='', encoding='utf-8') as csvfile6:
writer = csv.writer(csvfile6)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_xiangguan)
logger.debug('drelations.csv已生成')
# 生成三元组
with open('c2019.txt', 'r', encoding='utf-8') as f:
all = f.read()
records = all.split('*NEWRECORD')
tuples = []
map = {}
for record, i in zip(records[1:], range(1, len(records))):
tuplesitem = []
items = record.split('\n')
idcount = 0
for item in items:
if item.startswith('NM = '):
heading = re.findall('NM = ([^"]+)', item)[0]
map[heading] = "C" + str(i)
if item.startswith('SY = '):
entry = re.findall('[^\|]+', item)[0]
entry = re.findall('SY = ([^"]+)', entry)[0]
tuples.append((heading, entry, '代项'))
tuples.append((entry, heading, '用项'))
if not map.get(entry, 0):
idcount = idcount + 1
map[entry] = map[heading] + "_" + str(idcount)
if item.startswith('HM = '):
relate = re.findall('HM = ([^"]+)', item)[0].strip("*")
tuples.append((heading, relate, '相关'))
if not map.get(relate, 0):
idcount = idcount + 1
map[relate] = map[heading] + "_" + str(idcount)
if item.startswith('PA = '):
relate = re.findall('PA = ([^"]+)', item)[0]
tuples.append((heading, relate, '相关'))
if not map.get(relate, 0):
idcount = idcount + 1
map[relate] = map[heading] + "_" + str(idcount)
# 生成 relations[[from_id, type, to_id]...]和crelations.csv
relations_daixiang = []
relations_yongxiang = []
relations_xiangguan = []
for tuplei in tuples:
if tuplei[2] == "代项":
relations_daixiang.append([map[tuplei[0]], tuplei[2], map[tuplei[1]]])
if tuplei[2] == "用项":
relations_yongxiang.append([map[tuplei[0]], tuplei[2], map[tuplei[1]]])
if tuplei[2] == "相关":
relations_xiangguan.append([map[tuplei[0]], tuplei[2], map[tuplei[1]]])
logger.debug('relations[[from_id, type, to_id]...]已生成')
# 生成 words[[name, id]...]和cwords.csv
words = []
for key in map.keys():
words.append([key, map.get(key)])
logger.debug('words[[name, id]...]已生成')
with open("cwords.csv", "w", newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["name", "id"]) # 先写入columns_name
writer.writerows(words) # 写入多行用writerows
logger.debug('cwords.csv已生成')
with open("crelations_daixiang.csv", "w", newline='', encoding='utf-8') as csvfile1:
writer = csv.writer(csvfile1)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_daixiang)
with open("crelations_yongxiang.csv", "w", newline='', encoding='utf-8') as csvfile2:
writer = csv.writer(csvfile2)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_yongxiang)
with open("crelations_xiangguan.csv", "w", newline='', encoding='utf-8') as csvfile3:
writer = csv.writer(csvfile3)
writer.writerow(["from_id", "type", "to_id"])
writer.writerows(relations_xiangguan)
logger.debug('crelations.csv已生成')
2.插入节点
LOAD CSV WITH HEADERS FROM "file:///dwords.csv" AS line
MERGE (w:word{name:line.name,identification:line.id})
3.插入关系
LOAD CSV WITH HEADERS FROM "file:///drelations_daixiang.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:代项]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_yongxiang.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:用项]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_shangweici.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:上位词]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_xiaweici.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:下位词]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_xiongdi.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:兄弟]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_xiangguan.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:相关]->(b)