neo4j在线增量批量插入三元组

1.python代码生成words.csv、relations_daixiang.csv relations_yongxiang.csv relations_xiangguan.csv 注意:每种关系单独导入!

# 生成三元组
with open('d2019.txt', 'r', encoding='utf-8') as f:
    all = f.read()
records = all.split('*NEWRECORD')
tuples = []
mapd = {}
for record, i in zip(records[1:], range(1, len(records))):
    tuplesitem = []
    items = record.split('\n')
    idcount = 0
    for item in items:
        if item.startswith('MH = '):
            heading = re.findall('MH = ([^"]+)', item)[0]
            mapd[heading] = "D" + str(i)
        if item.startswith('ENTRY = '):
            entry = re.findall('[^\|]+', item)[0]
            entry = re.findall('ENTRY = ([^"]+)', entry)[0]
            tuples.append((heading, entry, '代项'))
            tuples.append((entry, heading, '用项'))
            if not mapd.get(entry,0):
                idcount = idcount + 1
                mapd[entry] = mapd[heading] + "_" + str(idcount)
        if item.startswith('PRINT ENTRY = '):
            printentry = re.findall('[^\|]+', item)[0]
            printentry = re.findall('PRINT ENTRY = ([^"]+)', printentry)[0]
            tuples.append((heading, printentry, '代项'))
            tuples.append((printentry, heading, '用项'))
            if not mapd.get(printentry, 0):
                idcount = idcount + 1
                mapd[printentry] = mapd[heading] + "_" + str(idcount)
        if item.startswith('MN = '):
            nid = re.findall('MN = ([^"]+)', item)[0]
            hypernyms = find_hypernym(nid)
            for hypernym in hypernyms:
                tuples.append((heading, hypernym, '上位词'))
            hyponyms = find_hyponym(nid)
            for hyponym in hyponyms:
                tuples.append((heading, hyponym, '下位词'))
            homoionyms = find_homoionym(nid)
            for homoionym in homoionyms:
                tuples.append((heading, homoionym, '兄弟'))
        if item.startswith('FX = '):
            relate = re.findall('FX = ([^"]+)', item)[0]
            tuples.append((heading, relate,  '相关'))
logger.debug('tuples(left_word, right_word, relation)已生成')
logger.debug('mapd{word:id}已生成')

# 生成 relations[[from_id, type, to_id]...]和relations.csv
relations_daixiang = []
relations_yongxiang = []
relations_shangweici = []
relations_xiaweici = []
relations_xiongdi = []
relations_xiangguan = []
for tuplei in tuples:
    if tuplei[2] == "代项":
        relations_daixiang.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
    if tuplei[2] == "用项":
        relations_yongxiang.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
    if tuplei[2] == "上位词":
        if not mapd.get(tuplei[1], 0):
            i = i + 1
            mapd[tuplei[1]] = "D" + str(i)
        relations_shangweici.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
    if tuplei[2] == "下位词":
        if not mapd.get(tuplei[1], 0):
            i = i + 1
            mapd[tuplei[1]] = "D" + str(i)
        relations_xiaweici.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
    if tuplei[2] == "兄弟":
        if not mapd.get(tuplei[1], 0):
            i = i + 1
            mapd[tuplei[1]] = "D" + str(i)
        relations_xiongdi.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
    if tuplei[2] == "相关":
        relations_xiangguan.append([mapd[tuplei[0]], tuplei[2], mapd[tuplei[1]]])
logger.debug('relations[[from_id, type, to_id]...]已生成')

# 生成 words[[name, identification]...]和dwords.csv
words = []
for key in mapd.keys():
    words.append([key, mapd.get(key)])
logger.debug('words[[name, identification]...]已生成')
with open("dwords.csv", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["name", "id"])  # 先写入columns_name
    writer.writerows(words)  # 写入多行用writerows
logger.debug('dwords.csv已生成')

with open("drelations_daixiang.csv", "w", newline='', encoding='utf-8') as csvfile1:
    writer = csv.writer(csvfile1)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_daixiang)
with open("drelations_yongxiang.csv", "w", newline='', encoding='utf-8') as csvfile2:
    writer = csv.writer(csvfile2)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_yongxiang)
with open("drelations_shangweici.csv", "w", newline='', encoding='utf-8') as csvfile3:
    writer = csv.writer(csvfile3)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_shangweici)
with open("drelations_xiaweici.csv", "w", newline='', encoding='utf-8') as csvfile4:
    writer = csv.writer(csvfile4)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_xiaweici)
with open("drelations_xiongdi.csv", "w", newline='', encoding='utf-8') as csvfile5:
    writer = csv.writer(csvfile5)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_xiongdi)
with open("drelations_xiangguan.csv", "w", newline='', encoding='utf-8') as csvfile6:
    writer = csv.writer(csvfile6)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_xiangguan)
logger.debug('drelations.csv已生成')


# 生成三元组
with open('c2019.txt', 'r', encoding='utf-8') as f:
    all = f.read()
records = all.split('*NEWRECORD')
tuples = []
map = {}
for record, i in zip(records[1:], range(1, len(records))):
    tuplesitem = []
    items = record.split('\n')
    idcount = 0
    for item in items:
        if item.startswith('NM = '):
            heading = re.findall('NM = ([^"]+)', item)[0]
            map[heading] = "C" + str(i)
        if item.startswith('SY = '):
            entry = re.findall('[^\|]+', item)[0]
            entry = re.findall('SY = ([^"]+)', entry)[0]
            tuples.append((heading, entry, '代项'))
            tuples.append((entry, heading, '用项'))
            if not map.get(entry, 0):
                idcount = idcount + 1
                map[entry] = map[heading] + "_" + str(idcount)
        if item.startswith('HM = '):
            relate = re.findall('HM = ([^"]+)', item)[0].strip("*")
            tuples.append((heading, relate,  '相关'))
            if not map.get(relate, 0):
                idcount = idcount + 1
                map[relate] = map[heading] + "_" + str(idcount)
        if item.startswith('PA = '):
            relate = re.findall('PA = ([^"]+)', item)[0]
            tuples.append((heading, relate,  '相关'))
            if not map.get(relate, 0):
                idcount = idcount + 1
                map[relate] = map[heading] + "_" + str(idcount)

# 生成 relations[[from_id, type, to_id]...]和crelations.csv
relations_daixiang = []
relations_yongxiang = []
relations_xiangguan = []
for tuplei in tuples:
    if tuplei[2] == "代项":
        relations_daixiang.append([map[tuplei[0]], tuplei[2], map[tuplei[1]]])
    if tuplei[2] == "用项":
        relations_yongxiang.append([map[tuplei[0]], tuplei[2], map[tuplei[1]]])
    if tuplei[2] == "相关":
        relations_xiangguan.append([map[tuplei[0]], tuplei[2], map[tuplei[1]]])
logger.debug('relations[[from_id, type, to_id]...]已生成')

# 生成 words[[name, id]...]和cwords.csv
words = []
for key in map.keys():
    words.append([key, map.get(key)])
logger.debug('words[[name, id]...]已生成')
with open("cwords.csv", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["name", "id"])      # 先写入columns_name
    writer.writerows(words)     # 写入多行用writerows
logger.debug('cwords.csv已生成')

with open("crelations_daixiang.csv", "w", newline='', encoding='utf-8') as csvfile1:
    writer = csv.writer(csvfile1)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_daixiang)
with open("crelations_yongxiang.csv", "w", newline='', encoding='utf-8') as csvfile2:
    writer = csv.writer(csvfile2)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_yongxiang)
with open("crelations_xiangguan.csv", "w", newline='', encoding='utf-8') as csvfile3:
    writer = csv.writer(csvfile3)
    writer.writerow(["from_id", "type", "to_id"])
    writer.writerows(relations_xiangguan)
logger.debug('crelations.csv已生成')

2.插入节点

LOAD CSV WITH HEADERS FROM "file:///dwords.csv" AS line
MERGE (w:word{name:line.name,identification:line.id})

3.插入关系

LOAD CSV WITH HEADERS FROM "file:///drelations_daixiang.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:代项]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_yongxiang.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:用项]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_shangweici.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:上位词]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_xiaweici.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:下位词]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_xiongdi.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:兄弟]->(b)
LOAD CSV WITH HEADERS FROM "file:///drelations_xiangguan.csv" AS line
match (a:word{identification:line.from_id}),(b:word{identification:line.to_id})
merge (a)-[r:相关]->(b)

你可能感兴趣的:(搜索)