简单记录下,
最近要做一些有关知识图谱融合研究的相关工作,需要自己构建一些能实际落地应用的知识图谱来验证知识图谱融合研究的有效性,本文程序完成的工作是读取MySQL中的题库数据并导入Neo4J中,同时对节点和关系进行去重,避免重复创建节点和关系,如有错误或可优化的地方请指正。
在本篇博客程序运行之前,笔者已经爬取了1w8左右的高中历史题库数据,爬取的是某在线题库网站。数据:高中历史题库-18000条-机器学习文档类资源-CSDN下载
# coding:utf-8
from py2neo import Graph, Node, Relationship, NodeMatcher, RelationshipMatcher
import web
import hashlib
# 连接Neo4j
url = "http://localhost:7474"
username = "neo4j"
password = "test"
graph = Graph(url, auth=(username, password))
node_matcher = NodeMatcher(graph)
relation_matcher = RelationshipMatcher(graph)
print("neo4j info: {}".format(str(graph)))
hl = hashlib.md5(b'KG')
# 连接数据库
db = web.database(
dbn='mysql',
user='root',
pw='xxxxxxx',
db='learn_kg',
)
# 得到md5字符串
def get_md5_string(input_str):
hl.update(input_str.encode(encoding='utf-8'))
return hl.hexdigest()
# 从数据库中查询数据并处理
def query_data():
results = list(db.query("select * from t_questions_yixuela"))
return results
# 处理数据
def handle_data():
root_node = {'label': 'Period', 'name': '初中'}
history_node = {'label': 'subject', 'name': '初中历史'}
add_to_neo(root_node, history_node, '子学科')
results = query_data()
for res in results:
question = res['question_info']
answer = res['answer']
points = res['point']
input_string_split = points.split("@@")
first_point = input_string_split[0]
second_point = input_string_split[1]
third_point = input_string_split[2]
add_to_neo(history_node, {'label': 'point', 'name': first_point}, '子知识点')
add_to_neo({'label': 'point', 'name': first_point}, {'label': 'point', 'name': second_point}, '子知识点')
add_to_neo({'label': 'point', 'name': second_point}, {'label': 'point', 'name': third_point}, '子知识点')
print(f'{first_point}-> {second_point}-> {third_point}')
# 插入题目
question_md5 = get_md5_string(question)
add_to_neo({'label': 'point', 'name': third_point},
{'label': 'question', 'name': question_md5, 'info': question}, '题目')
# 插入答案
add_to_neo({'label': 'question', 'name': question_md5}, {'label': 'point', 'name': answer}, '答案')
# 添加关系到neo4j
def add_relation_to_neo(start_node_item, end_node_item, relation_str):
start_node = node_matcher.match(start_node_item['label'], name=start_node_item['name']).first()
end_node = node_matcher.match(end_node_item['label'], name=end_node_item['name']).first()
if start_node is not None and end_node is not None:
r_type = relation_matcher.match([start_node, end_node], r_type=relation_str).first()
if r_type is None:
graph.create(Relationship(start_node, relation_str, end_node))
print(f"create relations.{relation_str}, start_node:{str(start_node)}, end_node:{str(end_node)}")
# 添加节点和关系到neo4j
def add_to_neo(start_node, end_node, relation):
add_node_to_neo(start_node)
add_node_to_neo(end_node)
add_relation_to_neo(start_node, end_node, relation)
# 添加节点到neo4j
def add_node_to_neo(node_items):
label = node_items['label']
name = node_items['name']
find_node = node_matcher.match(label, name=name).first()
if find_node is None:
attrs = {k: v for k, v in node_items.items() if k != "label"}
node = Node(label, **attrs)
graph.create(node)
if __name__ == '__main__':
handle_data()