Python将csv数据导入neo4j

参考链接:https://github.com/SongX64/movie_recommend_knowleagegraph

import pandas as pd
from neo4j import GraphDatabase

# 连接数据库驱动
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "neo4j"))

# 参数设置
k = 10  # 考虑最相似的用户,也就是最邻近的邻居
moives_common = 3  # 考虑用户相似度,要有多少个电影公共看过
usesrs_common = 2  # 至少共通看过2个电影,说用户相似
threshold_sim = 0.9  # 用户相似度阈值


def load_data():
    with driver.session() as session:
        # 清空数据库
        session.run("""MATCH ()-[r]->() DELETE r""")
        session.run("""MATCH (n) DETACH DELETE n""")

        # --------------从文件中读取数据,存入 neo4j 数据库中------------
        # 加载电影
        print("Loading movies ...")
        session.run("""
            LOAD CSV WITH HEADERS FROM "file:///out_movies.csv" AS csv
            CREATE (:Movie {title: csv.title})
        """)

        # 加载评分
        print("Loading gradings ... ")
        session.run("""
            LOAD CSV WITH HEADERS FROM "file:///out_grade.csv" AS csv
            MERGE(m:Movie {title: csv.title})
            MERGE(u:User {id: toInteger(csv.user_id)})
            CREATE (u)-[:RATED {grading: toInteger(csv.grade)}]->(m)
        """)

        # 加载电影类型
        print("Loading genre ...")
        session.run("""
            LOAD CSV WITH HEADERS FROM "file:///out_genre.csv" AS csv
            MERGE (m:Movie {title: csv.title})
            MERGE (g:Genre {genre: csv.genre})
            CREATE (m)-[:HAS_GENRE]->(g)
        """)

        # 加载关键词
        print("Loading keywords ...")
        session.run("""
            LOAD CSV WITH HEADERS FROM "file:///out_keyword.csv" AS csv
            MERGE(m:Movie {title: csv.title})
            MERGE(k:Keyword {keyword: csv.keyword})
            CREATE (m)-[:HAS_KEYWORD]->(k)
        """)

        # 加载导演
        print("Loading productors ...")
        session.run("""
            LOAD CSV WITH HEADERS FROM "file:///out_productor.csv" AS csv
            MERGE(m:Movie {title: csv.title})
            MERGE(p:Productor {name: csv.productor})
            CREATE (m)-[:HAS_PRODUCTOR]->(p)
        """)
        # -------------------读取文件完毕-------------------------


def queries():
    while True:
        userid = int(input("请输入要为哪位用户推荐电影,输入ID即可:"))
        m = int(input("为该用户推荐的电影个数:"))

        # 电影类型
        genre = []
        if int(input("是否筛选喜欢的类型?输入0或1: ")):
            # 排除的话
            with driver.session() as session:
                try:
                    # 查询所有类型,放入元组中
                    q = session.run(f"""MATCH (g:Genre) RETURN g.genre AS genre""")
                    result = []
                    for i, r in enumerate(q):
                        result.append(r["genre"])

                    # 将 genre 列转为 DataFrame 类型,并列出提示
                    df = pd.DataFrame(result, columns=["genre"])
                    print()
                    print(df)

                    # 根据上面的输出,输入类型
                    inp = input("请输入喜欢的类型,例如 1 2 3 : ")
                    if len(inp) != 0:
                        inp = inp.split(" ")
                        # TODO 这里是什么意思? lamuda表达式 ???
                        genre = [df["genre"].iloc[int(x)] for x in inp]
                finally:
                    print("Error")

        # 进行查询, 用户u1对电影的评分, 降序排序
        with driver.session() as session:
            q = session.run(f"""
                MATCH (u1:User {{ id:{userid} }})-[r:RATED]-(m:Movie)
                RETURN m.title AS title,r.grading AS grade
                ORDER BY grade DESC
            """)
            print()
            print("你评分过的电影如下所示: ")

            # 将 session 查询结果放入元组中
            result = []
            for r in q:
                result.append([r["title"], r["grade"]])

            # 输出结果, 用户对于电影的一个评分列表
            if len(result) == 0:
                print("没有结果推荐")
            else:
                df = pd.DataFrame(result, columns=["title", "grade"])
                print()
                print(df.to_string(index=True))
            print("---------------------------------------------------------------------------------------------------")

            # 删除用户相似性关系
            session.run(f"""
                MATCH (u1:User)-[s:SIMILARITY]-(u2:User)
                DELETE s
            """)

            # 重新计算用户相似性
            # 通过电影连接两个用户, u1 --rated-- movie --rated-- u2
            # 计算u1,u2共同评论过的电影,然后根据两个人的评分来计算相似度
            # (用户1评分 * 用户2评分)的总和,除以他们分别的根号平方和
            session.run(f"""
                MATCH (u1:User {{id : {userid}}})-[r1:RATED]-(m:Movie)-[r2:RATED]-(u2:User)
                WITH
                    u1, u2,
                    COUNT(m) AS movies_common,
                    SUM(r1.grading * r2.grading)/(SQRT( SUM(r1.grading^2) ) * SQRT( SUM(r2.grading^2) )) as sim
                WHERE movies_common >= {moives_common} AND sim > {threshold_sim}
                MERGE (u1)-[s:SIMILARITY]-(u2)
                SET s.sim = sim
            """)

            # 条件语句拼装, 过滤类型
            Q_GENRE = ""
            if len(genre) > 0:
                Q_GENRE = "AND ((SIZE(gen) > 0) AND "
                Q_GENRE += "(ANY(X IN " + str(genre) + " WHERE X IN gen))"
                Q_GENRE += ")"

            q = session.run(f"""
                MATCH (u1:User{{id : {userid}}})-[s:SIMILARITY]-(u2:User)
                WITH u1,u2,s
                ORDER BY s.sim DESC LIMIT {k}
                MATCH (m:Movie)-[r:RATED]-(u2)
                OPTIONAL MATCH (g:Genre)--(m)
                WITH u1,u2,s,m,r, COLLECT(DISTINCT g.genre) AS gen
                WHERE NOT((m)-[:RATED]-(u1)) {Q_GENRE}
                WITH
                    m.title AS title,
                    SUM(r.grading * s.sim)/SUM(s.sim) AS grade,
                    COUNT(u2) AS num,
                    gen
                WHERE num >= {usesrs_common}
                RETURN title,grade,num,gen
                ORDER BY grade DESC, num DESC
                LIMIT {m}
            """)

            print("推荐的电影:")
            result = []
            for r in q:
                result.append([r["title"], r["grade"], r["num"], r["gen"]])
            if len(result) == 0:
                print("无推荐")
                print()
                continue
            df = pd.DataFrame(result, columns=["title", "avg grade", "num recommenders", "genre"])
            print()
            print(df.to_string(index=True))
            print("---------------------------------------------------------------------------------------------------")


if __name__ == "__main__":
    if int(input("是否需要重新加载知识图谱? 输入0或1: ")):
        load_data()
    queries()

是否需要重新加载知识图谱? 输入0或1:  1


Loading movies ...
Loading gradings ... 
Loading genre ...
Loading keywords ...
Loading productors ...


请输入要为哪位用户推荐电影,输入ID即可: 944
为该用户推荐的电影个数: 10
是否筛选喜欢的类型?输入0或1:  1



              genre
0            Action
1         Adventure
2           Fantasy
3   Science Fiction
4             Crime
5         Animation
6            Family
7             Drama
8           Romance
9          Thriller
10           Comedy
11          Mystery
12              War
13          History
14          Western
15           Horror
16            Music
17      Documentary
18          Foreign
19         TV Movie


请输入喜欢的类型,例如 1 2 3 :  3


Error

你评分过的电影如下所示: 
没有结果推荐
---------------------------------------------------------------------------------------------------
推荐的电影:
无推荐

你可能感兴趣的:(Neo4J,python,neo4j,开发语言)