pyspark dataframe 读写MySQL

1、定义MySQL的配置

self.db_config = {
    "url": "jdbc:mysql://{host}:{port}/db",
    "driver": "com.mysql.jdbc.Driver",
    "user": "poctest",
    "password": "123",
    "port": "3306",
    "host": "0.0.0.0",
    "database": "db"
}
self.sql_engine = create_engine('mysql+pymysql://{user}:{pwd}@{host}:{port}/{database}'.format(
    user=self.db_config['user'],
    pwd=self.db_config['password'],
    host=self.db_config['host'],
    port=self.db_config['port'],
    database=self.db_config['database']
))

2、pyspark 读取MySQL表

def load_table_myspark(sparkSession, comm, table_name):
    """
    :argument 将MySQ表数据加载到程序中
    :param sparkSession
    :param comm: common配置模块
    :param table_name: 要查询的表名,表名可以是原始表名,也可以是(select * from t) as t 这种构造表
    :return: spark DataFrame
    """
    df = None
    db_config = comm.db_config
    try:
        df = sparkSession.read.format('jdbc').options(
            url=db_config['url'],
            driver=db_config['driver'],
            dbtable=table_name,
            user=db_config['user'],
            password=db_config['password']
        ).load()
    except Exception as e:
        print("-----数据加载失败,错误异常信息:", e)
    return df

3、pyspark 写MySQL表

def save_table_myspark(sparkSession, comm, table_name, df, mode="append"):
    """
    :argument 
    :param sparkSession
    :param comm
    :param table_name
    :param df: spark的DataFrame类型
    :param mode: 默认为append模式
    :return:
    """
    db_config = comm.db_config
    flag = False
    try:
        df.write.mode(mode) \
            .format("jdbc") \
            .option("url", db_config['url']) \
            .option("dbtable", table_name) \
            .option("user", db_config['user']) \
            .option("password", db_config['password']) \
            .save()
        flag = True
    except Exception as e:
        print("-----数据加载失败,错误异常信息:", e)
    return flag

你可能感兴趣的:(python,spark)