八、自动生成Flink数据同步任务(mssql-flink-mysql)

只抽数据。业务数据库有很多表,手动建表很麻烦。

python代码(暂未测试所有可能情况):

# -*- coding: utf-8 -*-

from pyflink.table import EnvironmentSettings, TableEnvironment
import pymssql
import pandas as pd

env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
t_env = TableEnvironment.create(env_settings)
#t_env.get_config().get_configuration().set_string("parallelism.default", "3")

#数据库连接(SQL server)
conn = pymssql.connect(host = '121.xxx.xx.xx',
                         port = 1433,
                         user = 'cdcreader',
                         password = '123456',
                         database = 'abc_car')
  
#kafka连接配置
source_common_conf = """ with (
'connector' = 'kafka',
'properties.bootstrap.servers' = 'n101:9092,n102:9092,n103:9092',
'properties.group.id' = 'flink-001',
'scan.startup.mode' = 'timestamp',
'scan.startup.timestamp-millis' = '1647360000000',
'format' = 'debezium-json',
'debezium-json.schema-include' = 'true',
'topic' = 'abc_car.dbo."""

#MySQL连接配置
sink_common_conf = """ with (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://192.168.xx.xxx:3306/ods',
'username' = 'user_t',
'password' = '123456',
'table-name' = '"""

#数据类型映射(mssql-dbz-flinksql,'format' = 'debezium-json')
mssql_dbz_fsql = {
'bigint':'bigint',
'int':'int',
'tinyint':'tinyint',
'smallint':'smallint',
'bit':'boolean',
'decimal':'decimal',
'money':'decimal',
'smallmoney':'decimal',
'numeric':'numeric',
'float':'float',
'real':'float',
'date':'int',
'datetimeoffset':'bigint',
'datetime2':'bigint',
'datetime':'bigint',
'smalldatetime':'bigint',
'time':'time',
'char':'char',
'nchar':'varchar',
'varchar':'varchar',
'nvarchar':'varchar',
'text':'string',
'ntext':'string',
'xml':'string',
'binary':'binary',
'varbinary':'varbinary'
}

#数据类型映射(mssql-flinksql)
mssql_fsql = {
'bigint':'bigint',
'int':'int',
'tinyint':'tinyint',
'smallint':'smallint',
'bit':'boolean',
'decimal':'decimal',
'money':'decimal',
'smallmoney':'decimal',
'numeric':'numeric',
'float':'float',
'real':'float',
'date':'date',
'datetimeoffset':'timestamp_ltz',
'datetime2':'timestamp',
'datetime':'timestamp',
'smalldatetime':'timestamp',
'time':'time',
'char':'char',
'nchar':'varchar',
'varchar':'varchar',
'nvarchar':'varchar',
'text':'string',
'ntext':'string',
'xml':'string',
'binary':'binary',
'varbinary':'varbinary'
}

schema = "'dbo'"
tb_include = ['abc_t1','abc_t2','abc_t3','abc_t4','abc_t5','abc_t6']

sql_1 = """select object_id,name from sys.objects where type ='U' and schema_id=""" + \
    '(select schema_id from sys.schemas where name=' + schema + ')'
    
df = pd.DataFrame(pd.read_sql(sql_1,conn))
tables = df[df['name'].isin(tb_include)]

for index, tb_row in tables.iterrows():
    
    object_id = tb_row.object_id
    tb_name = tb_row['name']
    
    if tb_name[:3] != 'abc':
        continue
    
    sql_2 = """select
    a.name as col,
    b.name as datatype,
    a.max_length,
    a.precision,
    a.scale,
    a.collation_name
    from sys.columns a join sys.types b on b.user_type_id=a.user_type_id
    where a.object_id=""" + str(object_id)
    
    tb_info = pd.DataFrame(pd.read_sql(sql_2,conn))
    
    sql_3 = """select
    c.name as key_col
    from sys.indexes a
    JOIN sys.index_columns b on b.object_id=a.object_id and b.index_id=a.index_id
    JOIN sys.columns c on c.object_id=a.object_id and c.column_id=b.column_id
    WHERE a.is_primary_key=1 and a.object_id=""" + str(object_id)
    
    tb_keys = pd.read_sql(sql_3,conn).values.ravel()


    sink_cols = []
    source_cols = []
    sync_cols = []
    
    for index, i_row in tb_info.iterrows():
        
        sourceType = mssql_dbz_fsql[i_row.datatype]
        sinkType = mssql_fsql[i_row.datatype]
        
        if i_row.max_length == -1:
            i_row.max_length = 8000
        
        col = '`' + i_row.col + '`'
        
        if i_row.collation_name != None and (i_row.collation_name)[:7].lower() == 'chinese' \
        and (i_row.col)[0] == 'n':
            i_row.max_length = int(i_row.max_length/2)
            
        if i_row.precision == 0 and i_row.scale == 0:
            sink_col = col + ' ' + sinkType + '(' + str(i_row.max_length) + ')'
            source_col = col + ' ' + sourceType + '(' + str(i_row.max_length) + ')'
        elif sinkType in ['decimal','numeric','money','smallmoney']:
            sink_col = col + ' ' + sinkType + '(' + str(i_row.precision) + ',' + str(i_row.scale) + ')'
            source_col = col + ' ' + sourceType + '(' + str(i_row.precision) + ',' + str(i_row.scale) + ')'
        else:
            sink_col = col + ' ' + sinkType
            source_col = col + ' ' + sourceType
        
        sink_cols.append(sink_col)
        source_cols.append(source_col)
            
        if i_row.col in ['datetime','datetime2','datetimeoffset','smalldatetime']:
            sync_col = 'to_timestamp_ltz(' + col + '-57600000,3) ' + col
        elif i_row.col == 'date':
            sync_col = 'timestampadd(day,' + col + ",to_date('1970-01-01')) " + col
        else:
            sync_col = col
        
        sync_cols.append(sync_col)
            
    if tb_keys == []:
        key_conf = ''
    else:
        key_conf = '\n,primary key (' + ','.join(tb_keys) + ') not enforced)'
    
    #生成sql
    source_tb = 'source_' + tb_name
    sink_tb = 'sink_' + tb_name
    
    source_ddl = 'create table ' + source_tb + '(' + '\n,'.join(source_cols) + ')' + \
        source_common_conf + tb_name + "')" 
    sink_ddl = 'create table ' + sink_tb + '(' + '\n,'.join(sink_cols) + key_conf + \
        sink_common_conf + tb_name + "')"
    sync_ddl = 'select' + '\n,'.join(sync_cols) + ' from ' + source_tb
    
    print(sink_ddl)
       
    #t_env.execute_sql(source_ddl)
    #t_env.execute_sql(sink_ddl)
    #t_env.sql_query(sync_ddl).insert_into(sink_tb) 
    
#t_env.execute('sync_abc_car')

conn.close

  

你可能感兴趣的:(大数据,python,flink)