只抽数据。业务数据库有很多表,手动建表很麻烦。
python代码(暂未测试所有可能情况):
# -*- coding: utf-8 -*-
from pyflink.table import EnvironmentSettings, TableEnvironment
import pymssql
import pandas as pd
env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
t_env = TableEnvironment.create(env_settings)
#t_env.get_config().get_configuration().set_string("parallelism.default", "3")
#数据库连接(SQL server)
conn = pymssql.connect(host = '121.xxx.xx.xx',
port = 1433,
user = 'cdcreader',
password = '123456',
database = 'abc_car')
#kafka连接配置
source_common_conf = """ with (
'connector' = 'kafka',
'properties.bootstrap.servers' = 'n101:9092,n102:9092,n103:9092',
'properties.group.id' = 'flink-001',
'scan.startup.mode' = 'timestamp',
'scan.startup.timestamp-millis' = '1647360000000',
'format' = 'debezium-json',
'debezium-json.schema-include' = 'true',
'topic' = 'abc_car.dbo."""
#MySQL连接配置
sink_common_conf = """ with (
'connector' = 'jdbc',
'url' = 'jdbc:mysql://192.168.xx.xxx:3306/ods',
'username' = 'user_t',
'password' = '123456',
'table-name' = '"""
#数据类型映射(mssql-dbz-flinksql,'format' = 'debezium-json')
mssql_dbz_fsql = {
'bigint':'bigint',
'int':'int',
'tinyint':'tinyint',
'smallint':'smallint',
'bit':'boolean',
'decimal':'decimal',
'money':'decimal',
'smallmoney':'decimal',
'numeric':'numeric',
'float':'float',
'real':'float',
'date':'int',
'datetimeoffset':'bigint',
'datetime2':'bigint',
'datetime':'bigint',
'smalldatetime':'bigint',
'time':'time',
'char':'char',
'nchar':'varchar',
'varchar':'varchar',
'nvarchar':'varchar',
'text':'string',
'ntext':'string',
'xml':'string',
'binary':'binary',
'varbinary':'varbinary'
}
#数据类型映射(mssql-flinksql)
mssql_fsql = {
'bigint':'bigint',
'int':'int',
'tinyint':'tinyint',
'smallint':'smallint',
'bit':'boolean',
'decimal':'decimal',
'money':'decimal',
'smallmoney':'decimal',
'numeric':'numeric',
'float':'float',
'real':'float',
'date':'date',
'datetimeoffset':'timestamp_ltz',
'datetime2':'timestamp',
'datetime':'timestamp',
'smalldatetime':'timestamp',
'time':'time',
'char':'char',
'nchar':'varchar',
'varchar':'varchar',
'nvarchar':'varchar',
'text':'string',
'ntext':'string',
'xml':'string',
'binary':'binary',
'varbinary':'varbinary'
}
schema = "'dbo'"
tb_include = ['abc_t1','abc_t2','abc_t3','abc_t4','abc_t5','abc_t6']
sql_1 = """select object_id,name from sys.objects where type ='U' and schema_id=""" + \
'(select schema_id from sys.schemas where name=' + schema + ')'
df = pd.DataFrame(pd.read_sql(sql_1,conn))
tables = df[df['name'].isin(tb_include)]
for index, tb_row in tables.iterrows():
object_id = tb_row.object_id
tb_name = tb_row['name']
if tb_name[:3] != 'abc':
continue
sql_2 = """select
a.name as col,
b.name as datatype,
a.max_length,
a.precision,
a.scale,
a.collation_name
from sys.columns a join sys.types b on b.user_type_id=a.user_type_id
where a.object_id=""" + str(object_id)
tb_info = pd.DataFrame(pd.read_sql(sql_2,conn))
sql_3 = """select
c.name as key_col
from sys.indexes a
JOIN sys.index_columns b on b.object_id=a.object_id and b.index_id=a.index_id
JOIN sys.columns c on c.object_id=a.object_id and c.column_id=b.column_id
WHERE a.is_primary_key=1 and a.object_id=""" + str(object_id)
tb_keys = pd.read_sql(sql_3,conn).values.ravel()
sink_cols = []
source_cols = []
sync_cols = []
for index, i_row in tb_info.iterrows():
sourceType = mssql_dbz_fsql[i_row.datatype]
sinkType = mssql_fsql[i_row.datatype]
if i_row.max_length == -1:
i_row.max_length = 8000
col = '`' + i_row.col + '`'
if i_row.collation_name != None and (i_row.collation_name)[:7].lower() == 'chinese' \
and (i_row.col)[0] == 'n':
i_row.max_length = int(i_row.max_length/2)
if i_row.precision == 0 and i_row.scale == 0:
sink_col = col + ' ' + sinkType + '(' + str(i_row.max_length) + ')'
source_col = col + ' ' + sourceType + '(' + str(i_row.max_length) + ')'
elif sinkType in ['decimal','numeric','money','smallmoney']:
sink_col = col + ' ' + sinkType + '(' + str(i_row.precision) + ',' + str(i_row.scale) + ')'
source_col = col + ' ' + sourceType + '(' + str(i_row.precision) + ',' + str(i_row.scale) + ')'
else:
sink_col = col + ' ' + sinkType
source_col = col + ' ' + sourceType
sink_cols.append(sink_col)
source_cols.append(source_col)
if i_row.col in ['datetime','datetime2','datetimeoffset','smalldatetime']:
sync_col = 'to_timestamp_ltz(' + col + '-57600000,3) ' + col
elif i_row.col == 'date':
sync_col = 'timestampadd(day,' + col + ",to_date('1970-01-01')) " + col
else:
sync_col = col
sync_cols.append(sync_col)
if tb_keys == []:
key_conf = ''
else:
key_conf = '\n,primary key (' + ','.join(tb_keys) + ') not enforced)'
#生成sql
source_tb = 'source_' + tb_name
sink_tb = 'sink_' + tb_name
source_ddl = 'create table ' + source_tb + '(' + '\n,'.join(source_cols) + ')' + \
source_common_conf + tb_name + "')"
sink_ddl = 'create table ' + sink_tb + '(' + '\n,'.join(sink_cols) + key_conf + \
sink_common_conf + tb_name + "')"
sync_ddl = 'select' + '\n,'.join(sync_cols) + ' from ' + source_tb
print(sink_ddl)
#t_env.execute_sql(source_ddl)
#t_env.execute_sql(sink_ddl)
#t_env.sql_query(sync_ddl).insert_into(sink_tb)
#t_env.execute('sync_abc_car')
conn.close