dataworks openapi文档
在这里获取对应的project databases 和tables的api操作,注意这里需要的几个pip进行的安装包在datahub中安装
根据他的格式进行引入
dataworks_common = {
"alibabacloud_dataworks-public20200518>=4.3.13, <5.0.0",
"alibabacloud_tea_openapi>=0.3.1, <1.0.0",
"alibabacloud_tea_console>=0.0.1, <1.0.0",
"alibabacloud_tea_util>=0.3.5, <1.0.0"
}
#############
这个时候只代表你的python依赖完成
还是在setup.py中的entry_points
#这里的指定,数据源和对应的路径,下面截图解释
"dataworks = datahub.ingestion.source.dataworks:DataWorksSource",
官方新数据源构建步骤官方步骤
注意这里的ConfigModel和/datahub/metadata-ingestion/src/datahub/ingestion/source/file.py
但是注意下面的这个LookerAPIConfig这里其实就是告诉你的继承configmodel就是为了书写你的的前端界面配置的source文件或者yml时候的参数
所以我的配置只需要几个链接dataworks的基本参数
除了前三个参数是必须的,下面几个参数其实是通用的,参考其他的source源配置的参数信息
class DataworksSourceConfig(ConfigModel):
access_key_id: str = Field(description="dataworks access_key_id")
access_key_secret: str = Field(description="dataworks access_key_secret")
endpoint: str = Field(
default="dataworks.cn-shanghai.aliyuncs.com",
description="dataworks endpoint",
)
database_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="regex patterns for project to filter in ingestion.",
)
platform_instance: Optional[str] = Field(
default=None,
description="The instance of the platform that all assets produced by this recipe belong to",
)
env: str = Field(
default=FabricTypeClass.PROD,
description="The environment that all assets produced by this connector belong to",
)
文件参考不要光看这个file的,更多的结构还是参考mongodb.py,这个更加准确
完整代码
import re
from dataclasses import dataclass, field
from alibabacloud_dataworks_public20200518.client import Client as dataworks_public20200518Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_dataworks_public20200518 import models as dataworks_public_20200518_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.models import RuntimeOptions
from datahub.configuration.common import (
AllowDenyPattern,
ConfigModel,
)
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import (
SupportStatus,
config_class,
platform_name,
support_status,
)
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
ArrayTypeClass,
BooleanTypeClass,
BytesTypeClass,
NullTypeClass,
NumberTypeClass,
RecordTypeClass,
SchemalessClass,
StringTypeClass,
TimeTypeClass,
UnionTypeClass,
)
from pydantic import Field, root_validator
from datahub.ingestion.api.source import SourceReport, Source
from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit
from typing import (
Any,
Iterable,
List,
Optional,
Type,
)
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
ArrayTypeClass,
BooleanTypeClass,
BytesTypeClass,
DateTypeClass,
EnumTypeClass,
NullTypeClass,
NumberTypeClass,
RecordTypeClass,
SchemaField,
SchemaFieldDataType,
SchemaMetadata,
StringTypeClass,
TimeTypeClass,
)
from datahub.metadata.schema_classes import (
FabricTypeClass,
DatasetPropertiesClass,
InstitutionalMemoryClass,
InstitutionalMemoryMetadataClass,
AuditStampClass
)
from datahub.utilities.urns.dataset_urn import DatasetUrn
class DataworksSourceConfig(ConfigModel):
access_key_id: str = Field(description="dataworks access_key_id")
access_key_secret: str = Field(description="dataworks access_key_secret")
endpoint: str = Field(
default="dataworks.cn-hangzhou.aliyuncs.com",
description="dataworks endpoint",
)
database_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="regex patterns for project to filter in ingestion.",
)
platform_instance: Optional[str] = Field(
default=None,
description="The instance of the platform that all assets produced by this recipe belong to",
)
env: str = Field(
default=FabricTypeClass.PROD,
description="The environment that all assets produced by this connector belong to",
)
@dataclass
class DataworksSourceReport(SourceReport):
filtered: List[str] = field(default_factory=list)
def report_dropped(self, name: str) -> None:
self.filtered.append(name)
pass
#接下来的方法主要是封装方法的
def get_project_request():
'''返回获取projectlsit'''
return dataworks_public_20200518_models.ListProjectsRequest()
def get_table_theme_info(openapi_client, table_guid, runtime):
'''
:param openapi_client: 客户端
:param table_guid:
:param runtime:
:return: columns的dict
'''
table_theme_request = dataworks_public_20200518_models.GetMetaTableColumnRequest(table_guid=table_guid)
table_theme_response = openapi_client.get_meta_table_column_with_options(table_theme_request, runtime)
table_columns_dict = table_theme_response.body.to_map()['Data']['ColumnList']
return table_columns_dict
#获取table列表
def get_tables_list():
'''不建议使用获取project下tables'''
return dataworks_public_20200518_models.ListTableThemeRequest()
#获取对应的dataworks上的wiki信息,这里封装到了对应的datahub界面的description中
def get_table_wiki_document(openapi_client, table_guid, runtime):
table_theme_request = dataworks_public_20200518_models.GetMetaTableIntroWikiRequest(table_guid=table_guid)
return openapi_client.get_meta_table_intro_wiki_with_options(table_theme_request, runtime).body.data
def get_metadb_tables_list():
return dataworks_public_20200518_models.GetMetaDBTableListRequest()
#用于对应的表字段类型
_field_type_mapping = {
'int': NumberTypeClass,
'decimal': NumberTypeClass,
'largeint': NumberTypeClass,
'float': NumberTypeClass,
'smallint': NumberTypeClass,
'tinyint': NumberTypeClass,
'double':NumberTypeClass,
'bigint': NumberTypeClass,
'boolean': BooleanTypeClass,
'binary': StringTypeClass,
'string': StringTypeClass,
'varchar': StringTypeClass,
'char': StringTypeClass,
'array': NullTypeClass,
'map': NullTypeClass,
'struct': NullTypeClass,
'uniontype': NullTypeClass,
'date': DateTypeClass,
'timestamp': TimeTypeClass,
'datetime': TimeTypeClass,
'json': RecordTypeClass,
'nulltype': NullTypeClass,
}
#获取对应的列类型
def get_column_type(
sql_report: DataworksSourceReport, dataset_name: str, column_type: Any
) -> SchemaFieldDataType:
TypeClass: Optional[Type] = None
for sql_type in _field_type_mapping.keys():
if column_type==sql_type:
TypeClass = _field_type_mapping[sql_type]
break
if TypeClass is None:
sql_report.report_warning(
dataset_name, f"unable to map type {column_type!r} to metadata schema"
)
TypeClass = NullTypeClass
return SchemaFieldDataType(type=TypeClass())
def make_dataset_urn_with_platform_instance(
platform: str, name: str, platform_instance: Optional[str], env: str = FabricTypeClass.PROD
) -> str:
return str(
DatasetUrn.create_from_ids(
platform_id=platform,
table_name=name,
env=env,
platform_instance=platform_instance,
)
)
@platform_name("dataworks")
@config_class(DataworksSourceConfig)
@support_status(SupportStatus.UNKNOWN)
@dataclass
class DataWorksSource(Source):
dataworks_client: dataworks_public20200518Client
dataworks_runtime: RuntimeOptions
config: DataworksSourceConfig
report: DataworksSourceReport
# 数据初始化连接
def __init__(self, ctx: PipelineContext, config: DataworksSourceConfig):
super().__init__(ctx)
self.config = config
self.report = DataworksSourceReport()
self.conf = open_api_models.Config()
self.conf.access_key_id = config.access_key_id
self.conf.access_key_secret = config.access_key_secret
self.conf.endpoint = config.endpoint
self.platform='dataworks'
self.dataworks_client = dataworks_public20200518Client(self.conf)
self.dataworks_runtime = util_models.RuntimeOptions()
self.dataworks_runtime.connect_timeout=360000
self.dataworks_runtime.connect_timeout=360000
pass
@classmethod
def create(cls, config_dict: dict, ctx: PipelineContext) -> "DataWorksSource":
config = DataworksSourceConfig.parse_obj(config_dict)
return cls(ctx, config)
#这里才是重点重点,数据的封装成对应的datahub需要的类进行阐述
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
platform = "dataworks"
project_request = get_project_request()
# 获取project信息
database_names_list = self.dataworks_client.list_projects_with_options(project_request, self.dataworks_runtime)
database_names_list_map = database_names_list.body.to_map()
database_names_list_page=database_names_list_map['PageResult']
project_databases = database_names_list_page['ProjectList']
page_num=database_names_list_page['PageNumber']
page_size=database_names_list_page['PageSize']
#注意这里,建议自己打印看一下,dataworks提供的api是一个分页查询,读取的数据是一页信息,所以这里循环全部读取,下面的库 表都是一样的方式
while database_names_list_page['TotalCount']-page_num*page_size>0:
page_num+=1
project_request.page_number=page_num
database_names_list2 = self.dataworks_client.list_projects_with_options(project_request, self.dataworks_runtime)
project_databases.extend(database_names_list2.body.to_map()['PageResult']['ProjectList'])
metadb_tables_list = get_metadb_tables_list()
for database_name in project_databases:
# 获取projectname 数据库名
project_name = database_name['ProjectIdentifier']
metadb_tables_list.app_guid = 'odps.{0}'.format(project_name)
metadb_tables_list.page_number=1
table_init_properties=self.dataworks_client.get_meta_dbtable_list_with_options(metadb_tables_list,self.dataworks_runtime).body
table_init_properties_list=table_init_properties.data.table_entity_list
data=table_init_properties.to_map()['Data']
datapage_num=data['PageNumber']
datapage_size=data['PageSize']
while data['TotalCount']-datapage_num*datapage_size>0:
datapage_num+=1
metadb_tables_list.page_number=datapage_num
tabele_info_new_properties=self.dataworks_client.get_meta_dbtable_list_with_options(metadb_tables_list,self.dataworks_runtime).body.data.table_entity_list
table_init_properties_list.extend(tabele_info_new_properties)
# 返回每一张表的properties信息 类型dict
# 一个库下的所有表详情properties 列表list odps.库.表
for table_info in table_init_properties_list:
#每张表的table_guid
table_guid=table_info.table_guid
#这个截取只是为了去除上面的odps这几个字段变成 database.table
database_table_name=table_guid[table_guid.index('.')+1:]
table_name=database_table_name[database_table_name.index('.')+1:]
dataset_name=f"{database_table_name}"
# 获取一张表的详细信息,这里没有表名信息,只有列信息返回的是列表,内部是一个个的dict table_info 单独一张表的详细properties信息
try:
table_colums_list_dict=get_table_theme_info(self.dataworks_client, table_info.table_guid, self.dataworks_runtime)
#重点封装,这里需要注意
dataset_urn = make_dataset_urn_with_platform_instance(
self.platform,
dataset_name,
self.config.platform_instance,
self.config.env,
)
dataset_snapshot = DatasetSnapshot(
urn=dataset_urn,
aspects=[],
)
# get table doc
# 获取对应的vk信息
# get_table_wiki_document(self.dataworks_client,table_guid,self.dataworks_runtime)
table_wiki = get_table_wiki_document(self.dataworks_client,table_guid,self.dataworks_runtime)
table_wiki_content = table_wiki.content if table_wiki is not None else None
dataset_properties = DatasetPropertiesClass(
tags=[],
customProperties={},
description = table_wiki_content
)
dataset_snapshot.aspects.append(dataset_properties)
canonical_schema: List[SchemaField] = []
#获取列信息并进行类型对应
for table_colums_dict in table_colums_list_dict:
column_name=table_colums_dict['ColumnName']
column_type=table_colums_dict['ColumnType'].lower()
col_type = str(re.search(r'^\w+', column_type).group(0))
column_commit=table_colums_dict['Comment']
isPartitioningKey=table_colums_dict['IsPartitionColumn']
field =SchemaField(
fieldPath=column_name,
nativeDataType=col_type,
type=get_column_type(self.report,dataset_name,col_type),
description=column_commit,
isPartitioningKey=isPartitioningKey,
nullable=False,
recursive=False,
)
canonical_schema.append(field)
table_schema_metadata = SchemaMetadata(
schemaName=table_name,
platform=f"urn:li:dataPlatform:{platform}",
version=0,
hash="",
platformSchema=SchemalessClass(),
fields=canonical_schema,
)
dataset_snapshot.aspects.append(table_schema_metadata)
# add Link
institutionalMemoryMetadataClass = InstitutionalMemoryMetadataClass(
url = f"https://www.baidu.com",
description = "dataworks",
createStamp = AuditStampClass(time=0, actor="urn:li:corpuser:datahub")
)
institutionalMemory = InstitutionalMemoryClass(
elements = [institutionalMemoryMetadataClass]
)
dataset_snapshot.aspects.append(institutionalMemory)
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
wu = MetadataWorkUnit(id=dataset_name, mce=mce)
self.report.report_workunit(wu)
yield wu
except BaseException as e:
print(e.args)
continue
def get_report(self) -> DataworksSourceReport:
return self.report
注意1.2写的**“dataworks = datahub.ingestion.source.dataworks:DataWorksSource”,**
对应的就是这个文件的位置,和对应其中的代码的source名称 dataworks对应的就是你写的yml文件时候的type
#记得修改前三个参数access_key_id access_key_secret endpoint
source:
type: dataworks
config:
access_key_id: ''
access_key_secret: ''
endpoint: 'dataworks-hangzhou.aliyunc'
platform_instance: 'dataworks112'
sink:
type: datahub-rest
config:
server: 'http://datahub-gms:8080'
```