datahub数据源开发maxcompute(dataworks)

datahub数据源开发

1,基础环境和从阿里官网获取对应的api

dataworks openapi文档
在这里获取对应的project databases 和tables的api操作,注意这里需要的几个pip进行的安装包在datahub中安装

1.1 setup.py修改

根据他的格式进行引入

dataworks_common = {
    "alibabacloud_dataworks-public20200518>=4.3.13, <5.0.0",
    "alibabacloud_tea_openapi>=0.3.1, <1.0.0",
    "alibabacloud_tea_console>=0.0.1, <1.0.0",
    "alibabacloud_tea_util>=0.3.5, <1.0.0"
}
#############



这个时候只代表你的python依赖完成

1.2 引入文件

还是在setup.py中的entry_points

#这里的指定,数据源和对应的路径,下面截图解释
"dataworks = datahub.ingestion.source.dataworks:DataWorksSource",

2 datahub新建数据源编写

官方新数据源构建步骤官方步骤

注意这里的ConfigModel和/datahub/metadata-ingestion/src/datahub/ingestion/source/file.py
但是注意下面的这个LookerAPIConfig这里其实就是告诉你的继承configmodel就是为了书写你的的前端界面配置的source文件或者yml时候的参数
所以我的配置只需要几个链接dataworks的基本参数
除了前三个参数是必须的,下面几个参数其实是通用的,参考其他的source源配置的参数信息

class DataworksSourceConfig(ConfigModel):
    access_key_id: str = Field(description="dataworks access_key_id")
    access_key_secret: str = Field(description="dataworks access_key_secret")
    endpoint: str = Field(
        default="dataworks.cn-shanghai.aliyuncs.com",
        description="dataworks endpoint",
    )
    database_pattern: AllowDenyPattern = Field(
        default=AllowDenyPattern.allow_all(),
        description="regex patterns for project to filter in ingestion.",
    )
    platform_instance: Optional[str] = Field(
        default=None,
        description="The instance of the platform that all assets produced by this recipe belong to",
    )
    env: str = Field(
        default=FabricTypeClass.PROD,
        description="The environment that all assets produced by this connector belong to",
    )

文件参考不要光看这个file的,更多的结构还是参考mongodb.py,这个更加准确
完整代码

import re
from dataclasses import dataclass, field

from alibabacloud_dataworks_public20200518.client import Client as dataworks_public20200518Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_dataworks_public20200518 import models as dataworks_public_20200518_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.models import RuntimeOptions

from datahub.configuration.common import (
  AllowDenyPattern,
  ConfigModel,
)
from datahub.ingestion.api.common import PipelineContext

from datahub.ingestion.api.decorators import (
  SupportStatus,
  config_class,
  platform_name,
  support_status,
)
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
  ArrayTypeClass,
  BooleanTypeClass,
  BytesTypeClass,
  NullTypeClass,
  NumberTypeClass,
  RecordTypeClass,
  SchemalessClass,
  StringTypeClass,
  TimeTypeClass,
  UnionTypeClass,
)

from pydantic import Field, root_validator

from datahub.ingestion.api.source import SourceReport, Source
from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit
from typing import (
  Any,
  Iterable,
  List,
  Optional,
  Type,
)
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
  ArrayTypeClass,
  BooleanTypeClass,
  BytesTypeClass,
  DateTypeClass,
  EnumTypeClass,
  NullTypeClass,
  NumberTypeClass,
  RecordTypeClass,
  SchemaField,
  SchemaFieldDataType,
  SchemaMetadata,
  StringTypeClass,
  TimeTypeClass,
)
from datahub.metadata.schema_classes import (
  FabricTypeClass,
  DatasetPropertiesClass,
  InstitutionalMemoryClass,
  InstitutionalMemoryMetadataClass,
  AuditStampClass
)
from datahub.utilities.urns.dataset_urn import DatasetUrn

class DataworksSourceConfig(ConfigModel):
  access_key_id: str = Field(description="dataworks access_key_id")
  access_key_secret: str = Field(description="dataworks access_key_secret")
  endpoint: str = Field(
      default="dataworks.cn-hangzhou.aliyuncs.com",
      description="dataworks endpoint",
  )
  database_pattern: AllowDenyPattern = Field(
      default=AllowDenyPattern.allow_all(),
      description="regex patterns for project to filter in ingestion.",
  )
  platform_instance: Optional[str] = Field(
      default=None,
      description="The instance of the platform that all assets produced by this recipe belong to",
  )
  env: str = Field(
      default=FabricTypeClass.PROD,
      description="The environment that all assets produced by this connector belong to",
  )


@dataclass
class DataworksSourceReport(SourceReport):
  filtered: List[str] = field(default_factory=list)

  def report_dropped(self, name: str) -> None:
      self.filtered.append(name)
  pass
#接下来的方法主要是封装方法的
def get_project_request():
  '''返回获取projectlsit'''
  return dataworks_public_20200518_models.ListProjectsRequest()

def get_table_theme_info(openapi_client, table_guid, runtime):
  '''
  :param openapi_client: 客户端
  :param table_guid:
  :param runtime:
  :return: columns的dict
  '''
  table_theme_request = dataworks_public_20200518_models.GetMetaTableColumnRequest(table_guid=table_guid)

  table_theme_response = openapi_client.get_meta_table_column_with_options(table_theme_request, runtime)
  table_columns_dict = table_theme_response.body.to_map()['Data']['ColumnList']
  return table_columns_dict
#获取table列表
def get_tables_list():
  '''不建议使用获取project下tables'''
  return dataworks_public_20200518_models.ListTableThemeRequest()

#获取对应的dataworks上的wiki信息,这里封装到了对应的datahub界面的description中
def get_table_wiki_document(openapi_client, table_guid, runtime):
  table_theme_request = dataworks_public_20200518_models.GetMetaTableIntroWikiRequest(table_guid=table_guid)
  return openapi_client.get_meta_table_intro_wiki_with_options(table_theme_request, runtime).body.data

def get_metadb_tables_list():
  return dataworks_public_20200518_models.GetMetaDBTableListRequest()

#用于对应的表字段类型
_field_type_mapping = {
  'int': NumberTypeClass,
  'decimal': NumberTypeClass,
  'largeint': NumberTypeClass,
  'float': NumberTypeClass,
  'smallint': NumberTypeClass,
  'tinyint': NumberTypeClass,
  'double':NumberTypeClass,
  'bigint': NumberTypeClass,
  'boolean': BooleanTypeClass,
  'binary': StringTypeClass,
  'string': StringTypeClass,
  'varchar': StringTypeClass,
  'char': StringTypeClass,
  'array': NullTypeClass,
  'map': NullTypeClass,
  'struct': NullTypeClass,
  'uniontype': NullTypeClass,
  'date': DateTypeClass,
  'timestamp': TimeTypeClass,
  'datetime': TimeTypeClass,
  'json': RecordTypeClass,
  'nulltype': NullTypeClass,
}

#获取对应的列类型
def get_column_type(
      sql_report: DataworksSourceReport, dataset_name: str, column_type: Any
) -> SchemaFieldDataType:

  TypeClass: Optional[Type] = None
  for sql_type in _field_type_mapping.keys():
      if column_type==sql_type:
          TypeClass = _field_type_mapping[sql_type]
          break
  if TypeClass is None:
      sql_report.report_warning(
          dataset_name, f"unable to map type {column_type!r} to metadata schema"
      )
      TypeClass = NullTypeClass

  return SchemaFieldDataType(type=TypeClass())

def make_dataset_urn_with_platform_instance(
      platform: str, name: str, platform_instance: Optional[str], env: str = FabricTypeClass.PROD
) -> str:
  return str(
      DatasetUrn.create_from_ids(
          platform_id=platform,
          table_name=name,
          env=env,
          platform_instance=platform_instance,
      )
  )


@platform_name("dataworks")
@config_class(DataworksSourceConfig)
@support_status(SupportStatus.UNKNOWN)
@dataclass
class DataWorksSource(Source):
  dataworks_client: dataworks_public20200518Client
  dataworks_runtime: RuntimeOptions
  config: DataworksSourceConfig
  report: DataworksSourceReport

  # 数据初始化连接
  def __init__(self, ctx: PipelineContext, config: DataworksSourceConfig):
      super().__init__(ctx)
      self.config = config
      self.report = DataworksSourceReport()
      self.conf = open_api_models.Config()
      self.conf.access_key_id = config.access_key_id
      self.conf.access_key_secret = config.access_key_secret
      self.conf.endpoint = config.endpoint
      self.platform='dataworks'
      self.dataworks_client = dataworks_public20200518Client(self.conf)
      self.dataworks_runtime = util_models.RuntimeOptions()
      self.dataworks_runtime.connect_timeout=360000
      self.dataworks_runtime.connect_timeout=360000
      pass

  @classmethod
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "DataWorksSource":
      config = DataworksSourceConfig.parse_obj(config_dict)
      return cls(ctx, config)

#这里才是重点重点,数据的封装成对应的datahub需要的类进行阐述
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
      platform = "dataworks"
      project_request = get_project_request()
      # 获取project信息
      database_names_list = self.dataworks_client.list_projects_with_options(project_request, self.dataworks_runtime)
      database_names_list_map = database_names_list.body.to_map()
      database_names_list_page=database_names_list_map['PageResult']
      project_databases = database_names_list_page['ProjectList']
      page_num=database_names_list_page['PageNumber']
      page_size=database_names_list_page['PageSize']
      #注意这里,建议自己打印看一下,dataworks提供的api是一个分页查询,读取的数据是一页信息,所以这里循环全部读取,下面的库 表都是一样的方式
      while database_names_list_page['TotalCount']-page_num*page_size>0:
          page_num+=1
          project_request.page_number=page_num
          database_names_list2 = self.dataworks_client.list_projects_with_options(project_request, self.dataworks_runtime)
          project_databases.extend(database_names_list2.body.to_map()['PageResult']['ProjectList'])
      metadb_tables_list = get_metadb_tables_list()
      for database_name in project_databases:
          # 获取projectname 数据库名
          project_name = database_name['ProjectIdentifier']
          metadb_tables_list.app_guid = 'odps.{0}'.format(project_name)
          metadb_tables_list.page_number=1
          table_init_properties=self.dataworks_client.get_meta_dbtable_list_with_options(metadb_tables_list,self.dataworks_runtime).body
          table_init_properties_list=table_init_properties.data.table_entity_list
          data=table_init_properties.to_map()['Data']
          datapage_num=data['PageNumber']
          datapage_size=data['PageSize']
          while data['TotalCount']-datapage_num*datapage_size>0:
              datapage_num+=1
              metadb_tables_list.page_number=datapage_num
              tabele_info_new_properties=self.dataworks_client.get_meta_dbtable_list_with_options(metadb_tables_list,self.dataworks_runtime).body.data.table_entity_list
              table_init_properties_list.extend(tabele_info_new_properties)
          # 返回每一张表的properties信息 类型dict
          # 一个库下的所有表详情properties 列表list odps.库.表
          for table_info in table_init_properties_list:
              #每张表的table_guid
              table_guid=table_info.table_guid
              #这个截取只是为了去除上面的odps这几个字段变成 database.table
              database_table_name=table_guid[table_guid.index('.')+1:]
              table_name=database_table_name[database_table_name.index('.')+1:]
              dataset_name=f"{database_table_name}"
              # 获取一张表的详细信息,这里没有表名信息,只有列信息返回的是列表,内部是一个个的dict table_info 单独一张表的详细properties信息
              try:
                  table_colums_list_dict=get_table_theme_info(self.dataworks_client, table_info.table_guid, self.dataworks_runtime)
                  #重点封装,这里需要注意
                  dataset_urn = make_dataset_urn_with_platform_instance(

                      self.platform,
                      dataset_name,
                      self.config.platform_instance,
                      self.config.env,
                  )
                  dataset_snapshot = DatasetSnapshot(
                      urn=dataset_urn,
                      aspects=[],
                  )
                  # get table doc
                  # 获取对应的vk信息
                  # get_table_wiki_document(self.dataworks_client,table_guid,self.dataworks_runtime)
                  table_wiki = get_table_wiki_document(self.dataworks_client,table_guid,self.dataworks_runtime)
                  table_wiki_content = table_wiki.content if table_wiki is not None else None
                  dataset_properties = DatasetPropertiesClass(
                      tags=[],
                      customProperties={},
                      description = table_wiki_content
                  )
                  dataset_snapshot.aspects.append(dataset_properties)
                  canonical_schema: List[SchemaField] = []
                  #获取列信息并进行类型对应
                  for table_colums_dict in table_colums_list_dict:
                      column_name=table_colums_dict['ColumnName']
                      column_type=table_colums_dict['ColumnType'].lower()
                      col_type = str(re.search(r'^\w+', column_type).group(0))
                      column_commit=table_colums_dict['Comment']
                      isPartitioningKey=table_colums_dict['IsPartitionColumn']
                      field =SchemaField(
                          fieldPath=column_name,
                          nativeDataType=col_type,
                          type=get_column_type(self.report,dataset_name,col_type),
                          description=column_commit,
                          isPartitioningKey=isPartitioningKey,
                          nullable=False,
                          recursive=False,
                      )
                      canonical_schema.append(field)
                  table_schema_metadata = SchemaMetadata(
                      schemaName=table_name,
                      platform=f"urn:li:dataPlatform:{platform}",
                      version=0,
                      hash="",
                      platformSchema=SchemalessClass(),
                      fields=canonical_schema,
                  )
                  dataset_snapshot.aspects.append(table_schema_metadata)

                  # add Link
                  institutionalMemoryMetadataClass = InstitutionalMemoryMetadataClass(
                      url = f"https://www.baidu.com",
                      description = "dataworks",
                      createStamp =  AuditStampClass(time=0, actor="urn:li:corpuser:datahub")
                  )
                  institutionalMemory = InstitutionalMemoryClass(
                      elements = [institutionalMemoryMetadataClass]
                  )
                  dataset_snapshot.aspects.append(institutionalMemory)

                  mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
                  wu = MetadataWorkUnit(id=dataset_name, mce=mce)
                  self.report.report_workunit(wu)
                  yield wu
              except BaseException as e:
                  print(e.args)
                  continue


  def get_report(self) -> DataworksSourceReport:
      return self.report

注意1.2写的**“dataworks = datahub.ingestion.source.dataworks:DataWorksSource”,**
对应的就是这个文件的位置,和对应其中的代码的source名称 dataworks对应的就是你写的yml文件时候的type

yml案列

#记得修改前三个参数access_key_id  access_key_secret  endpoint
source:
  type: dataworks
  config:
    access_key_id: ''
    access_key_secret: ''
    endpoint: 'dataworks-hangzhou.aliyunc'
    platform_instance: 'dataworks112'
sink:
  type: datahub-rest
  config:
    server: 'http://datahub-gms:8080'
    ```

你可能感兴趣的:(datahub,python,大数据)