使用python脚本生成datax-json脚本(一次性操作多张表),并使用datax将业务数据从mysql采集到hdfs

废话不多说,上代码(因为这个代码是从别的地方复制后修改的,有许多冗余的地方,但是不影响运行-->从mysql将数据导入到hdfs): 

#coding=utf-8
import os
import sys
import getopt
import json

import pymysql
pymysql.install_as_MySQLdb()

#MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop101"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "123456"

#HDFS NameNode相关配置,需根据实际情况作出修改
hdfs_nn_host = "hadoop101"
hdfs_nn_port = "8020"

#生成配置文件的目标路径,可根据实际情况作出修改
output_path = "/opt/anson/datax/job/import"

def get_connection():
    return pymysql.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)


def get_mysql_meta(database, table):
    connection = get_connection()
    cursor = connection.cursor()
    sql= "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION"
    cursor.execute(sql, [database, table])
    fetchall = cursor.fetchall()
    cursor.close()
    connection.close()
    return fetchall


def get_mysql_columns(database, table):
    return (list(map(lambda x: x[0], get_mysql_meta(database, table))))


def get_hive_columns(database, table):
    def type_mapping(mysql_type):
        mappings = {
            "bigint": "bigint",
            "int": "bigint",
            "smallint": "bigint",
            "tinyint": "bigint",
            "decimal": "string",
            "double": "double",
            "float": "float",
            "binary": "string",
            "char": "string",
            "varchar": "string",
            "datetime": "string",
            "time": "string",
            "timestamp": "string",
            "date": "string",
            "text": "string"
        }
        return mappings[mysql_type]

    meta = get_mysql_meta(database, table)
    return (list(map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)))


def generate_json(source_database, source_table):
    job = {
        "job": {
            "setting": {
                "speed": {
                    "channel": 3
                },
                "errorLimit": {
                    "record": 0,
                    "percentage": 0.02
                }
            },
            "content": [{
                "reader": {
                    "name": "mysqlreader",
                    "parameter": {
                        "username": mysql_user,
                        "password": mysql_passwd,
                        "column": get_mysql_columns(source_database, source_table),
                        "splitPk": "",
                        "connection": [{
                            "table": [source_table],
                            "jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database + "?useSSL=false"]
                        }]
                    }
                },
                "writer": {
                    "name": "hdfswriter",
                    "parameter": {
                        "defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
                        "fileType": "text",
                        "path": "${targetdir}",
                        "fileName": source_table,
                        "column": get_hive_columns(source_database, source_table),
                        "writeMode": "append",
                        "fieldDelimiter": "\t",
                        "compress": "gzip"
                    }
                }
            }]
        }
    }
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f:
        json.dump(job, f)


def get_tablename(source_database):
    connection = get_connection()
    cursor = connection.cursor()
    sql = "SELECT distinct TABLE_NAME from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s"
    cursor.execute(sql,source_database)
    fetchall = cursor.fetchall()
    cursor.close()
    connection.close()
    return fetchall

def main(args):
    #指定数据库,并生成其所有表的datax-json文件
    source_database = "offline"

    #如果每次只想生成单表,那么放开本行和139行,并将142~144行注释
    #source_table = ""

    options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl='])
    for opt_name, opt_value in options:
        if opt_name in ('-d', '--sourcedb'):
            source_database = opt_value
        if opt_name in ('-t', '--sourcetbl'):
            source_table = opt_value

    # generate_json(source_database, source_table)

    #一次性生成指定数据库所有表datax的json配置文件
    res = get_tablename(source_database)
    for res1 in res:
        generate_json(source_database,res1[0])


if __name__ == '__main__':
    main(sys.argv[1:])

接下来是通过shell脚本,使用datax将数据从mysql 采集到hdfs (下面是一个离线数仓项目的demo):

#!/bin/bash
DATAX_HOME=/opt/bdp/datax
# 如果传入日期则do_date等于传入的日期,否则等于前一天日期
if [ -n "$2" ] ;then
    do_date=$2
else
   do_date=`date -d "-1 day" +%F`
fi
#处理目标路径,此处的处理逻辑是,如果目标路径不存在,则创建;若存在,则清空,目的是保证同步任务可重复执行
handle_targetdir(){
  hadoop fs -test -e $1
  if [[ $? -eq 1 ]]; then
  echo "路径$1不存在,正在创建......"
  hadoop fs -mkdir -p $1
  else
  echo "路径$1已经存在"
    fs_count=$(hadoop fs -count $1)
 content_size=$(echo $fs_count | awk '{print $3}')
 if [[ $content_size -eq 0 ]]; then
 echo "路径$1为空"
 else
 echo "路径$1不为空,正在清空......"
 hadoop fs -rm -r -f $1/*
    fi
  fi
}

import_data(){
  datax_config=$1
  target_dir=$2
  handle_targetdir $target_dir
  python2 $DATAX_HOME/bin/datax.py -p" -Dtargetdir=$target_dir" $datax_config
}

case $1 in
"order_info")
  import_data /opt/bdp/datax/job/import/offline.order_info.json /offline_data_datax/db/order_info/$do_date
  ;;
"base_category1")
  import_data /opt/bdp/datax/job/import/offline.base_category1.json /offline_data_datax/db/base_category1/$do_date
  ;;
"base_category2")
  import_data /opt/bdp/datax/job/import/offline.base_category2.json /offline_data_datax/db/base_category2/$do_date
  ;;
"base_category3")
  import_data /opt/bdp/datax/job/import/offline.base_category3.json /offline_data_datax/db/base_category3/$do_date
  ;;
"order_detail")
  import_data /opt/bdp/datax/job/import/offline.order_detail.json /offline_data_datax/db/order_detail/$do_date
  ;;
"sku_info")
  import_data /opt/bdp/datax/job/import/offline.sku_info.json /offline_data_datax/db/sku_info/$do_date
  ;;
"user_info")
  import_data /opt/bdp/datax/job/import/offline.user_info.json /offline_data_datax/db/user_info/$do_date
  ;;
"payment_info")
  import_data /opt/bdp/datax/job/import/offline.payment_info.json /offline_data_datax/db/payment_info/$do_date
  ;;
"base_province")
  import_data /opt/bdp/datax/job/import/offline.base_province.json /offline_data_datax/db/base_province/$do_date
  ;;
"base_region")
  import_data /opt/bdp/datax/job/import/offline.base_region.json /offline_data_datax/db/base_region/$do_date
  ;;
"base_trademark")
  import_data /opt/bdp/datax/job/import/offline.base_trademark.json /offline_data_datax/db/base_trademark/$do_date
  ;;
"activity_info")
  import_data /opt/bdp/datax/job/import/offline.activity_info.json /offline_data_datax/db/activity_info/$do_date
  ;;
"activity_order")
  import_data /opt/bdp/datax/job/import/offline.activity_order.json /offline_data_datax/db/activity_order/$do_date
  ;;
"cart_info")
  import_data /opt/bdp/datax/job/import/offline.cart_info.json /offline_data_datax/db/cart_info/$do_date
  ;;
"comment_info")
  import_data /opt/bdp/datax/job/import/offline.comment_info.json /offline_data_datax/db/comment_info/$do_date
  ;;
"coupon_info")
  import_data /opt/bdp/datax/job/import/offline.coupon_info.json /offline_data_datax/db/coupon_info/$do_date
  ;;
"coupon_use")
  import_data /opt/bdp/datax/job/import/offline.coupon_use.json /offline_data_datax/db/coupon_use/$do_date
  ;;
"favor_info")
  import_data /opt/bdp/datax/job/import/offline.favor_info.json /offline_data_datax/db/favor_info/$do_date
  ;;
"order_refund_info")
offline_data_datax
  ;;
"order_status_log")
  import_data /opt/bdp/datax/job/import/offline.order_status_log.json /offline_data_datax/db/order_status_log/$do_date
  ;;
"spu_info")
  import_data /opt/bdp/datax/job/import/offline.spu_info.json /offline_data_datax/db/spu_info/$do_date
  ;;
"activity_rule")
  import_data /opt/bdp/datax/job/import/offline.activity_rule.json /offline_data_datax/db/activity_rule/$do_date
  ;;
"base_dic")
  import_data /opt/bdp/datax/job/import/offline.base_dic.json /offline_data_datax/db/base_dic/$do_date
  ;;
"all")
  import_data /opt/bdp/datax/job/import/offline.order_info.json /offline_data_datax/db/order_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_category1.json /offline_data_datax/db/base_category1/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_category2.json /offline_data_datax/db/base_category2/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_category3.json /offline_data_datax/db/base_category3/$do_date
  import_data /opt/bdp/datax/job/import/offline.order_detail.json /offline_data_datax/db/order_detail/$do_date
  import_data /opt/bdp/datax/job/import/offline.sku_info.json /offline_data_datax/db/sku_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.user_info.json /offline_data_datax/db/user_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.payment_info.json /offline_data_datax/db/payment_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_province.json /offline_data_datax/db/base_province/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_region.json /offline_data_datax/db/base_region/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_trademark.json /offline_data_datax/db/base_trademark/$do_date
  import_data /opt/bdp/datax/job/import/offline.activity_info.json /offline_data_datax/db/activity_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.activity_order.json /offline_data_datax/db/activity_order/$do_date
  import_data /opt/bdp/datax/job/import/offline.cart_info.json /offline_data_datax/db/cart_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.comment_info.json /offline_data_datax/db/comment_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.coupon_info.json /offline_data_datax/db/coupon_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.coupon_use.json /offline_data_datax/db/coupon_use/$do_date
  import_data /opt/bdp/datax/job/import/offline.favor_info.json /offline_data_datax/db/favor_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.order_refund_info.json /offline_data_datax/db/order_refund_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.order_status_log.json /offline_data_datax/db/order_status_log/$do_date
  import_data /opt/bdp/datax/job/import/offline.spu_info.json /offline_data_datax/db/spu_info/$do_date
  import_data /opt/bdp/datax/job/import/offline.activity_rule.json /offline_data_datax/db/activity_rule/$do_date
  import_data /opt/bdp/datax/job/import/offline.base_dic.json /offline_data_datax/db/base_dic/$do_date
  ;;
esac

当然我们也可以使用sqoop,shell脚本如下:

#! /bin/bash

APP=offline
sqoop=/opt/anson/sqoop/bin/sqoop

if [ -n "$2" ] ;then
    do_date=$2
else
    do_date=`date -d '-1 day' +%F`
fi

import_data(){
$sqoop import \
--connect jdbc:mysql://hadoop101:3306/$APP \
--username root \
--password 123456 \
--target-dir /offline_data/$APP/db/$1/$do_date \
--delete-target-dir \
--query "$2 and  \$CONDITIONS" \
--num-mappers 1 \
--fields-terminated-by '\t' \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N'

hadoop jar /opt/bdp/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /offline_data/$APP/db/$1/$do_date
}

import_order_info(){
  import_data order_info "select
                            id, 
                            final_total_amount, 
                            order_status, 
                            user_id, 
                            out_trade_no, 
                            create_time, 
                            operate_time,
                            province_id,
                            benefit_reduce_amount,
                            original_total_amount,
                            feight_fee      
                        from order_info
                        where (date_format(create_time,'%Y-%m-%d')='$do_date' 
                        or date_format(operate_time,'%Y-%m-%d')='$do_date')"
}

import_coupon_use(){
  import_data coupon_use "select
                          id,
                          coupon_id,
                          user_id,
                          order_id,
                          coupon_status,
                          get_time,
                          using_time,
                          used_time
                        from coupon_use
                        where (date_format(get_time,'%Y-%m-%d')='$do_date'
                        or date_format(using_time,'%Y-%m-%d')='$do_date'
                        or date_format(used_time,'%Y-%m-%d')='$do_date')"
}

import_order_status_log(){
  import_data order_status_log "select
                                  id,
                                  order_id,
                                  order_status,
                                  operate_time
                                from order_status_log
                                where date_format(operate_time,'%Y-%m-%d')='$do_date'"
}

import_activity_order(){
  import_data activity_order "select
                                id,
                                activity_id,
                                order_id,
                                create_time
                              from activity_order
                              where date_format(create_time,'%Y-%m-%d')='$do_date'"
}

import_user_info(){
  import_data "user_info" "select 
                            id,
                            name,
                            birthday,
                            gender,
                            email,
                            user_level, 
                            create_time,
                            operate_time
                          from user_info 
                          where (DATE_FORMAT(create_time,'%Y-%m-%d')='$do_date' 
                          or DATE_FORMAT(operate_time,'%Y-%m-%d')='$do_date')"
}

import_order_detail(){
  import_data order_detail "select 
                              od.id,
                              order_id, 
                              user_id, 
                              sku_id,
                              sku_name,
                              order_price,
                              sku_num, 
                              od.create_time,
                              source_type,
                              source_id  
                            from order_detail od
                            join order_info oi
                            on od.order_id=oi.id
                            where DATE_FORMAT(od.create_time,'%Y-%m-%d')='$do_date'"
}

import_payment_info(){
  import_data "payment_info"  "select 
                                id,  
                                out_trade_no, 
                                order_id, 
                                user_id, 
                                alipay_trade_no, 
                                total_amount,  
                                subject, 
                                payment_type, 
                                payment_time 
                              from payment_info 
                              where DATE_FORMAT(payment_time,'%Y-%m-%d')='$do_date'"
}

import_comment_info(){
  import_data comment_info "select
                              id,
                              user_id,
                              sku_id,
                              spu_id,
                              order_id,
                              appraise,
                              comment_txt,
                              create_time
                            from comment_info
                            where date_format(create_time,'%Y-%m-%d')='$do_date'"
}

import_order_refund_info(){
  import_data order_refund_info "select
                                id,
                                user_id,
                                order_id,
                                sku_id,
                                refund_type,
                                refund_num,
                                refund_amount,
                                refund_reason_type,
                                create_time
                              from order_refund_info
                              where date_format(create_time,'%Y-%m-%d')='$do_date'"
}

import_sku_info(){
  import_data sku_info "select 
                          id,
                          spu_id,
                          price,
                          sku_name,
                          sku_desc,
                          weight,
                          tm_id,
                          category3_id,
                          create_time
                        from sku_info where 1=1"
}

import_base_category1(){
  import_data "base_category1" "select 
                                  id,
                                  name 
                                from base_category1 where 1=1"
}

import_base_category2(){
  import_data "base_category2" "select
                                  id,
                                  name,
                                  category1_id 
                                from base_category2 where 1=1"
}

import_base_category3(){
  import_data "base_category3" "select
                                  id,
                                  name,
                                  category2_id
                                from base_category3 where 1=1"
}

import_base_province(){
  import_data base_province "select
                              id,
                              name,
                              region_id,
                              area_code,
                              iso_code
                            from base_province
                            where 1=1"
}

import_base_region(){
  import_data base_region "select
                              id,
                              region_name
                            from base_region
                            where 1=1"
}

import_base_trademark(){
  import_data base_trademark "select
                                tm_id,
                                tm_name
                              from base_trademark
                              where 1=1"
}

import_spu_info(){
  import_data spu_info "select
                            id,
                            spu_name,
                            category3_id,
                            tm_id
                          from spu_info
                          where 1=1"
}

import_favor_info(){
  import_data favor_info "select
                          id,
                          user_id,
                          sku_id,
                          spu_id,
                          is_cancel,
                          create_time,
                          cancel_time
                        from favor_info
                        where 1=1"
}

import_cart_info(){
  import_data cart_info "select
                        id,
                        user_id,
                        sku_id,
                        cart_price,
                        sku_num,
                        sku_name,
                        create_time,
                        operate_time,
                        is_ordered,
                        order_time,
                        source_type,
                        source_id
                      from cart_info
                      where 1=1"
}

import_coupon_info(){
  import_data coupon_info "select
                          id,
                          coupon_name,
                          coupon_type,
                          condition_amount,
                          condition_num,
                          activity_id,
                          benefit_amount,
                          benefit_discount,
                          create_time,
                          range_type,
                          spu_id,
                          tm_id,
                          category3_id,
                          limit_num,
                          operate_time,
                          expire_time
                        from coupon_info
                        where 1=1"
}

import_activity_info(){
  import_data activity_info "select
                              id,
                              activity_name,
                              activity_type,
                              start_time,
                              end_time,
                              create_time
                            from activity_info
                            where 1=1"
}

import_activity_rule(){
    import_data activity_rule "select
                                    id,
                                    activity_id,
                                    condition_amount,
                                    condition_num,
                                    benefit_amount,
                                    benefit_discount,
                                    benefit_level
                                from activity_rule
                                where 1=1"
}

import_base_dic(){
    import_data base_dic "select
                            dic_code,
                            dic_name,
                            parent_code,
                            create_time,
                            operate_time
                          from base_dic
                          where 1=1"
}

case $1 in
  "order_info")
     import_order_info
;;
  "base_category1")
     import_base_category1
;;
  "base_category2")
     import_base_category2
;;
  "base_category3")
     import_base_category3
;;
  "order_detail")
     import_order_detail
;;
  "sku_info")
     import_sku_info
;;
  "user_info")
     import_user_info
;;
  "payment_info")
     import_payment_info
;;
  "base_province")
     import_base_province
;;
  "base_region")
     import_base_region
;;
  "base_trademark")
     import_base_trademark
;;
  "activity_info")
      import_activity_info
;;
  "activity_order")
      import_activity_order
;;
  "cart_info")
      import_cart_info
;;
  "comment_info")
      import_comment_info
;;
  "coupon_info")
      import_coupon_info
;;
  "coupon_use")
      import_coupon_use
;;
  "favor_info")
      import_favor_info
;;
  "order_refund_info")
      import_order_refund_info
;;
  "order_status_log")
      import_order_status_log
;;
  "spu_info")
      import_spu_info
;;
  "activity_rule")
      import_activity_rule
;;
  "base_dic")
      import_base_dic
;;

"first")
   import_base_category1
   import_base_category2
   import_base_category3
   import_order_info
   import_order_detail
   import_sku_info
   import_user_info
   import_payment_info
   import_base_province
   import_base_region
   import_base_trademark
   import_activity_info
   import_activity_order
   import_cart_info
   import_comment_info
   import_coupon_use
   import_coupon_info
   import_favor_info
   import_order_refund_info
   import_order_status_log
   import_spu_info
   import_activity_rule
   import_base_dic
;;
"all")
   import_base_category1
   import_base_category2
   import_base_category3
   import_order_info
   import_order_detail
   import_sku_info
   import_user_info
   import_payment_info
   import_base_trademark
   import_activity_info
   import_activity_order
   import_cart_info
   import_comment_info
   import_coupon_use
   import_coupon_info
   import_favor_info
   import_order_refund_info
   import_order_status_log
   import_spu_info
   import_activity_rule
   import_base_dic
;;
esac

 

业务数据基本就是这样采集到hdfs了~

你可能感兴趣的:(python,json,sqoop,大数据,数据仓库)