废话不多说,上代码(因为这个代码是从别的地方复制后修改的,有许多冗余的地方,但是不影响运行-->从mysql将数据导入到hdfs):
#coding=utf-8
import os
import sys
import getopt
import json
import pymysql
pymysql.install_as_MySQLdb()
#MySQL相关配置,需根据实际情况作出修改
mysql_host = "hadoop101"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "123456"
#HDFS NameNode相关配置,需根据实际情况作出修改
hdfs_nn_host = "hadoop101"
hdfs_nn_port = "8020"
#生成配置文件的目标路径,可根据实际情况作出修改
output_path = "/opt/anson/datax/job/import"
def get_connection():
return pymysql.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)
def get_mysql_meta(database, table):
connection = get_connection()
cursor = connection.cursor()
sql= "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION"
cursor.execute(sql, [database, table])
fetchall = cursor.fetchall()
cursor.close()
connection.close()
return fetchall
def get_mysql_columns(database, table):
return (list(map(lambda x: x[0], get_mysql_meta(database, table))))
def get_hive_columns(database, table):
def type_mapping(mysql_type):
mappings = {
"bigint": "bigint",
"int": "bigint",
"smallint": "bigint",
"tinyint": "bigint",
"decimal": "string",
"double": "double",
"float": "float",
"binary": "string",
"char": "string",
"varchar": "string",
"datetime": "string",
"time": "string",
"timestamp": "string",
"date": "string",
"text": "string"
}
return mappings[mysql_type]
meta = get_mysql_meta(database, table)
return (list(map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)))
def generate_json(source_database, source_table):
job = {
"job": {
"setting": {
"speed": {
"channel": 3
},
"errorLimit": {
"record": 0,
"percentage": 0.02
}
},
"content": [{
"reader": {
"name": "mysqlreader",
"parameter": {
"username": mysql_user,
"password": mysql_passwd,
"column": get_mysql_columns(source_database, source_table),
"splitPk": "",
"connection": [{
"table": [source_table],
"jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database + "?useSSL=false"]
}]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
"fileType": "text",
"path": "${targetdir}",
"fileName": source_table,
"column": get_hive_columns(source_database, source_table),
"writeMode": "append",
"fieldDelimiter": "\t",
"compress": "gzip"
}
}
}]
}
}
if not os.path.exists(output_path):
os.makedirs(output_path)
with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f:
json.dump(job, f)
def get_tablename(source_database):
connection = get_connection()
cursor = connection.cursor()
sql = "SELECT distinct TABLE_NAME from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s"
cursor.execute(sql,source_database)
fetchall = cursor.fetchall()
cursor.close()
connection.close()
return fetchall
def main(args):
#指定数据库,并生成其所有表的datax-json文件
source_database = "offline"
#如果每次只想生成单表,那么放开本行和139行,并将142~144行注释
#source_table = ""
options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl='])
for opt_name, opt_value in options:
if opt_name in ('-d', '--sourcedb'):
source_database = opt_value
if opt_name in ('-t', '--sourcetbl'):
source_table = opt_value
# generate_json(source_database, source_table)
#一次性生成指定数据库所有表datax的json配置文件
res = get_tablename(source_database)
for res1 in res:
generate_json(source_database,res1[0])
if __name__ == '__main__':
main(sys.argv[1:])
接下来是通过shell脚本,使用datax将数据从mysql 采集到hdfs (下面是一个离线数仓项目的demo):
#!/bin/bash DATAX_HOME=/opt/bdp/datax # 如果传入日期则do_date等于传入的日期,否则等于前一天日期 if [ -n "$2" ] ;then do_date=$2 else do_date=`date -d "-1 day" +%F` fi #处理目标路径,此处的处理逻辑是,如果目标路径不存在,则创建;若存在,则清空,目的是保证同步任务可重复执行 handle_targetdir(){ hadoop fs -test -e $1 if [[ $? -eq 1 ]]; then echo "路径$1不存在,正在创建......" hadoop fs -mkdir -p $1 else echo "路径$1已经存在" fs_count=$(hadoop fs -count $1) content_size=$(echo $fs_count | awk '{print $3}') if [[ $content_size -eq 0 ]]; then echo "路径$1为空" else echo "路径$1不为空,正在清空......" hadoop fs -rm -r -f $1/* fi fi } import_data(){ datax_config=$1 target_dir=$2 handle_targetdir $target_dir python2 $DATAX_HOME/bin/datax.py -p" -Dtargetdir=$target_dir" $datax_config } case $1 in "order_info") import_data /opt/bdp/datax/job/import/offline.order_info.json /offline_data_datax/db/order_info/$do_date ;; "base_category1") import_data /opt/bdp/datax/job/import/offline.base_category1.json /offline_data_datax/db/base_category1/$do_date ;; "base_category2") import_data /opt/bdp/datax/job/import/offline.base_category2.json /offline_data_datax/db/base_category2/$do_date ;; "base_category3") import_data /opt/bdp/datax/job/import/offline.base_category3.json /offline_data_datax/db/base_category3/$do_date ;; "order_detail") import_data /opt/bdp/datax/job/import/offline.order_detail.json /offline_data_datax/db/order_detail/$do_date ;; "sku_info") import_data /opt/bdp/datax/job/import/offline.sku_info.json /offline_data_datax/db/sku_info/$do_date ;; "user_info") import_data /opt/bdp/datax/job/import/offline.user_info.json /offline_data_datax/db/user_info/$do_date ;; "payment_info") import_data /opt/bdp/datax/job/import/offline.payment_info.json /offline_data_datax/db/payment_info/$do_date ;; "base_province") import_data /opt/bdp/datax/job/import/offline.base_province.json /offline_data_datax/db/base_province/$do_date ;; "base_region") import_data /opt/bdp/datax/job/import/offline.base_region.json /offline_data_datax/db/base_region/$do_date ;; "base_trademark") import_data /opt/bdp/datax/job/import/offline.base_trademark.json /offline_data_datax/db/base_trademark/$do_date ;; "activity_info") import_data /opt/bdp/datax/job/import/offline.activity_info.json /offline_data_datax/db/activity_info/$do_date ;; "activity_order") import_data /opt/bdp/datax/job/import/offline.activity_order.json /offline_data_datax/db/activity_order/$do_date ;; "cart_info") import_data /opt/bdp/datax/job/import/offline.cart_info.json /offline_data_datax/db/cart_info/$do_date ;; "comment_info") import_data /opt/bdp/datax/job/import/offline.comment_info.json /offline_data_datax/db/comment_info/$do_date ;; "coupon_info") import_data /opt/bdp/datax/job/import/offline.coupon_info.json /offline_data_datax/db/coupon_info/$do_date ;; "coupon_use") import_data /opt/bdp/datax/job/import/offline.coupon_use.json /offline_data_datax/db/coupon_use/$do_date ;; "favor_info") import_data /opt/bdp/datax/job/import/offline.favor_info.json /offline_data_datax/db/favor_info/$do_date ;; "order_refund_info") offline_data_datax ;; "order_status_log") import_data /opt/bdp/datax/job/import/offline.order_status_log.json /offline_data_datax/db/order_status_log/$do_date ;; "spu_info") import_data /opt/bdp/datax/job/import/offline.spu_info.json /offline_data_datax/db/spu_info/$do_date ;; "activity_rule") import_data /opt/bdp/datax/job/import/offline.activity_rule.json /offline_data_datax/db/activity_rule/$do_date ;; "base_dic") import_data /opt/bdp/datax/job/import/offline.base_dic.json /offline_data_datax/db/base_dic/$do_date ;; "all") import_data /opt/bdp/datax/job/import/offline.order_info.json /offline_data_datax/db/order_info/$do_date import_data /opt/bdp/datax/job/import/offline.base_category1.json /offline_data_datax/db/base_category1/$do_date import_data /opt/bdp/datax/job/import/offline.base_category2.json /offline_data_datax/db/base_category2/$do_date import_data /opt/bdp/datax/job/import/offline.base_category3.json /offline_data_datax/db/base_category3/$do_date import_data /opt/bdp/datax/job/import/offline.order_detail.json /offline_data_datax/db/order_detail/$do_date import_data /opt/bdp/datax/job/import/offline.sku_info.json /offline_data_datax/db/sku_info/$do_date import_data /opt/bdp/datax/job/import/offline.user_info.json /offline_data_datax/db/user_info/$do_date import_data /opt/bdp/datax/job/import/offline.payment_info.json /offline_data_datax/db/payment_info/$do_date import_data /opt/bdp/datax/job/import/offline.base_province.json /offline_data_datax/db/base_province/$do_date import_data /opt/bdp/datax/job/import/offline.base_region.json /offline_data_datax/db/base_region/$do_date import_data /opt/bdp/datax/job/import/offline.base_trademark.json /offline_data_datax/db/base_trademark/$do_date import_data /opt/bdp/datax/job/import/offline.activity_info.json /offline_data_datax/db/activity_info/$do_date import_data /opt/bdp/datax/job/import/offline.activity_order.json /offline_data_datax/db/activity_order/$do_date import_data /opt/bdp/datax/job/import/offline.cart_info.json /offline_data_datax/db/cart_info/$do_date import_data /opt/bdp/datax/job/import/offline.comment_info.json /offline_data_datax/db/comment_info/$do_date import_data /opt/bdp/datax/job/import/offline.coupon_info.json /offline_data_datax/db/coupon_info/$do_date import_data /opt/bdp/datax/job/import/offline.coupon_use.json /offline_data_datax/db/coupon_use/$do_date import_data /opt/bdp/datax/job/import/offline.favor_info.json /offline_data_datax/db/favor_info/$do_date import_data /opt/bdp/datax/job/import/offline.order_refund_info.json /offline_data_datax/db/order_refund_info/$do_date import_data /opt/bdp/datax/job/import/offline.order_status_log.json /offline_data_datax/db/order_status_log/$do_date import_data /opt/bdp/datax/job/import/offline.spu_info.json /offline_data_datax/db/spu_info/$do_date import_data /opt/bdp/datax/job/import/offline.activity_rule.json /offline_data_datax/db/activity_rule/$do_date import_data /opt/bdp/datax/job/import/offline.base_dic.json /offline_data_datax/db/base_dic/$do_date ;; esac
当然我们也可以使用sqoop,shell脚本如下:
#! /bin/bash APP=offline sqoop=/opt/anson/sqoop/bin/sqoop if [ -n "$2" ] ;then do_date=$2 else do_date=`date -d '-1 day' +%F` fi import_data(){ $sqoop import \ --connect jdbc:mysql://hadoop101:3306/$APP \ --username root \ --password 123456 \ --target-dir /offline_data/$APP/db/$1/$do_date \ --delete-target-dir \ --query "$2 and \$CONDITIONS" \ --num-mappers 1 \ --fields-terminated-by '\t' \ --compress \ --compression-codec lzop \ --null-string '\\N' \ --null-non-string '\\N' hadoop jar /opt/bdp/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer /offline_data/$APP/db/$1/$do_date } import_order_info(){ import_data order_info "select id, final_total_amount, order_status, user_id, out_trade_no, create_time, operate_time, province_id, benefit_reduce_amount, original_total_amount, feight_fee from order_info where (date_format(create_time,'%Y-%m-%d')='$do_date' or date_format(operate_time,'%Y-%m-%d')='$do_date')" } import_coupon_use(){ import_data coupon_use "select id, coupon_id, user_id, order_id, coupon_status, get_time, using_time, used_time from coupon_use where (date_format(get_time,'%Y-%m-%d')='$do_date' or date_format(using_time,'%Y-%m-%d')='$do_date' or date_format(used_time,'%Y-%m-%d')='$do_date')" } import_order_status_log(){ import_data order_status_log "select id, order_id, order_status, operate_time from order_status_log where date_format(operate_time,'%Y-%m-%d')='$do_date'" } import_activity_order(){ import_data activity_order "select id, activity_id, order_id, create_time from activity_order where date_format(create_time,'%Y-%m-%d')='$do_date'" } import_user_info(){ import_data "user_info" "select id, name, birthday, gender, email, user_level, create_time, operate_time from user_info where (DATE_FORMAT(create_time,'%Y-%m-%d')='$do_date' or DATE_FORMAT(operate_time,'%Y-%m-%d')='$do_date')" } import_order_detail(){ import_data order_detail "select od.id, order_id, user_id, sku_id, sku_name, order_price, sku_num, od.create_time, source_type, source_id from order_detail od join order_info oi on od.order_id=oi.id where DATE_FORMAT(od.create_time,'%Y-%m-%d')='$do_date'" } import_payment_info(){ import_data "payment_info" "select id, out_trade_no, order_id, user_id, alipay_trade_no, total_amount, subject, payment_type, payment_time from payment_info where DATE_FORMAT(payment_time,'%Y-%m-%d')='$do_date'" } import_comment_info(){ import_data comment_info "select id, user_id, sku_id, spu_id, order_id, appraise, comment_txt, create_time from comment_info where date_format(create_time,'%Y-%m-%d')='$do_date'" } import_order_refund_info(){ import_data order_refund_info "select id, user_id, order_id, sku_id, refund_type, refund_num, refund_amount, refund_reason_type, create_time from order_refund_info where date_format(create_time,'%Y-%m-%d')='$do_date'" } import_sku_info(){ import_data sku_info "select id, spu_id, price, sku_name, sku_desc, weight, tm_id, category3_id, create_time from sku_info where 1=1" } import_base_category1(){ import_data "base_category1" "select id, name from base_category1 where 1=1" } import_base_category2(){ import_data "base_category2" "select id, name, category1_id from base_category2 where 1=1" } import_base_category3(){ import_data "base_category3" "select id, name, category2_id from base_category3 where 1=1" } import_base_province(){ import_data base_province "select id, name, region_id, area_code, iso_code from base_province where 1=1" } import_base_region(){ import_data base_region "select id, region_name from base_region where 1=1" } import_base_trademark(){ import_data base_trademark "select tm_id, tm_name from base_trademark where 1=1" } import_spu_info(){ import_data spu_info "select id, spu_name, category3_id, tm_id from spu_info where 1=1" } import_favor_info(){ import_data favor_info "select id, user_id, sku_id, spu_id, is_cancel, create_time, cancel_time from favor_info where 1=1" } import_cart_info(){ import_data cart_info "select id, user_id, sku_id, cart_price, sku_num, sku_name, create_time, operate_time, is_ordered, order_time, source_type, source_id from cart_info where 1=1" } import_coupon_info(){ import_data coupon_info "select id, coupon_name, coupon_type, condition_amount, condition_num, activity_id, benefit_amount, benefit_discount, create_time, range_type, spu_id, tm_id, category3_id, limit_num, operate_time, expire_time from coupon_info where 1=1" } import_activity_info(){ import_data activity_info "select id, activity_name, activity_type, start_time, end_time, create_time from activity_info where 1=1" } import_activity_rule(){ import_data activity_rule "select id, activity_id, condition_amount, condition_num, benefit_amount, benefit_discount, benefit_level from activity_rule where 1=1" } import_base_dic(){ import_data base_dic "select dic_code, dic_name, parent_code, create_time, operate_time from base_dic where 1=1" } case $1 in "order_info") import_order_info ;; "base_category1") import_base_category1 ;; "base_category2") import_base_category2 ;; "base_category3") import_base_category3 ;; "order_detail") import_order_detail ;; "sku_info") import_sku_info ;; "user_info") import_user_info ;; "payment_info") import_payment_info ;; "base_province") import_base_province ;; "base_region") import_base_region ;; "base_trademark") import_base_trademark ;; "activity_info") import_activity_info ;; "activity_order") import_activity_order ;; "cart_info") import_cart_info ;; "comment_info") import_comment_info ;; "coupon_info") import_coupon_info ;; "coupon_use") import_coupon_use ;; "favor_info") import_favor_info ;; "order_refund_info") import_order_refund_info ;; "order_status_log") import_order_status_log ;; "spu_info") import_spu_info ;; "activity_rule") import_activity_rule ;; "base_dic") import_base_dic ;; "first") import_base_category1 import_base_category2 import_base_category3 import_order_info import_order_detail import_sku_info import_user_info import_payment_info import_base_province import_base_region import_base_trademark import_activity_info import_activity_order import_cart_info import_comment_info import_coupon_use import_coupon_info import_favor_info import_order_refund_info import_order_status_log import_spu_info import_activity_rule import_base_dic ;; "all") import_base_category1 import_base_category2 import_base_category3 import_order_info import_order_detail import_sku_info import_user_info import_payment_info import_base_trademark import_activity_info import_activity_order import_cart_info import_comment_info import_coupon_use import_coupon_info import_favor_info import_order_refund_info import_order_status_log import_spu_info import_activity_rule import_base_dic ;; esac
业务数据基本就是这样采集到hdfs了~