一、在对mongodb数据拉取过程中,发现md中存储的是一个个的集合,字段不固定,多的几十条,少的几条,这样再json配置时,会导致数据的匹配不对,有脏读,乱读等现象,下面把解决办法贴下,不到之处请谅解。
首先对于字段相对固定的,我采取的是mongodb->datax->hive的途径
json配置:
{
"job": {
"setting": {
"speed": {
"channel": 2
}
},
"content": [{
"reader": {
"name": "mongodbreader",
"parameter": {
"address": ["地址"],
"userName": "用户名",
"userPassword": "密码",
"dbName": "库名",
"collectionName": "集合名",
"query":"{created:{ $gte: ISODate('$byday}T16:00:00.000Z'), $lte: ISODate('$yesterday}T16:00:00.000Z') }}",
"column": [{
"index":0,
"name": "_id",
"type": "string"
}, {
"index":1,
"name": "owner",
"type": "string"
}, {
"index":2,
"name": "contributor",
"type": "string"
}, {
"index":3,
"name": "type",
"type": "string"
}, {
"index":4,
"name": "amount",
"type": "int"
}, {
"index":5,
"name": "divided",
"type": "double"
}, {
"index":6,
"name": "orderId",
"type": "string"
}, {
"index":7,
"name": "orderPrice",
"type": "int"
}, {
"index":8,
"name": "created",
"type": "date"
}, {
"index":9,
"name": "updated",
"type": "date"
}]
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://ip",
"fileType": "text",
"path": "/user/hive/warehouse/aries.db/ods_goldsystem_mdaccountitems/accounting_day=$dt",
"fileName": "zhenghaoReader",
"column": [{
"index":0,
"name": "id",
"type": "string"
}, {
"index":1,
"name": "owner",
"type": "string"
}, {
"index":2,
"name": "contributor",
"type": "string"
}, {
"index":3,
"name": "type",
"type": "string"
}, {
"index":4,
"name": "amount",
"type": "int"
}, {
"index":5,
"name": "divided",
"type": "double"
}, {
"index":6,
"name": "orderId",
"type": "string"
}, {
"index":7,
"name": "orderPrice",
"type": "int"
}, {
"index":8,
"name": "created",
"type": "string"
}, {
"index":9,
"name": "updated",
"type": "string"
}],
"writeMode": "append",
"fieldDelimiter": "\t"
}
}
}]
}
}
注意:
1、index是起到匹配作用的,需要一一对应
2、"query":mongodb的查询语句,在md中存储的是isod格式时间,在拉取时需要配16:00:00转换成东八区的00:00:00。
py脚本:使用py脚本传参不仅可以解决参数格式问题,还可以使拉取速度变得更快。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
import os
import signal
import subprocess
import time
import re
import socket
import json
from optparse import OptionParser
from optparse import OptionGroup
from string import Template
import codecs
import platform
import datetime
now=datetime.datetime.now()
def getYesterday():
delta=datetime.timedelta(days=1)
n_days=now-delta
return n_days
def getByday():
delta=datetime.timedelta(days=2)
n_days=now-delta
return n_days
def buildStartCommand(data):
byday=getByday().strftime('%Y-%m-%d')
yesterday=getYesterday().strftime('%Y-%m-%d')
dt=getYesterday().strftime('%Y%m%d')
shCommand1 = 'python /data/datax/bin/datax.py -p "-Dbyday=%s -Dyesterday=%s -Ddt=%s" /data/datax/zhenghao/%s.json'
finalCommand = (shCommand1 % (byday,yesterday,dt,data) )
print finalCommand
return finalCommand
def printCopyright():
print '''
DataYJ , From Yuanju !
Copyright (C) 2010-2017, YuanJu Group. All Rights Reserved.
'''
sys.stdout.flush()
if __name__ == "__main__":
printCopyright()
arr= ["表名1","表名2","表名3"]
for data in arr:
startCommand = buildStartCommand(data)
child_process = subprocess.Popen(startCommand, shell=True)
(stdout,stderr) = child_process.communicate()
sys.exit(child_process.returncode)
这里我启动的一键脚本,配置了三张表
二、有时会发现对于集合中内容多,且不固定的md,我们可以通过Hbase写入的方式解决,直接拉取到hive中会出现某个字段没有,下个字段内容提前的现象。
Hbase写入json配置:
{
"job": {
"setting": {
"speed": {
"channel": 2
}
},
"content": [{
"reader": {
"name": "mongodbreader",
"parameter": {
"address": ["ip"],
"userName": "用户名",
"userPassword": "密码",
"dbName": "库名",
"collectionName": "集合名",
"query":"{ $or: [ {created:{ $gte: ISODate('$byday}T16:00:00Z') }}, { bindRewardUpdated : $dt } ] }",
"column": [{
"index":0,
"name": "_id",
"type": "string"
}, {
"index":1,
"name": "deviceId",
"type": "string"
}, {
"index":2,
"name": "promoter",
"type": "string"
}, {
"index":3,
"name": "user",
"type": "string"
}, {
"index":4,
"name": "way",
"type": "string"
}, {
"index":5,
"name": "bindMethod",
"type": "string"
}, {
"index":6,
"name": "bindRewardGold",
"type": "long"
}, {
"index":7,
"name": "created",
"type": "date"
}, {
"index":8,
"name": "bindRewardUpdated",
"type": "long"
}]
}
},
"writer": {
"name": "hbase11xwriter",
"parameter": {
"hbaseConfig": {
"hbase.zookeeper.property.clientPort":"2181",
"hbase.zookeeper.quorum":"dw08.com,dw09.com,dw02.com,dw03.com,dw04.com"
},
"table": "ARIES:userpromoters",
"mode": "normal",
"encoding": "utf-8",
"nullMode":"empty",
"rowkeyColumn": [
{
"index":-2,
"type":"string",
"value":"|"
},
{
"index":0,
"type":"string"
}
],
"column": [{
"index":0,
"name": "attr:id",
"type": "string"
}, {
"index":1,
"name": "attr:deviceId",
"type": "string"
}, {
"index":2,
"name": "attr:promoter",
"type": "string"
}, {
"index":3,
"name": "attr:user",
"type": "string"
}, {
"index":4,
"name": "attr:way",
"type": "string"
}, {
"index":5,
"name": "attr:bindMethod",
"type": "string"
}, {
"index":6,
"name": "attr:bindRewardGold",
"type": "string"
}, {
"index":7,
"name": "attr:created",
"type": "string"
}, {
"index":8,
"name": "attr:bindRewardUpdated",
"type": "string"
}]
}
}
}]
}
}
注意:rowkeyColumn": [
{
"index":-2,
"type":"string",
"value":"|"
},
{
"index":0,
"type":"string"
}
这里的-2是我们对源码稍作改动,加的一组随机数,rowkey的格式是随机数|index0,可以有效的为每条数据添加唯一标识。如果想要在rowkey做一些改动可以参考官网配置,index-1。
hive中建立与hbase的关联表后即可。