下载源码
svn checkout http://code.taobao.org/svn/datax/trunk
-环境
root@datanode158:~# java -version
java version "1.7.0_45"
root@datanode158:~# python -V
Python 2.7.3
root@datanode158:~# ant -version
Apache Ant(TM) version 1.8.2 compiled on December 3 2011
root@datanode158:~# g++ --version
g++ (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
root@datanode158:~# rpm --version
RPM version 4.9.1.1
root@datanode158:~# dos2unix -V
dos2unix 5.3.1 (2011-08-09)
With native language support.
LOCALEDIR: /usr/share/locale
步骤:
1、进入datax的rpm目录:/datax/rpm
在root下运行:rpmbuild --ba t_dp_datax_engine.spec
会出现一堆FileNotFound的问题
RPM build errors:
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/bin
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/conf
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/engine
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/common
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/libs
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/logs
File not found: /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64/home/taobao/datax/jobs
开源也不好好修改下源码,搞到一头雾水=,=
修改t_dp_datax_engine.spec如下:
summary: engine provides core scheduler and data swap storage for DataX
Name: t_dp_datax_engine
Version: 1.0.0
Release: 1
Group: System
License: GPL
AutoReqProv: no
BuildArch: noarch
%define dataxpath /home/taobao/datax //改成%{buildroot}/home/taobao/datax
%define vdataxpath /home/taobao/datax //添加,其中vdataxpath下面要用
%description
DataX Engine provides core scheduler and data swap storage for DataX
%prep
cd ${OLDPWD}/../
export LANG=zh_CN.UTF-8
ant dist
%build
%install
dos2unix ${OLDPWD}/../release/datax.py
mkdir -p %{dataxpath}/bin
mkdir -p %{dataxpath}/conf
mkdir -p %{dataxpath}/engine
mkdir -p %{dataxpath}/common
mkdir -p %{dataxpath}/libs
mkdir -p %{dataxpath}/jobs
mkdir -p %{dataxpath}/logs
cp ${OLDPWD}/../jobs/sample/*.xml %{dataxpath}/jobs
cp ${OLDPWD}/../release/*.py %{dataxpath}/bin/
cp -r ${OLDPWD}/../conf/*.properties %{dataxpath}/conf
cp -r ${OLDPWD}/../conf/*.xml %{dataxpath}/conf
cp -r ${OLDPWD}/../build/engine/*.jar %{dataxpath}/engine
cp -r ${OLDPWD}/../build/common/*.jar %{dataxpath}/common
cp ${OLDPWD}/../c++/build/libcommon.so %{dataxpath}/common
cp -r ${OLDPWD}/../libs/commons-io-2.0.1.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/commons-lang-2.4.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/dom4j-2.0.0-ALPHA-2.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/jaxen-1.1-beta-6.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/junit-4.4.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/log4j-1.2.16.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/slf4j-api-1.4.3.jar %{dataxpath}/libs
cp -r ${OLDPWD}/../libs/slf4j-log4j12-1.4.3.jar %{dataxpath}/libs
%post
chmod -R 0777 %{dataxpath}/jobs //改成chmod -R 0777 %{vdataxpath}/jobs
chmod -R 0777 %{dataxpath}/logs //改成chmod -R 0777 %{vdataxpath}/logs
%files
%defattr(0755,root,root)
%{dataxpath}/bin // 改成%{vdataxpath}/bin
%{dataxpath}/conf //改成%{vdataxpath}/conf
%{dataxpath}/engine //改成%{vdataxpath}/engine
%{dataxpath}/common //改成%{vdataxpath}/common
%{dataxpath}/libs //改成%{vdataxpath}/libs
%attr(0777,root,root) %dir %{dataxpath}/logs //改成%attr(0777,root,root)
%{vdataxpath}/logs
%attr(0777,root,root) %dir %{dataxpath}/jobs //改成 %attr(0777,root,root) %{vdataxpath}/jobs
%changelog
* Fri Aug 20 2010 meining
- Version 1.0.0
再次编译
Processing files: t_dp_datax_engine-1.0.0-1.noarch
Checking for unpackaged file(s): /usr/lib/rpm/check-files /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64
Wrote: /root/rpmbuild/SRPMS/t_dp_datax_engine-1.0.0-1.src.rpm
Wrote: /root/rpmbuild/RPMS/noarch/t_dp_datax_engine-1.0.0-1.noarch.rpm
Executing(%clean): /bin/sh -e /var/tmp/rpm-tmp.y3UwSl
+ umask 022
+ cd /root/rpmbuild/BUILD
+ /bin/rm -rf /root/rpmbuild/BUILDROOT/t_dp_datax_engine-1.0.0-1.x86_64
+ exit 0
进入:/root/rpmbuild/RPMS/noarch
发布
rpm -ivh t_dp_datax_engine-1.0.0-1.noarch.rpm
至此安装完成!
测试
root@datanode158:~/rpmbuild/RPMS/noarch# python /home/taobao/datax/bin/datax.py -e true
Taobao DataX V1.0
Data Source List :
0 mysql
1 sqlserver
2 http
3 fake
4 stream
5 oracle
6 hdfs
7 hbase
Please choose [0-7]: 2
Data Destination List :
0 stream
1 mysql
2 hdfs
3 oracle
4 hbase
Please choose [0-4]: 0
Generate /home/taobao/datax/jobs/httpreader_to_streamwriter_1396012010274.xml successfully .
配置/home/taobao/datax/jobs/httpreader_to_streamwriter_1396012010274.xml
httpreader
streamwriter
自动生成的 xml 文件中,有“?” 标识的 value 值,表示此处用户必须配置,其他地方的默认值用户可以根据自己需要作修改
执行
DataX 的运行命令如下:
/home/taobao/datax/bin/datax.py job.xml
其中/home/taobao/datax/bin/datax.py 是 DataX 命令行的 python 封装,该执行脚本
是整个 DataX 的程序入口,Job.xml 是该 job 的配置文件。
如上述配置,该程序会将百度首页download下来
................百度页面html、js代码
2014-03-28 21:13:02,204 [main] INFO schedule.Engine - DataX Reader post work begins .
2014-03-28 21:13:02,204 [main] INFO schedule.Engine - DataX Reader post work ends .
2014-03-28 21:13:02,204 [main] INFO schedule.Engine - DataX Writers post work begins .
2014-03-28 21:13:02,205 [main] INFO schedule.Engine - DataX Writers post work ends .
2014-03-28 21:13:02,205 [main] INFO schedule.Engine - DataX job succeed .
2014-03-28 21:13:02,210 [main] INFO schedule.Engine -
DataX starts work at : 2014-03-28 21:13:00
DataX ends work at : 2014-03-28 21:13:02
Total time costs : 2s
Average byte speed : 26KB/s
Average line speed : 1L/s
Total transferred records : 1
Total discarded records : 0