root# apt-get install heartbeat cluster-glue cluster-agents pacemaker2. 配置heartbeat,在master1和master2上进行如下修改:
root# vi /etc/ha.d/ha.cf修改如下:
# enable pacemaker, without stonith crm yes # log where ? logfacility local0 # warning of soon be dead warntime 10 # declare a host (the other node) dead after: deadtime 20 # dead time on boot (could take some time until net is up) initdead 120 # time between heartbeats keepalive 2 # the nodes node master1 node master2 # heartbeats, over dedicated replication interface! ucast eth0 master1 # ignored by master1 (owner of ip) ucast eth0 master2 # ignored by master2 (owner of ip) # ping the name server to assure we are online ping ns3.创建一个认证密钥(authkeys )文件。在master1和master2上切换到root用户执行如下脚本:
root# ( echo -ne "auth 1\n1 sha1 "; \ dd if=/dev/urandom bs=512 count=1 | openssl md5 ) \ > /etc/ha.d/authkeys root# chmod 0600 /etc/ha.d/authkeys
root# vi /root/.bashrc export JAVA_HOME=/usr/local/jdk1.6 export HADOOP_HOME=/usr/local/hadoop/current export OCF_ROOT=/usr/lib/ocf执行如下命令使上面的改变生效:
root# source /root/.bashrc2. 创建一个标准开发集群框架(Open Clustering Framework ,OCF)资源打理文件并命名为namenode。并添加如下内容到其中:
root# vi namenode #!/bin/sh : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/resource.d/heartbeat} . ${OCF_FUNCTIONS_DIR}/.ocf-shellfuncs usage() { echo "Usage: $0 {start|stop|status|monitor|meta-data|validateall}" }3. 添加一个meta_data()函数到上面的namenode文件中。meta_data()函数功能转储资源代理的元数据为标准输出。每个资源代理必须有一组XML元数据描述自己的目的和
root# vi namenode meta_data() {cat <<END <?xml version="1.0"?> <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> <resource-agent name="namenode"> <version>0.1</version> <longdesc lang="en"> This is a resource agent for NameNode. It manages HDFS namenode daemon. </longdesc> <shortdesc lang="en">Manage namenode daemon.</shortdesc> <parameters></parameters> <actions> <action name="start" timeout="120" /> <action name="stop" timeout="120" /> <action name="status" depth="0" timeout="120" interval="120" /> <action name="monitor" depth="0" timeout="120" interval="120" /> <action name="meta-data" timeout="10" /> <action name="validate-all" timeout="5" /> </actions> </resource-agent> END }4.添加 namenode_start()函数。这个函数用于通过Pacemake来在服务器上开启namenode守护线程。在namenode_start()函数中,我们首先检查是否NameNode会在服务器上已经启动,如果它没有启动,我们调用的hadoop-daemon.sh从Hadoop的用户来启动它:
root# vi namenode namenode_start() { # if namenode is already started on this server, bail out early namenode_status if [ $? -eq 0 ]; then ocf_log info "namenode is already running on this server, skip" return $OCF_SUCCESS fi # start namenode on this server ocf_log info "Starting namenode daemon..." su - hadoop -c "${HADOOP_HOME}/bin/hadoop-daemon.sh start name node" if [ $? -ne 0 ]; then ocf_log err "Can not start namenode daemon." return $OCF_ERR_GENERIC; fi sleep 1 return $OCF_SUCCESS }5.添加namenode_stop()函数。此功能用于通过Pacemaker在服务器上停止NameNode的守护进程。在namenode_stop()函数中,我们 首先检查的NameNode是否已经停止在服务器上,如果它正在运行,我们从Hadoop的用户调用的hadoop-daemon.sh停止它:
root# vi namenode namenode_stop () { # if namenode is not started on this server, bail out early namenode_status if [ $? -ne 0 ]; then ocf_log info "namenode is not running on this server, skip" return $OCF_SUCCESS fi # stop namenode on this server ocf_log info "Stopping namenode daemon..." su - hadoop -c "${HADOOP_HOME}/bin/hadoop-daemon.sh stop name node" if [ $? -ne 0 ]; then ocf_log err "Can not stop namenode daemon." return $OCF_ERR_GENERIC; fi sleep 1 return $OCF_SUCCESS }6.添加namenode_status()函数。此功能用于通过Pacemaker 监测服务器上的NameNode的守护程序的状态。在namenode_status()
root# vi namenode namenode_status () { ocf_log info "monitor namenode" su - hadoop -c "${JAVA_HOME}/bin/jps" | egrep -q "NameNode" rc=$? # grep will return true if namenode is running on this machine if [ $rc -eq 0 ]; then ocf_log info "Namenode is running" return $OCF_SUCCESS else ocf_log info "Namenode is not running" return $OCF_NOT_ RUNNING fi }7.添加namenode_validateAll()函数用来确保在我们运行其他功能之前,我们设置的环境变量是否正确:
root# vi namenode namenode_validateAll () { if [ -z "$JAVA_HOME" ]; then ocf_log err "JAVA_HOME not set." exit $OCF_ERR_INSTALLED fi if [ -z "$HADOOP_HOME" ]; then ocf_log err "HADOOP_HOME not set." exit $OCF_ERR_INSTALLED fi # Any subject is OK return $OCF_SUCCESS }8. 添加下面的主程序。在这里,我们将简单地调用以前的函数
root# vi namenode # See how we were called. if [ $# -ne 1 ]; then usage exit $OCF_ERR_GENERIC fi namenode_validateAll case $1 in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; *);; esac case $1 in status|monitor) namenode_status;; start) namenode_start;; stop) namenode_stop;; validate-all);; *)usage exit $OCF_ERR_UNIMPLEMENTED;; esac exit $?9. 改变namenode文件的权限并在master1和master2上测试它:
root# chmod 0755 namenode root# ocf-tester -v -n namenode-test /full/path/of/namenode10. 确定所有的测试在执行下一步之前通过,否则高可用性集群将可能出现未知的问题。
root# mkdir ${OCF_ROOT}/resource.d/hac root# cp namenode ${OCF_ROOT}/resource.d/hac root# chmod 0755 ${OCF_ROOT}/resource.d/hac/namenode
我们准备用Heartbeat和Pacemaker来配置高度可用的NameNode。我们将设立一个VIP地址并且在配置Hadoop和HBase时使用这个VIP地址作为他们的 主节点。NameNode会在VIP被分配的时候在活动主节点上启动。如果活动 主已经崩溃,Heartbeat和Pacemaker会检测到它,并分配VIP地址的 备用主节点,然后启动NameNode。
1. 在master1和master2尚开启Heartbeat :root# /etc/init.d/heartbeat start2. 改变默认的crm配置。所有资源相关的命令仅仅在master1和master2上执行一次:
root# crm configure property stonith-enabled=false root# crm configure property default-resource-stickiness=13. 使用我们的VIP地址增加一个VIP资源:
root# crm configure primitive VIP ocf:heartbeat:IPaddr params ip="10.174.14.10" op monitor interval="10s"4. 确定在你做出改变之后将下面的hadoop配置同步到所有的主节点、客户端和从节点上:
hadoop$ vi $HADOOP_HOME/conf/core-site.xml <property> <name>fs.default.name</name> <value>hdfs://master:8020</value> </property>5. 确定在你做出改变之后将下面的HBase 配置同步到所有的主节点、客户端和从节点上:
hadoop$ vi $HBASE_HOME/conf/hbase-site.xml <property> <name>hbase.rootdir</name> <value>hdfs://master:8020/hbase</value> </property>6. 为了配置Hadoop以便写元数据到本地磁盘和NFS中,做出如下改变并将 配置同步到所有的主节点、客户端和从节点上:
hadoop$ vi $HADOOP_HOME/conf/hdfs-site.xml <property> <name>dfs.name.dir</name> <value>/usr/local/hadoop/var/dfs/name,/mnt/nfs/hadoop /dfs/name</value> </property>7. 添加我们在第5步中为Pacemaker创建的namenode资源代理,我们将使用NAMENODE作为它的资源名:
root# crm configure primitive NAMENODE ocf:hac:namenode op monitor interval="120s" timeout="120s" op start timeout="120s" op stop timeout="120s" meta resource-stickiness="1"8.配置VIP资源和NAMENODE资源作为一个资源组:
root# crm configure group VIP-AND-NAMENODE VIP NAMENODE9.配置VIP资源和NAMENODE资源 的托管:
root# crm configure colocation VIP-WITH-NAMENODE inf: VIP NAMENODE10. 配置VIP资源和NameNode的资源的资源顺序:
root# crm configure order IP-BEFORE-NAMENODE inf: VIP NAMENODE11. 通过使用crm_mon命令验证以前的Heartbeat和资源配置的。如果一切配置正确,你应该会看到
root@master1 hac$ crm_mon -1r ============ Last updated: Tue Nov 22 22:39:11 2011 Stack: Heartbeat Current DC: master2 (7fd92a93-e071-4fcb-993f-9a84e6c7846f) - partition with quorum Version: 1.0.9-74392a28b7f31d7ddc86689598bd23114f58978b 2 Nodes configured, 1 expected votes 1 Resources configured. ============ Online: [ master1 master2 ] Full list of resources: Resource Group: VIP-AND-NAMENODE VIP (ocf::heartbeat:IPaddr): Started master1 NAMENODE (ocf::hac:namenode): Started master112.确保VIP和NameNode的资源在同一台服务器上开启。
hadoop@master$ for i in 1 2 3 do ssh slave$i "$HADOOP_HOME/bin/hadoop-daemon.sh start datanode" sleep 1 done2. 开启你的HBase集群的master,就是那些与VIP地址相关联的活动主服务器:
hadoop@master$ $HBASE_HOME/bin/start-hbase.sh3.开启master2上的从HMaster:
hadoop@master2$ $HBASE_HOME/bin/hbase-daemon.sh start master转载请注明出处: http://blog.csdn.net/iAm333