$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar \ -D mapred.map.tasks=1 \ -D mapred.reduce.tasks=1 \ -D mapred.job.name="Experiment" \ -input "input.txt" \ -output "out" \ -mapper "xargs cat" \ -reducer "cat"
#include <iostream> #include <stdio.h> #include <string> using namespace std; int main(int argc, char *argv[]){ string docid, dup; while(cin >> docid >> dup){ cout << dup << "\t" << docid << endl; } return 0; }
#include <iostream> #include <stdint.h> using namespace std; int main(int argc, char *argv[]){ string key; uint64_t value; string last_key = ""; uint64_t last_value = 0; while(cin >> key >> value){ if(key == "*"){ cout << value << "\t" << "A" << endl; }else if(last_key != "" && last_key != key){ cout << last_value << "\t" << "A" << endl; last_key = key; last_value = value; }else{ last_key = key; last_value = value; } } if(last_key != ""){ cout << value << "\t" << "A" << endl; } }
cmd="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar" cmd=$cmd" -jt host:port" cmd=$cmd" -fs host:port" cmd=$cmd" -input $1" cmd=$cmd" -output $2" cmd=$cmd" -file uniq_map" cmd=$cmd" -file uniq_reduce" cmd=$cmd" -mapper uniq_map" cmd=$cmd" -reducer uniq_reduce" cmd=$cmd" -jobconf mapred.map.tasks.spaculative.execution=false" cmd=$cmd" -jobconf mapred.task.timeout=60000000" cmd=$cmd" -jobconf mapred.job.tracker=local" echo $cmd $cmd >> ./stat.log其实通过脚本语言实现比C++更为容易。
core=`find . -name 'core*'`; cp $core /home/admin/ gdb -quiet ./a.out -c $core -x ./pipes-default-gdb-commands.txt pipes-default-gdb-commands.txt注明了执行的gdb命令 info threads backtrace quit
struct rlimit limit; limit.rlim_cur = 65535; limit.rlim_max = 65535; if (setrlimit(RLIMIT_CORE, &limit) != 0) { printf("setrlimit() failed with errno=%s\n", strerror(errno)); exit(1); }
然后在jobconf中,把要执行的script赋给变量”mapred.map.task.debug.script”或”mapred.reduce.task.debug.script”。 这样当HadoopStreaming执行过程发生core dump,就可以通过JobTracker的GUI界面看到GDB的信息了。
在Hadoop上调试HadoopStreaming程序的方法详解 by 道凡