Hadoop Streaming: c++编写uniq程序

1. 描述
使用Hadoop streaming对输入数据的第一列进行uniq去重计算,采用c++编写map-reduce程序。

2. mapper程序
mapper.cpp

#include 
#include 

using namespace std;

#define BUFFER_SIZE 102400
#define DELIM "\t"

int main(int argc, char** argv)
{
        char buf[BUFFER_SIZE];
        char *strCur = NULL;

        //读取第一个字段并输出
        while(fgets(buf, BUFFER_SIZE-1, stdin))
        {
                strCur = strtok(buf, DELIM);
                cout << buf << endl;
        }


        return 0;
}

3. reducer
reducer.cpp
#include 
#include 
#include 

using namespace std;

int main(int argc, char** argv)
{
        string key;

        set myset;

        while(cin >> key)
        {
                myset.insert(key);
        }

        cout << myset.size() << endl;

        return 0;
}

4. 测试数据
test.txt

a       15      1
a       15      1
a       15      1
a       15      1
b       20      1
c       15      1
c       15      1
d       16      1
a       16      1

5. 本地测试
$ g++ -o mapper mapper.cpp 
$ g++ -o reducer reducer.cpp 
$ cat test.txt | ./mapper | ./reducer 
4

6. Hadoop streaming 测试

$ hadoop fs -put test.txt /user/test.txt
$ hadoop streaming 
    -input /user/test.txt /
    -output /user/tmp_1324 /
    -mapper ./mapper -reducer ./reducer /
    -file mapper -file reducer /
    -jobconf mapred.reduce.tasks=1 /
    -jobconf mapre.job.name="uniq_test" 

7. 查看结果

$  hadoop fs -cat /user/tmp_1324/part-00000
4

你可能感兴趣的:(hadoop,Hadoop,Streaming实战,hadoop,c++,buffer,测试,string,null)