1. Default mapreduce
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
importorg.apache.hadoop.mapreduce.Mapper;
importorg.apache.hadoop.mapreduce.Reducer;
importorg.apache.hadoop.conf.Configuration;
importorg.apache.hadoop.conf.Configured;
importorg.apache.hadoop.fs.Path;
importorg.apache.hadoop.util.Tool;
importorg.apache.hadoop.util.ToolRunner;
importorg.apache.hadoop.mapreduce.Job;
importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public
class
DefaultMapReduce
extends
Configured
implements
Tool {
@Override
public
int
run(String[] args) throwsException {
Configuration conf = getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(getClass());
job.setNumReduceTasks(
0
);
FileInputFormat.addInputPath(job, newPath(args[
0
]));
FileOutputFormat.setOutputPath(job, newPath(args[
1
]));
returnjob.waitForCompletion(
true
) ?
0
:
1
;
}
public
static
void
main(String[] args)
throws
Exception {
ToolRunner.run(newDefaultMapReduce(), args);
}
}
|
注意:
此处并没有定义map但是程序任然可以运行,是因为hadoop新的API用mapred类(接口实现)替换成了mapreduce类(抽象类实现),新的Mapper抽象类本身提供了可以运行的map方法, 部分源码如下:
1
2
3
4
5
6
7
8
9
10
|
publicvoidrun(Context context) throwsIOException, InterruptedException {
setup(context);
try
{
while
(context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
}
finally
{
cleanup(context);
}
}
|
2.SequenceFileWritDemo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
import
java.io.IOException;
import
java.net.URI;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.io.SequenceFile;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.io.IOUtils;
public
class
SequenceFileWritDemo {
private
static
final
String[] DATA = {
"one, two, buckle my shoe"
,
"Three, four, shut the door"
};
public
static
void
main(String[] args)
throws
IOException {
String uri = args[
0
];
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path =
new
Path(uri);
IntWritable key =
new
IntWritable();
Text value =
new
Text();
SequenceFile.Writer writer =
null
;
try
{
writer = SequenceFile.createWriter(conf, writer.file(path),
qwriter.keyClass(key.getClass()),writer.valueClass(value.getClass()));
for
(
int
i =
0
; i <
100
; i++ ) {
key.set(
100
- i);
value.set(DATA[i % DATA.length]);
System.out.printf(
"[%s]\t%s\t%s\n"
, writer.getLength(), key, value);
writer.append(key, value);
}
}
finally
{
IOUtils.closeStream(writer);
}
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
import
java.io.IOException;
import
java.net.URI;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.io.SequenceFile;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.io.IOUtils;
public
class
SequenceFileWritDemo {
private
static
final
String[] DATA = {
"one, two, buckle my shoe"
,
"Three, four, shut the door"
};
public
static
void
main(String[] args)
throws
IOException {
String uri = args[
0
];
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path =
new
Path(uri);
IntWritable key =
new
IntWritable();
Text value =
new
Text();
SequenceFile.Writer writer =
null
;
try
{
writer = SequenceFile.createWriter(conf, writer.file(path),
qwriter.keyClass(key.getClass()),writer.valueClass(value.getClass()));
for
(
int
i =
0
; i <
100
; i++ ) {
key.set(
100
- i);
value.set(DATA[i % DATA.length]);
System.out.printf(
"[%s]\t%s\t%s\n"
, writer.getLength(), key, value);
writer.append(key, value);
}
}
finally
{
IOUtils.closeStream(writer);
}
}
}
|
3.write & read a MapFile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
import
java.io.IOException;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.io.MapFile;
import
org.apache.hadoop.io.MapFile.Writer;
import
org.apache.hadoop.io.MapFile.Reader;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
public
class
MyMapFile {
static
private
final
String[] DATA = {
"this is the first"
,
"this is the second"
,
"this is the third"
,
"this is the forth"
};
public
static
void
main(String[] args)
throws
IOException {
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(conf);
String uri = args[
0
];
IntWritable key =
new
IntWritable();
Text val =
new
Text();
MapFile.Writer writer =
new
MapFile.Writer(conf,
new
Path(uri),
Writer.keyClass(key.getClass()),Writer.valueClass(val.getClass()));
for
(
int
i =
0
; i <
10
; i++ ) {
key.set( i +
1
);
val.set(DATA[ i % DATA.length ]);
writer.append(key, val);
}
writer.close();
MapFile.Reader reader =
new
MapFile.Reader(
new
Path(uri), conf);
while
( reader.next(key, val) ){
System.out.println( key +
"\t"
+ val );
}
reader.close();
}
}
|
4.Configurataion Printer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
import
java.util.Map.Entry;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.util.ToolRunner;
import
org.apache.hadoop.util.Tool;
import
org.apache.hadoop.conf.Configured;
public
class
ConfigurationPrinter
extends
Configured
implements
Tool {
@Override
public
int
run(String[] args)
throws
Exception {
Configuration.addDefaultResource(args[
0
]);
Configuration.addDefaultResource(args[
1
]);
Configuration conf = getConf();
for
(Entry<String, String> entry : conf) {
System.out.printf(
"%s=%s\n"
, entry.getKey(), entry.getValue());
}
return
0
;
}
public
static
void
main(String[] args)
throws
Exception {
ToolRunner.run(
new
ConfigurationPrinter(), args);
}
}
|
5.计算温度最大值 (基于全新2.2.0API)
Deprecated: Job类的所有Constructors, 新的API用静态方法getInstance(conf)来去的Job的实例;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
import
java.io.IOException;
import
java.util.Iterator;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.conf.Configured;
import
org.apache.hadoop.io.LongWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.mapreduce.Mapper;
import
org.apache.hadoop.mapreduce.Reducer;
import
org.apache.hadoop.util.Tool;
import
org.apache.hadoop.util.ToolRunner;
import
org.apache.hadoop.mapreduce.Job;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public
class
getMaxTemperature
extends
Configured
implements
Tool {
class
MaxTemperatureMapper
extends
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public
void
map(LongWritable key, Text val, Context context)
throws
IOException, InterruptedException {
String line = val.toString();
String year = line.substring(
15
,
19
);
int
Temperature;
if
(!hasPlus(line)){
Temperature = Integer.parseInt(line.substring(
87
,
92
));
}
else
{
Temperature = Integer.parseInt(line.substring(
88
,
92
));
}
String qual = line.substring(
92
,
93
);
if
(!matched(qual)) {
context.write(
new
Text(year),
new
IntWritable(Temperature));
}
}
private
boolean
hasPlus(String line) {
return
line.charAt(
87
) ==
'+'
?
true
:
false
;
}
private
boolean
matched(String line) {
return
line.matches(
"[01459"
) ?
true
:
false
;
}
}
class
MaxTemperatureReducer
extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public
void
reduce(Text key, Iterable<IntWritable> vals, Context context)
throws
IOException, InterruptedException {
int
maxValue = Integer.MIN_VALUE;
for
( IntWritable value : vals ) {
maxValue = Math.max(maxValue, value.get());
}
context.write(key,
new
IntWritable(maxValue));
}
}
@Override
public
int
run(String[] args)
throws
Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf);
job.setJobName(
"helloRuby"
);
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job,
new
Path(args[
0
]));
FileOutputFormat.setOutputPath(job,
new
Path(args[
1
]));
job.setMapperClass(MaxTemperatureMapper.
class
);
job.setCombinerClass(MaxTemperatureReducer.
class
);
job.setReducerClass(MaxTemperatureReducer.
class
);
job.setOutputKeyClass(Text.
class
);
job.setOutputValueClass(IntWritable.
class
);
return
job.waitForCompletion(
true
) ?
0
:
1
;
}
public
static
void
main(String[] args)
throws
Exception {
ToolRunner.run(
new
getMaxTemperature() , args);
}
}
|
6.File Read
1
2
3
4
5
6
7
8
9
10
11
|
public
class
FileRead {
public
static
void
main(Sting[] args)
throws
Exception {
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(conf);
InputStream in =
new
InputStream();
in = fs.open(
new
Path(args[
0
]));
IOUtils.copyBytes(in, System.out,
4096
,
false
);
IOUtils.closeStream(in);
}
}
|
过程分析:
(1)open
当client读取文件时,在FileSystem ojbect上调用open()方法,而FileSystem是HDFS的一个instance;
从上述程序中可见:
line5 得到FileSystem的instance
line7 调用FileSystem上的open()方法
(2)get block location
随后,HDFS通过RPC(Remote Procedure Call)来呼叫namenode, 来获得the locations of the blocks for the first few blocks, 对于每一个block, namenode会返回有此block的datanode 的 address, 而且datanode会根据Network Topology被重新排序;
获取locations以后,DFS会返回一个FSDataInputStream给client来读取数据, FSDataInputStream会依次wrap一个DFSInputStream来管理datanode和namenode的 I/O,DFSInputStream上同时也存储了first few datanode 的address;
(3)read
client 在FSDataInputStream上call read(), 则DFSInputStream会连接存储第一个block的最近的datanode, 之后不停地call read()方法从datanode读取数据到client, 当到达block的末尾,DFSInputStream会关闭与此datanode的连接, 然后找到存储下一个block的datanode, 依次往复...
(4) DFSInputStream
DFSInputStream按顺序读入每一个packet的最近的一个block, 每读一个block都要重新和一个datanode建立连接;
DFSInputStream同时会和namenode保持连接,来重新获取下一个packet的blocks所在的datanode的locations
(5)FSDataInputStream
FSDataInputStream 是client和datanode连接的中介, client call read() methods 都通过FSDataInputStream来调用DFSInputStream
(6)容错
在读取数据的过程中遇到的错误主要有两类:
1. DFSInputStream和datanode 的communication出现错误, 此时DFSInputStream会尝试连接保存此packet的下一个block所在的datanode中最近的一个, 同时会记录此datanode, 防止读取下一个block是再次从该datanode上读取;
2. DFSInputStream checksum data from datanode时,发现损坏的数据块, 则它会在DFSInputStream尝试从另一个datanode读取此packet的下一个block副本之前报告给namenode;
7.File Write
1
2
3
4
5
6
7
8
9
10
11
12
13
|
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
public
class
CreateDir {
public
static
void
main(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
String dst = args[
0
];
FileSystem fs = FileSystem.get(conf);
fs.create(
new
Path(dst));
}
}
|
过程分析:
(1)create
当client 写入文件时, 在FileSystem object上调用create()方法, FileSystem是HDFS的一个instance;
(2) create new file in filesystem's namespace
DFS 通过Remote Procedure Call 来call namenode, namenode会在filesystem的namespace产生一个新的文件, 新建文件之前,nanemode会做一些列检查,包括client是否有create file的permission, 要create的文件是否已经存在, 若检查没有通过,则抛出IOException;
create file以后,DFS会返回一个FSDataOutputStream给client来写文件, FSDataInputStream会wrap一个DFSOutputStream来与namenode和datanode交流;
(3) client write data
client开始写入数据到文件时,DFSOutputStream会将待写入的数据split into 很多packets, 这些packet会被写入一个内部队列data queue, DFSOutputStream维护此data queue;
这个data queue将被DataStreamer所用,DataStreamer主要负责向namenode发出申请,来为新的packet的block副本分配 合适的datanode, namenode会挑选出合适的datanodes来存储这些data blocks;
存储这个packet的blocks的datanode会组成一个pipeline, 假设每个packet的block的replication level是3, 则此pipeline由3个datanode组成。
DataStreamer将此packet导入pipeline的第一个datanode, 该datanode存储此packet之后forward it to 第二个datanode, 同样地,第二个datanode存储此packet, forward to 第三个datanode;
(4) 容错
从(3)中我们知道DFSOutputStream维护着一个data queue, 此外,
DFSOutputStream还维护一个ack queue(acknowledged), 当一个packet已经被所有在pipeline中的datanode acknowledged, 则,此packet将会从ack queue中移走;
如果写数据时,一个datanode 写入失败,则会发生以下动作:
首先, pipeline会关闭,所有在ack queue中的packets会被添加到data queue前面,以保证下游的datanode不会丢失任何packets, 当前已写入datanode的数据块会被标识, 而写入坏的datnode中的部分数据会在此datanode recover以后被删除;
failed的datanode将从pipeline中移走, namenode 会notice到这些,会重新分配一个datanode来组成新的pipeline; 下一个packet的block不会受到影响;
当写入一个block时若大量的datanode failed,只要满足dfs.replication.min(default is 1), 则此写入就不会失败,block会被复制并同步到cluster上的datanode中,知道达到dfs.replication所设的数目(默认是3)
(5)数据写入的收尾阶段
当client完成数据写入是,client会调用在FSDataOutputStream 上的close()。 此动作会在给namenode发送file完成写入的信号之前flushs所有剩余的packets到datanode pipeline并等待acknowledge;
因为DataStreamer之前曾为所有的packets向namenode申请过block locations, 故namenode已经知道此文件由哪些blocks组成。
8.MRUnit使用
常用类
1
2
3
4
5
6
7
8
9
|
implements
java.lang.annotation.Annotation:
org.junit.Test
org.junit.Rule
org.junit.Ignore
org.junit.ClassRule
org.junit.BeforeClass
org.junit.Before
org.junit.AfterClass
org.junit.After
|
Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import
java.io.IOException;
import
org.apache.hadoop.io.LongWritable;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.Text;
import
org.junit.Test;
import
org.apache.hadoop.mrunit.mapreduce.MapDriver;
public
class
MaxTemperatureMapperTest {
@Test
public
void
processesValidRecord()
throws
IOException {
Text value =
new
Text(
"0043011990999991950051518004+68750+023550FM-12+0382"
+
// Year ^^^^
"99999V0203201N00261220001CN9999999N9-00111+99999999999"
);
// Temperature ^^^^^
new
MapDriver<LongWritable, Text, Text, IntWritable>()
.withMapper(
new
MaxTemperatureMapper())
.withInput(
new
LongWritable(
1
), value)
.withOutput(
new
Text(
"1950"
),
new
IntWritable(-
11
))
.runTest();
}
}
|
注意一些deprecated的class和methods:
org.apache.hadoop.mrunit.MapDriver<K1,V1,K2,V2>被弃用应该可以理解,此类是为mapreduce的旧API(比如org.apache.hadoop.mapred)写的,比如其中一个方法
MapDriver<K1,V1,K2,V2>
withMapper(org.apache.hadoop.mapred.Mapper<K1,V1,K2,V2> m)
mapreduce的新API为org.apache.hadoop.mapreduce.*; 与之对应MRUnit的MapDriver(包括ReduceDriver)为:
org.apache.hadoop.mrunit.mapreduce.MapDriver<K1,V1,K2,V2>, 同样的,上述方法变为:
MapDriver<K1,V1,K2,V2>
withCounters(org.apache.hadoop.mapreduce.Counters ctrs)
MapDriverBase class中的T withInputValue(V1 val) 被弃用,改为T withInput(K1 key, V1 val) ,还有很多,不详列。
执行步骤:
注意: 需要下载MRUnit并编译,在/home/user/.bashrc下设置MRUnit_HOME变量, 之后修改$HADOOP_HOME/libexec/hadoop-config.sh,将$MRUnit_HOME/lib/*.jar添加进去, 之后source $HADOOP_HOME/libexec/hadoop-config.sh,再执行下面操作:
1
2
3
|
javac -d class/ MaxTemperatureMapper.java MaxTemperatureMapperTest.java
jar -cvf
test
.jar -C class ./
java -
cp
test
.jar:$CLASSPATH org.junit.runner.JUnitCore MaxTemperatureMapperTest
|
9.Test Tool, ToolRunner, GenericOptionsParser, Configuration
说明:
Tool 中的run()方法可以看做是mapreduce程序的driver, 我们一般通过implements Tool接口来设置Job启动的相关属性,然后在main()函数里通过静态调用ToolRunner.run(new MainClass(), args)来间接调用run()方法
;
2. 包装GenericOptionsParser类来解generic hadoop command line arguments(参见另一篇文章:《hadoopFS-shell commands》
Code1 (Configuration里添加的resource是String类型):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
import
java.util.Map.Entry;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.conf.Configured;
import
org.apache.hadoop.util.ToolRunner;
import
org.apache.hadoop.util.Tool;
import
org.apache.hadoop.fs.Path;
public
class
ConfigurationPrinter
extends
Configured
implements
Tool {
static
{
Configuration.addDefaultResource(
"config.xml"
);
}
@Override
public
int
run(String[] args)
throws
Exception {
Configuration conf = getConf();
for
(Entry<String, String> hash: conf) {
System.out.printf(
"%s=%s\n"
, hash.getKey(), hash.getValue());
}
return
0
;
}
public
static
void
main(String[] args)
throws
Exception {
int
exitCode = ToolRunner.run(
new
ConfigurationPrinter(), args);
System.exit(exitCode);
}
}
|
注:Configuration class提供只一种静态方法:addDefaultresource(String name), 如上述代码, 添加Resource "config.xml"为String类型时,hadoop将从classpath里查找此文件;若Resource 为Path()类型时,hadoop将从local filesystem里查找此文件: Configuration conf = new Configuration(); conf.addResource(new Path("config.xml"));
执行步骤:
1
2
|
#将自定义的config文件config.xml放在hadoop的$HADOOP_CONF_DIR里
%
mv
config.xml $HADOOP_HOME
/etc/hadoop/
|
假如我们添加的resource如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
<!--cat $HADOOP_HOME/etc/hadoop/config.xml-->
<
configuration
>
<
property
>
<
name
>color</
name
>
<
value
>yellow</
value
>
</
property
>
<
property
>
<
name
>size</
name
>
<
value
>10</
value
>
</
property
>
<
property
>
<
name
>weight</
name
>
<
value
>heavy</
value
>
<
final
>true</
final
>
</
property
>
</
configuration
>
|
执行代码:
1
2
3
4
5
6
7
8
9
10
|
mkdir
class
source
$HADOOP_HOME
/libexec/hadoop-config
.sh
javac -d class ConfigurationPrinter.java
jar -cvf ConfigurationPrinter.jar -C class ./
export
HADOOP_CLASSPATH=ConfigurationPrinter.jar:$CLASSPATH
#下面查找刚才添加的resource是否被读入
#我们在config.xml里添加了一项 <name>color</name>,执行
yarn ConfigurationPrinter|
grep
"color"
color=yellow
#可见代码是正确的
|
或者在Command line里指定HADOOP_CONF_DIR, 比如执行:
1
2
|
% $YARN_HOME
/bin/yarn
ConfigurationPrinter --conf config.xml |
grep
color
color=yellow
|
也是可以的!
Code2 (Configuration里添加的resource是Path类型):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
import
java.util.Map.Entry;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.conf.Configured;
import
org.apache.hadoop.util.ToolRunner;
import
org.apache.hadoop.util.Tool;
import
org.apache.hadoop.fs.Path;
public
class
ConfigurationPrinter
extends
Configured
implements
Tool {
@Override
public
int
run(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
conf.addResource(
new
Path(
"config.xml"
));
for
(Entry<String, String> hash: conf) {
System.out.printf(
"%s=%s\n"
, hash.getKey(), hash.getValue());
}
return
0
;
}
public
static
void
main(String[] args)
throws
Exception {
int
exitCode = ToolRunner.run(
new
ConfigurationPrinter(), args);
System.exit(exitCode);
}
}
|
此时添加的resource类型是Path()类型,故hadoop将从local filesystem里查找config.xml, 不需要将config.xml放在conf/下面,只要在代码中指定config.xml在本地文件系统中的路径即可(new Path("../others/config.xml"))
运行步骤:
1
2
3
4
5
6
7
8
9
10
|
%
mkdir
class
%
source
$HADOOP_HOME
/libexec/hadoop-config
.sh
% javac -d class ConfigurationPrinter.java
% jar -cvf ConfigurationPrinter.jar -C class ./
%
export
HADOOP_CLASSPATH=ConfigurationPrinter.jar:$CLASSPATH
#下面查找刚才添加的resource是否被读入
#我们在config.xml里添加了一项 <name>color</name>,执行
% yarn ConfigurationPrinter|
grep
"color"
color=yellow
#可见代码是正确的
|
备注:ConfigurationParser支持set individual properties:
1
2
3
4
5
6
7
8
9
10
11
12
13
|
Generic Options
The supported generic options are:
-conf <configuration
file
> specify a configuration
file
-D <property=value> use value
for
given property
-fs <
local
|namenode:port> specify a namenode
-jt <
local
|jobtracker:port> specify a job tracker
-files <comma separated list of files> specify comma separated
files to be copied to the map reduce cluster
-libjars <comma separated list of jars> specify comma separated
jar files to include
in
the classpath.
-archives <comma separated list of archives> specify comma
separated archives to be unarchived on the compute machines.
|
可以尝试:
1
2
3
|
% $YARN_HOME
/bin/yarn
ConfigurationPrinter -d fuck=Japan |
grep
fuck
#输出为:
fuck=Japan
|
再次提醒:
ToolRunner
can be used to run classes implementing Tool
interface. It works in conjunction with GenericOptionsParser
to parse thegeneric hadoop command line arguments and modifies the Configuration
of theTool
. The application-specific options are passed along without being modified.
ToolRunner和GenericOptionsParser共同来(解析|修改) generic hadoop command line arguments (什么是generic hadoop command line arguments? 比如:yarn command [genericOptions] [commandOptions]
10.DFS API 操作
Code 1. Reading data from a hadoop URL
说明:想要让java从hadoop的dfs里读取数据,则java 必须能够识别hadoop hdfs URL schema, 因此我们应该将hdfs的FsUrlStreamHandlerFactory作为一个实例提供给java, java的setURLStreamHandlerFactory方法可以实现此功能;
注意:此方法有缺陷,由于在java里,setURLStreamHandlerFactorymethod在 每一个JVM里只能调用一次,加入第三方component已经set a URLStreamHandlerFactory,则hadoop用户就不能使用setURLStreamHandlerFactory方法来 reading data from hadoop。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import
java.io.InputStream;
import
java.net.URL;
import
org.apache.hadoop.fs.FsUrlStreamHandlerFactory;
import
org.apache.hadoop.io.IOUtils;
public class URLCat {
static {
URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
}
public static void main(String[] args) throws Exception {
InputStream
in
= null;
try {
in
= new URL(args[0]).openStream();
IOUtils.copyBytes(
in
, System.out, 4096,
false
);
} finally {
IOUtils.closeStream(
in
);
}
}
}
|
执行步骤:
1
2
3
4
5
6
7
|
$
source
$YARN_HOME
/libexec/hadoop-config
.sh
$
mkdir
myclass
$javac -
cp
$CLASSPATH URLCat.java -d myclass
$jar -cvf urlcat.jar -C myclass ./
# assume we have a file bar.txt in hdfs: /user/grid/bar.txt
# then we need run yarn with this command
$yarn jar -
cp
urlcat.jar URLCat hdfs:
///user/grid/bar
.txt
|
Code2. Reading data using HDFS API
说明:使用hadoop的FileSystem API可以避免上面所述的JVM只能调用一次setURLStreamHandlerFactory的缺陷;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
import
java.net.URI;
import
java.io.InputStream;
import
org.apache.hadoop.io.IOUtils;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
public
class
URICat {
public
static
void
main(String[] args)
throws
Exception {
String uri = args[
0
];
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
InputStream in =
null
;
try
{
in = fs.open(
new
Path(uri));
IOUtils.copyBytes(in, System.out,
4096
,
false
);
}
finally
{
IOUtils.closeStream(in);
}
}
}
|
执行步骤:
1
2
3
4
5
|
$
source
$YARN_HOME
/libexec/hadoop-config
.sh
$
mkdir
myclass
$javac -
cp
$CLASSPATH URICat.java -d myclass
$jar -cvf uricat.jar -C myclass ./
$yarn jar -
cp
uricat.jar URICat
/user/grid/bar
.txt
|
备注1:因为我们调用了FileSystem的API,故输入的filepath也可以省略HDFS的URI全名hdfs://,如上面执行步骤里所写。
备注2:FileSystem是抽象类,故不能new FileSystem()来得到instance, 而需要调用其的静态方法get()来得到;
备注3:注意java里的向上转型,体现在简要提示里各种Stream的继承关系上;
备注4:Configuration conf = new Configuration();
Configurations需要xml文件里的键值对<name>x</name>来配置,规则为:
if x is named by a String, 则在classpath里检查同名文件;
if x is named by a Path, 则直接本地查找,不检查classpath;
若用户不指定,则默认调用两个resources: core-site.xml和core-default.xml
用户可以指定xml文件以添加自己定义的configurations:
conf.addResource("my_configuration.xml");
Code3. Writing data
从本地复制文件到hdfs
版本1 FileCopy with copyBytes() method
简要提示:
1.核心代码就一行,即从InputStrea 以二进制方式复制到OutputStream:
1
|
static void copyBytes(InputStream
in
, OutputStream out, int buffSize, boolean close)
|
2.我们新建一个FileInputStream(localsrc)实例, 将其暂存在BufferedInputStream()里,并向上转型生成InputStream:
1
|
FileInputStream(String name )
|
3.调用FileSystem来产生OutputStream:
1
|
FSDataOutputStream create(Path f, Progressable progress)
|
代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
import
java.net.URI;
import
java.io.InputStream;
import
java.io.BufferedInputStream;
import
java.io.FileInputStream;
import
java.io.OutputStream;
import
org.apache.hadoop.fs.BufferedFSInputStream;
import
org.apache.hadoop.util.Progressable;
import
org.apache.hadoop.util.Progressable;
import
org.apache.hadoop.io.IOUtils;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.conf.Configuration;
public
class
FileCopyWithProgress {
public
static
void
main(String[] args)
throws
Exception {
String localsrc = args[
0
];
String dst = args[
1
];
InputStream in =
new
BufferedInputStream(
new
FileInputStream(localsrc));
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(URI.create(dst), conf);
OutputStream out = fs.create(
new
Path(dst),
new
Progressable() {
public
void
progress() { System.out.print(
"."
);} }
);
IOUtils.copyBytes(in, out,
4096
,
true
);
}
}
|
执行步骤:
1
2
3
4
5
6
7
8
9
|
% $YARN_HOME
/libexec/hadoop-config
.sh
% javac -
cp
$CLASSPATH -d my_class FileCopyWithProgress.java
% jar -cvf filecopywithprogress.jar -C my_class/ .
# assum we have a local file foo.out in directory: /home/grid/foo.out,
# then we should run yarn like below
% yarn jar filecopywithprogress.jar FileCopyWithProgress \
/home/grid/foo
.out hdfs:
///user/grid/copied_foo
.out
# we can do a check for the copied file
% hadoop fs -
ls
-R
/user/grid/
|
注:从下面开始使用另一种方式来编译、运行代码
版本2 使用FileSystem的copyFromLocalFile()方法
代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
|
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.conf.Configuration;
public
class
FileCopyFromLocal {
public
static
void
main(String[] args)
throws
Exception {
String localSrc = args[
0
];
String dst = args[
1
];
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(conf);
fs.copyFromLocalFile(
new
Path(localSrc),
new
Path(dst));
}
}
|
执行步骤:
1
2
3
4
5
6
7
8
9
10
|
$
source
$YARN_HOME
/libexec/hadoop-config
.sh
$javac FileCopyFromLocal.java -d class/
$jar -cvf filecopyfromlocal.jar -C class ./
$
export
HADOOP_CLASSPATH=$CLASSPATH:filecopyfromlocal.jar
# suppose we have a file bar.txt in local disk,
# then we use the following command line to copy it to hdfs
$yarn FileCopyFromLocal bar.txt hdfs:
///user/grid/kissyou
# we can check the copied file on hdfs
$hadoop fs -
ls
/user/grid/
w-r--r-- 3 grid supergroup 899 2013-11-17 01:33
/user/grid/kissyou
|
Code4.新建文件夹/文件
新建文件夹 FileSystem.mkdirs()
1
2
3
4
5
6
7
8
9
10
11
12
|
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.Path;
public
class
CreateDir {
public
static
void
main(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
String dst = args[
0
];
FileSystem fs = FileSystem.get(conf);
fs.mkdirs(
new
Path(dst));
}
}
|
执行步骤:
1
2
3
4
5
6
7
8
|
$
source
$YARN_HOME
/libexec/hadoop-config
.sh
$javac CreatDir.java -d class/
$jar -cvf createdir.jar -C class ./
$
export
HADOOP_CLASSPATH=$CLASSPATH:createdir.jar
$yarn CreateDir hdfs:
///user/grid/kissyou
# we can check the created directory on hdfs
$hadoop fs -
ls
/user/grid/
w-r--r-- 3 grid supergroup 899 2013-11-17 01:33
/user/grid/kissyou
|
新建文件 FileSystem.create()
1
2
3
4
5
6
7
8
9
10
11
12
13
|
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.Path;
public
class
CreateFile {
public
static
void
main(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
String dst = args[
0
];
FileSystem fs = FileSystem.get(conf);
fs.create(
new
Path(dst));
}
}
|
执行步骤:
1
2
3
4
5
6
7
8
|
$
source
$YARN_HOME
/libexec/hadoop-config
.sh
$javac CreatFile.java -d class/
$jar -cvf createfile.jar -C class ./
$
export
HADOOP_CLASSPATH=$CLASSPATH:createfile.jar
$yarn CreatFile hdfs:
///user/grid/kissyou
.txt
# we can check the created file on hdfs
$hadoop fs -
ls
/user/grid/
w-r--r-- 3 grid supergroup 899 2013-11-17 01:33
/user/grid/kissyou
.txt
|
注意三点:
1. 同一路径下不可以新建同名的文件foo和目录foo/, 否则运行时会抛出异常:fs.FileAlreadyExistsException
2. 我们进行copy复制、写文件操作时mkdirs()方法会被自动调用,故一般不会调用mkdirs()来手动创建目录;
3. 官方API文档里对mkdirs()的描述是:"Make the given file and all non-existent parents into directories", 所以在hadoop里创建文件的方法是recursive(递归的),相当于linux里的:
1
|
%
mkdir
-p foo
/bar/qzx
|
同样等价于hdfs-shell里的命令:
1
|
% $YARN_HOME/bin/hadoop fs -mkdir -p hdfs:
///foo/bar/qzx
|
Code5.Testing file and Getting fileStatus
提示: hadoop2.2中一些API已经deprecated, 以下代码全部用新的constructor,methods写成。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
import
java.net.URI;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.FileStatus;
public
class
TestFileStatus {
public
static
void
main(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(conf);
FileStatus stat = fs.getFileStatus(
new
Path(args[
0
]));
if
(stat.isDirectory()) {
System.out.println(stat.getPath().toUri().getPath() +
" is a directory."
);
}
else
if
(stat.isFile()) {
System.out.println(stat.getPath().toUri().getPath() +
" is a file."
);
System.out.println(stat.getPath().toUri().getPath() +
" getBlockSize: "
+ stat.getBlockSize());
System.out.println(stat.getPath().toUri().getPath() +
" getLen(): "
+ stat.getLen());
System.out.println(stat.getPath().toUri().getPath() +
" getOwner(): "
+ stat.getOwner());
System.out.println(stat.getPath().toUri().getPath() +
" getGroup(): "
+ stat.getGroup());
System.out.println(stat.getPath().toUri().getPath() +
" getAccessTime(): "
+ stat.getAccessTime());
System.out.println(stat.getPath().toUri().getPath() +
" getModificationTime(): "
+ stat.getModificationTime());
System.out.println(stat.getPath().toUri().getPath() +
" getPermission(): "
+ stat.getPermission());
System.out.println(stat.getPath().toUri().getPath() +
" hashcode(): "
+ stat.hashCode());
System.out.println(stat.getPath().toUri().getPath() +
" getPath(): "
+ stat.getPath());
}
}
}
|
利用下面我写的小脚本可以方便地编译并生成jar文件:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
#!/usr/bin/env sh
CWD=$(
pwd
)
export
CLASSPATH=
''
. $YARN_HOME
/libexec/hadoop-config
.sh
if
[ -d class ];
then
rm
-rf class/*
else
mkdir
$CWD
/class
fi
for
f
in
$@
do
srcs=
"$srcs $CWD/$f"
done
javac $srcs -d class
if
[ $? -
ne
0 ] ;
then
echo
Error found when compiling the code!
exit
1
fi
class=$(
cat
$1 |
grep
'package'
|
sed
-e
"s/\(package\s\)\|\(;\)//g"
\
).$(
echo
$1 |
sed
-r
's/(.*).java/echo \1/ge'
)
jarfile=$(
echo
$1 |
sed
-r
's/(.*)\.java/echo \L\1\.jar/ge'
)
jar -cvf $CWD/$jarfile -C $CWD
/class
. >
/dev/null
2>&1
#echo jar -cvf $jarfile -C class .
echo
-----------------CMD Lines-----------------------
echo
source
$YARN_HOME
/libexec/hadoop-config
.sh >sourceIt.sh
echo
export
HADOOP_CLASSPATH=$jarfile:
'$CLASSPATH'
>>sourceIt.sh
echo
source
$CWD
/sourceIt
.sh
echo
yarn $class [
command
args]...
|
执行步骤:
注意,为了简化起见,脚本定义:
1
2
3
4
5
6
7
8
9
10
|
$.
/compack
.sh args1 args2 args3...中args1为main class
$
chmod
500 compack.sh
$.
/compack
.sh TestFileStatus.java
#then the script will reminder you with the following message:
-----------------CMD Lines------------------
source
sourceIt.sh
yarn TestFileStatus [
command
args]...
$
source
sourceIt.sh
# suppose we have a file "part-m-00000" in hdfs,run yarn like below
$yarn TestFileStatus
/user/hive/warehouse/footbl/part-m-00000
|
Output:
1
2
3
4
5
6
7
8
9
10
11
12
|
#output
/user/hive/warehouse/footbl/part-m-00000
is a
file
.
/user/hive/warehouse/footbl/part-m-00000
getBlockSize: 134217728
/user/hive/warehouse/footbl/part-m-00000
getLen(): 1275
/user/hive/warehouse/footbl/part-m-00000
getOwner(): grid
/user/hive/warehouse/footbl/part-m-00000
getGroup(): supergroup
/user/hive/warehouse/footbl/part-m-00000
getAccessTime(): 1384675957784
/user/hive/warehouse/footbl/part-m-00000
getModificationTime(): 1384675958368
/user/hive/warehouse/footbl/part-m-00000
getPermission(): rw-r--r--
/user/hive/warehouse/footbl/part-m-00000
hashcode(): 1096001837
/user/hive/warehouse/footbl/part-m-00000
getPath(): \
hdfs:
//cluster1
:9000
/user/hive/warehouse/footbl/part-m-00000
|
Code6. Listing files & glob files
Listing files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
import
java.net.URI;
import
org.apache.hadoop.fs.FileUtil;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.FileStatus;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.conf.Configuration;
public
class
ListFiles {
public
static
void
main(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(conf);
Path[] paths =
new
Path[args.length];
for
(
int
i =
0
; i < args.length; i++) {
paths[i] =
new
Path(args[i]);
}
FileStatus[] status = fs.listStatus(paths);
Path[] pathList = FileUtil.stat2Paths(status);
for
(Path p : pathList) {
System.out.println(p);
}
}
}
|
执行步骤:
1
2
3
|
% .
/compack
.sh ListFiles.java
%
source
sourceIt.s
% yarn ListFiles
/user/hive/warehouse/footbl
/user/grid/
|
output:
1
2
3
4
5
|
hdfs:
//cluster1
:9000
/user/hive/warehouse/footbl/_SUCCESS
hdfs:
//cluster1
:9000
/user/hive/warehouse/footbl/part-m-00000
hdfs:
//cluster1
:9000
/user/grid/kiss
hdfs:
//cluster1
:9000
/user/grid/kissyou
hdfs:
//cluster1
:9000
/user/grid/missyou
|
Filter files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
package
org.apache.hadoop.MyCode;
import
org.apache.hadoop.fs.PathFilter;
import
org.apache.hadoop.fs.Path;
public
class
MyFilter
implements
PathFilter {
private
final
String regex;
public
MyFilter(String regex) {
this
.regex = regex;
}
public
boolean
accept(Path path) {
return
path.toString().matches(regex);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
package
org.apache.hadoop.MyCode;
import
org.apache.hadoop.MyCode.MyFilter;
import
java.net.URI;
import
org.apache.hadoop.fs.FileSystem;
import
org.apache.hadoop.fs.FileStatus;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.fs.FileUtil;
import
org.apache.hadoop.conf.Configuration;
public
class
ListStatusWithPattern {
public
static
void
main(String[] args)
throws
Exception {
Configuration conf =
new
Configuration();
FileSystem fs = FileSystem.get(conf);
FileStatus[] status = fs.globStatus(
new
Path(args[
0
]),
new
MyFilter(args[
1
]));
Path[] pathList = FileUtil.stat2Paths(status);
for
( Path p : pathList ) {
System.out.println(p);
}
}
}
|
执行步骤:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
% source $YARN_HOME/libexec/hadoop-config.sh
% mkdir
class
% javac ListStatusWithPattern.java MyFilter.java -d
class
% jar -cvf liststatuswithpattern.jar -C
class
./
% export HADOOP_CLASSPATH=liststatuswithpattern.jar:$CLASSPATH
#suppose we have four files in hdfs like below
% hadoop fs -ls /user/grid/
Found
4
items
drwxr-xr-x - grid supergroup
0
2013
-
11
-
17
01
:
06
/user/grid/kiss
-rw-r--r--
3
grid supergroup
0
2013
-
11
-
17
06
:
05
/user/grid/kissyou
drwxr-xr-x - grid supergroup
0
2013
-
11
-
17
19
:
33
/user/grid/miss
-rw-r--r--
3
grid supergroup
899
2013
-
11
-
17
01
:
33
/user/grid/missyou
# then we can run the command to filter the matched file
% yarn jar liststatuswithpattern.jar org.apache.hadoop.MyCode.ListStatusWithPattern
"hdfs:///user/grid/*ss*"
"^.*grid/[k].*$
|
或者可以使用前面给出的脚本编译、打包并生成主要的执行yarn的代码:
1
2
3
4
5
6
7
8
9
10
|
$.
/compack
.sh ListStatusWithPattern.java MyFilter.java
#注意,脚本默认输入的第一个源文件为main class所在文件
$
source
source
/home/grid/hadoop-2
.2.0-src
/hadoop-dist/target/hadoop-2
.2.0
/task/DFSAPIProgramming/sourceIt
.sh
-----------------CMD Lines-----------------------
source
/home/grid/hadoop-2
.2.0-src
/hadoop-dist/target/hadoop-2
.2.0
/task/DFSAPIProgramming/sourceIt
.sh
yarn org.apache.hadoop.MyCode.MyFilter [
command
args]...
output:
hdfs:
//cluster1
:9000
/user/grid/kiss
hdfs:
//cluster1
:9000
/user/grid/kissyou
|
11.重写comparactor
要点:
类型比较在hadoop的mapreduce中非常重要,主要用来比较keys;
hadoop中的RawComparator<T>接口继承自java的comparator, 主要用来比较序列化的objects;
hadoop中的WritableComparator class更全面,提供了两种主要的比较方法,一种是直接比较object,另一种是较serialized representations;
举例来说 比较object:
compare(new IntWritable(21), new IntWritable(998));
比较serialized representations:
compare(serialize(new IntWritable(21)), serialize(new IntWritable(998)))
提示:继承关系
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
//1.org.apache.hadoop.io
Interface RawComparator<T>
//description
public
interface
RawComparator<T>
extends
Comparator<T>
//method
int
compare(
byte
[] b1,
int
s1,
int
l1,
byte
[] b2,
int
s2,
int
l2)
//2.org.apache.hadoop.io
InterfaceWritableComparable<T>
//description
public
interface
WritableComparable<T>
extends
Writable, Comparable<T>
//method
Methods inherited from
interface
org.apache.hadoop.io.Writable
readFields, write
//3.java.lang.Object
|__ org.apache.hadoop.io.WritableComparator
//description
public
class
WritableComparator
extends
Object
implements
RawComparator
//methods
int
compare(
byte
[] b1,
int
s1,
int
l1,
byte
[] b2,
int
s2,
int
l2)
int
compare(Object a, Object b)
int
compare(WritableComparable a, WritableComparable b)
static
int
compareBytes(
byte
[] b1,
int
s1,
int
l1,
byte
[] b2,
int
s2,
int
l2)
//4.java.util
InterfaceComparator<T>
//description
public
interface
Comparator<T>
//methods
int
compare(T o1, T o2)
boolean
equals(Object obj)
|
Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
import
java.lang.Byte;
import
java.io.DataOutputStream;
import
java.io.ByteArrayOutputStream;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.WritableComparator;
import
org.apache.hadoop.io.RawComparator;
public
class
MyIntWritableComparactor {
public
static
byte
[] serialize(IntWritable writable)
throws
Exception {
ByteArrayOutputStream out =
new
ByteArrayOutputStream();
DataOutputStream dataOut =
new
DataOutputStream(out);
writable.write(dataOut);
dataOut.close();
return
out.toByteArray();
}
@SuppressWarnings
(
"unchecked"
)
public
static
void
main(String[] args)
throws
Exception {
RawComparator<IntWritable> comparator = WritableComparator.get(IntWritable.
class
);
IntWritable w1 =
new
IntWritable(
13
);
IntWritable w2 =
new
IntWritable(
12
);
System.out.println(
"w1: "
+ w1 +
" w2: "
+ w2);
System.out.println(
"w1 compare w2 : "
+ comparator.compare(w1,w2));
byte
[] b1 = serialize(w1);
byte
[] b2 = serialize(w2);
System.out.println(
"b1.length: "
+ b1.length);
System.out.println(
"b2.length: "
+ b2.length);
System.out.println(
"b1.length compare b2.length: "
+
comparator.compare(b1,
0
, b1.length, b2,
0
, b2.length));
}
}
|
编译,运行:
1
2
3
4
5
6
7
|
//
注意我用的是hadoop2.2
%
source
$YARN_HOME
/libexec/hadoop-config
.sh
%
mkdir
myclass
% javac -d myclass MyIntWritableCompare.java
% jar -cvf mycompare.jar -C myclass ./
%
export
HADOOP_CLASSPATH=$CLASSPATH:mycompare.jar
% yarn MyIntWritableCompare
|
输出:
1
2
3
4
5
6
|
% yarn jar text.jar Text
w1: 13 w2: 12
w1 compare w2 : 1
b1.length: 4
b2.length: 4
b1.length compare b2.length: 1
|
12.数据压缩
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
import
org.apache.hadoop.io.*;
import
org.apache.hadoop.io.compress.CompressionOutputStream;
import
org.apache.hadoop.io.compress.CompressionCodec;
import
org.apache.hadoop.io.IOUtils;
import
org.apache.hadoop.util.ReflectionUtils;
public
class
StreamCompressor {
public
static
void
main(String[] args)
throws
Exception {
String codeClassname = args[
0
];
Class<?> codeClass = Class.forName(codeClassname);
Configuration conf =
new
Configuration();
CompressionCodec codec =
(CompressionCodec)ReflectionUtils.newInstance(codeClass,conf);
CompressionOutputStream out = codec.createOutputStream(System.out);
IOUtils.copyBytes(System.in,out,
4096
,
false
);
out.finish();
}
}
|