hadoop 编程

FileSystem


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;


import org.apache.commons.compress.utils.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Before;
import org.junit.Test;


public class HdfsUtil {
FileSystem fs=null;
@Before
public void conf() throws Exception{
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://wuke01:9000/");
fs=FileSystem.get(new URI("hdfs://wuke01:9000/"), conf, "hadoop");
}
/**
* 原始下载文件
* @throws Exception
*/
@Test
public void download() throws Exception{
FSDataInputStream in=fs.open(new Path("hdfs://wukecomputer:9000/jdk-7u65-linux-i586.tar.gz"));
   FileOutputStream out=new FileOutputStream("D:/jdk.tar.gz");
   IOUtils.copy(in, out);
}
/**
* 封装下载文件
* @throws Exception
*/
@Test
public void download2() throws Exception{
fs.copyToLocalFile(new Path("hdfs://wukecomputer:9000/jdk-7u65-linux-i586.tar.gz"), new Path("D:/jdk2.txt"));
}
/**
* 上传文件
* @throws Exception
* @throws IOException
*/
@Test
public void upload() throws Exception, IOException{
File file=new File("D:/aa.txt");
FileOutputStream o = new FileOutputStream(file);  
     o.write("wuke\001dukun\001pang\001ding".getBytes("utf-8"));  
     o.close();  
FileInputStream in=new FileInputStream("D:/aa.txt");
FSDataOutputStream out=fs.create(new Path("hdfs://wuke01:9000/spark/up.txt"));
  IOUtils.copy(in, out);
}
/**
* 精简上传文件
*/
@Test
public void upload2() throws Exception{
fs.copyFromLocalFile(new Path("C:/IFRToolLog.txt"), new Path("hdfs://wukecomputer:9000/aa/up2.txt"));
}


}


mapreduce


Mapper:


import java.io.IOException;


import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


//Mapper
//默认情况向,框架传递给我们的mapper的输入数据中,key是要处理的文本中一行的偏移量,value是这一行的内容
public class WCMapper extends Mapper{


//mapreduce框架每读一行数据,调用一次map方法
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line=value.toString();
//切分内容
String[] words=StringUtils.split(line," ");

for(String word:words){
context.write(new Text(word), new LongWritable(1));
}
}




}


Reducer:


import java.io.IOException;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class WCReducer extends Reducer{
//mapreduce框架处理完map后,将所有key-value对缓存,然后传递一组,调用依次reducer方法
//
@Override
protected void reduce(Text text, Iterable values,Context context)
throws IOException, InterruptedException {
long count=0;
//遍历value的list,累加求和
for(LongWritable value:values){
count+=value.get();
}
//输出这一个单词的统计结果
context.write(text, new LongWritable(count));

}


}


Runner:


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;




/**
 * 用来描述一个特定作业
 * 比如,该作业使用哪个类作为逻辑处理中的map,哪个作为reducer
 * 还可以指定数据输入、输出路径
 * @author ke
 *
 */
public class WCRunner {
public static void main(String[] args) throws Throwable {
Configuration conf=new Configuration();
Job wcJob=Job.getInstance(conf);

//指定maper reducer  jar包所在的位置
wcJob.setJarByClass(WCRunner.class);

//指定mapper和reducer类
wcJob.setMapperClass(WCMapper.class);
wcJob.setReducerClass(WCReducer.class);

//指定reducer输出数据类型
//(此处输出类型是为为reducer和map设置的。如果mapper输出类型和reducer一致,可只写下面两行设置)
wcJob.setOutputKeyClass(Text.class);
wcJob.setOutputValueClass(LongWritable.class);

//指定Mapper输出数据类型
wcJob.setMapOutputKeyClass(Text.class);
wcJob.setMapOutputValueClass(LongWritable.class);

//指定输入输出数据路径
FileInputFormat.setInputPaths(wcJob, new Path("/wc/input/"));
FileOutputFormat.setOutputPath(wcJob, new Path("/wc/output2/"));

//向集群提交此job,参数true代表打印进度
wcJob.waitForCompletion(true);

}
}

你可能感兴趣的:(hadoop)