eclipse远程调试mapreduce程序

编程调试环境:window eclipse
hadoop运行环境:linux(vmware)
hadoop版本:1.2.1

hadoop编程涉及两个方面,一是hdfs分布式文件存储程序、一是mapreduce分布式数据处理程序。
因为习惯在window下使用eclipse开发程序,但是hadoop的程序需要运行在hadoop环境中,所以希望能够在window下eclipse编程并能够直接在eclipse执行hadoop程序。

一、mapreduce程序
步骤一、搭建hadoop开发环境,导入hadoop依赖包
            hadoop加载的配置文件主要包括:core-default.xml/core-site.xml、hdfs-default.xml/hdfs-site.xml、mapred-default.xml/mapred-site.xml,其中*-site.xml文件为用户修改文件,默认情况下使用*-site.xml的配置信息覆盖default配置项,因此需要修改site文件。
步骤二、创建或修改配置文件并保存到source目录下,如src目录下
           在创建mapreduce程序时只需要修改mapred-site.xml文件即可。
          
        mapred.job.tracker
        8.8.8.7:9001   jobtracker运行监听的地址端口,用于提交job任务
        
    

步骤三、修改hadoop运行程序的监听服务地址,如8.8.8.7:9001
         在hadoop运行环境中做上述相同的修改,启动hadoop后可以通过netstat -an|grep 9001命令在linux查看监听。理论上监听0.0.0.0:9001应该没问题,没有测试

步骤四、编写打包程序

       因为mapreduce需要运行在hadoop环境下,所以需要把mapreduce程序打成jar包并通过jobconf设置。这样在jobclient提交job任务是会自动把jar文件生成的hadoop文件系统并解压缩,通过反射的方式调用mapreduce程序的main函数执行

//本方式只实现了一级目录,没有递归打包子目录。

public static File createJar(Class clazz)
    {
        try
        {
            String pathName = clazz.getPackage().getName().replace(".", "/");
            final String fileName = clazz.getName().substring(clazz.getName().lastIndexOf(".")+1);
            final File file = File.createTempFile("MapreduceJar-"+fileName, ".jar",new File("."));
            Runtime.getRuntime().addShutdownHook(new Thread(){
                @Override
                public void run() {
                    file.delete();
                }
                
            });
            Manifest manifest = new Manifest();
            manifest.getMainAttributes().putValue("Manifest-Version", "1.0");
            manifest.getMainAttributes().putValue("Created-By", "ejtoo");
            JarOutputStream out = new JarOutputStream(new FileOutputStream(file), manifest);
            
            File dir = new File(clazz.getResource("").getFile());
            File[] names = dir.listFiles();
            for(File name : names)
            {
                JarEntry entry = new JarEntry(pathName+"/"+name.getName());
                out.putNextEntry(entry);
                FileInputStream in = new FileInputStream(name);
                byte[] buffer = new byte[1024];
                int n = in.read(buffer);
                while (n != -1) {
                    out.write(buffer, 0, n);
                    n = in.read(buffer);
                }
                in.close();
            }
            out.flush();
            out.close();
            
            return file;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }
    }

步骤五、编写mapreduce程序,在执行job前需要首先执行打包方法,另外需要实现自己的map和reduce类

package com.ejtoo;

import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class Maptest extends Configured implements Tool {
    public static class Map extends Mapper {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            System.out.println("line===>" + line);
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class Reduce extends Reducer {
        public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    public int run(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        System.out.println(conf.get("mapred.job.tracker"));
        System.out.println(conf.get("fs.default.name"));
        File jarFile =createJar(Maptest.class);
        ((JobConf) job.getConfiguration()).setJar(jarFile.toString());
        job.setJarByClass(Maptest.class);
        job.setJobName("wordcount");

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, new Path("test/jjj.txt"));//指定在hadoop文件系统的文件
        Path path = new Path("output");
        FileSystem fSystem = (DistributedFileSystem)FileSystem.get(conf);
        fSystem.delete(path, true);
        FileOutputFormat.setOutputPath(job, path);
        boolean success = job.waitForCompletion(true);
        System.out.println(job.isComplete());
        System.out.println("JobID: " + job.getJobID());
        job.getJobClient().close();//如果不通过jobclient关闭rpc,服务端会报错readAndProcess threw exception java.io.IOException: Connection reset by peer。该方法未开放
        return success ? 0 : 1;
    }
    
    public static void main(String[] args) throws Exception {
        /**
         * TODO:调用一
         */
        int ret = ToolRunner.run(new Maptest(), args);
        System.exit(ret);
    }
}

步骤五、提交执行mapreduce程序,并在浏览器通过http://8.8.8.7:50030查看job执行结果


问题处理,如果服务端jobtracker日志中有如下异常,则表示本地程序执行完毕异常关闭了与hadoop中IPC Server的连接,Server向客户端发送数据是异常。(连接被关闭)

2013-12-26 23:43:37,821 INFO org.apache.hadoop.ipc.Server: IPC Server listener on 9001: readAndProcess threw exception java.io.IOException: Connection reset by peer. Count of bytes read: 0
java.io.IOException: Connection reset by peer
        at sun.nio.ch.FileDispatcher.read0(Native Method)
        at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:21)
        at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:233)
        at sun.nio.ch.IOUtil.read(IOUtil.java:206)
        at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:236)
        at org.apache.hadoop.ipc.Server.channelRead(Server.java:1776)
        at org.apache.hadoop.ipc.Server.access$2700(Server.java:97)
        at org.apache.hadoop.ipc.Server$Connection.readAndProcess(Server.java:1134)
        at org.apache.hadoop.ipc.Server$Listener.doRead(Server.java:577)
        at org.apache.hadoop.ipc.Server$Listener$Reader.run(Server.java:384)
        at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
        at java.lang.Thread.run(Thread.java:619)


上述异常不影响功能,如果需要处理异常则需要使用job.getJobClient().close();方法,当前getJobClient方法未开放。



你可能感兴趣的:(hadoop)