package com.iminer.alg.review.movie.xinjian; import java.io.ByteArrayOutputStream; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.Pair; import com.iminer.alg.review.opinion.ClassifierModelPredict; import com.iminer.statistics.entity.Utility; import com.iminer.tool.common.util.Tools; public class predictMR extends Configured implements Tool { public static class MRTemplateNewMappper extends Mapper<Text, Text, IntWritable, Text> { ClassifierModelPredict cmp ;//= new ClassifierModelPredict("lexAftRemove", "modelk2Final"); @Override // 如果没有需要初始化的资源,无需此步驟 protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); try { URI[] uriArr = DistributedCache.getCacheFiles(conf); for (URI uri : uriArr) { //从分布式缓存中读取停用词表 FileSystem fs = FileSystem.get(context.getConfiguration()); //从分布式缓存中读取指定单词集构成的trie树 if (uri.toString().contains("predict.out")) { FSDataInputStream input = fs.open(new Path(uri.toString())); try { cmp = (ClassifierModelPredict)Tools.getObjectFromBytes(input2byte(input)); System.out.println("model加载成功!"); } catch (Exception e) { e.printStackTrace(); } input.close(); } } } catch (IOException e) { e.printStackTrace(); } // } @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { // .. 处理数据 String content = value.toString(); try { Pair<Integer , Double> pair = cmp.mygetPrediction(content, true); context.write(new IntWritable(pair.getFirst().intValue()), value); } catch (Exception e) { e.printStackTrace(); } } @Override // 如果没有需要释放的资源,无需此步驟 protected void cleanup(Context context) throws IOException, InterruptedException { // .... 如果没有需要释放的资源, super.cleanup(context); } } public static class MRTemplateNewReducer extends Reducer<Writable, Writable, Writable, Writable> { @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); // } @Override protected void reduce(Writable key, Iterable<Writable> values, Context context) throws IOException, InterruptedException { for (Writable value : values) { // ..... } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { // .... super.cleanup(context); } } @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); // 设置job名 job.setJobName("Data predict"); job.setJarByClass(predictMR.class); // 设置map和reduce类 job.setMapperClass(MRTemplateNewMappper.class); // job.setReducerClass(MRTemplateNewReducer.class); // 设置输入输出类型 job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); // 设置输入输出格式 job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // 设置输入输出路径 FileInputFormat.addInputPath(job, new Path("/user/fanglei/task/tv_weibo_splitted/北京爱情故事-r-00000")); FileSystem fs = FileSystem.get(getConf()); Path outPutPath = new Path(args[1]); if (fs.exists(outPutPath)) fs.delete(outPutPath, true); FileOutputFormat.setOutputPath(job, outPutPath); // 提交job job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; } public static void main(String[] args) throws Exception { ClassifierModelPredict cmp = new ClassifierModelPredict("lexAftRemove", "modelk2Final"); FileOutputStream fos = new FileOutputStream("predict.out"); ObjectOutputStream oos = new ObjectOutputStream(fos); oos.writeObject(cmp); oos.close(); Configuration conf = new Configuration(); FileSystem localFS = FileSystem.get(conf); localFS.copyFromLocalFile(true, new Path("predict.out"), new Path("/user/lvxinjian/" + "predict.out")); DistributedCache.addCacheFile(new URI("/user/lvxinjian/" + "predict.out"), conf); String[] otheragrs = new GenericOptionsParser(conf, args) .getRemainingArgs(); int result = ToolRunner.run(conf, new predictMR(), otheragrs); System.exit(result); } public static final byte[] input2byte(InputStream inStream) throws IOException { ByteArrayOutputStream swapStream = new ByteArrayOutputStream(); byte[] buff = new byte[100]; int rc = 0; while ((rc = inStream.read(buff, 0, 100)) > 0) { swapStream.write(buff, 0, rc); } byte[] in2b = swapStream.toByteArray(); return in2b; } }