基于hanLP的中文分词-MapReduce实现

用mapreduce实现中文分词
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


import java.io.IOException;
import java.util.List;

public class SentenceSeg extends Configured implements Tool {

    public static class segMapper extends Mapper<LongWritable, Text, IntWritable, Text> {

        public void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {
            Text textSentence = new Text();
            String line = value.toString();
            String[] strSplits = line.split("\t");
            if(strSplits.length == 2) {
                int label = Integer.parseInt(strSplits[0]);
                String sentence = strSplits[1];
                Segment segment = HanLP.newSegment().enablePartOfSpeechTagging(false);
                List<Term> segWords = segment.seg(sentence);
                CoreStopWordDictionary.apply(segWords);

                String strSegWords = segWords.toString().replaceAll("[\\[\\]]", "").replaceAll("\\/[a-z]+,", "");
                textSentence.set(strSegWords.toString());
                System.out.println("---------------------------");
                System.out.println(segWords.size());
                System.out.println(strSegWords);
                System.out.println("---------------------------");

                context.write(new IntWritable(label), textSentence);
            }
        }
    }


    public int run(String[] args) throws Exception{
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        System.out.println("----------------------");
        System.out.println(otherArgs.length);
        for(int i=0;i<otherArgs.length;i++){
            System.out.println(otherArgs[i]);
        }
        System.out.println("----------------------");
        if(otherArgs.length != 2){
            System.err.println("Usage: SentenceSeg <inputLoc> <outputLoc>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf);
        job.setJobName("credit_text_classify");
        job.setJarByClass(SentenceSeg.class);

        Path input = new Path(otherArgs[0]);
        Path output = new Path(otherArgs[1]);
        FileInputFormat.addInputPath(job, input);
        FileOutputFormat.setOutputPath(job, output);

        job.setMapperClass(segMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setNumReduceTasks(0);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);
        System.exit(job.waitForCompletion(true)?0:1);

        return 0;

    }


    public static void main(String[] args) throws Exception {

        int res = ToolRunner.run(new Configuration(), new SentenceSeg(), args);
        System.exit(res);

    }
}

你可能感兴趣的:(mapreduce,文本分类)