hive UDF 提取文本中的地名

有时候,需要从文本字符串提取出地区名,为了在HIVE中使用方便,开发了一个HIVE UDF,核心在于使用了hanLP这样一个中文文本处理的神器。详细代码如下:



import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;


public class placeExtract extends UDF{
    public Text evaluate(Text inputStr){
        if(inputStr == null) return null;
        Text result = new Text();

        Segment segment = HanLP.newSegment().enablePlaceRecognize(true);
        List<Term> termList = segment.seg(inputStr.toString());
        StringBuilder places = new StringBuilder();
        for(int i=0; i< termList.size(); i++){
            String s = termList.get(i).toString();
            if(Pattern.matches(".*/ns$",s)){
                places.append(s.replace("/ns",""));
                places.append(" ");
            }
        }
        result.set(places.toString());
        return result;
    }

    public static void main(String[] args){
        String[] testCase = new String[]{
                "北京朝阳区大妈",
                "山东青岛大虾"
        };
        Segment segment = HanLP.newSegment().enablePlaceRecognize(true);
        for(String sentence : testCase){
            List<Term> termList = segment.seg(sentence);
            StringBuilder places = new StringBuilder();
            for(int i=0; i< termList.size(); i++){
                String s = termList.get(i).toString();
                if(Pattern.matches(".*/ns$",s)){
                    places.append(s.replace("/ns",""));
                    places.append(" ");
                }
            }
            System.out.println(places);
        }
    }

}


你可能感兴趣的:(hive,udf,文本分析)