记录hive中文分词+词频统计

1,编写udf程序,这里使用的时IK分词jar包进行分词,udf函数输出的是使用空格分隔的分词后的词组成的字符串。

package hiveUDF;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class IkParticiple extends UDF {
	public String evaluate(String input) {
		// 如果输入为空,则直接返回空即可
		String output="";
		if (input == null || input.trim().length() == 0) {
			return null;
		}
	     //JiebaSegmenter segmenter = new JiebaSegmenter();
	    // output=segmenter.sentenceProcess(input).toString().replaceAll(", ", " ").toLowerCase();
		byte[] bt = input.getBytes();
		InputStream ip = new ByteArrayInputStream(bt);
		Reader read = new InputStreamReader(ip);
		IKSegmenter iks = new IKSegmenter(read, true);
		Lexeme t;
		try {
			while ((t = iks.next()) != null) {
				output=output+t.getLexemeText().toLowerCase()+" ";
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return output;
	}
 
	/*public static void main(String[] args) {
		System.out.println(new IkParticiple().evaluate("测试我的分词udf"));
	}*/
	//输出:测试 我 的 分词 udf 
}

2,使用打包工具打包
3,在hive中加入打好包的jar包和使用的IK分词包

add jar /home/hadoop/workspace/hiveUDF/target/hiveUDF-0.0.1-SNAPSHOT.jar;
add jar /home/hadoop/IKAnalyzer2012_u6.jar;

4,创建分词的临时函数

create temporary function fenciTest as 'hiveUDF.IkParticiple';

5,对jobs表中的job_info字段进行词频统计降序输出。
create table word_counts as select word,count(1) as count from
(select explode(split(fenciTest(job_info),’ ')) as word from jobs ) word
group by word
order by count;

你可能感兴趣的:(云计算与大数据)