Spark下四种中文分词工具使用
hanLP
ansj
jieba
fudannlp
推荐使用ansj,速度快而且效果好
另外jieba,hanLP效果也不错。
具体参考
ansj:https://github.com/NLPchina/ansj_seg
HanLP:https://github.com/hankcs/HanLP
我的代码如下,加了scala连接mysql数据库查找、插入操作,添加自定义词典,添加停用词词典,和Spark RDD实现wordcount的相关知识。
package WordCloud
import java.sql.{Connection, DriverManager}
import java.{util}
import Mysql.ConvertToJson
import domain.tb_analyze_professional_skill
import org.ansj.library.DicLibrary
import scala.io.Source
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.{DicAnalysis, ToAnalysis}
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
/**
* Created by ljq on 19-2-23.
*/
object WordCloud {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Wordcloud").setMaster("local[4]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val jdbcDF = spark.read.format("jdbc").option("url", "jdbc:mysql://xx:3306/job_data?useSSL=false")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "job_data")
.option("user", "xx")
.option("password", "xx").load()
val data = jdbcDF.rdd.map(x => x(11))
//添加自定义词典
val dicfile = raw"/home/zq/Desktop/ExtendDic" //ExtendDic为一个文本文件的名字,里面每一行存放一个词
//逐行读入文本文件,将其添加到自定义词典中
for (word <- Source.fromFile(dicfile).getLines) {
DicLibrary.insert(DicLibrary.DEFAULT, word)
}
//添加停用词词典
val stopworddicfile = raw"/home/zq/Desktop/StopWordDic" //stopworddicfile为一个文本文件的名字,里面每一行存放一个词
val filter = new StopRecognition()
filter.insertStopNatures("w", null) //过滤掉标点
filter.insertStopRegexes("^[0-9]*$", "\\s*") //过滤掉数字和空字符
for (word <- Source.fromFile(stopworddicfile).getLines) {
filter.insertStopWords(word)
}
val splited = data.filter(_ != null).map(x => DicAnalysis.parse(x.toString).recognition(filter).toStringWithOutNature(" ")) //.replaceAll("\\s*", "")
val wordcloud = splited.cache().flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _, 1).sortBy(_._2, false).take(20)
val list = new util.ArrayList[tb_analyze_professional_skill]()
wordcloud.foreach(x => list.add(tb_analyze_professional_skill(x._1, x._2.toString)))
val str = ConvertToJson.ToJson(list)
insert(str)
/*data.foreach(println)*/
/*val jrdd = jdbcDF.collect()
println(jrdd.toBuffer)*/
}
def insert(result: String): Unit = {
val Driver = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://xx:3306/job_data?useUnicode=true&characterEncoding=utf8" +
"&useSSL=false"
var conn: Connection = null
var ps: java.sql.PreparedStatement = null
val sql = "insert into tb_analyze_professional_skill(result) values(?)"
try {
Class.forName(Driver)
conn = DriverManager.getConnection(url, "xx", "xx")
ps = conn.prepareStatement(sql)
ps.setString(1, result)
ps.executeUpdate()
}
catch {
case e: Exception => e.printStackTrace
}
conn.close()
ps.close()
}
}
package domain
/**
* Created by ljq on 19-2-23.
*/
case class job_info(id: Int, direction: Int, job_name: String, company_name: String, job_site: String, job_salary: String, avr_salary: Double, relase_date: String, education_level: String, work_exper: String, company_welfare: String, job_resp: String, job_require: String, company_type: String, company_people_num: String, company_business: String)
case class tb_analyze_professional_skill(name:String,value:String)
case class tb_analyze_job_requirements(name:String,value:String)
package Mysql;
import com.google.gson.Gson;
import domain.tb_analyze_professional_skill;
import java.util.ArrayList;
/**
* Created by ljq on 19-2-24.
*/
public class ConvertToJson {
public static String ToJson(ArrayList<tb_analyze_professional_skill> list) {
Gson gson = new Gson();
String gsonStr = gson.toJson(list);
return gsonStr;
}
}