Spark+Ansj中文分词Scala程序

Spark下四种中文分词工具使用
hanLP
ansj
jieba
fudannlp

推荐使用ansj,速度快而且效果好
另外jieba,hanLP效果也不错。
具体参考
ansj:https://github.com/NLPchina/ansj_seg
HanLP:https://github.com/hankcs/HanLP

我的代码如下,加了scala连接mysql数据库查找、插入操作,添加自定义词典,添加停用词词典,和Spark RDD实现wordcount的相关知识。

package WordCloud

import java.sql.{Connection, DriverManager}
import java.{util}

import Mysql.ConvertToJson
import domain.tb_analyze_professional_skill
import org.ansj.library.DicLibrary

import scala.io.Source
import org.ansj.recognition.impl.StopRecognition
import org.ansj.splitWord.analysis.{DicAnalysis, ToAnalysis}
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf

/**
  * Created by ljq on 19-2-23.
  */
object WordCloud {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Wordcloud").setMaster("local[4]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val jdbcDF = spark.read.format("jdbc").option("url", "jdbc:mysql://xx:3306/job_data?useSSL=false")
      .option("driver", "com.mysql.jdbc.Driver")
      .option("dbtable", "job_data")
      .option("user", "xx")
      .option("password", "xx").load()
    val data = jdbcDF.rdd.map(x => x(11))

    //添加自定义词典
    val dicfile = raw"/home/zq/Desktop/ExtendDic" //ExtendDic为一个文本文件的名字,里面每一行存放一个词
    //逐行读入文本文件,将其添加到自定义词典中
    for (word <- Source.fromFile(dicfile).getLines) {
      DicLibrary.insert(DicLibrary.DEFAULT, word)
    }

    //添加停用词词典
    val stopworddicfile = raw"/home/zq/Desktop/StopWordDic" //stopworddicfile为一个文本文件的名字,里面每一行存放一个词
    val filter = new StopRecognition()
    filter.insertStopNatures("w", null) //过滤掉标点
    filter.insertStopRegexes("^[0-9]*$", "\\s*") //过滤掉数字和空字符
    for (word <- Source.fromFile(stopworddicfile).getLines) {
      filter.insertStopWords(word)
    }

    val splited = data.filter(_ != null).map(x => DicAnalysis.parse(x.toString).recognition(filter).toStringWithOutNature(" ")) //.replaceAll("\\s*", "")

    val wordcloud = splited.cache().flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _, 1).sortBy(_._2, false).take(20)

    val list = new util.ArrayList[tb_analyze_professional_skill]()

    wordcloud.foreach(x => list.add(tb_analyze_professional_skill(x._1, x._2.toString)))

    val str = ConvertToJson.ToJson(list)

    insert(str)
    /*data.foreach(println)*/
    /*val jrdd = jdbcDF.collect()
    println(jrdd.toBuffer)*/
  }

  def insert(result: String): Unit = {
    val Driver = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://xx:3306/job_data?useUnicode=true&characterEncoding=utf8" +
      "&useSSL=false"
    var conn: Connection = null
    var ps: java.sql.PreparedStatement = null
    val sql = "insert into tb_analyze_professional_skill(result) values(?)"
    try {
      Class.forName(Driver)
      conn = DriverManager.getConnection(url, "xx", "xx")
      ps = conn.prepareStatement(sql)
      ps.setString(1, result)
      ps.executeUpdate()
    }
    catch {
      case e: Exception => e.printStackTrace
    }
    conn.close()
    ps.close()
  }
}

package domain

/**
  * Created by ljq on 19-2-23.
  */
case class job_info(id: Int, direction: Int, job_name: String, company_name: String, job_site: String, job_salary: String, avr_salary: Double, relase_date: String, education_level: String, work_exper: String, company_welfare: String, job_resp: String, job_require: String, company_type: String, company_people_num: String, company_business: String)

case class tb_analyze_professional_skill(name:String,value:String)

case class tb_analyze_job_requirements(name:String,value:String)

package Mysql;

import com.google.gson.Gson;
import domain.tb_analyze_professional_skill;

import java.util.ArrayList;

/**
 * Created by ljq on 19-2-24.
 */
public class ConvertToJson {
    public static String ToJson(ArrayList<tb_analyze_professional_skill> list) {
        Gson gson = new Gson();
        String gsonStr = gson.toJson(list);
        return gsonStr;
    }
}

你可能感兴趣的:(Scala学习,大数据平台Spark生态系统,BigData,分词)