Spark和HanLP结合实现分词

实现地理位置名词的分词

    val spark = SparkSession
      .builder()
      .appName("Word2Vec").master("local[*]")
      .getOrCreate()

    val df=spark.createDataFrame(Seq(("1","湖北武汉市汉口北大道12345号"),
      ("2","成都青羊区清江中路"),
      ("3","地址是乱输入的")
    )).toDF("id","address")

    import spark.implicits._
    val tmpDf=df.map(r=>{
        val id=r.getAs[String]("id")
        val address=r.getAs[String]("address")
        val result=address.wordSplit().mkString("|")
        (id,address,result)
      }).filter(x=>{
      x._3!=""
    }).toDF("id","address","address_split")
    tmpDf.show()
    spark.stop()

核心代码:

  implicit class WordSplit(word: String) extends Serializable {

    def wordSplit(flag: Boolean = false): Seq[String] = {
      Option(word) match {
        case None => Seq.empty[String]
        case Some(s) => {
          val el = segments.seg(s.trim)
          val result = if (el.isEmpty) Seq.empty[String]
          else {
            //取地理名词
            el.filter(_.nature.name() == "ns")
              .map(x => {
                x.word.trim.replaceAll(" ", "")
              }).filterNot(_.isEmpty).distinct
          }
          flag match {
            case false => result
            case true => result.map(_.replaceAll(usenessWs.mkString("[", " ", "]"), ""))
          }
        }
      }
    }
  }

你可能感兴趣的:(Spark)