Spark中DataFrame去除NaN、null以及空字符串数据

去除null、NaN:

//创建sparkSession(打包在集群上运行要删除master)
    val sparkConf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.filter(!_.equals('$')))
	
	//获取sparkContext
	val sparkContext = new SparkContext(sparkConf)

    //设置日志级别
    sparkContext.setLogLevel("WARN")

	//获取sqlContext
    val spark: SQLContext = new SQLContext(sparkContext)
	
	//读取数据
	val data: DataFrame = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://10.213.111.XXX:23306/buf_amr_all")
      .option("dbtable", "pdwqy_pms_yx_sbxx")
      .option("user", "qjjc")
      .option("password", "XXX")
      .load()

	//去除null和NaN
	data.na.drop().show()

去掉空字符串:

//创建sparkSession(打包在集群上运行要删除master)
    val sparkConf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.filter(!_.equals('$')))
	
	//获取sparkContext
	val sparkContext = new SparkContext(sparkConf)

    //设置日志级别
    sparkContext.setLogLevel("WARN")

	//获取sqlContext
    val spark: SQLContext = new SQLContext(sparkContext)
	
	//读取数据
	val data: DataFrame = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://10.213.111.XXX:23306/buf_amr_all")
      .option("dbtable", "pdwqy_pms_yx_sbxx")
      .option("user", "qjjc")
      .option("password", "XXX")
      .load()
	
	//去掉空字符串
	data.where("sentence <> ''").show()

你可能感兴趣的:(Spark)