Spark 第三天

  • 数据分析案例

    统计某一时间段或时间点内每个模块的被访问此处,取出前三个次数最多的模块作为参考

     import java.net.URL
    
    import org.apache.spark.rdd.RDD
    import org.apache.spark.{SparkConf, SparkContext}
    
    
    
    //TODO 统计某一时间段或时间点内每个模块的被访问此处,取出前三个次数最多的模块作为参考
    // 20161123101523 http://java.learn.com/java/javaee.shtml
    //20161123101523  http://java.learn.com/java/javaee.shtml
    //20161123101523  http://ui.learn.com/ui/video.shtml
    //20161123101523  http://bigdata.learn.com/bigdata/
    object Subject {
      def main(args: Array[String]): Unit = {
        val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
        val sc = new SparkContext(sparkConf)
        val data = sc.textFile("G://scala/数据/access.txt")
        //数据清洗
        val urlClickOne:RDD [(String,Int)] =data.map(line=>{
          val fields=line.split("\t")
          val url =fields(1)
          (url,1)
        })
      //加起来
        val sumUrl: RDD[(String, Int)] = urlClickOne.reduceByKey(_ + _)
        //提取学科信息
        val subjectUrlCount: RDD[(String, String, Int)]= sumUrl.map(content=>{
          val url=content._1
          val count=content._2
          val subject=new URL(url).getHost
          (subject,url,count)
        })
        val result=subjectUrlCount.groupBy(_._1)mapValues(_.toList.sortBy(_._3).reverse.take(3))
    result.foreach(println)
        sc.stop()
    
    
      }
    
    }
    
    运行结果
    (ui.learn.com,List((ui.learn.com,http://ui.learn.com/ui/
    video.shtml,37),
    (ui.learn.com,http://ui.learn.com/ui/course.shtml,26),
    (ui.learn.com,http://ui.learn.com/ui/teacher.shtml,23)))
    
    • 自定义分区

      import java.net.URL
      
      import org.apache.spark.rdd.RDD
      import org.apache.spark.{Partitioner, SparkConf, SparkContext}
      import scala.collection.mutable
      import scala.collection.mutable.Map
      //TODO 按不同学科进行分区
      object Subject02 {
        def main(args: Array[String]): Unit = {
          val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
          val sc = new SparkContext(sparkConf)
          val access: RDD[String] = sc.textFile("E://access.txt",1)
          // 2.数据维度提取
          val data = access.map(line => {
            val fields = line.split("\t")
            val url = fields(1)
            val subject = new URL(url).getHost
            (subject, url)
          })
      
      
          //使用分区器
        val subjuectPartition=new SubjuectPartition(data.keys.distinct.collect())
       data.groupBy(_._1).partitionBy(subjuectPartition).saveAsTextFile("")
        }
      }
      class  SubjuectPartition(subject: Array[String]) extends Partitioner
      {
        //分区数
        override def numPartitions: Int = subject.length
      //key(也就是科目) 和分区数的对应关系()
        val map:Map[String,Int]=new mutable.HashMap
        var partition = 0
        //把各个科目和分区序号给了map 形成一一对应的关系
        for(subjeo<-subject){
          map.put(subjeo,partition)
          partition += 1
        }
        //指定分区规则 map get到要配置的分区号
        override def getPartition(key: Any): Int ={
          map.getOrElse(key.toString,0)
        }
      }
      
  • 自定义排序

自定义排序

import org.apache.spark.{SparkConf, SparkContext}





case class Person(name: String, age: Int, height: Int)
object Mysort{
val personOrder=new Ordering[Person]{
 implicit override def compare(x: Person, y: Person): Int = {
    if(x.age!=y.age){
      x.age-y.age

    }else{
      y.height-x.height
    }
  }
}

}

object ortTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val data =sc.parallelize(Array(("tom",20,169),("jerry",19,173),("jack",21,175),("rose",20,170)),1)
  data.sortBy(person=>Person(person._1,person._2,person._3),true).foreach(println)
    sc.stop()

  }

}

    // 运行结果: 
    // (jack,21,175) (tom,20,169) (rose,20,170) (jerry,19,173) 
    sc.stop()
  }
}
  • 实现变量的共享
因为list 分区了数据不能共享 用累加器区实现数据共享

package com.qfedu.day13

import org.apache.spark.{SparkConf, SparkContext}

object AccumulatorTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val list = List(1,2,3,4,5,6)
    val listRDD = sc.parallelize(list, 3)
    // 实现变量的共享,必须由SparkContext创建
    val accumulator = sc.accumulator(0)
    // 不能使用普通变量进行记录
      
    var sum = 0
    listRDD.foreach(x => {
      accumulator += x
    })
    println(accumulator)
  }
}

  • SparkSQL
    • DataFrame

libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.6.3" - 从文件中读取数据构建DataFrame

import org.apache.spark.{SparkConf, SparkContext}
case class Make(id: Int, makeName: String)
object DataFrameTest {
def main(args: Array[String]): Unit = {
val sparkConf = new
SparkConf().setAppName("Master").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
val data =
sc.textFile("E://make.txt").map(_
.split("
,
"))
val dataFrame =
sqlContext.createDataFrame(data.map(make =>
Make(make(0).toInt, make(1))))
dataFrame.show()
sc.stop()
}
}

通过StructType指定Schema

将DataFrame注册成表

可以使用registerTempTable方法将DataFrame注册成表,直接使用SQL进行操作

应为注册成的变第一列为_1 _2 _3 这种形式 所以需要重新处理第一列 变成1 2 3

package com.qfedu.day13

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}

object DataFrameTest {
  def main(args: Array[String]): Unit = {
    // 0.初始化
    val sparkConf = new SparkConf().setAppName("SparkSQL").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    // 1.构建SqlContext
    val sqlContext = new SQLContext(sc)
    val data: RDD[String] = sc.textFile("E://make.txt")
    // 2.构建所需的Row结构
    val dataRDD: RDD[Row] = data.map(line => {
      val id = line.split(",")(0).toInt
      val makeName = line.split(",")(1)
      // 无泛型约束
      Row(id, makeName)
    })
    val schema = StructType(
      // 使用集合形式构建每一列的结构信息
      List(StructField("id",IntegerType),StructField("makeName",StringType))
    )
    // 使用RDD构建DataFrame
    val dataFrame = sqlContext.createDataFrame(dataRDD, schema)
    // 打印元信息
    dataFrame.printSchema()
    // dataFrame.show()
    // 注册为临时表
    dataFrame.registerTempTable("make")
    // 使用SQL操作查询文本数据
    sqlContext.sql("select * from make where id < 10").show()
  }
}


  • 以JDBC的方式从关系型数据库中读取数据,并转换成RDD的方式 - 工程构建 libraryDependencies += "mysql" % "mysql-connector-java" % "5.1.46"

useUnicode 是否使用Unicode字符集,如果参数characterEncoding设置为gb2312或gbk,本参数值必须设置为true

characterEncoding :当useUnicode设置为true时,指定字符编码。比如可设置为gb2312或gbk

package com.qfedu.day13

import java.sql.DriverManager

import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

object JdbcRDDTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("JdbcRDD").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val jdbcUrl = "jdbc:mysql://localhost:3306/w01?useUnicode=true&characterEncoding=utf8"
    val user = "root"
    val password = "mysql"
    val conn = () => {
      Class.forName("com.mysql.jdbc.Driver").newInstance()
      DriverManager.getConnection(jdbcUrl, user, password)
    }
    val sql = "select * from make where id between ? and ?"
    val jdbcRDD: JdbcRDD[(Int, String)] = new JdbcRDD(sc, conn, sql, 0, 300, 3,
      res => {
        val id = res.getInt("id")
        val makeName = res.getString("makeName")
        (id, makeName)
      }
    )
    println(jdbcRDD.collect().toBuffer)
    sc.stop()
  }
}

你可能感兴趣的:(Spark 第三天)