Spark 第三天

数据分析案例

统计某一时间段或时间点内每个模块的被访问此处，取出前三个次数最多的模块作为参考

 import java.net.URL

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}



//TODO 统计某一时间段或时间点内每个模块的被访问此处，取出前三个次数最多的模块作为参考
// 20161123101523 http://java.learn.com/java/javaee.shtml
//20161123101523  http://java.learn.com/java/javaee.shtml
//20161123101523  http://ui.learn.com/ui/video.shtml
//20161123101523  http://bigdata.learn.com/bigdata/
object Subject {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val data = sc.textFile("G://scala/数据/access.txt")
    //数据清洗
    val urlClickOne:RDD [(String,Int)] =data.map(line=>{
      val fields=line.split("\t")
      val url =fields(1)
      (url,1)
    })
  //加起来
    val sumUrl: RDD[(String, Int)] = urlClickOne.reduceByKey(_ + _)
    //提取学科信息
    val subjectUrlCount: RDD[(String, String, Int)]= sumUrl.map(content=>{
      val url=content._1
      val count=content._2
      val subject=new URL(url).getHost
      (subject,url,count)
    })
    val result=subjectUrlCount.groupBy(_._1)mapValues(_.toList.sortBy(_._3).reverse.take(3))
result.foreach(println)
    sc.stop()


  }

}

运行结果
(ui.learn.com,List((ui.learn.com,http://ui.learn.com/ui/
video.shtml,37),
(ui.learn.com,http://ui.learn.com/ui/course.shtml,26),
(ui.learn.com,http://ui.learn.com/ui/teacher.shtml,23)))

自定义分区

import java.net.URL

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
import scala.collection.mutable.Map
//TODO 按不同学科进行分区
object Subject02 {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val access: RDD[String] = sc.textFile("E://access.txt",1)
    // 2.数据维度提取
    val data = access.map(line => {
      val fields = line.split("\t")
      val url = fields(1)
      val subject = new URL(url).getHost
      (subject, url)
    })


    //使用分区器
  val subjuectPartition=new SubjuectPartition(data.keys.distinct.collect())
 data.groupBy(_._1).partitionBy(subjuectPartition).saveAsTextFile("")
  }
}
class  SubjuectPartition(subject: Array[String]) extends Partitioner
{
  //分区数
  override def numPartitions: Int = subject.length
//key(也就是科目) 和分区数的对应关系()
  val map:Map[String,Int]=new mutable.HashMap
  var partition = 0
  //把各个科目和分区序号给了map 形成一一对应的关系
  for(subjeo<-subject){
    map.put(subjeo,partition)
    partition += 1
  }
  //指定分区规则 map get到要配置的分区号
  override def getPartition(key: Any): Int ={
    map.getOrElse(key.toString,0)
  }
}

自定义排序

自定义排序

import org.apache.spark.{SparkConf, SparkContext}





case class Person(name: String, age: Int, height: Int)
object Mysort{
val personOrder=new Ordering[Person]{
 implicit override def compare(x: Person, y: Person): Int = {
    if(x.age!=y.age){
      x.age-y.age

    }else{
      y.height-x.height
    }
  }
}

}

object ortTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val data =sc.parallelize(Array(("tom",20,169),("jerry",19,173),("jack",21,175),("rose",20,170)),1)
  data.sortBy(person=>Person(person._1,person._2,person._3),true).foreach(println)
    sc.stop()

  }

}

    // 运行结果: 
    // (jack,21,175) (tom,20,169) (rose,20,170) (jerry,19,173) 
    sc.stop()
  }
}

实现变量的共享

因为list 分区了数据不能共享 用累加器区实现数据共享

package com.qfedu.day13

import org.apache.spark.{SparkConf, SparkContext}

object AccumulatorTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val list = List(1,2,3,4,5,6)
    val listRDD = sc.parallelize(list, 3)
    // 实现变量的共享，必须由SparkContext创建
    val accumulator = sc.accumulator(0)
    // 不能使用普通变量进行记录
      
    var sum = 0
    listRDD.foreach(x => {
      accumulator += x
    })
    println(accumulator)
  }
}

SparkSQL
- DataFrame

libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.6.3" - 从文件中读取数据构建DataFrame

import org.apache.spark.{SparkConf, SparkContext}
case class Make(id: Int, makeName: String)
object DataFrameTest {
def main(args: Array[String]): Unit = {
val sparkConf = new
SparkConf().setAppName("Master").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
val data =
sc.textFile("E://make.txt").map(_
.split("
,
"))
val dataFrame =
sqlContext.createDataFrame(data.map(make =>
Make(make(0).toInt, make(1))))
dataFrame.show()
sc.stop()
}
}

通过StructType指定Schema

将DataFrame注册成表

可以使用registerTempTable方法将DataFrame注册成表，直接使用SQL进行操作

应为注册成的变第一列为_1 _2 _3 这种形式所以需要重新处理第一列变成1 2 3

package com.qfedu.day13

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}

object DataFrameTest {
  def main(args: Array[String]): Unit = {
    // 0.初始化
    val sparkConf = new SparkConf().setAppName("SparkSQL").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    // 1.构建SqlContext
    val sqlContext = new SQLContext(sc)
    val data: RDD[String] = sc.textFile("E://make.txt")
    // 2.构建所需的Row结构
    val dataRDD: RDD[Row] = data.map(line => {
      val id = line.split(",")(0).toInt
      val makeName = line.split(",")(1)
      // 无泛型约束
      Row(id, makeName)
    })
    val schema = StructType(
      // 使用集合形式构建每一列的结构信息
      List(StructField("id",IntegerType),StructField("makeName",StringType))
    )
    // 使用RDD构建DataFrame
    val dataFrame = sqlContext.createDataFrame(dataRDD, schema)
    // 打印元信息
    dataFrame.printSchema()
    // dataFrame.show()
    // 注册为临时表
    dataFrame.registerTempTable("make")
    // 使用SQL操作查询文本数据
    sqlContext.sql("select * from make where id < 10").show()
  }
}

以JDBC的方式从关系型数据库中读取数据，并转换成RDD的方式 - 工程构建 libraryDependencies += "mysql" % "mysql-connector-java" % "5.1.46"

useUnicode 是否使用Unicode字符集，如果参数characterEncoding设置为gb2312或gbk，本参数值必须设置为true

characterEncoding :当useUnicode设置为true时，指定字符编码。比如可设置为gb2312或gbk

package com.qfedu.day13

import java.sql.DriverManager

import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

object JdbcRDDTest {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("JdbcRDD").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val jdbcUrl = "jdbc:mysql://localhost:3306/w01?useUnicode=true&characterEncoding=utf8"
    val user = "root"
    val password = "mysql"
    val conn = () => {
      Class.forName("com.mysql.jdbc.Driver").newInstance()
      DriverManager.getConnection(jdbcUrl, user, password)
    }
    val sql = "select * from make where id between ? and ?"
    val jdbcRDD: JdbcRDD[(Int, String)] = new JdbcRDD(sc, conn, sql, 0, 300, 3,
      res => {
        val id = res.getInt("id")
        val makeName = res.getString("makeName")
        (id, makeName)
      }
    )
    println(jdbcRDD.collect().toBuffer)
    sc.stop()
  }
}

Spark 第三天

你可能感兴趣的:(Spark 第三天)