-
数据分析案例
统计某一时间段或时间点内每个模块的被访问此处,取出前三个次数最多的模块作为参考
import java.net.URL import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} //TODO 统计某一时间段或时间点内每个模块的被访问此处,取出前三个次数最多的模块作为参考 // 20161123101523 http://java.learn.com/java/javaee.shtml //20161123101523 http://java.learn.com/java/javaee.shtml //20161123101523 http://ui.learn.com/ui/video.shtml //20161123101523 http://bigdata.learn.com/bigdata/ object Subject { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]") val sc = new SparkContext(sparkConf) val data = sc.textFile("G://scala/数据/access.txt") //数据清洗 val urlClickOne:RDD [(String,Int)] =data.map(line=>{ val fields=line.split("\t") val url =fields(1) (url,1) }) //加起来 val sumUrl: RDD[(String, Int)] = urlClickOne.reduceByKey(_ + _) //提取学科信息 val subjectUrlCount: RDD[(String, String, Int)]= sumUrl.map(content=>{ val url=content._1 val count=content._2 val subject=new URL(url).getHost (subject,url,count) }) val result=subjectUrlCount.groupBy(_._1)mapValues(_.toList.sortBy(_._3).reverse.take(3)) result.foreach(println) sc.stop() } } 运行结果 (ui.learn.com,List((ui.learn.com,http://ui.learn.com/ui/ video.shtml,37), (ui.learn.com,http://ui.learn.com/ui/course.shtml,26), (ui.learn.com,http://ui.learn.com/ui/teacher.shtml,23)))
-
-
自定义分区
import java.net.URL import org.apache.spark.rdd.RDD import org.apache.spark.{Partitioner, SparkConf, SparkContext} import scala.collection.mutable import scala.collection.mutable.Map //TODO 按不同学科进行分区 object Subject02 { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]") val sc = new SparkContext(sparkConf) val access: RDD[String] = sc.textFile("E://access.txt",1) // 2.数据维度提取 val data = access.map(line => { val fields = line.split("\t") val url = fields(1) val subject = new URL(url).getHost (subject, url) }) //使用分区器 val subjuectPartition=new SubjuectPartition(data.keys.distinct.collect()) data.groupBy(_._1).partitionBy(subjuectPartition).saveAsTextFile("") } } class SubjuectPartition(subject: Array[String]) extends Partitioner { //分区数 override def numPartitions: Int = subject.length //key(也就是科目) 和分区数的对应关系() val map:Map[String,Int]=new mutable.HashMap var partition = 0 //把各个科目和分区序号给了map 形成一一对应的关系 for(subjeo<-subject){ map.put(subjeo,partition) partition += 1 } //指定分区规则 map get到要配置的分区号 override def getPartition(key: Any): Int ={ map.getOrElse(key.toString,0) } }
-
- 自定义排序
自定义排序
import org.apache.spark.{SparkConf, SparkContext}
case class Person(name: String, age: Int, height: Int)
object Mysort{
val personOrder=new Ordering[Person]{
implicit override def compare(x: Person, y: Person): Int = {
if(x.age!=y.age){
x.age-y.age
}else{
y.height-x.height
}
}
}
}
object ortTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val data =sc.parallelize(Array(("tom",20,169),("jerry",19,173),("jack",21,175),("rose",20,170)),1)
data.sortBy(person=>Person(person._1,person._2,person._3),true).foreach(println)
sc.stop()
}
}
// 运行结果:
// (jack,21,175) (tom,20,169) (rose,20,170) (jerry,19,173)
sc.stop()
}
}
- 实现变量的共享
因为list 分区了数据不能共享 用累加器区实现数据共享
package com.qfedu.day13
import org.apache.spark.{SparkConf, SparkContext}
object AccumulatorTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("Master").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val list = List(1,2,3,4,5,6)
val listRDD = sc.parallelize(list, 3)
// 实现变量的共享,必须由SparkContext创建
val accumulator = sc.accumulator(0)
// 不能使用普通变量进行记录
var sum = 0
listRDD.foreach(x => {
accumulator += x
})
println(accumulator)
}
}
- SparkSQL
-
- DataFrame
libraryDependencies += "org.apache.spark" %% "spark-sql" % "1.6.3" - 从文件中读取数据构建DataFrame
import org.apache.spark.{SparkConf, SparkContext}
case class Make(id: Int, makeName: String)
object DataFrameTest {
def main(args: Array[String]): Unit = {
val sparkConf = new
SparkConf().setAppName("Master").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
val data =
sc.textFile("E://make.txt").map(_
.split("
,
"))
val dataFrame =
sqlContext.createDataFrame(data.map(make =>
Make(make(0).toInt, make(1))))
dataFrame.show()
sc.stop()
}
}
通过StructType指定Schema
将DataFrame注册成表
可以使用registerTempTable方法将DataFrame注册成表,直接使用SQL进行操作
应为注册成的变第一列为_1 _2 _3 这种形式 所以需要重新处理第一列 变成1 2 3
package com.qfedu.day13
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
object DataFrameTest {
def main(args: Array[String]): Unit = {
// 0.初始化
val sparkConf = new SparkConf().setAppName("SparkSQL").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
// 1.构建SqlContext
val sqlContext = new SQLContext(sc)
val data: RDD[String] = sc.textFile("E://make.txt")
// 2.构建所需的Row结构
val dataRDD: RDD[Row] = data.map(line => {
val id = line.split(",")(0).toInt
val makeName = line.split(",")(1)
// 无泛型约束
Row(id, makeName)
})
val schema = StructType(
// 使用集合形式构建每一列的结构信息
List(StructField("id",IntegerType),StructField("makeName",StringType))
)
// 使用RDD构建DataFrame
val dataFrame = sqlContext.createDataFrame(dataRDD, schema)
// 打印元信息
dataFrame.printSchema()
// dataFrame.show()
// 注册为临时表
dataFrame.registerTempTable("make")
// 使用SQL操作查询文本数据
sqlContext.sql("select * from make where id < 10").show()
}
}
- 以JDBC的方式从关系型数据库中读取数据,并转换成RDD的方式 - 工程构建 libraryDependencies += "mysql" % "mysql-connector-java" % "5.1.46"
useUnicode 是否使用Unicode字符集,如果参数characterEncoding设置为gb2312或gbk,本参数值必须设置为true
characterEncoding :当useUnicode设置为true时,指定字符编码。比如可设置为gb2312或gbk
package com.qfedu.day13
import java.sql.DriverManager
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}
object JdbcRDDTest {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("JdbcRDD").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val jdbcUrl = "jdbc:mysql://localhost:3306/w01?useUnicode=true&characterEncoding=utf8"
val user = "root"
val password = "mysql"
val conn = () => {
Class.forName("com.mysql.jdbc.Driver").newInstance()
DriverManager.getConnection(jdbcUrl, user, password)
}
val sql = "select * from make where id between ? and ?"
val jdbcRDD: JdbcRDD[(Int, String)] = new JdbcRDD(sc, conn, sql, 0, 300, 3,
res => {
val id = res.getInt("id")
val makeName = res.getString("makeName")
(id, makeName)
}
)
println(jdbcRDD.collect().toBuffer)
sc.stop()
}
}