Spark core 编程案例1

测试数据:

192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/ HTTP/1.1" 200 259
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/head.jsp HTTP/1.1" 200 713
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/body.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:37 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:38 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:38 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:40 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:40 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:41 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:54:41 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:42 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:54:42 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:52 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:52 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:53 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:56 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:54:56 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:57 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:57 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:58 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:58 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:59 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:59 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:55:00 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:55:00 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:55:02 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:55:02 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242

案例一:分析tomcat的访问日志,求访问量最高的两个网页
        1、对每个jps的访问量求和
        2、排序
        3、取前两条记录 

package demo

import org.apache.spark.{SparkConf, SparkContext}

object MapPartitionsDemo{

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.setAppName("MapPartitionsDemo").setMaster("local")
    val sc = new SparkContext(conf)

    val rdd1 = sc.textFile("G:\\大数据第七期vip课程\\正式课\\0611-第三十二章节\\localhost_access_log.2017-07-30.txt").map{
      line =>{
       //192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/head.jsp HTTP/1.1" 200 713
        //解析字符串,找到jsp 名字
        //第一步解析GET /MyDemoWeb/head.jsp HTTP/1.1
        val index1= line.indexOf("\"") //双引号开头的位置
        val index2 = line.lastIndexOf("\"") //双引号结束的位置
        val str = line.substring(index1+1,index2)
        //解析/MyDemoWeb/head.jsp
        val index3= str.indexOf(" ") //双引号开头的位置
        val index4 = str.lastIndexOf(" ") //双引号结束的位置
        val str1 = str.substring(index3+1,index4)

        //解析***.jsp
        val jspname=str1.substring(str1.lastIndexOf("/")+1)
        //返回(***.jsp,1)
        (jspname,1)

      }
    }

    //按照jspname 进行累加
    val rdd2 = rdd1.reduceByKey(_+_)
    //按照访问量排序,降序
    val rdd3 = rdd2.sortBy(_._2,false)

    //取出前两条
    println(rdd3.take(2).toBuffer)//ArrayBuffer((oracle.jsp,9), (hadoop.jsp,9))
    sc.stop()

  }
}

案例二:分析tomcat的访问日志,根据网页的名字进行分区(类似MapReduce中的自定义分区)
        结果: 网页的名字    访问日志
               oracle.jsp    192.168.88.1 - - [30/Jul/2017:12:54:37 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
               oracle.jsp    192.168.88.1 - - [30/Jul/2017:12:54:53 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242

package demo

import java.util

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

object MyTomcatPartitioerDemo {

  def main(args:Array[String]):Unit={
    System.setProperty("hadoop.home.dir","G:\\hadoop\\hadoop-2.4.1\\hadoop-2.4.1")
    val conf = new SparkConf()
    conf.setAppName("MapPartitionsWithIndexDemo").setMaster("local")
    val sc = new SparkContext(conf)

    val rdd1 = sc.textFile("G:\\localhost_access_log.2017-07-30.txt").map{
      line =>{
        //192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/head.jsp HTTP/1.1" 200 713
        //解析字符串,找到jsp 名字
        //第一步解析GET /MyDemoWeb/head.jsp HTTP/1.1
        val index1= line.indexOf("\"") //双引号开头的位置
        val index2 = line.lastIndexOf("\"") //双引号结束的位置
        val str = line.substring(index1+1,index2)
        //解析/MyDemoWeb/head.jsp
        val index3= str.indexOf(" ") //双引号开头的位置
        val index4 = str.lastIndexOf(" ") //双引号结束的位置
        val str1 = str.substring(index3+1,index4)

        //解析***.jsp
        val jspname=str1.substring(str1.lastIndexOf("/")+1)
        //返回(***.jsp,对应的日志)
        (jspname,line)

      }
    }

    //根据jsp 的名字建立分区,得到jsp 名字的个数
    //得到所有不重复的名字-->string
    val rdd2 = rdd1.map(_._1).distinct().collect()
    //根据jsp 的名字建立分区,创建分区规则
    val mypartitioner = new MyPartitions(rdd2)

    //注意:rdd1是一个
    //执行分区
    val result = rdd1.partitionBy(mypartitioner)

    //输出
    result.saveAsTextFile("G:\\partitions")
    sc.stop()
  }
}

class MyPartitions(allJspName:Array[String]) extends Partitioner{

  //定义一个集合保存分区的条件
  //String jsp名字 Int分区号
  val partitionMap=  new util.HashMap[String,Int]()
  //建立分区规则
  var partID = 0
  for(name <- allJspName){
    partitionMap.put(name,partID)
    partID += 1
  }

  //返回分区的个数
  override def numPartitions: Int = partitionMap.size()
  //根据key(jsp的名字) 获取分区号
  override def getPartition(key: Any): Int = {
    partitionMap.getOrDefault(key.toString,0)
  }
}

输出结果: 

Spark core 编程案例1_第1张图片

注意:

1.输出目录  result.saveAsTextFile("G:\\partitions") 不能存在,否则提示already exists:

Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/G:/partitions already exists
    at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
    at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1191)

2.Windows环境需要设置  System.setProperty("hadoop.home.dir","G:\\hadoop\\hadoop-2.4.1\\hadoop-2.4.1") 

Spark core 编程案例1_第2张图片

否则可能提示:

Caused by: java.io.IOException: (null) entry in command string: null chmod 0644 G:\partitions1\_temporary\0\_temporary\attempt_20181023002946_0003_m_000000_3\part-00000
    at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:770)

 案例三:把上面分析的结果,保存到Oracle中(知识点:在哪里建立Connection?): 对于非序列化的对象,如何处理?

    1、create table result(jspname varchar2(20), count number);
    2、开发JDBC程序(Scala),包含Oracle的驱动 
    3、操作RDD的时候,尽量针对分区,避免序列化问题

package demo

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.spark.{SparkConf, SparkContext}

object MyOracleDemo {

  def main(args: Array[String]): Unit = {
    System.setProperty("hadoop.home.dir","G:\\hadoop\\hadoop-2.4.1\\hadoop-2.4.1")
    val conf = new SparkConf()
    conf.setAppName("MapPartitionsWithIndexDemo").setMaster("local")
    val sc = new SparkContext(conf)

    val rdd1 = sc.textFile("G:\\localhost_access_log.2017-07-30.txt").map{
      line =>{
        //192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/head.jsp HTTP/1.1" 200 713
        //解析字符串,找到jsp 名字
        //第一步解析GET /MyDemoWeb/head.jsp HTTP/1.1
        val index1= line.indexOf("\"") //双引号开头的位置
        val index2 = line.lastIndexOf("\"") //双引号结束的位置
        val str = line.substring(index1+1,index2)
        //解析/MyDemoWeb/head.jsp
        val index3= str.indexOf(" ") //双引号开头的位置
        val index4 = str.lastIndexOf(" ") //双引号结束的位置
        val str1 = str.substring(index3+1,index4)

        //解析***.jsp
        val jspname=str1.substring(str1.lastIndexOf("/")+1)
        //返回(***.jsp,对应的日志)
        (jspname,1)

      }
    }
    //针对分区,创建connection,将结果保存到数据库中
    rdd1.foreachPartition(saveToOracle)


  }

  def saveToOracle(iter:Iterator[(String,Int)])={
    var con:Connection = null
    var pst:PreparedStatement = null

    try{
      Class.forName("oracle.jdbc.driver.OracleDriver").newInstance()
      con = DriverManager.getConnection("jdbc:oracle:thin:@192.168.163.134:1521:orcl","scott","tiger")
      pst = con.prepareStatement("insert into result values (?,?)")

      iter.foreach(data =>{
        pst.setString(1,data._1)
        pst.setInt(2,data._2)
        pst.executeUpdate()
      })

    }catch{
      case e1:Exception => e1.printStackTrace()
    }finally{
      if(pst != null) pst.close()
      if(con != null) con.close()
    }
  }
}

Spark core 编程案例1_第3张图片

package demo

import java.sql.DriverManager
import  org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

object MyJDBCRDDDemo {

  val connection = () =>{
    Class.forName("oracle.jdbc.driver.OracleDriver").newInstance()
    DriverManager.getConnection("jdbc:oracle:thin:@192.168.163.134:1521:orcl","scott","tiger")

  }

  def main(args: Array[String]): Unit = {
    val sparkconf = new SparkConf()
    sparkconf.setAppName("MyJDBCRDDDemo").setMaster("local")
    val sc = new SparkContext(sparkconf)

    //10号部门,工资大于2000的员工姓名和薪水
    val oracleRDD = new JdbcRDD(sc,connection,"select * from emp where sal > ? and deptno=?",2000,10,1,r=>{
//      print(r.toString)
      val ename=r.getString(2)
      val sal = r.getInt(6)
      (ename,sal)

    })

    val result = oracleRDD.collect()
    println(result.toBuffer)
    sc.stop()
  }
}

 

你可能感兴趣的:(Spark)