SparkSQL读取Hive数据,SQL处理之后,存入Hive的分区表

pom文件:



    4.0.0

    Hive2Hive
    Hive2Hive
    1.0-SNAPSHOT
    
        1.8
        1.8
        2.11.8
        2.3.0
        2.7.5
        UTF-8
    
    
        
        
        
        
        

        
            org.scala-lang
            scala-library
            ${scala.version}
        

        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-sql_2.11
            ${spark.version}
        

        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        

        
        
        
        
        
        

        
        
        
        
        

        
        
        
        
        


        
        
            mysql
            mysql-connector-java
            5.1.6
        

        
        
        
        
        

        

        
            org.apache.hbase
            hbase-client
            1.2.6
        
        
        
            org.apache.hbase
            hbase-server
            1.2.6
        

        
            org.apache.hbase
            hbase-common
            1.2.6
        
        
        
            org.apache.hbase
            hbase-protocol
            1.2.6
        

        
            org.apache.spark
            spark-hive_2.11
            2.3.0
        
    




    
        
            
                
                    net.alchim31.maven
                    scala-maven-plugin
                    3.2.2
                
                
                    org.apache.maven.plugins
                    maven-compiler-plugin
                    3.5.1

                
            
        
        
            
                net.alchim31.maven
                scala-maven-plugin
                
                    
                        scala-compile-first
                        process-resources
                        
                            add-source
                            compile
                        
                    
                    
                        scala-test-compile
                        process-test-resources
                        
                            testCompile
                        
                    
                
            

            
                org.apache.maven.plugins
                maven-compiler-plugin
                
                    
                        compile
                        
                            compile
                        
                    
                
            

            
                org.apache.maven.plugins
                maven-shade-plugin
                2.4.3
                
                    
                        package
                        
                            shade
                        
                        
                            
                                
                                    *:*
                                    
                                        META-INF/*.SF
                                        META-INF/*.DSA
                                        META-INF/*.RSA
                                    
                                
                            
                        
                    
                
            
        
    

增加分区的辅助类:

package UDF.buildaddpartitionsql;

import java.util.ArrayList;
import java.util.List;

class SparkSqlExecute {
    public static String buildAddPartitionSql(String table, List
            partitionFields, List partitionValues) {
        String sql = "ALTER TABLE ";
        sql += table + " ADD PARTITION (";
        for (int i = 0; i < partitionFields.size(); i++) {
            sql += partitionFields.get(i) + " = ";
            sql += "'" + partitionValues.get(i) + "'";
            if (i == partitionFields.size() - 1) {
                sql += ")";
            } else {
                sql += ",";
            }
        }
        return sql;
    }
}

public class  buildAddPartitionSql{

    public  static  String wrapBuildAddPartitionSql(String table, List
            partitionFields, List partitionValues){

        return SparkSqlExecute.buildAddPartitionSql(table, partitionFields, partitionValues);

    }






    public static void main(String[] args) {
        String table = "tableName";
        List listFields = new ArrayList<>();
        List listValues = new ArrayList<>();
        listFields.add("dd");
        listValues.add("2019-05-34");

        String s = wrapBuildAddPartitionSql(table, listFields, listValues);
        System.out.println(s);

    }

}

UDF函数两个:

(1)Ip十进制转换为IP

package UDF.intToip


object Int2IP {
   def int2IP(ipInt: Long): String =  {
     val sb: StringBuilder = new StringBuilder
     sb.append(ipInt & 0xFF).append(".")
     sb.append((ipInt >> 8) & 0xFF).append(".")
     sb.append((ipInt >> 16) & 0xFF).append(".")
     sb.append((ipInt >> 24) & 0xFF)
     return sb.toString
   }


}

(2)已切割的时间转换为二进制

package UDF.timeToerjinzhi;

public class TimeToint {

    /**
     * 初始化int 数组
     */
    public static void initArray(int[] array) {
        if (null == array)
            return;
        for (int i = 0; i < array.length; i++) {
            array[i] = 0;
        }
    }
    /**
     * 将字符串数组转化为int数组
     */
    public static int[] stringToArray(String s) {
        String[] split = s.split(",");
        if (split == null) {
            return null;
        }
        int[] arr = new int[split.length];
        for (int i = 0; i < split.length; i++) {
            arr[i] = Integer.parseInt(split[i]);
        }
        return arr;
    }


    /**
     * 判断数组是否存在
     */
    public static void arrayIsExist(int[] stdArray, int[] source) {
        if (null == stdArray || null == source) {
            return;
        }
        for (int i = 0; i < source.length; i++) {
            int time = source[i];
            stdArray[time] = 1;
        }
    }

    /**
     * 打印输出
     */
    public static String printTimeArray(int[] stdArray) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < stdArray.length; i++) {

            sb.append((stdArray[i] == 1 ? "1" : "0"));
            //sb.append("" + i + "-" + (stdArray[i] == 1 ? "1" : "0") + ",");
        }
        String s = sb.substring(0, sb.length()-1);
        return s;
    }

    public static String warpMethod(String s) {
        int[] timeArray = new int[25];
        initArray(timeArray);
        int[] source = stringToArray(s);
        arrayIsExist(timeArray, source);
        return printTimeArray(timeArray);
    }
}

主方法:

package HiveToHDFS

import java.util
import UDF.buildaddpartitionsql.buildAddPartitionSql
import UDF.intToip.Int2IP
import UDF.timeToerjinzhi.TimeToint
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.launcher.SparkLauncher
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext

object Hive2HDFS2 {
  def bathSql(time: String, hiveContext: HiveContext): Unit = {

    val path = "hdfs://hadoop01:9000/user/hive/warehouse/wa_zfd_tytt_hive.db/t_event_monitor_1001_trojan_patition/DD="
    val tablename = "wa_zfd_tytt_hive.t_event_monitor_1001_trojan_patition"
    val listFields = new util.ArrayList[String]
    val listValues = new util.ArrayList[String]
    listFields.add("dd")
    listValues.add(time)

    val sql_partiton = buildAddPartitionSql.wrapBuildAddPartitionSql(tablename, listFields, listValues)

    hiveContext.sql(sql_partiton)
    hiveContext.udf.register("String2Ip_c_src_ipv4",(c_src_ipv4:Long)=>{
      Int2IP.int2IP(c_src_ipv4)
    })

    hiveContext.udf.register("String2c_dest_ipv4",(c_dest_ipv4:Long)=>{
      Int2IP.int2IP(c_dest_ipv4)
    })

    hiveContext.udf.register("timeToFormal",(ACTIVITY: String)=>{
      TimeToint.warpMethod(ACTIVITY)
    })

    val sql =
      """
        |select
        |w.c_event_id as EVENT_ID,w.c_src_ipv4 as SRCIP ,String2Ip_c_src_ipv4(w.c_src_ipv4) as SRCIP_INT, w.c_dest_ipv4 as DSTIP, String2c_dest_ipv4(w.c_dest_ipv4) as DSTIP_INT, w.c_dest_port as PRIMARY_DSTPORT, w.cnt_c_dest_port as PRIMARY_DSTPORT_CNT, w.CC_FLAG as CC_FLAG,w.CNT as CNT,q.ACTIVITY as ACTIVITY
        |from
        |(select
        |i.c_event_id as c_event_id,i.c_src_ipv4 as c_src_ipv4 ,i.c_dest_ipv4 as c_dest_ipv4, i.c_dest_port as c_dest_port, i.cnt_c_dest_port as cnt_c_dest_port, i.CC_FLAG as CC_FLAG,h.CNT as CNT,i.c_time_part as c_time_part
        |from
        |(select g.c_event_id as c_event_id , g.c_src_ipv4 as c_src_ipv4, g.c_dest_ipv4 as c_dest_ipv4, g.c_dest_port as c_dest_port,  g.cnt_c_dest_port as cnt_c_dest_port, 1 as CC_FLAG ,g.c_time_part as  c_time_part
        |from
        |(select
        |split(e.event_src_dst, '-')[0] as c_event_id,
        |split(e.event_src_dst, '-')[1] as c_src_ipv4,
        |split(e.event_src_dst, '-')[2] as c_dest_ipv4,
        |split(e.event_src_dst, '-')[3] as c_dest_port,
        |cnt_c_dest_port,c_time_part
        |from
        |(select
        |d.event_src_dst as event_src_dst, d.cnt_c_dest_port as cnt_c_dest_port, c_time_part
        |from
        |(select
        |c.event_src_dst as event_src_dst, c.cnt_c_dest_port as cnt_c_dest_port, row_number() over(distribute by c.event_src_dst sort by c.cnt_c_dest_port desc) as rank ,c_time_part
        |from
        |(select
        |concat_ws('-', cast(b.c_event_id as string) , cast(b.c_src_ipv4 as string), cast(b.c_dest_ipv4 as string), cast(b.c_dest_port as string) ) as event_src_dst,  b.cnt_c_dest_port as cnt_c_dest_port ,c_time_part
        |from
        |(select
        |c_event_id, c_src_ipv4, c_dest_ipv4, c_dest_port, count(c_dest_port) as cnt_c_dest_port, c_time_part
        |from
        |wa_zfd_tytt_hive.t_event_monitor_1001_trojan
        |where c_time_part='2017-07-26'
        |group by c_event_id,  c_src_ipv4, c_dest_ipv4, c_dest_port, c_time_part) as  b
        |where c_time_part='2017-07-26'
        |) as c
        |where c_time_part='2017-07-26'
        |)  as d
        |where rank = 1 ) as e where c_time_part='2017-07-26') g
        |where c_time_part='2017-07-26'
        |)as i
        |join
        |(select
        |f.c_event_id as c_event_id , f.c_src_ipv4 as c_src_ipv4, f.c_dest_ipv4 as c_dest_ipv4, sum(f.cnt_c_dest_port ) as CNT
        |from
        |(select c_event_id, c_src_ipv4, c_dest_ipv4, c_dest_port, count(c_dest_port) as cnt_c_dest_port ,c_time_part
        |from wa_zfd_tytt_hive.t_event_monitor_1001_trojan
        |where c_time_part='2017-07-26'
        |group by c_event_id,  c_src_ipv4, c_dest_ipv4, c_dest_port,c_time_part) as f
        |where c_time_part='2017-07-26'
        |group by f.c_event_id , f.c_src_ipv4, f.c_dest_ipv4) as h on i.c_event_id=h.c_event_id and i.c_src_ipv4=h.c_src_ipv4 and i.c_dest_ipv4=h.c_dest_ipv4) as w
        |join
        |(select c_event_id,c_src_ipv4, c_dest_ipv4,c_time_part, timeToFormal(concat_ws(",",(collect_set(substring(c_time, -8, 2))))) as ACTIVITY
        |from wa_zfd_tytt_hive.t_event_monitor_1001_trojan
        |where c_time_part='2017-07-26'
        |group by c_event_id,  c_src_ipv4, c_dest_ipv4,c_time_part) as q
        |on
        |q.c_event_id=w.c_event_id and q.c_src_ipv4=w.c_src_ipv4 and q.c_dest_ipv4=w.c_dest_ipv4
        |where w.c_time_part='2017-07-26'
        |limit 100
      """.stripMargin
    val frame = hiveContext.sql(sql)


    frame.repartition(1).write.option("delimiter", "\t").option("compression", "none").mode(SaveMode.Overwrite)
      .csv(path + s"${time}")

  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("UDF").setMaster("local")
    var launcher = new SparkLauncher()
    //    System.setProperty("java.security.krb5.conf", "/etc/dataload/camus/hdfs.keytab")
    //    logger.info("krb5 config set java.security.krb5.conf=" + System.getProperty("java.security.krb5.conf"));
    launcher = launcher.setConf("spark.yarn.principal","hdfs" ).setConf("spark.yarn.keytab", "/etc/dataload/camus/hdfs.keytab")
    val sc = new SparkContext(conf)
    val hiveContext = new HiveContext(sc)

    for(arg <-args){
      bathSql(arg, hiveContext)
      println(arg)
    }


    sc.stop()


  }



}

原始数据:

1       175374339       26510   0       2017-07-26 21:56:45     f5bb2837b1944a249ee5587eb004515f        3414226114      ::      51404   0       0       3736770098      ::      8899    0   targetHost=123.187.31.64;payload=\01\00\00\00S\00\00\00\00\F4\01\00\002\00\00\00\E8\03\00\00_\17\00\00\00\00\00\00\01\00\00\00\01\00\00\00 \02\00\00\00\00\00\00\01\01\00\00\00 \00\00\00\02\00\00\00\04\00\00\03\00\00\00<\00\00\00\00\00\01\00\00\00123.187.31.64\00\C8\12  1       84      0       0       0       nan     2       320000  0       0       2       nan     0       NULLNULL     NULL    NULL    NULL    NULL    NULL    NULL    2017-07-26
1       175374357       26503   0       2017-07-26 21:56:45     2bcfbd57f5c9239c05cdc4580df7a982        31609973        ::      7862    0       0       2077124337      ::      29135   0   payload=Linux3.2.0-56-generic\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00B2.2.1bd\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\008 * 3392\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0031886MB\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001000MB/s\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\00586\00\00\00\00\00\00\00\00\00\00\00100\00\00\00\00\00\00\00\00\00\00\00\00\00MK64\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00idle\00SYN) Target:180.169.7.42:80\00Status:Attacking......\00..\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00      1       82      0       0       0       nan     2       440000  0       0       0       \CC\DA\D1\B6\D4\C6\BC\C6\CB\E3\A3\A8\B1\B1\BE\A9\A3\A9\D3\D0\CF\DE\D4\F0\C8\CE\B9\AB\CB\BE  NULL     NULL    NULL    NULL    NULL    NULL    NULL    NULL    2017-07-26
1       175390795       26503   0       2017-07-26 21:56:44     17876ac106aebd5b4c5e07867b900296        1937650768      ::      53930   0       0       3736746632      ::      29135   0   payload=Linux2.6.32-358.el6.x86_64\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00B2.2.1bd\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\008 * 1995\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\005842MB\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001000MB/s\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00MK64\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00idle\00SYN) Target:119.29.124.43:80 Status:Attacking......\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00    2       810000  0       0       0       nan     2       320000  0       0       2       nan     0       NULL    NULL    NULL    NULL    NULL    NULL    NULL    NULL    2017-07-26
30730   175636631       26503   0       2017-07-26 21:56:36     8B211D6FADEC3169B1D47951D2D496C8        2892523789      ::      58008   0       0       463163526       ::      29135   0   payload=Linux4.9.15-x86_64-linode81\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00B2.2.1bd\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001 * 2299\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00988MB\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001000MB/s\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00100\00\00\00\00\00\00\00\00\00\00\00\00\00MK64_SecurtDoor\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00dle\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00      1       1206    0       0       150     nan     2       35      0       0       2       nan     0   NULL     NULL    NULL    NULL    NULL    NULL    NULL    NULL    2017-07-26

输出数据:

25579   3067074597      37.216.207.182  3428803437      109.99.95.204   80      2       1       2       000000000000001000000000        2018-2-21
26503   2375119092      244.112.145.141 3079018291      51.23.134.183   29135   99      1       99      000000000000001111110000        2018-2-21
26510   1863042180      132.196.11.111  2734383054      206.95.251.162  79      1       1       1       000000000000000000100000        2018-2-21
26510   2008578983      167.123.184.119 1754647427      131.203.149.104 25000   1       1       1       000000000000000000100000        2018-2-21
26510   2026304360      104.243.198.120 2734383054      206.95.251.162  79      1       1       1       000000000000000000000100        2018-2-21
28747   1033949069      141.207.160.61  3322956083      51.73.16.198    3308    1       1       1       000000000000001000000000        2018-2-21
28747   1878256575      191.235.243.111 2102249246      30.199.77.125   7878    2       1       2       000000000000010000000000        2018-2-21
28747   3027529922      194.112.116.180 3322956083      51.73.16.198    3308    1       1       1       000000000000000000100000        2018-2-21
28747   707828200       232.153.48.42   3322956083      51.73.16.198    3308    1       1       1       000000000000001000000000        2018-2-21
29376   1358364226      66.254.246.80   3736764903      231.129.186.222 8888    4       1       4       000000000000001010100000        2018-2-21
29376   2970862811      219.196.19.177  3736764903      231.129.186.222 8888    2       1       2       000000000000010000000000        2018-2-21
33177   3707939957      117.172.2.221   401214785       65.13.234.23    8877    1       1       1       000000000000000010000000        2018-2-21
6505    3736460950      150.222.181.222 2997016939      107.217.162.178 80      1       1       1       000000000000001000000000        2018-2-21
21690   466554740       116.15.207.27   3322956083      51.73.16.198    3308    1       1       1       000000000000001000000000        2018-2-21
21690   987164138       234.237.214.58  3322956083      51.73.16.198    3308    1       1       1       000000000000000000100000        2018-2-21
22110   2016887277      237.65.55.120   3404670650      186.38.239.202  8000    1       1       1       000000000000001000000000        2018-2-21
22110   2054304230      230.49.114.122  3404670650      186.38.239.202  8000    1       1       1       000000000000010000000000        2018-2-21
22110   2054342199      55.198.114.122  3404670650      186.38.239.202  8000    1       1       1       000000000000010000000000        2018-2-21
22110   2071091192      248.87.114.123  794371949       109.39.89.47    1912    1       1       1       000000000000001000000000        2018-2-21
25579   30120285        93.153.203.1    3428803437      109.99.95.204   80      1       1       1       000000000000001000000000        2018-2-21
26510   2105872829      189.17.133.125  1944574493      29.218.231.115  25111   2       1       2       000000000000001000000000        2018-2-21
26510   980791341       45.176.117.58   1754647427      131.203.149.104 25000   3       1       3       000000000000011000010000        2018-2-21
28747   2071826334      158.143.125.123 846551373       77.89.117.50    8089    1       1       1       000000000000001000000000        2018-2-21
28747   2105512296      104.145.127.125 3322956083      51.73.16.198    3308    1       1       1       000000000000000100000000        2018-2-21
28747   466768629       245.82.210.27   3322956083      51.73.16.198    3308    2       1       2       000000000000001001000000        2018-2-21
28747   658934127       111.137.70.39   3322956083      51.73.16.198    3308    2       1       2       000000000000000000100100        2018-2-21

 

你可能感兴趣的:(项目)