pom文件:
4.0.0
Hive2Hive
Hive2Hive
1.0-SNAPSHOT
1.8
1.8
2.11.8
2.3.0
2.7.5
UTF-8
org.scala-lang
scala-library
${scala.version}
org.apache.spark
spark-core_2.11
${spark.version}
org.apache.spark
spark-sql_2.11
${spark.version}
org.apache.hadoop
hadoop-client
${hadoop.version}
mysql
mysql-connector-java
5.1.6
org.apache.hbase
hbase-client
1.2.6
org.apache.hbase
hbase-server
1.2.6
org.apache.hbase
hbase-common
1.2.6
org.apache.hbase
hbase-protocol
1.2.6
org.apache.spark
spark-hive_2.11
2.3.0
net.alchim31.maven
scala-maven-plugin
3.2.2
org.apache.maven.plugins
maven-compiler-plugin
3.5.1
net.alchim31.maven
scala-maven-plugin
scala-compile-first
process-resources
add-source
compile
scala-test-compile
process-test-resources
testCompile
org.apache.maven.plugins
maven-compiler-plugin
compile
compile
org.apache.maven.plugins
maven-shade-plugin
2.4.3
package
shade
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
增加分区的辅助类:
package UDF.buildaddpartitionsql;
import java.util.ArrayList;
import java.util.List;
class SparkSqlExecute {
public static String buildAddPartitionSql(String table, List
partitionFields, List partitionValues) {
String sql = "ALTER TABLE ";
sql += table + " ADD PARTITION (";
for (int i = 0; i < partitionFields.size(); i++) {
sql += partitionFields.get(i) + " = ";
sql += "'" + partitionValues.get(i) + "'";
if (i == partitionFields.size() - 1) {
sql += ")";
} else {
sql += ",";
}
}
return sql;
}
}
public class buildAddPartitionSql{
public static String wrapBuildAddPartitionSql(String table, List
partitionFields, List partitionValues){
return SparkSqlExecute.buildAddPartitionSql(table, partitionFields, partitionValues);
}
public static void main(String[] args) {
String table = "tableName";
List listFields = new ArrayList<>();
List listValues = new ArrayList<>();
listFields.add("dd");
listValues.add("2019-05-34");
String s = wrapBuildAddPartitionSql(table, listFields, listValues);
System.out.println(s);
}
}
UDF函数两个:
(1)Ip十进制转换为IP
package UDF.intToip
object Int2IP {
def int2IP(ipInt: Long): String = {
val sb: StringBuilder = new StringBuilder
sb.append(ipInt & 0xFF).append(".")
sb.append((ipInt >> 8) & 0xFF).append(".")
sb.append((ipInt >> 16) & 0xFF).append(".")
sb.append((ipInt >> 24) & 0xFF)
return sb.toString
}
}
(2)已切割的时间转换为二进制
package UDF.timeToerjinzhi;
public class TimeToint {
/**
* 初始化int 数组
*/
public static void initArray(int[] array) {
if (null == array)
return;
for (int i = 0; i < array.length; i++) {
array[i] = 0;
}
}
/**
* 将字符串数组转化为int数组
*/
public static int[] stringToArray(String s) {
String[] split = s.split(",");
if (split == null) {
return null;
}
int[] arr = new int[split.length];
for (int i = 0; i < split.length; i++) {
arr[i] = Integer.parseInt(split[i]);
}
return arr;
}
/**
* 判断数组是否存在
*/
public static void arrayIsExist(int[] stdArray, int[] source) {
if (null == stdArray || null == source) {
return;
}
for (int i = 0; i < source.length; i++) {
int time = source[i];
stdArray[time] = 1;
}
}
/**
* 打印输出
*/
public static String printTimeArray(int[] stdArray) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < stdArray.length; i++) {
sb.append((stdArray[i] == 1 ? "1" : "0"));
//sb.append("" + i + "-" + (stdArray[i] == 1 ? "1" : "0") + ",");
}
String s = sb.substring(0, sb.length()-1);
return s;
}
public static String warpMethod(String s) {
int[] timeArray = new int[25];
initArray(timeArray);
int[] source = stringToArray(s);
arrayIsExist(timeArray, source);
return printTimeArray(timeArray);
}
}
主方法:
package HiveToHDFS
import java.util
import UDF.buildaddpartitionsql.buildAddPartitionSql
import UDF.intToip.Int2IP
import UDF.timeToerjinzhi.TimeToint
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.launcher.SparkLauncher
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
object Hive2HDFS2 {
def bathSql(time: String, hiveContext: HiveContext): Unit = {
val path = "hdfs://hadoop01:9000/user/hive/warehouse/wa_zfd_tytt_hive.db/t_event_monitor_1001_trojan_patition/DD="
val tablename = "wa_zfd_tytt_hive.t_event_monitor_1001_trojan_patition"
val listFields = new util.ArrayList[String]
val listValues = new util.ArrayList[String]
listFields.add("dd")
listValues.add(time)
val sql_partiton = buildAddPartitionSql.wrapBuildAddPartitionSql(tablename, listFields, listValues)
hiveContext.sql(sql_partiton)
hiveContext.udf.register("String2Ip_c_src_ipv4",(c_src_ipv4:Long)=>{
Int2IP.int2IP(c_src_ipv4)
})
hiveContext.udf.register("String2c_dest_ipv4",(c_dest_ipv4:Long)=>{
Int2IP.int2IP(c_dest_ipv4)
})
hiveContext.udf.register("timeToFormal",(ACTIVITY: String)=>{
TimeToint.warpMethod(ACTIVITY)
})
val sql =
"""
|select
|w.c_event_id as EVENT_ID,w.c_src_ipv4 as SRCIP ,String2Ip_c_src_ipv4(w.c_src_ipv4) as SRCIP_INT, w.c_dest_ipv4 as DSTIP, String2c_dest_ipv4(w.c_dest_ipv4) as DSTIP_INT, w.c_dest_port as PRIMARY_DSTPORT, w.cnt_c_dest_port as PRIMARY_DSTPORT_CNT, w.CC_FLAG as CC_FLAG,w.CNT as CNT,q.ACTIVITY as ACTIVITY
|from
|(select
|i.c_event_id as c_event_id,i.c_src_ipv4 as c_src_ipv4 ,i.c_dest_ipv4 as c_dest_ipv4, i.c_dest_port as c_dest_port, i.cnt_c_dest_port as cnt_c_dest_port, i.CC_FLAG as CC_FLAG,h.CNT as CNT,i.c_time_part as c_time_part
|from
|(select g.c_event_id as c_event_id , g.c_src_ipv4 as c_src_ipv4, g.c_dest_ipv4 as c_dest_ipv4, g.c_dest_port as c_dest_port, g.cnt_c_dest_port as cnt_c_dest_port, 1 as CC_FLAG ,g.c_time_part as c_time_part
|from
|(select
|split(e.event_src_dst, '-')[0] as c_event_id,
|split(e.event_src_dst, '-')[1] as c_src_ipv4,
|split(e.event_src_dst, '-')[2] as c_dest_ipv4,
|split(e.event_src_dst, '-')[3] as c_dest_port,
|cnt_c_dest_port,c_time_part
|from
|(select
|d.event_src_dst as event_src_dst, d.cnt_c_dest_port as cnt_c_dest_port, c_time_part
|from
|(select
|c.event_src_dst as event_src_dst, c.cnt_c_dest_port as cnt_c_dest_port, row_number() over(distribute by c.event_src_dst sort by c.cnt_c_dest_port desc) as rank ,c_time_part
|from
|(select
|concat_ws('-', cast(b.c_event_id as string) , cast(b.c_src_ipv4 as string), cast(b.c_dest_ipv4 as string), cast(b.c_dest_port as string) ) as event_src_dst, b.cnt_c_dest_port as cnt_c_dest_port ,c_time_part
|from
|(select
|c_event_id, c_src_ipv4, c_dest_ipv4, c_dest_port, count(c_dest_port) as cnt_c_dest_port, c_time_part
|from
|wa_zfd_tytt_hive.t_event_monitor_1001_trojan
|where c_time_part='2017-07-26'
|group by c_event_id, c_src_ipv4, c_dest_ipv4, c_dest_port, c_time_part) as b
|where c_time_part='2017-07-26'
|) as c
|where c_time_part='2017-07-26'
|) as d
|where rank = 1 ) as e where c_time_part='2017-07-26') g
|where c_time_part='2017-07-26'
|)as i
|join
|(select
|f.c_event_id as c_event_id , f.c_src_ipv4 as c_src_ipv4, f.c_dest_ipv4 as c_dest_ipv4, sum(f.cnt_c_dest_port ) as CNT
|from
|(select c_event_id, c_src_ipv4, c_dest_ipv4, c_dest_port, count(c_dest_port) as cnt_c_dest_port ,c_time_part
|from wa_zfd_tytt_hive.t_event_monitor_1001_trojan
|where c_time_part='2017-07-26'
|group by c_event_id, c_src_ipv4, c_dest_ipv4, c_dest_port,c_time_part) as f
|where c_time_part='2017-07-26'
|group by f.c_event_id , f.c_src_ipv4, f.c_dest_ipv4) as h on i.c_event_id=h.c_event_id and i.c_src_ipv4=h.c_src_ipv4 and i.c_dest_ipv4=h.c_dest_ipv4) as w
|join
|(select c_event_id,c_src_ipv4, c_dest_ipv4,c_time_part, timeToFormal(concat_ws(",",(collect_set(substring(c_time, -8, 2))))) as ACTIVITY
|from wa_zfd_tytt_hive.t_event_monitor_1001_trojan
|where c_time_part='2017-07-26'
|group by c_event_id, c_src_ipv4, c_dest_ipv4,c_time_part) as q
|on
|q.c_event_id=w.c_event_id and q.c_src_ipv4=w.c_src_ipv4 and q.c_dest_ipv4=w.c_dest_ipv4
|where w.c_time_part='2017-07-26'
|limit 100
""".stripMargin
val frame = hiveContext.sql(sql)
frame.repartition(1).write.option("delimiter", "\t").option("compression", "none").mode(SaveMode.Overwrite)
.csv(path + s"${time}")
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("UDF").setMaster("local")
var launcher = new SparkLauncher()
// System.setProperty("java.security.krb5.conf", "/etc/dataload/camus/hdfs.keytab")
// logger.info("krb5 config set java.security.krb5.conf=" + System.getProperty("java.security.krb5.conf"));
launcher = launcher.setConf("spark.yarn.principal","hdfs" ).setConf("spark.yarn.keytab", "/etc/dataload/camus/hdfs.keytab")
val sc = new SparkContext(conf)
val hiveContext = new HiveContext(sc)
for(arg <-args){
bathSql(arg, hiveContext)
println(arg)
}
sc.stop()
}
}
原始数据:
1 175374339 26510 0 2017-07-26 21:56:45 f5bb2837b1944a249ee5587eb004515f 3414226114 :: 51404 0 0 3736770098 :: 8899 0 targetHost=123.187.31.64;payload=\01\00\00\00S\00\00\00\00\F4\01\00\002\00\00\00\E8\03\00\00_\17\00\00\00\00\00\00\01\00\00\00\01\00\00\00 \02\00\00\00\00\00\00\01\01\00\00\00 \00\00\00\02\00\00\00\04\00\00\03\00\00\00<\00\00\00\00\00\01\00\00\00123.187.31.64\00\C8\12 1 84 0 0 0 nan 2 320000 0 0 2 nan 0 NULLNULL NULL NULL NULL NULL NULL NULL 2017-07-26
1 175374357 26503 0 2017-07-26 21:56:45 2bcfbd57f5c9239c05cdc4580df7a982 31609973 :: 7862 0 0 2077124337 :: 29135 0 payload=Linux3.2.0-56-generic\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00B2.2.1bd\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\008 * 3392\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0031886MB\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001000MB/s\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\00586\00\00\00\00\00\00\00\00\00\00\00100\00\00\00\00\00\00\00\00\00\00\00\00\00MK64\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00idle\00SYN) Target:180.169.7.42:80\00Status:Attacking......\00..\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00 1 82 0 0 0 nan 2 440000 0 0 0 \CC\DA\D1\B6\D4\C6\BC\C6\CB\E3\A3\A8\B1\B1\BE\A9\A3\A9\D3\D0\CF\DE\D4\F0\C8\CE\B9\AB\CB\BE NULL NULL NULL NULL NULL NULL NULL NULL 2017-07-26
1 175390795 26503 0 2017-07-26 21:56:44 17876ac106aebd5b4c5e07867b900296 1937650768 :: 53930 0 0 3736746632 :: 29135 0 payload=Linux2.6.32-358.el6.x86_64\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00B2.2.1bd\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\008 * 1995\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\005842MB\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001000MB/s\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00MK64\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00idle\00SYN) Target:119.29.124.43:80 Status:Attacking......\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00 2 810000 0 0 0 nan 2 320000 0 0 2 nan 0 NULL NULL NULL NULL NULL NULL NULL NULL 2017-07-26
30730 175636631 26503 0 2017-07-26 21:56:36 8B211D6FADEC3169B1D47951D2D496C8 2892523789 :: 58008 0 0 463163526 :: 29135 0 payload=Linux4.9.15-x86_64-linode81\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00B2.2.1bd\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001 * 2299\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00988MB\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\001000MB/s\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00100\00\00\00\00\00\00\00\00\00\00\00\00\00MK64_SecurtDoor\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00dle\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00 1 1206 0 0 150 nan 2 35 0 0 2 nan 0 NULL NULL NULL NULL NULL NULL NULL NULL 2017-07-26
输出数据:
25579 3067074597 37.216.207.182 3428803437 109.99.95.204 80 2 1 2 000000000000001000000000 2018-2-21
26503 2375119092 244.112.145.141 3079018291 51.23.134.183 29135 99 1 99 000000000000001111110000 2018-2-21
26510 1863042180 132.196.11.111 2734383054 206.95.251.162 79 1 1 1 000000000000000000100000 2018-2-21
26510 2008578983 167.123.184.119 1754647427 131.203.149.104 25000 1 1 1 000000000000000000100000 2018-2-21
26510 2026304360 104.243.198.120 2734383054 206.95.251.162 79 1 1 1 000000000000000000000100 2018-2-21
28747 1033949069 141.207.160.61 3322956083 51.73.16.198 3308 1 1 1 000000000000001000000000 2018-2-21
28747 1878256575 191.235.243.111 2102249246 30.199.77.125 7878 2 1 2 000000000000010000000000 2018-2-21
28747 3027529922 194.112.116.180 3322956083 51.73.16.198 3308 1 1 1 000000000000000000100000 2018-2-21
28747 707828200 232.153.48.42 3322956083 51.73.16.198 3308 1 1 1 000000000000001000000000 2018-2-21
29376 1358364226 66.254.246.80 3736764903 231.129.186.222 8888 4 1 4 000000000000001010100000 2018-2-21
29376 2970862811 219.196.19.177 3736764903 231.129.186.222 8888 2 1 2 000000000000010000000000 2018-2-21
33177 3707939957 117.172.2.221 401214785 65.13.234.23 8877 1 1 1 000000000000000010000000 2018-2-21
6505 3736460950 150.222.181.222 2997016939 107.217.162.178 80 1 1 1 000000000000001000000000 2018-2-21
21690 466554740 116.15.207.27 3322956083 51.73.16.198 3308 1 1 1 000000000000001000000000 2018-2-21
21690 987164138 234.237.214.58 3322956083 51.73.16.198 3308 1 1 1 000000000000000000100000 2018-2-21
22110 2016887277 237.65.55.120 3404670650 186.38.239.202 8000 1 1 1 000000000000001000000000 2018-2-21
22110 2054304230 230.49.114.122 3404670650 186.38.239.202 8000 1 1 1 000000000000010000000000 2018-2-21
22110 2054342199 55.198.114.122 3404670650 186.38.239.202 8000 1 1 1 000000000000010000000000 2018-2-21
22110 2071091192 248.87.114.123 794371949 109.39.89.47 1912 1 1 1 000000000000001000000000 2018-2-21
25579 30120285 93.153.203.1 3428803437 109.99.95.204 80 1 1 1 000000000000001000000000 2018-2-21
26510 2105872829 189.17.133.125 1944574493 29.218.231.115 25111 2 1 2 000000000000001000000000 2018-2-21
26510 980791341 45.176.117.58 1754647427 131.203.149.104 25000 3 1 3 000000000000011000010000 2018-2-21
28747 2071826334 158.143.125.123 846551373 77.89.117.50 8089 1 1 1 000000000000001000000000 2018-2-21
28747 2105512296 104.145.127.125 3322956083 51.73.16.198 3308 1 1 1 000000000000000100000000 2018-2-21
28747 466768629 245.82.210.27 3322956083 51.73.16.198 3308 2 1 2 000000000000001001000000 2018-2-21
28747 658934127 111.137.70.39 3322956083 51.73.16.198 3308 2 1 2 000000000000000000100100 2018-2-21