sparkSQL清洗埋点数据(java版)

数据格式:

{"actionTimes":"2018-11-25","actions":"搜索","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.nyist.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"120.50.10.233","requestMethod":"GET","sessionId":"0a1c7b51-4434-4ea0-ada9-d45ba788541c","title":"关键340洞察力","user_id":"6252"}
{"actionTimes":"2018-5-20","actions":"下单","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"181.94.33.139","requestMethod":"POST","sessionId":"79e33d3f-77e3-4120-ae75-c650e94e22f3","title":"关键273洞察力","user_id":"6106"}
{"actionTimes":"2018-7-21","actions":"下单","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"181.94.33.139","requestMethod":"GET","sessionId":"6bb2b015-36e7-4b7e-98c9-bc83365ea2cc","title":"关键504洞察力","user_id":"6918"}
{"actionTimes":"2018-9-27","actions":"登录","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.phone.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"115.44.31.64","requestMethod":"POST","sessionId":"8fff502c-fa29-4ce3-811b-e4eb944ef62e","title":"关键473洞察力","user_id":"3190"}
{"actionTimes":"2018-7-26","actions":"搜索","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.baidu.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"207.12.85.193","requestMethod":"POST","sessionId":"0a28e4aa-8d4a-4335-abad-9ff51f08f7fe","title":"关键371洞察力","user_id":"4569"}
{"actionTimes":"2018-5-5","actions":"浏览评论","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.nyist.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"10.7.87.67","requestMethod":"POST","sessionId":"65af3572-d48b-4e49-aaf4-97f322d6ac10","title":"关键117洞察力","user_id":"212"}
{"actionTimes":"2018-0-19","actions":"搜索","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://www.phone.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"181.94.33.139","requestMethod":"POST","sessionId":"d8c6e3fc-d557-4c26-855b-3053ead690d6","title":"关键34洞察力","user_id":"2367"}
{"actionTimes":"2018-5-27","actions":"下单","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"241.21.27.237","requestMethod":"GET","sessionId":"8214cc26-d22f-4670-9d63-95ce87814c9a","title":"关键266洞察力","user_id":"5698"}
{"actionTimes":"2018-11-0","actions":"浏览评论","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://www.baidu.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"181.94.33.139","requestMethod":"POST","sessionId":"4571d37d-b025-4233-a892-3f2f6d66faf2","title":"关键877洞察力","user_id":"6415"}
{"actionTimes":"2018-4-18","actions":"下单","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"181.94.33.139","requestMethod":"POST","sessionId":"68b70177-ec1f-4ef3-9a94-2bf88be2524d","title":"关键847洞察力","user_id":"3807"}
{"actionTimes":"2018-11-30","actions":"搜索","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"162.22.60.57","requestMethod":"GET","sessionId":"0df8178c-9b56-440b-8cae-09aed013a500","title":"关键365洞察力","user_id":"9359"}
{"actionTimes":"2018-1-12","actions":"浏览评论","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.phone.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"181.94.33.139","requestMethod":"GET","sessionId":"a12ce0d6-2f6b-412e-a488-1c92b9c50859","title":"关键328洞察力","user_id":"6414"}
{"actionTimes":"2018-11-10","actions":"搜索","bb":"v1.0","fromType":"Safari/537.36","fromURL":"https://www.nyist.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"215.20.32.228","requestMethod":"GET","sessionId":"8f30423f-15db-4c77-8e49-36f980765d68","title":"关键213洞察力","user_id":"5614"}
{"actionTimes":"2018-6-27","actions":"登录","bb":"v1.0","fromType":"Safari/537.36","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"56.76.93.252","requestMethod":"POST","sessionId":"20a3e71d-9f69-44e9-8a71-90b598108744","title":"关键960洞察力","user_id":"5282"}
{"actionTimes":"2018-0-13","actions":"搜索","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"132.57.13.172","requestMethod":"GET","sessionId":"2417850a-ffd7-4e88-ae20-31bf1f6f8ef2","title":"关键762洞察力","user_id":"3345"}
{"actionTimes":"2018-10-22","actions":"搜索","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"132.57.13.172","requestMethod":"POST","sessionId":"493fd1a3-3756-456e-ac3d-911cbfcdb726","title":"关键143洞察力","user_id":"3299"}
{"actionTimes":"2018-8-21","actions":"登录","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.nyist.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"132.57.13.172","requestMethod":"GET","sessionId":"52fae759-f19b-4adc-b3b8-1844e604da47","title":"关键785洞察力","user_id":"6020"}
{"actionTimes":"2018-1-3","actions":"登录","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"132.57.13.172","requestMethod":"GET","sessionId":"bfcc923a-1473-4de8-943d-257e2af047f6","title":"关键491洞察力","user_id":"8398"}
{"actionTimes":"2018-10-10","actions":"下单","bb":"v1.0","fromType":"Mozilla/5.0 (Windows NT 6.1; Win64; x64)","fromURL":"https://www.baidu.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"216.61.70.89","requestMethod":"GET","sessionId":"b2cbb5f7-e3ed-4be3-8191-e61501e3c65c","title":"关键26洞察力","user_id":"596"}
{"actionTimes":"2018-1-4","actions":"登录","bb":"v1.0","fromType":"Safari/537.36","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"172.29.54.51","requestMethod":"GET","sessionId":"90d395d4-d7df-4123-ba75-2008aa79e72c","title":"关键635洞察力","user_id":"49"}
{"actionTimes":"2018-8-12","actions":"下单","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"132.57.13.172","requestMethod":"POST","sessionId":"45eef856-30d0-478d-bb6c-d0c2b0753bf4","title":"关键486洞察力","user_id":"3262"}
{"actionTimes":"2018-2-10","actions":"登录","bb":"v1.0","fromType":"Safari/537.36","fromURL":"https://www.phone.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"132.57.13.172","requestMethod":"GET","sessionId":"3a4b00c2-eac5-4d2b-b979-dc188b8ad044","title":"关键111洞察力","user_id":"3602"}
{"actionTimes":"2018-6-27","actions":"登录","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.phone.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"251.77.50.71","requestMethod":"POST","sessionId":"e9d2e342-7977-4712-90ba-e60ab9db2ecc","title":"关键926洞察力","user_id":"6450"}
{"actionTimes":"2018-6-22","actions":"下单","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"39.32.4.140","requestMethod":"GET","sessionId":"8ab35dc1-9152-4cc0-b426-719dee02e7b1","title":"关键273洞察力","user_id":"916"}
{"actionTimes":"2018-1-29","actions":"登录","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.baidu.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"251.77.50.71","requestMethod":"POST","sessionId":"2e384529-a943-476e-8cef-cb28b32cf046","title":"关键18洞察力","user_id":"3420"}
{"actionTimes":"2018-4-8","actions":"搜索","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.mail.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"251.77.50.71","requestMethod":"POST","sessionId":"150804fb-513a-4783-8b66-674b8cc31748","title":"关键276洞察力","user_id":"474"}
{"actionTimes":"2018-7-1","actions":"浏览商品","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://www.nyist.com/s?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"251.77.50.71","requestMethod":"GET","sessionId":"4e5d1333-daee-4ff3-a4ef-511da7c44bfc","title":"关键604洞察力","user_id":"2571"}
{"actionTimes":"2018-8-9","actions":"下单","bb":"v1.0","fromType":"IE/537.36 (KHTML, like Gecko) ","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"251.77.50.71","requestMethod":"POST","sessionId":"667a77e6-2dac-42b8-b45e-947b0d632a8f","title":"关键315洞察力","user_id":"6842"}
{"actionTimes":"2018-7-22","actions":"浏览评论","bb":"v1.0","fromType":"Chrome/73.0.3683.75","fromURL":"https://tv.qq.com/channel/child?listpage=1&channel=children&itype=3","ip":"251.77.50.71","requestMethod":"POST","sessionId":"ed9c961e-f562-4197-8f56-be1b6e561f14","title":"关键812洞察力","user_id":"5699"}
{"actionTimes":"2018-7-26","actions":"浏览商品","bb":"v1.0","fromType":"Safari/537.36","fromURL":"https://www.phone.com/int?wd=ip%E5%9C%B0%E5%9D%80&rsv_spt=1","ip":"251.77.50.71","requestMethod":"POST","sessionId":"f607bb41-46ea-4a26-b41d-37fd8dbef145","title":"关键388洞察力","user_id":"1237"}

 

清洗程序代码:(获取相关指标,并输出到Mysql)

    备注:好久没用sparkSQL了,功能还怪详细嘞

package com.nyist.hdl.controller;
import java.util.Properties;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import com.nyist.hdl.model.MemberFrom;

/**
 * 清洗埋点数据 JSON
 * 
 * @author zhangchenguang
 *
 */
public class CleanJSON {

	public static void main(String[] args) {
		
		Properties connectionProperties = new Properties();
		connectionProperties.put("user", "root");
		connectionProperties.put("password", "123456");
		connectionProperties.put("driver","com.mysql.jdbc.Driver");
		
		SparkConf sc = new SparkConf();
		sc.setMaster("local");
		SparkSession spark = SparkSession.builder()
				  .config(sc)
				  .appName("CleanJSON")
				  .getOrCreate();
		Dataset rowData = spark.read().json("file:///Users/zhangchenguang/eclispe_workspace/spark_pro/src/main/java/com/nyist/hdl/datas/data.txt");
		rowData.createOrReplaceTempView("data");
		Dataset action_DF = spark.sql("SELECT actions,count(1) FROM data group by actions");
		action_DF.show();
//		action_DF.write()
//		.mode(SaveMode.Append)
//		.jdbc("jdbc:mysql://localhost:3306/t_spark", "t_action", connectionProperties);
		
//		action_DF.foreach(new ForeachFunction() {
//			
//			@Override
//			public void call(Object row) throws Exception {
//				System.out.println(row);
//			}
//
//		});
		
		Dataset opLog_DF = spark.sql("SELECT ip,count(1) FROM data group by ip having count(1)>1");
		opLog_DF.show();
//		opLog_DF.write()
//			.mode(SaveMode.Append)
//			.jdbc("jdbc:mysql://localhost:3306/t_spark", "t_oplog", connectionProperties);
		
		Dataset memberFrom_DF = rowData.select("fromURL");
		
		Dataset mfStr_DF = memberFrom_DF.map(new MapFunction() {

			/**
			 * 处理来源方式
			 */
			private static final long serialVersionUID = 1L;
			private MemberFrom mf = null;

			@Override
			public MemberFrom call(Row value) throws Exception {
				String rowStr = value.getString(0);
				int index = 0;
				index = rowStr.indexOf("baidu");
				
				mf = new MemberFrom();
				mf.setCount(1);
				
				if(index!=-1){
					mf.setFromName("百度推广");
				}
				index = rowStr.indexOf("tv");
				if(index!=-1){
					mf.setFromName("视频广告");
				}
				index = rowStr.indexOf("nyist");
				if(index!=-1){
					mf.setFromName("直接访问");
				}
				index = rowStr.indexOf("mail");
				if(index!=-1){
					mf.setFromName("邮件营销");
				}
				index = rowStr.indexOf("phone");
				if(index!=-1){
					mf.setFromName("客服电话");
				}
				
				return mf;
			}
		}, Encoders.bean(MemberFrom.class));

		JavaRDD jrdd = mfStr_DF.javaRDD();
		
		Dataset mf_DF = spark.createDataFrame(jrdd, MemberFrom.class);
		
		mf_DF.createOrReplaceTempView("memberFrom");
		Dataset mfRes_DF = spark.sql("SELECT fromName,count(1) FROM memberFrom group by fromName");
		mfRes_DF.show();
		
//		mfRes_DF.write()
//			.mode(SaveMode.Append)
//			.jdbc("jdbc:mysql://localhost:3306/t_spark", "t_memberfrom", connectionProperties);

	}
}

 

你可能感兴趣的:(大数据,Spark)