spark任务shell运行_基于sparksql调用shell脚本运行SQL

[Author]: kwu

基于sparksql调用shell脚本运行SQL,sparksql提供了类似hive中的 -e  , -f ,-i的选项

1、定时调用脚本

#!/bin/sh

# upload logs to hdfs

yesterday=`date --date=‘1 days ago‘ +%Y%m%d`

/opt/modules/spark/bin/spark-sql -i /opt/bin/spark_opt/init.sql --master spark://10.130.2.20:7077 --executor-memory 6g --total-executor-cores 45 --conf spark.ui.port=4075 -e "insert overwrite table st.stock_realtime_analysis PARTITION (DTYPE=‘01‘ )

select t1.stockId as stockId,

t1.url as url,

t1.clickcnt as clickcnt,

0,

round((t1.clickcnt / (case when t2.clickcntyesday is null then 0 else t2.clickcntyesday end) - 1) * 100, 2) as LPcnt,

‘01‘ as type,

t1.analysis_date as analysis_date,

t1.analysis_time as analysis_time

from (select stock_code stockId,

concat(‘http://stockdata.stock.hexun.com/‘, stock_code,‘.shtml‘) url,

count(1) clickcnt,

substr(from_unixtime(unix_timestamp(),‘yyyy-MM-dd HH:mm:ss‘),1,10) analysis_date,

substr(from_unixtime(unix_timestamp(),‘yyyy-MM-dd HH:mm:ss‘),12,8) analysis_time

from dms.tracklog_5min

where stock_type = ‘STOCK‘

and day =

substr(from_unixtime(unix_timestamp(), ‘yyyyMMdd‘), 1, 8)

group by stock_code

order by clickcnt desc limit 20) t1

left join (select stock_code stockId, count(1) clickcntyesday

from dms.tracklog_5min a

where stock_type = ‘STOCK‘

and substr(datetime, 1, 10) = date_sub(from_unixtime(unix_timestamp(),‘yyyy-MM-dd HH:mm:ss‘),1)

and substr(datetime, 12, 5)

and day = ‘${yesterday}‘

group by stock_code) t2

on t1.stockId = t2.stockId;

"

sqoop export --connect jdbc:mysql://10.130.2.245:3306/charts --username guojinlian --password Abcd1234 --table stock_realtime_analysis --fields-terminated-by ‘\001‘ --columns "stockid,url,clickcnt,splycnt,lpcnt,type" --export-dir /dw/st/stock_realtime_analysis/dtype=01;

init.sql内容为载入udf:

add jar /opt/bin/UDF/hive-udf.jar;

create temporary function udtf_stockidxfund as ‘com.hexun.hive.udf.stock.UDTFStockIdxFund‘;

create temporary function udf_getbfhourstime as ‘com.hexun.hive.udf.time.UDFGetBfHoursTime‘;

create temporary function udf_getbfhourstime2 as ‘com.hexun.hive.udf.time.UDFGetBfHoursTime2‘;

create temporary function udf_stockidxfund as ‘com.hexun.hive.udf.stock.UDFStockIdxFund‘;

create temporary function udf_md5 as ‘com.hexun.hive.udf.common.HashMD5UDF‘;

create temporary function udf_murhash as ‘com.hexun.hive.udf.common.HashMurUDF‘;

create temporary function udf_url as ‘com.hexun.hive.udf.url.UDFUrl‘;

create temporary function url_host as ‘com.hexun.hive.udf.url.UDFHost‘;

create temporary function udf_ip as ‘com.hexun.hive.udf.url.UDFIP‘;

create temporary function udf_site as ‘com.hexun.hive.udf.url.UDFSite‘;

create temporary function udf_UrlDecode as ‘com.hexun.hive.udf.url.UDFUrlDecode‘;

create temporary function udtf_url as ‘com.hexun.hive.udf.url.UDTFUrl‘;

create temporary function udf_ua as ‘com.hexun.hive.udf.useragent.UDFUA‘;

create temporary function udf_ssh as ‘com.hexun.hive.udf.useragent.UDFSSH‘;

create temporary function udtf_ua as ‘com.hexun.hive.udf.useragent.UDTFUA‘;

create temporary function udf_kw as ‘com.hexun.hive.udf.url.UDFKW‘;

create temporary function udf_chdecode as ‘com.hexun.hive.udf.url.UDFChDecode‘;

设置ui的port

--conf spark.ui.port=4075

默觉得4040,会与其它正在跑的任务冲突,这里改动为4075

设定任务使用的内存与CPU资源

--executor-memory 6g --total-executor-cores 45

原来的语句是用hive

-e 运行的,改动为spark后速度大加快了。

原来为15min,提升速度后为 45s.

你可能感兴趣的:(spark任务shell运行)