需求:因为需要使用hadoop能力,所以程序最后打包丢到spark所在服务器上运行,自己的web通过socket连接该jar,传递参数,执行数据搬迁任务,最后返回执行结果。
2.1.2
1.7
1.2.0-cdh5.8.4
org.apache.spark
spark-core_2.11
${spark.version}
provided
org.apache.spark
spark-sql_2.11
${spark.version}
provided
org.apache.spark
spark-hive_2.11
${spark.version}
provided
mysql
mysql-connector-java
5.1.47
provided
org.apache.commons
commons-lang3
3.4
provided
com.google.code.gson
gson
2.2.4
provided
org.elasticsearch
elasticsearch-spark-20_2.11
6.3.2
provided
com.oracle
ojdbc6
11.2.0.3
provided
org.apache.zookeeper
zookeeper
3.4.13
pom
org.apache.hbase
hbase-client
${hbase.version}
org.slf4j
slf4j-log4j12
org.apache.hbase
hbase-common
${hbase.version}
org.slf4j
slf4j-log4j12
org.apache.hbase
hbase-server
${hbase.version}
org.slf4j
slf4j-log4j12
public class MyServer {
public static void main(String[] args) throws Exception
{
int port = 2018;
if(args != null && args.length>0){
port = Integer.parseInt(args[0]);
}
SparkSession spark = SparkSession.builder().appName("SparkUtil") // .master("local[*]")
.config("spark.sql.hive.verifyPartitionPath",true) //解决分区损坏问题
.config("spark.hadoop.yarn.timeline-service.enabled",false)//java.lang.NoClassDefFoundError: com/sun/jersey/api/client/config/ClientConfig
.config("spark.sql.broadcastTimeout",PropertiesUtil.getProperty("spark.sql.broadcastTimeout","3600"))
.config("spark.network.timeout",PropertiesUtil.getProperty("spark.network.timeout"))
.config("spark.sql.parquet.compression.codec",PropertiesUtil.getProperty("spark.sql.parquet.compression.codec","gzip"))
.config("es.index.auto.create", "false")//不自动创建,提前创建索引
.config("es.nodes",PropertiesUtil.getProperty("es.nodes"))
.config("es.port",PropertiesUtil.getProperty("es.port"))
.config("es.batch.size.bytes",PropertiesUtil.getProperty("es.batch.size.bytes"))
.config("es.batch.size.entries",PropertiesUtil.getProperty("es.batch.size.entries","100")) // 100 解决Could not write all entries
.config("es.batch.write.refresh",PropertiesUtil.getProperty("es.batch.write.refresh","false"))
.config("es.batch.write.retry.count",PropertiesUtil.getProperty("es.batch.write.retry.count","30")) // 3 解决Could not write all entries
.config("es.batch.write.retry.wait",PropertiesUtil.getProperty("es.batch.write.retry.wait","100")) // 10s 解决Could not write all entries
.config("es.http.timeout",PropertiesUtil.getProperty("es.http.timeout"))
.config("es.http.retries",PropertiesUtil.getProperty("es.http.retries"))
.config("es.action.heart.beat.lead",PropertiesUtil.getProperty("es.action.heart.beat.lead"))
.config("es.nodes.wan.only",PropertiesUtil.getProperty("es.nodes.wan.only","true"))
.config("es.nodes.data.only",PropertiesUtil.getProperty("es.nodes.data.only","true"))
.config("es.nodes.discovery",PropertiesUtil.getProperty("es.nodes.discovery","true"))
.config("es.input.use.sliced.partitions",PropertiesUtil.getProperty("es.input.use.sliced.partitions","50000"))
.config("es.input.max.docs.per.partition",PropertiesUtil.getProperty("es.input.max.docs.per.partition","100000"))
.config("es.net.http.header.Accept-Languag",PropertiesUtil.getProperty("es.net.http.header.Accept-Languag","gzip"))
.enableHiveSupport().getOrCreate();
ServerSocket serverSocket = new ServerSocket(port);//是一个能够接受其他通信实体请求的类
System.out.println("服务器正在等待客户端的连接请求----");
// 创建线程池
ThreadPoolExecutor executor = new ThreadPoolExecutor(50, 80, 200, MILLISECONDS,
new ArrayBlockingQueue(5));
//用一个while循环可以同时响应多个客户端的请求
while(true){
Socket socket = serverSocket.accept();//服务器监听对应端口的输入
MyServerThread thread = new MyServerThread(socket,spark);//创建线程
executor.execute(thread);
//thread.start();
}
}
}
public class MyServerThread implements Runnable, Serializable {
public static final String SUCCESS = "1";
public static final String FAIL = "0";
public static final String DS_HIVE = "hive";
public static final String DS_ORACLE = "oracle";
public static final String DS_ES = "es";
public static final String DS_HBASE = "hbase";
public static final String DS_MYSQL = "mysql"
private transient Socket socket;
private transient SparkSession spark;
public MyServerThread(Socket socket,SparkSession spark){
this.socket = socket;
this.spark = spark;
}
public void run() {
BufferedReader bufferedReader = null;
BufferedWriter bufferedWriter = null;
ObjectInputStream ois = null;
Map map = null;
HashMap paramsMap = null;
try{
ois = new ObjectInputStream(socket.getInputStream());
paramsMap = (HashMap) ois.readObject();
System.out.println("来自客户端的数据:"+paramsMap);
if(paramsMap != null) {
String method = PropertiesUtil.getStrValue(paramsMap, "method");
if ("doImportToEs".equals(method)) {
map = doImportToEs(paramsMap, spark);
}
if ("doImportToHBase".equals(method)) {
map = doImportToHBase(paramsMap, spark);
}
bufferedWriter = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream()));
bufferedWriter.write(map.get("code") + ":" + map.get("msg") + "\n");
bufferedWriter.flush();
}
} catch(Exception e) {
e.printStackTrace();
} finally {
try {
if(ois != null){
ois.close();
}
if(bufferedReader != null){
bufferedReader.close();
}
if(bufferedWriter != null){
bufferedWriter.close();
}
if(socket != null){
socket.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
//数据保存
private Map doImportToEs(Map paramsMap, SparkSession spark){
long start = System.currentTimeMillis();
Map result = new HashMap();
long totalCount = 0L;
try {
String sql = PropertiesUtil.getStrValue(paramsMap,"sql");
String tableCode = PropertiesUtil.getStrValue(paramsMap,"table_code");
String index = PropertiesUtil.getStrValue(paramsMap,"index");
String srcTableType = PropertiesUtil.getStrValue(paramsMap,"src_table_type");
String tableType = PropertiesUtil.getStrValue(paramsMap,"table_type");
String mappingId = PropertiesUtil.getStrValue(paramsMap,"mapping_id");
//保存到ES
if (DS_HIVE.equalsIgnoreCase(srcTableType)) {
Dataset toTransDs = spark.sql(sql).persist();
totalCount = toTransDs.count();
if(StringUtils.isEmpty(mappingId)) {
JavaEsSparkSQL.saveToEs(toTransDs, index + "/" + index);
} else {
//索引主键,使用默认主键;更多配置可参考ConfigurationOptions
JavaEsSparkSQL.saveToEs(toTransDs, index + "/" + index,ImmutableMap.of("es.mapping.id", mappingId));
}
toTransDs.unpersist();//释放缓存
}
if(DS_ORACLE.equalsIgnoreCase(srcTableType) && DS_ES.equalsIgnoreCase(tableType)){
//连接oracle库
Map options = new HashMap();
options.put("driver",PropertiesUtil.getProperty("oracle.driver"));
options.put("url",PropertiesUtil.getStrValue(paramsMap,"url"));
options.put("user",PropertiesUtil.getStrValue(paramsMap,"user"));
options.put("password",PropertiesUtil.getStrValue(paramsMap,"password"));
options.put("dbtable",tableCode);
Dataset oraDs = spark.read().format("jdbc").options(options).load();
oraDs.createOrReplaceTempView(tableCode);
Dataset oraTable = spark.sql(sql);//String sqltext="(" + sql + ") t";
oraTable.show();
totalCount = oraTable.count();
if(StringUtils.isEmpty(mappingId)) {
JavaEsSparkSQL.saveToEs(oraTable, index + "/" + index);
} else {
//索引主键,使用默认主键;更多配置可参考ConfigurationOptions
JavaEsSparkSQL.saveToEs(oraTable, index + "/" + index,ImmutableMap.of("es.mapping.id", mappingId));
}
}
if(DS_MYSQL.equalsIgnoreCase(srcTableType) && DS_ES.equalsIgnoreCase(tableType)) {
Properties connectionProperties = new Properties();
connectionProperties.put("user", PropertiesUtil.getStrValue(paramsMap,"user"));
connectionProperties.put("password", PropertiesUtil.getStrValue(paramsMap,"password"));
String url = PropertiesUtil.getStrValue(paramsMap,"url");
Dataset mysqlTable = spark.read().jdbc(url, tableCode, connectionProperties);
mysqlTable.show();
totalCount = mysqlTable.count();
if(StringUtils.isEmpty(mappingId)) {
JavaEsSparkSQL.saveToEs(mysqlTable, index + "/" + index);
} else {
//索引主键,使用默认主键;更多配置可参考ConfigurationOptions
JavaEsSparkSQL.saveToEs(mysqlTable, index + "/" + index,ImmutableMap.of("es.mapping.id", mappingId));
}
}
result.put("code",totalCount);//SUCCESS
long end = System.currentTimeMillis();
result.put("msg","保存到ES执行完成,源数据量:"+totalCount+"条,耗时:"+(end-start)/1000+"s");//注意不要用英文:
System.out.println(result);
} catch (Exception e) {
result.put("code",FAIL);
result.put("msg",e.getMessage());
e.printStackTrace();
}
return result;
}
private Map doImportToHBase(Map paramsMap, SparkSession spark){
long start = System.currentTimeMillis();
Map result = new HashMap();
long totalCount = 0L;
long rowCount = 0L;
try {
String sql = PropertiesUtil.getStrValue(paramsMap,"sql");
String srcTableCode = PropertiesUtil.getStrValue(paramsMap,"src_table_code");
String tableCode = PropertiesUtil.getStrValue(paramsMap,"table_code");
String srcTableType = PropertiesUtil.getStrValue(paramsMap,"src_table_type");
String tableType = PropertiesUtil.getStrValue(paramsMap,"table_type");
String mappingId = PropertiesUtil.getStrValue(paramsMap,"mapping_id");
if(StringUtils.isEmpty(mappingId)) {
result.put("code",FAIL);
result.put("msg","Rowkey column is NULL!");
return result;
}
if (DS_HIVE.equalsIgnoreCase(srcTableType)) {
Dataset toTransDs = spark.sql(sql).persist();
totalCount = toTransDs.count();
rowCount = saveToHBase(toTransDs,tableCode,mappingId);//保存数据到hbase
toTransDs.unpersist();//释放缓存
}
if(DS_ORACLE.equalsIgnoreCase(srcTableType) && DS_HBASE.equalsIgnoreCase(tableType)){
//连接oracle库
Map options = new HashMap();
options.put("driver",PropertiesUtil.getProperty("oracle.driver"));
options.put("url",PropertiesUtil.getStrValue(paramsMap, "url"));
options.put("user",PropertiesUtil.getStrValue(paramsMap, "user"));
options.put("password",PropertiesUtil.getStrValue(paramsMap, "password"));
options.put("dbtable",srcTableCode);
Dataset oraDs = spark.read().format("jdbc").options(options).load();
oraDs.createOrReplaceTempView(srcTableCode);
Dataset oraTable = spark.sql(sql);//String sqltext="(" + sql + ") t";
oraTable.show();
totalCount = oraTable.count();
rowCount = saveToHBase(oraTable,tableCode,mappingId);//保存数据到hbase
}
if(DS_MYSQL.equalsIgnoreCase(srcTableType) && DS_HBASE.equalsIgnoreCase(tableType)){
//连接mysql库
Properties connectionProperties = new Properties();
connectionProperties.put("user", PropertiesUtil.getStrValue(paramsMap,"user"));
connectionProperties.put("password", PropertiesUtil.getStrValue(paramsMap,"password"));
String url = PropertiesUtil.getStrValue(paramsMap,"url");
Dataset mysqlTable = spark.read().jdbc(url, srcTableCode, connectionProperties);
mysqlTable.show();
totalCount = mysqlTable.count();
rowCount = saveToHBase(mysqlTable,tableCode,mappingId);//保存数据到hbase
}
long end = System.currentTimeMillis();
result.put("code",SUCCESS);
String msgs = "保存到HBase执行完成,源数据量:"+totalCount+"条,耗时:"+(end-start)/1000+"s";//注意不要用英文:
if(totalCount != rowCount){
result.put("code",FAIL);
msgs += ";保存到HBase数据量:" + rowCount + ",数据量不一致";
}
result.put("msg",msgs);
System.out.println(result);
} catch (Exception e) {
result.put("code",FAIL);
result.put("msg",e.getMessage());
e.printStackTrace();
}
return result;
}
private Long saveToHBase(Dataset resultDs, String tableCode, final String mappingId) throws Exception {
String tableName = tableCode.toLowerCase();//表名
final String columnFamily = "columnFamily";//列簇名
Configuration config = spark.sparkContext().hadoopConfiguration();
config.set("hbase.zookeeper.quorum", PropertiesUtil.getProperty("hbase.zookeeper.quorum"));
config.set("hbase.zookeeper.property.clientPort",PropertiesUtil.getProperty("hbase.zookeeper.property.clientPort"));
config.set("hbase.master", PropertiesUtil.getProperty("hbase.master"));
config.set("hbase.cluster.distributed", PropertiesUtil.getProperty("hbase.cluster.distributed","true"));
config.set("zookeeper.session.timeout", PropertiesUtil.getProperty("zookeeper.session.timeout","30000"));
config.set("hbase.hregion.majorcompaction", PropertiesUtil.getProperty("hbase.hregion.majorcompaction","0"));
config.set("hbase.regionserver.regionSplitLimit", PropertiesUtil.getProperty("hbase.regionserver.regionSplitLimit","1"));
config.set("dfs.client.socket-timeout", PropertiesUtil.getProperty("dfs.client.socket-timeout","60000"));
config.set("hbase.regionserver.handler.count", PropertiesUtil.getProperty("hbase.regionserver.handler.count","20"));
Job hbaseJob = Job.getInstance(config,"spark-hbase");
hbaseJob.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);
hbaseJob.getConfiguration().set(TableInputFormat.INPUT_TABLE, tableName);
hbaseJob.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, tableName);
Connection connection = ConnectionFactory.createConnection(config);
Admin admin = connection.getAdmin();
//删除无需保留的数据
String regex = "^" + tableName + "$";
TableName[] tableNames = admin.listTableNames(regex);//查询所有表名
for (TableName tname : tableNames) {
admin.disableTable(tname); //禁用表
admin.deleteTable(tname); //删除表
}
//新建表
TableName htableName = TableName.valueOf(tableName);
HTableDescriptor hbaseTable = new HTableDescriptor(htableName);
hbaseTable.addFamily(new HColumnDescriptor(columnFamily));//列簇默认
admin.createTable(hbaseTable);
final String[] mids = mappingId.split(",");
JavaPairRDD hbasePuts = resultDs.javaRDD().mapToPair(new PairFunction() {
@Override
public Tuple2 call(Row row) throws Exception {
StructType schema = row.schema();
Iterator it = schema.iterator();
Iterator it1 = schema.iterator();
//获取rowkey值
int i = 0;
List values = new ArrayList<>();
while (it.hasNext()) {
StructField next = it.next();
for(String mid : mids){
if(mid.equalsIgnoreCase(next.name())){
values.add(row.getString(i));
}
}
i++;
}
Put put = new Put(Bytes.toBytes(StringUtils.join(values,":")));//连接多个字段
//设置数据
i = 0;
while (it1.hasNext()) {
StructField next = it1.next();
Object value = row.get(i++);
String v = (value == null) ? "" :String.valueOf(value);
put.addColumn(Bytes.toBytes(columnFamily), Bytes.toBytes(next.name()), Bytes.toBytes(v));
}
return new Tuple2(new ImmutableBytesWritable(), put);
}
});
//保存数据
hbasePuts.saveAsNewAPIHadoopDataset(hbaseJob.getConfiguration());
//数据稽核
String coprocessorClassName = "org.apache.hadoop.hbase.coprocessor.AggregateImplementation";
admin.disableTable(htableName);
hbaseTable.addCoprocessor(coprocessorClassName);
admin.modifyTable(htableName, hbaseTable);
admin.enableTable(htableName);
AggregationClient ac = new AggregationClient(config);
Scan scan = new Scan();
scan.addFamily(Bytes.toBytes(columnFamily));
ColumnInterpreter longColumnInterpreter = new LongColumnInterpreter();
long rowCount = 0;
try {
rowCount = ac.rowCount(htableName,longColumnInterpreter,scan);
} catch (Throwable throwable) {
throwable.printStackTrace();
}
System.out.println("hbase rowCount:"+rowCount);
admin.close();
connection.close();
return rowCount;
}
}
public class PropertiesUtil {
private static Properties prop = null;
static{
//获取配置
try {
prop = new Properties();
ClassLoader loader = Thread.currentThread().getContextClassLoader();
InputStream inputStream = loader.getResourceAsStream("config.properties");
prop.load(inputStream);
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getProperty(String property){
return prop.getProperty(property);
}
public static String getProperty(String property,String defaultValue){
String value = prop.getProperty(property);
if(value == null || "".equals(value)){
value = defaultValue;
}
return value;
}
public static String getStrValue(Map map, String key) {
if (map == null || map.isEmpty() || StringUtils.isBlank(key)) {
return "";
}
Object t = map.get(key);
if (t != null) {
if(t instanceof Integer){
return t+"";
}
return t.toString();
} else {
for (Object o : map.keySet()) {
String name = (String) o;
if (name.toLowerCase().equals(key.toLowerCase())) {
Object value = map.get(o);
if (value == null) {
return "";
}
return value.toString();
}
}
}
return "";
}
}
#spark配置
spark.sql.broadcastTimeout=3600
spark.network.timeout=1400s
spark.sql.parquet.compression.codec=snappy
################################################################
#数据来源类型[hive、oracle、mysql]
#数据来源类型为oracle时,oracle配置
oracle.driver=oracle.jdbc.driver.OracleDriver
################################################################
#数据目标类型[es、hbase]
#数据目标类型为es时,elasticsearch配置
es.nodes=hostes
es.port=9200
# 4000 解决Could not write all entries
es.batch.size.bytes=20000000
es.batch.size.entries=5000
es.batch.write.refresh=true
es.batch.write.retry.count=50
es.batch.write.retry.wait=500
es.http.timeout=5m
es.http.retries=50
es.action.heart.beat.lead=50
es.nodes.wan.only=true
es.nodes.data.only =false
es.nodes.discovery=false
es.input.use.sliced.partitions=50000
es.input.max.docs.per.partition=100000
es.net.http.header.Accept-Languag=gzip
#数据目标类型为hbase时,hbase配置
hbase.zookeeper.quorum=hosthbase
hbase.zookeeper.property.clientPort=2181
hbase.master=hosthbase:60000
hbase.cluster.distributed=true
zookeeper.session.timeout=1200000
hbase.hregion.majorcompaction=0
hbase.regionserver.regionSplitLimit=150
dfs.client.socket-timeout=60000
hbase.regionserver.handler.count=50
public class MyClient {
private Socket socket;
public MyClient (String address, int port) throws IOException {
socket = new Socket();
socket.connect(new InetSocketAddress(InetAddress.getByName(address), port));
}
public Socket getSocket(){
return socket;
}
public static void main(String[] args){
Socket socket = null;
BufferedReader buffer = null;
OutputStream outputStream = null;
ObjectOutputStream oos = null;
InputStream inputStream = null;
try{
MyClient myClient = new MyClient ("hostss",2018);
socket = myClient.getSocket();
outputStream = socket.getOutputStream();
inputStream = socket.getInputStream();
oos = new ObjectOutputStream(outputStream);
Map paramMap = new HashMap();
paramMap.put("method","doImportToHBase");
paramMap.put("sql","select * from hive_test.big_data_test where p_day = 20181220");
paramMap.put("table_code","big_data_test_hbase");
paramMap.put("src_table_type","hive");
paramMap.put("table_type","hbase");
paramMap.put("mapping_id","pk_id,phone_no");
oos.writeObject(paramMap);
oos.flush();
buffer = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String line = buffer.readLine();
System.out.println("来自服务端的数据:"+line);
} catch (IOException e) {
e.printStackTrace();
} finally{
try {
if (oos != null){
oos.close();
}
if (outputStream != null) {
outputStream.close();
}
if (inputStream != null) {
inputStream.close();
}
if (socket != null){
socket.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
nohup /home/hadoop/sparkForThrift/bin/spark-submit \
--class com.sd.data.spark.MyServer \
--driver-java-options -Xss10m \
--conf spark.executor.extraJavaOptions="-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:-CMSConcurrentMTEnabled -XX:CMSInitiatingOccupancyFraction=70 -XX:+CMSParallelRemarkEnabled" \
--driver-memory 50G \
--executor-memory 50G \
--executor-cores 5 \
--conf spark.default.parallelism=1000 \
--name spark-es-hbase \
--queue root.ses \
--master yarn \
>> spark-es-hbase.log &