<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>com.vincentgroupId>
<artifactId>hadoop-hdfsartifactId>
<version>1.0-SNAPSHOTversion>
<properties>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
<flink.version>1.8.2flink.version>
<java.version>1.8java.version>
<scala.binary.version>2.11scala.binary.version>
<hadoop.version>2.8.5hadoop.version>
<maven.compiler.source>${java.version}maven.compiler.source>
<maven.compiler.target>${java.version}maven.compiler.target>
properties>
<repositories>
<repository>
<id>apache.snapshotsid>
<name>Apache Development Snapshot Repositoryname>
<url>https://repository.apache.org/content/repositories/snapshots/url>
<releases>
<enabled>falseenabled>
releases>
<snapshots>
<enabled>trueenabled>
snapshots>
repository>
repositories>
<dependencies>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-javaartifactId>
<version>${flink.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-streaming-java_${scala.binary.version}artifactId>
<version>${flink.version}version>
<scope>providedscope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-streaming-scala_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-elasticsearch6_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-hadoop-compatibility_2.11artifactId>
<version>1.8.2version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka_2.11artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.41version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-log4j12artifactId>
<version>1.7.7version>
<scope>runtimescope>
dependency>
<dependency>
<groupId>log4jgroupId>
<artifactId>log4jartifactId>
<version>1.2.17version>
<scope>runtimescope>
dependency>
<dependency>
<groupId>org.projectlombokgroupId>
<artifactId>lombokartifactId>
<version>1.18.6version>
dependency>
dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<version>3.1version>
<configuration>
<source>${java.version}source>
<target>${java.version}target>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-shade-pluginartifactId>
<version>3.0.0version>
<executions>
<execution>
<phase>packagephase>
<goals>
<goal>shadegoal>
goals>
<configuration>
<artifactSet>
<excludes>
<exclude>org.apache.flink:force-shadingexclude>
<exclude>com.google.code.findbugs:jsr305exclude>
<exclude>org.slf4j:*exclude>
<exclude>log4j:*exclude>
excludes>
artifactSet>
<filters>
<filter>
<artifact>*:*artifact>
<excludes>
<exclude>META-INF/*.SFexclude>
<exclude>META-INF/*.DSAexclude>
<exclude>META-INF/*.RSAexclude>
excludes>
filter>
filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>com.vincent.TestmainClass>
transformer>
transformers>
configuration>
execution>
executions>
plugin>
plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.eclipse.m2egroupId>
<artifactId>lifecycle-mappingartifactId>
<version>1.0.0version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-shade-pluginartifactId>
<versionRange>[3.0.0,)versionRange>
<goals>
<goal>shadegoal>
goals>
pluginExecutionFilter>
<action>
<ignore/>
action>
pluginExecution>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<versionRange>[3.1,)versionRange>
<goals>
<goal>testCompilegoal>
<goal>compilegoal>
goals>
pluginExecutionFilter>
<action>
<ignore/>
action>
pluginExecution>
pluginExecutions>
lifecycleMappingMetadata>
configuration>
plugin>
plugins>
pluginManagement>
build>
<profiles>
<profile>
<id>add-dependencies-for-IDEAid>
<activation>
<property>
<name>idea.versionname>
property>
activation>
<dependencies>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-javaartifactId>
<version>${flink.version}version>
<scope>compilescope>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-streaming-java_${scala.binary.version}artifactId>
<version>${flink.version}version>
<scope>compilescope>
dependency>
dependencies>
profile>
profiles>
project>
package com.vincent;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.connectors.elasticsearch.ActionRequestFailureHandler;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
import org.apache.flink.util.ExceptionUtils;
import org.apache.http.HttpHost;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.ActionRequest;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.List;
public class ElasticSearchSinkUtil {
public static List<HttpHost> getEsAddresses(String hosts) {
String[] hostList = hosts.split(",");
List<HttpHost> addresses = new ArrayList<>();
for (String host : hostList) {
String[] ip_port = host.split(":");
String ip = ip_port[0];
String port = ip_port[1];
addresses.add(new HttpHost(ip, Integer.parseInt(port)));
}
return addresses;
}
public static <T> void addSink(List<HttpHost> hosts, int bulkFlushMaxActions, int parallelism,
SingleOutputStreamOperator<T> data, ElasticsearchSinkFunction<T> func) {
ElasticsearchSink.Builder<T> esSinkBuilder = new ElasticsearchSink.Builder<>(hosts, func);
esSinkBuilder.setBulkFlushMaxActions(bulkFlushMaxActions);
esSinkBuilder.setFailureHandler(new ActionRequestFailureHandler() {
@Override
public void onFailure(ActionRequest actionRequest, Throwable throwable, int i, RequestIndexer requestIndexer) throws Throwable {
String description = actionRequest.getDescription();
System.out.println("----------");
System.out.println(description);
System.out.println("===========");
if (ExceptionUtils.findThrowable(throwable, SocketTimeoutException.class).isPresent()) {
System.out.println("超时异常");
} else if (ExceptionUtils.findThrowable(throwable, EsRejectedExecutionException.class).isPresent()) {
// 异常1: ES队列满了(Reject异常),放回队列
System.out.println("ES队列满了");
requestIndexer.add(actionRequest);
} else if (ExceptionUtils.findThrowable(throwable, ElasticsearchParseException.class).isPresent()) {
System.out.println("parse异常" + description);
} else if (ExceptionUtils.findThrowable(throwable, ElasticsearchException.class).isPresent()) {
System.out.println("出现异常");
}
}
});
data.addSink(esSinkBuilder.build()).setParallelism(parallelism);
}
}
package com.vincent;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.java.tuple.Tuple7;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import org.elasticsearch.common.xcontent.*;
import java.io.IOException;
import java.util.List;
public class Test {
public static void main(String[] args) throws Exception {
String propertiesPath = args[0];
ParameterTool parameterTool = ParameterTool.fromPropertiesFile(propertiesPath);
List<HttpHost> esAddresses = ElasticSearchSinkUtil.getEsAddresses(parameterTool.get("es.hosts"));
int bulk_size = parameterTool.getInt("es.bulk.flushMaxAction");
int sinkParallelism = parameterTool.getInt("es.sink.parallelism");
String rawPath = parameterTool.get("rawPath");
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> dataStreamSource = env.readTextFile(rawPath);
SingleOutputStreamOperator<Tuple7<String, String, String, String, String, String, String>> map = dataStreamSource.map(new MapFunction<String, Tuple7<String, String, String, String, String, String, String>>() {
@Override
public Tuple7<String, String, String, String, String, String, String> map(String s) throws Exception {
String[] splits = s.split("\t");
String field1= splits[0];
String field2 = splits[1];
String field3= splits[2];
String field4= splits[3];
String field5= splits[4];
String field6= splits[5];
String field7= splits[6];
return new Tuple7<>(uid, timestamp, desc_info, related_identity, record_num, desc_type, date);
}
});
ElasticSearchSinkUtil.addSink(esAddresses, bulk_size, sinkParallelism, map, new ElasticsearchSinkFunction<Tuple7<String, String, String, String, String, String, String>>() {
@Override
public void process(Tuple7<String, String, String, String, String, String, String> data, RuntimeContext runtimeContext, RequestIndexer requestIndexer) {
IndexRequest indexRequest = null;
try {
indexRequest = createIndexRequest(data);
} catch (IOException e) {
e.printStackTrace();
}
requestIndexer.add(indexRequest);
}
public IndexRequest createIndexRequest(Tuple7<String, String, String, String, String, String, String> data) throws IOException {
JSONObject jsonObject = new JSONObject();
jsonObject.put("field1", data.f0);
jsonObject.put("field2", data.f1);
jsonObject.put("field3", JSONObject.parseObject(data.f2));
jsonObject.put("field4", JSONObject.parseObject(data.f3));
jsonObject.put("field5", data.f4);
jsonObject.put("field6", data.f5);
jsonObject.put("field7", data.f6);
return Requests.indexRequest()
.index("my_index")
.type("type").source(jsonObject.toString(), XContentType.JSON);
}
});
// map.setParallelism(1).print();
env.execute("Test");
}
}
可以灵活地修改配置文件:
es.hosts=swarm-manager:9200,swarm-worker1:9200,swarm-worker2:9200
es.bulk.flushMaxAction=200
es.sink.parallelism=1
# hdfs: hdfs://swarm-manager:9001/text/000000_0, windows: E:/test/hello.txt
# rawPath=hdfs://swarm-manager:9001/text/000000_0
rawPath=E:/test/000000_0
使用mvn pakage
打包应用,将生成的hadoop-hdfs-1.0-SNAPSHOT-shaded.jar
拷贝至服务器中。
使用命令./flink-1.8.2/bin/start-cluster.bat
启动集群
使用命令:flink run ./hadoop-hdfs-1.0-SNAPSHOT-shaded.jar ./flink-es.properties
就可以运行该作业了
在浏览器中输入http://服务器IP:8081
可以查看作业运行情况