http://blog.csdn.net/fansy1990/article/details/48001013
相关软件版本:
Spark1.4.1 ,Hadoop2.6,Scala2.10.5 , MyEclipse2014,intelliJ IDEA14,JDK1.8,Tomcat7
机器:
windows7 (包含JDK1.8,MyEclipse2014,IntelliJ IDEA14,TOmcat7);
centos6.6虚拟机(Hadoop伪分布式集群,Spark standAlone集群,JDK1.8);
centos7虚拟机(Tomcat,JDK1.8);
1. 场景:
1. windows简单Java程序调用Spark,执行Scala开发的Spark程序,这里包含两种模式:
1> 提交任务到Spark集群,使用standAlone模式执行;
2> 提交任务到Yarn集群,使用yarn-client的模式;
2. windows 开发Java web程序调用Spark,执行Scala开发的Spark程序,同样包含两种模式,参考1.
3. Linux运行java web程序调用Spark,执行Scala开发的Spark程序,包含两种模式,参考1.
2. 实现:
1. 简单Scala程序,该程序的功能是读取HDFS中的log日志文件,过滤log文件中的WARN和ERROR的记录,最后把过滤后的记录写入到HDFS中,代码如下:
使用IntelliJ IDEA 并打成jar包备用(lz这里命名为spark_filter.jar);
2. java调用spark_filter.jar中的Scala_Test 文件,并采用Spark standAlone模式,java代码如下:
- package test;
-
- import java.text.SimpleDateFormat;
- import java.util.Date;
-
- import org.apache.spark.deploy.SparkSubmit;
-
-
-
-
- public class SubmitScalaJobToSpark {
-
- public static void main(String[] args) {
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-hh-mm-ss");
- String filename = dateFormat.format(new Date());
- String tmp=Thread.currentThread().getContextClassLoader().getResource("").getPath();
- tmp =tmp.substring(0, tmp.length()-8);
- String[] arg0=new String[]{
- "--master","spark://node101:7077",
- "--deploy-mode","client",
- "--name","test java submit job to spark",
- "--class","Scala_Test",
- "--executor-memory","1G",
-
- tmp+"lib/spark_filter.jar",
- "hdfs://node101:8020/user/root/log.txt",
- "hdfs://node101:8020/user/root/badLines_spark_"+filename
- };
-
- SparkSubmit.main(arg0);
- }
- }
具体操作,使用MyEclipse新建java web工程,把spark_filter.jar 以及spark-assembly-1.4.1-hadoop2.6.0.jar(该文件在Spark压缩文件的lib目录中,同时该文件较大,拷贝需要一定时间) 拷贝到WebRoot/WEB-INF/lib目录。(注意:这里可以直接建立java web项目,在测试java调用时,直接运行java代码即可,在测试web项目时,开启tomcat即可)
java调用spark_filter.jar中的Scala_Test 文件,并采用Yarn模式。采用Yarn模式,不能使用简单的修改master为“yarn-client”或“yarn-cluster”,在使用Spark-shell或者spark-submit的时候,使用这个,同时配置HADOOP_CONF_DIR路径是可以的,但是在这里,读取不到HADOOP的配置,所以这里采用其他方式,使用yarn.Clent提交的方式,java代码如下:
- package test;
-
- import java.text.SimpleDateFormat;
- import java.util.Date;
-
- import org.apache.hadoop.conf.Configuration;
- import org.apache.spark.SparkConf;
- import org.apache.spark.deploy.yarn.Client;
- import org.apache.spark.deploy.yarn.ClientArguments;
-
- public class SubmitScalaJobToYarn {
-
- public static void main(String[] args) {
- SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-hh-mm-ss");
- String filename = dateFormat.format(new Date());
- String tmp=Thread.currentThread().getContextClassLoader().getResource("").getPath();
- tmp =tmp.substring(0, tmp.length()-8);
- String[] arg0=new String[]{
- "--name","test java submit job to yarn",
- "--class","Scala_Test",
- "--executor-memory","1G",
-
- "--jar",tmp+"lib/spark_filter.jar",
-
- "--arg","hdfs://node101:8020/user/root/log.txt",
- "--arg","hdfs://node101:8020/user/root/badLines_yarn_"+filename,
- "--addJars","hdfs://node101:8020/user/root/servlet-api.jar",//
- "--archives","hdfs://node101:8020/user/root/servlet-api.jar"//
- };
-
-
- Configuration conf = new Configuration();
- String os = System.getProperty("os.name");
- boolean cross_platform =false;
- if(os.contains("Windows")){
- cross_platform = true;
- }
- conf.setBoolean("mapreduce.app-submission.cross-platform", cross_platform);
- conf.set("fs.defaultFS", "hdfs://node101:8020");// 指定namenode
- conf.set("mapreduce.framework.name","yarn");
- conf.set("yarn.resourcemanager.address","node101:8032");
- conf.set("yarn.resourcemanager.scheduler.address", "node101:8030");
- conf.set("mapreduce.jobhistory.address","node101:10020");
-
- System.setProperty("SPARK_YARN_MODE", "true");
-
- SparkConf sparkConf = new SparkConf();
- ClientArguments cArgs = new ClientArguments(arg0, sparkConf);
-
- new Client(cArgs,conf,sparkConf).run();
- }
- }
3. java web测试 任务提交到Spark的两种模式,这里采用最简单的方式,直接配置servlet,其web.xml文件如下:
- xml version="1.0" encoding="UTF-8"?>
- <web-app version="3.0"
- xmlns="http://java.sun.com/xml/ns/javaee"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd">
- <servlet>
- <description>This is the description of my J2EE componentdescription>
- <display-name>This is the display name of my J2EE componentdisplay-name>
- <servlet-name>SparkServletservlet-name>
- <servlet-class>servlet.SparkServletservlet-class>
- servlet>
- <servlet>
- <description>This is the description of my J2EE componentdescription>
- <display-name>This is the display name of my J2EE componentdisplay-name>
- <servlet-name>YarnServletservlet-name>
- <servlet-class>servlet.YarnServletservlet-class>
- servlet>
-
-
- <servlet-mapping>
- <servlet-name>SparkServletservlet-name>
- <url-pattern>/servlet/SparkServleturl-pattern>
- servlet-mapping>
- <servlet-mapping>
- <servlet-name>YarnServletservlet-name>
- <url-pattern>/servlet/YarnServleturl-pattern>
- servlet-mapping>
-
- web-app>
SparkServlet如下:
- package servlet;
-
- import java.io.IOException;
- import java.io.PrintWriter;
-
- import javax.servlet.ServletException;
- import javax.servlet.http.HttpServlet;
- import javax.servlet.http.HttpServletRequest;
- import javax.servlet.http.HttpServletResponse;
-
- import test.SubmitScalaJobToSpark;
-
- public class SparkServlet extends HttpServlet {
-
-
-
-
- public SparkServlet() {
- super();
- }
-
-
-
-
- public void destroy() {
- super.destroy();
-
- }
-
-
-
-
-
-
-
-
-
-
-
- public void doGet(HttpServletRequest request, HttpServletResponse response)
- throws ServletException, IOException {
-
- this.doPost(request, response);
- }
-
-
-
-
-
-
-
-
-
-
-
- public void doPost(HttpServletRequest request, HttpServletResponse response)
- throws ServletException, IOException {
- System.out.println("开始SubmitScalaJobToSpark调用......");
- SubmitScalaJobToSpark.main(null);
-
- System.out.println("完成SubmitScalaJobToSpark调用!");
- response.setContentType("text/html");
- PrintWriter out = response.getWriter();
- out.println("");
- out.println("");
- out.println(" A Servlet");
- out.println(" ");
- out.print(" This is ");
- out.print(this.getClass());
- out.println(", using the POST method");
- out.println(" ");
- out.println("");
- out.flush();
- out.close();
- }
-
-
-
-
-
-
- public void init() throws ServletException {
-
- }
-
- }
这里只是调用了java编写的任务调用类而已。同时,SparServlet和YarnServlet也只是在调用的地方不同而已。
在web测试时,首先直接在MyEclipse上测试,然后拷贝工程WebRoot到centos7,再次运行tomcat,进行测试。
3. 总结及问题
1. 测试结果:
1> java代码直接提交任务到Spark和Yarn,进行日志文件的过滤,测试是成功运行的。可以在Yarn和Spark的监控中看到相关信息:
同时,在HDFS可以看到输出的文件:
2> java web 提交任务到Spark和Yarn,首先需要把spark-assembly-1.4.1-hadoop2.6.0.jar中的javax.servlet文件夹删掉,因为会和tomcat的servlet-api.jar冲突。
a. 在windows和linux上启动tomcat,提交任务到Spark standAlone,测试成功运行;
b. 在windows和linux上启动tomcat,提交任务到Yarn,测试失败;
2. 遇到的问题:
1> java web 提交任务到Yarn,会失败,失败的主要日志如下:
- 15/08/25 11:35:48 ERROR yarn.ApplicationMaster: User class threw exception: java.lang.NoClassDefFoundError: javax/servlet/http/HttpServletResponse
- java.lang.NoClassDefFoundError: javax/servlet/http/HttpServletResponse
这个是因为javax.servlet的包被删掉了,和tomcat的冲突。
同时,在日志中还可以看到:
- 15/08/26 12:39:27 INFO Client: Setting up container launch context for our AM
- 15/08/26 12:39:27 INFO Client: Preparing resources for our AM container
- 15/08/26 12:39:27 INFO Client: Uploading resource file:/D:/workspase_scala/SparkWebTest/WebRoot/WEB-INF/lib/spark-assembly-1.4.1-hadoop2.6.0.jar -> hdfs://node101:8020/user/Administrator/.sparkStaging/application_1440464833795_0012/spark-assembly-1.4.1-hadoop2.6.0.jar
- 15/08/26 12:39:32 INFO Client: Uploading resource file:/D:/workspase_scala/SparkWebTest/WebRoot/WEB-INF/lib/spark_filter.jar -> hdfs://node101:8020/user/Administrator/.sparkStaging/application_1440464833795_0012/spark_filter.jar
- 15/08/26 12:39:33 INFO Client: Uploading resource file:/C:/Users/Administrator/AppData/Local/Temp/spark-46820caf-06e0-4c51-a479-3bb35666573f/__hadoop_conf__5465819424276830228.zip -> hdfs://node101:8020/user/Administrator/.sparkStaging/application_1440464833795_0012/__hadoop_conf__5465819424276830228.zip
- 15/08/26 12:39:33 INFO Client: Source and destination file systems are the same. Not copying hdfs://node101:8020/user/root/servlet-api.jar
- 15/08/26 12:39:33 WARN Client: Resource hdfs://node101:8020/user/root/servlet-api.jar added multiple times to distributed cache.
这里在环境初始化的时候,上传了两个jar,一个就是spark-assembly-1.4.1-hadoop2.6.0.jar 还有一个就是我们自定义的jar。上传的spark-assembly-1.4.1-hadoop2.6.0.jar 里面没有javax.servlet的文件夹,所以会报错。在java中直接调用(没有删除javax.servlet的时候)同样会看到这样的日志,同样的上传,那时是可以的,也就是说这里确实是删除了包文件夹的关系。那么如何修复呢?
上传servlet-api到hdfs,同时在使用yarn.Client提交任务的时候,添加相关的参数,这里查看参数,发现两个比较相关的参数,--addJars以及--archive 参数,把这两个参数都添加后,看到日志中确实把这个jar包作为了job的共享文件,但是java web提交任务到yarn 还是报这个类找不到的错误。所以这个办法也是行不通!(可以参考http://blog.csdn.NET/fansy1990/article/details/52289826中的部署部分解决这个问题)
使用yarn.Client提交任务到Yarn参考http://blog.sequenceiq.com/blog/2014/08/22/spark-submit-in-java/ 。
http://blog.csdn.net/u010022051/article/details/48240173
相关软件版本:
Spark1.4.1 ,Hadoop2.6,Scala2.10.5 , MyEclipse2014,intelliJ IDEA14,JDK1.8,Tomcat7
机器:
windows7 (包含JDK1.8,MyEclipse2014,IntelliJ IDEA14,TOmcat7);
centos6.6虚拟机(Hadoop伪分布式集群,Spark standAlone集群,JDK1.8);
centos7虚拟机(Tomcat,JDK1.8);
1. 场景:
1. windows简单java程序调用Spark,执行Scala开发的Spark程序,这里包含两种模式:
1> 提交任务到Spark集群,使用standAlone模式执行;
2> 提交任务到Yarn集群,使用yarn-client的模式;
2. windows 开发java web程序调用Spark,执行Scala开发的Spark程序,同样包含两种模式,参考1.
3. linux运行java web程序调用Spark,执行Scala开发的Spark程序,包含两种模式,参考1.
2. 实现:
1. 简单Scala程序,该程序的功能是读取HDFS中的log日志文件,过滤log文件中的WARN和ERROR的记录,最后把过滤后的记录写入到HDFS中,代码如下:
[Bash shell]
纯文本查看
复制代码
01 |
import org.apache.spark.{SparkConf, SparkContext} |
05 |
* Created by Administrator on 2015/8/23. |
08 |
def main(args:Array[String]): Unit ={ |
13 |
val conf = new SparkConf().setAppName( "Scala filter" ) |
14 |
val sc = new SparkContext(conf) |
17 |
val lines = sc.textFile(args(0)) |
20 |
val errorsRDD = lines.filter(line => line.contains( "ERROR" )) |
21 |
val warningsRDD = lines.filter(line => line.contains( "WARN" )) |
22 |
val badLinesRDD = errorsRDD.union(warningsRDD) |
25 |
badLinesRDD.saveAsTextFile(args(1)) |
使用IntelliJ IDEA 并打成jar包备用(lz这里命名为spark_filter.jar);
2. java调用spark_filter.jar中的Scala_Test 文件,并采用Spark standAlone模式
java代码如下:
[Java]
纯文本查看
复制代码
03 |
import java.text.SimpleDateFormat; |
04 |
import java.util.Date; |
06 |
import org.apache.spark.deploy.SparkSubmit; |
11 |
public class SubmitScalaJobToSpark { |
13 |
public static void main(String[] args) { |
14 |
SimpleDateFormat dateFormat = new SimpleDateFormat( "yyyy-MM-dd-hh-mm-ss" ); |
15 |
String filename = dateFormat.format( new Date()); |
16 |
String tmp=Thread.currentThread().getContextClassLoader().getResource( "" ).getPath(); |
17 |
tmp =tmp.substring( 0 , tmp.length()- 8 ); |
18 |
String[] arg0= new String[]{ |
19 |
"--master" , "spark://node101:7077" , |
20 |
"--deploy-mode" , "client" , |
21 |
"--name" , "test java submit job to spark" , |
22 |
"--class" , "Scala_Test" , |
23 |
"--executor-memory" , "1G" , |
25 |
tmp+ "lib/spark_filter.jar" , |
26 |
"hdfs://node101:8020/user/root/log.txt" , |
27 |
"hdfs://node101:8020/user/root/badLines_spark_" +filename |
30 |
SparkSubmit.main(arg0); |
具体操作,使用MyEclipse新建java web工程,把spark_filter.jar 以及spark-assembly-1.4.1-hadoop2.6.0.jar(该文件在Spark压缩文件的lib目录中,同时该文件较大,拷贝需要一定时间) 拷贝到WebRoot/WEB-INF/lib目录。(注意:这里可以直接建立java web项目,在测试java调用时,直接运行java代码即可,在测试web项目时,开启tomcat即可)
java调用spark_filter.jar中的Scala_Test 文件,并采用Yarn模式。采用Yarn模式,不能使用简单的修改master为“yarn-client”或“yarn-cluster”,在使用Spark-shell或者spark-submit的时候,使用这个,同时配置HADOOP_CONF_DIR路径是可以的,但是在这里,读取不到HADOOP的配置,所以这里采用其他方式,使用yarn.Clent提交的方式,java代码如下:
[Java]
纯文本查看
复制代码
03 |
import java.text.SimpleDateFormat; |
04 |
import java.util.Date; |
06 |
import org.apache.hadoop.conf.Configuration; |
07 |
import org.apache.spark.SparkConf; |
08 |
import org.apache.spark.deploy.yarn.Client; |
09 |
import org.apache.spark.deploy.yarn.ClientArguments; |
11 |
public class SubmitScalaJobToYarn { |
13 |
public static void main(String[] args) { |
14 |
SimpleDateFormat dateFormat = new SimpleDateFormat( "yyyy-MM-dd-hh-mm-ss" ); |
15 |
String filename = dateFormat.format( new Date()); |
16 |
String tmp=Thread.currentThread().getContextClassLoader().getResource( "" ).getPath(); |
17 |
tmp =tmp.substring( 0 , tmp.length()- 8 ); |
18 |
String[] arg0= new String[]{ |
19 |
"--name" , "test java submit job to yarn" , |
20 |
"--class" , "Scala_Test" , |
21 |
"--executor-memory" , "1G" , |
23 |
"--jar" ,tmp+ "lib/spark_filter.jar" , |
25 |
"--arg" , "hdfs://node101:8020/user/root/log.txt" , |
26 |
"--arg" , "hdfs://node101:8020/user/root/badLines_yarn_" +filename, |
27 |
"--addJars" , "hdfs://node101:8020/user/root/servlet-api.jar" ,// |
28 |
"--archives" , "hdfs://node101:8020/user/root/servlet-api.jar" // |
32 |
Configuration conf = new Configuration(); |
33 |
String os = System.getProperty( "os.name" ); |
34 |
boolean cross_platform = false ; |
35 |
if (os.contains( "Windows" )){ |
36 |
cross_platform = true ; |
38 |
conf.setBoolean( "mapreduce.app-submission.cross-platform" , cross_platform); |
39 |
conf.set( "fs.defaultFS" , "hdfs://node101:8020" );// 指定namenode |
40 |
conf.set( "mapreduce.framework.name" , "yarn" ); |
41 |
conf.set( "yarn.resourcemanager.address" , "node101:8032" ); |
42 |
conf.set( "yarn.resourcemanager.scheduler.address" , "node101:8030" ); |
43 |
conf.set( "mapreduce.jobhistory.address" , "node101:10020" ); |
45 |
System.setProperty( "SPARK_YARN_MODE" , "true" ); |
47 |
SparkConf sparkConf = new SparkConf(); |
48 |
ClientArguments cArgs = new ClientArguments(arg0, sparkConf); |
50 |
new Client(cArgs,conf,sparkConf).run(); |
SparkServlet如下:
[Java]
纯文本查看
复制代码
03 |
import java.io.IOException; |
04 |
import java.io.PrintWriter; |
06 |
import javax.servlet.ServletException; |
07 |
import javax.servlet.http.HttpServlet; |
08 |
import javax.servlet.http.HttpServletRequest; |
09 |
import javax.servlet.http.HttpServletResponse; |
11 |
import test.SubmitScalaJobToSpark; |
13 |
public class SparkServlet extends HttpServlet { |
16 |
* Constructor of the object. |
18 |
public SparkServlet() { |
23 |
* Destruction of the servlet.
|
25 |
public void destroy() { |
31 |
* The doGet method of the servlet.
|