一、开发环境:
操作系统:win8 64位
IDE:IntelliJ IDEA
JDK:1.7
scala:scala-2.11.7
spark:linux上spark集群版本:1.4.1,本地依赖spakr的jar直接拷贝linux上$SPARK_HOME/lib/spark-assembly-1.4.1-hadoop2.4.0.jar
maven关键依赖:
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.4.1</version> </dependency> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.10.4</version> </dependency> <!-- 进行LDA 会使用到一下jar,否则可不引入 --> <dependency> <groupId>com.github.scopt</groupId> <artifactId>scopt_2.10</artifactId> <version>3.2.0</version> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>19.0</version> </dependency>
二、环境配置:
1、在window下配置环境变量:SCALA_HOME,然后引入到Path中
2、确保linux上spark集群的Master的spark-env.sh中SPARK_MASTER_IP的值本机能ping通,一般有两种配置:
1)直接配置IP为10.x.x.x,确保能ping同此IP即可,Master即为:spark://10.x.x.x:7077
2)配置的为linux机器名称如Master1.Hadoop,则需要在windows的hosts文件中将其配置进来,Master为spark://Master1.Hadoop:7077
三、提交流程
1、将spark任务类打jar包,生成d://....//spark-demo.jar,此处打的jar包放在linux使用spark-submit命令行调用也能执行
2、执行任务类或采用SparkSubmit.main(args)提交
四、代码实测:
1、spark 圆周率,此程序无须提交数据文件
新建项目,引入以上pom依赖,且将spark-assembly-1.4.1-hadoop2.4.0.jar手动加入项目中,新建类MyPi.java
package com.alleyz.spark; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import java.util.ArrayList; import java.util.List; /** * Created by Mr on 2016/1/5. */ public class MyPI { public static void main(String[] atrs){ SparkConf conf = new SparkConf(); conf.setAppName("alleyz-lad").setMaster("spark://Master1.Hadoop:7077"); JavaSparkContext jsc = new JavaSparkContext(conf); jsc.addJar("G:\\workspace\\idea\\spark-demo\\target\\artifacts\\spark_demo_jar\\spark-demo.jar"); int slices = 20; int n = 100000 * slices; List<Integer> l = new ArrayList<Integer>(n); for (int i = 0; i < n; i++) { l.add(i); } JavaRDD<Integer> dataSet = jsc.parallelize(l, slices); int count = dataSet.map(new Function<Integer, Integer>() { @Override public Integer call(Integer integer) { double x = Math.random() * 2 - 1; double y = Math.random() * 2 - 1; return (x * x + y * y < 1) ? 1 : 0; } }).reduce(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer integer, Integer integer2) { return integer + integer2; } }); System.out.println("Pi is roughly " + 4.0 * count / n); jsc.stop(); } }
先打jar包,然后点击右键运行,浏览器打开 http://Master1.Hadoop:8080 则可以看到正在执行的任务,稍后控制台会输入具体的结果,程序结束;
2、LDA
需要引入pom文件依赖,准备数据,我自己的如下(词组成的一个不规则矩阵):
...... 系统 三毛 查询 查询 核实 卡已经 目前欠 块八毛 核实 核实 账号 开通 业务 一个来电显示 包五 流量包 流量包 核实 非常抱歉 可以包一个 五十 流量包 现在办理 生效 确定办理 需要办理 办理 操作办理 电话 停机 还有一 毛三 办理一个 停机保 功能 手机 导致 手机 ....
新建java类:
package com.alleyz.spark; import com.google.common.collect.Multimap; import com.google.common.collect.Multimaps; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.mllib.feature.HashingTF; import org.apache.spark.mllib.feature.IDF; import org.apache.spark.mllib.feature.IDFModel; import scala.Tuple2; import org.apache.spark.api.java.*; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.clustering.DistributedLDAModel; import org.apache.spark.mllib.clustering.LDA; import org.apache.spark.mllib.linalg.Matrix; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.SparkConf; import java.io.*; import java.util.*; /** * Created by Mr on 2016/1/4. */ public class SparkLda { public static void main(String[] args) { final HashingTF hashingTF=new HashingTF(20000); SparkConf conf = new SparkConf(); conf.setAppName("alleyz-lda").setMaster("spark://Master1.Hadoop:7077"); JavaSparkContext sc = new JavaSparkContext(conf); sc.addJar("G:\\workspace\\idea\\spark-demo\\target\\artifacts\\spark_demo_jar\\spark-demo.jar"); // Load and parse the data // sc.addFile("D:\\spark\\sample_lda_data.txt"); String path = "D:\\spark\\sample_lda_data.txt"; File file = new File(path); final List<String> list=new ArrayList<>(); try{ BufferedReader fr = new BufferedReader(new InputStreamReader(new FileInputStream(file)));//new FileReader(file); String line; while((line=fr.readLine())!=null){ list.add(line); } }catch (Exception e){ e.printStackTrace(); } //原始数据 JavaRDD<String> data = sc.parallelize(list); JavaPairRDD<Long,List<String>> javaRdd=JavaPairRDD.fromJavaRDD(data.map(new Function<String, Tuple2<Long,List<String>>>() { @Override public Tuple2<Long,List<String>> call(String s) throws Exception { return new Tuple2<Long, List<String>>((long)s.hashCode(), Arrays.asList(s.split(" "))); } })); JavaPairRDD<Long,Vector> tfData=javaRdd.mapValues(new Function<List<String>, Vector>() { @Override public Vector call(List<String> strings) throws Exception { return hashingTF.transform(strings); } }); JavaRDD<String> tokens=javaRdd.values().flatMap(new FlatMapFunction<List<String>, String>() { @Override public Iterable<String> call(List<String> strings) throws Exception { return strings; } }).distinct(); Multimap<Integer,String> mapping= Multimaps.index(tokens.toArray(), new com.google.common.base.Function<String, Integer>() { public Integer apply(String t){ return hashingTF.indexOf(t); } }); final IDFModel idfModel=new IDF().fit(tfData.values()); JavaPairRDD<Long,Vector> tIdfData=tfData.mapValues(new Function<Vector, Vector>() { @Override public Vector call(Vector vector) throws Exception { return idfModel.transform(vector); } }); DistributedLDAModel ldaModel = (DistributedLDAModel) new LDA().setK(10).setMaxIterations(100).run(tIdfData); Tuple2<int[],double[]>[] d=ldaModel.describeTopics(100); String out="D:\\spark\\lda-out"; BufferedWriter bw=null; try { File outFile=new File(out); if(!outFile.exists())outFile.createNewFile(); bw= new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile))); for (int i = 0; i < d.length; i++) { System.out.println("Topic " + i + ":"); bw.write("Topic "+i+":\r\n"); for (int j = 0; j < d[i]._1().length; j++) { Collection col = mapping.get(d[i]._1()[j]); if (col.isEmpty()) continue; System.out.println("" + col + " " + d[i]._2()[j]); bw.write("" + col + " " + d[i]._2()[j]+"\r\n"); } bw.write("\r\n-----------------------------------------------"); } bw.flush(); bw.close(); }catch (Exception e){ e.printStackTrace(); } sc.stop(); } }
代码写完,bulid artifacts 打jar包,然后右键运行,就可以看到结果:
Topic 0: [密码] 0.009866712747027252 [设置] 0.009242580065070845 [手机] 0.007976218361447634 [电脑, 音量] 0.007185679520142035 [声音] 0.006859034109454243 [发给] 0.006323283466899422 [实用, 号码] 0.005875648054184159 [姓名] 0.00514309514424346 [身份证] 0.00442075176344923 [开通, 详细咨询] 0.0044181003876376 [信息] 0.0042643124381648205 [听到] 0.0042532126807515915 [操作] 0.004231734979013577 ..... -----------------------------------------------Topic 2: [宽带] 0.0193832008376721 [活动] 0.015378671944555595 [账号] 0.009303726013777125 [刷新一下] 0.009030500798620545 [刷新] 0.007823174929503695 [宽带账号] 0.007740455193261026 [实用, 号码] 0.0075097838150508505 [关机] 0.006752236405560853 [不能上网, 开户] 0.0064310542757245156 .....
总结一下:网上大多都是在yarn上传,,这个是不依赖hadoop的,最后那个LDA是借鉴git:https://github.com/igorekpotworek/NLP ,这是将lambda表达式改为java1.7的方法,其次修改了读取文件的策略,如有问题,请各位路过的大神及时指正,不胜感激