标签(空格分隔): Spark
学好spark的路径:
*,http://spark.apache.org/
*,spark源码:https://github.com/apache/spark
*,https://databricks.com/blog
[hadoop001@xingyunfei001 app]$ chmod u+x scala-2.10.4.tgz
[hadoop001@xingyunfei001 app]$ tar -zxf scala-2.10.4.tgz -C /opt/app
export SCALA_HOME=/opt/app/scala-2.10.4
export PATH=$PATH:$SCALA_HOME/bin
[hadoop001@xingyunfei001 app]$ scala -version
[hadoop001@xingyunfei001 scala-2.10.4]$ source /etc/profile
[hadoop001@xingyunfei001 app]$ chmod u+x spark-1.3.0-bin-2.5.tar.gz
[hadoop001@xingyunfei001 app]$ tar -zxf spark-1.3.0-bin-2.5.tar.gz -C /opt/app
JAVA_HOME=/opt/app/jdk1.7.0_67
SCALA_HOME=/opt/app/scala-2.10.4
HADOOP_CONF_DIR=/opt/app/hadoop_2.5.0_cdh
bin/spark-shell
var rdd=sc.textFile("/opt/datas/beifeng.log")
rdd.count //显示总条数
rdd.first //显示第一条数据
rdd.take(2) //获取头2条数据
rdd.filter(x=>x.contains("yarn")).collect
rdd.filter(_.contains("yarn")).collect
rdd.cache //将数据放到内存中
rdd.count
rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((x,y)=>(x+y)).collect
sparkcpntext:
1,application申请资源
2,读取数据,创建rdd
SPARK_MASTER_IP=xingyunfei001.com.cn
SPARK_MASTER_PORT=7077
SPARK_MASTER_WEBUI_PORT=8080
SPARK_WORKER_CORES=2
SPARK_WORKER_MEMORY=2g
SPARK_WORKER_PORT=7078
SPARK_WORKER_WEBUI_PORT=8081
SPARK_WORKER_INSTANCES=1
# A Spark Worker will be started on each of the machines listed below.
xingyunfei001.com.cn
[hadoop001@xingyunfei001 spark-1.3.0-bin-2.5.0]$ sbin/start-master.sh
[hadoop001@xingyunfei001 spark-1.3.0-bin-2.5.0]$ sbin/start-slaves.sh
bin/spark-shell --master spark://xingyunfei001.com.cn:7070
var rdd=sc.textFile("/opt/datas/input.txt")
val wordcount=rdd.flatMap(x=>x.split(" ")).map(x=>(x,1)).reduceByKey((x,y)=>(x+y)).collect sc.stop
[hadoop001@xingyunfei001 spark-1.3.0-bin-2.5.0]$ bin/spark-shell local[2] //本地模式启动2个线程
[hadoop001@xingyunfei001 spark-1.3.0-bin-2.5.0]$ bin/spark-shell local[*] //根据本地配置自动设置线程数目