--- 这个是一个朋友整理的。比较详细有保存价值。
-- 参考: http://spark.apache.org/docs/latest/building-spark.html
http://spark.apache.org/docs/latest/sql-programming-guide.html#overview
-- 主要包括:
-- 1. Spark基于hadoop 2.4.1编译;
-- 2. Spark集群安装;
-- 3. Spark访问HDFS里的文件;
-- 4. Spark访问Hive表;
-- 5. Spark访问MySQL数据库表;
-- 留待读者解决:
-- Spark访问Oracle、MS SQL Server
---------------------------------------------------------------------------------------------------
-- ############################################################################################# --
-- 一、编译Spark 1.3.0
-- 前期准备:
-- 从官网下载spark-1.3.0.tgz包到/opt/software下,并解压:
cd /opt/software/
tar -xvf spark-1.3.0.tgz
-- 第一步. 修改 spark-1.3.0目录下的pom.xml文件,匹配自己的软件版本,我的修改如下:
cd /opt/software/spark-1.3.0
vi pom.xml -- 修改如下相关软件的版本
<java.version>1.7</java.version>
<hadoop.version>2.4.1</hadoop.version>
<protobuf.version>2.5.0</protobuf.version>
<hbase.version>0.98.9-hadoop2</hbase.version>
<zookeeper.version>3.4.6</zookeeper.version>
<derby.version>10.11.1.1</derby.version>
-- 注意:如果要支持Scala 2.11,请运行以下脚本:-- 该版本太新了,目前还有些组件不支持,所以不建议修改
sh dev/change-version-to-2.11.sh
---------------------------------------------------------------------------------------------------
-- 第二步. 尝试用maven编译Spark (maven安装略)
export MAVEN_OPTS="-Xmx4g -XX:MaxPermSize=1024M -XX:ReservedCodeCacheSize=1024m"
nohup mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.1 -Phive -Phive-thriftserver -DskipTests clean package -Dtar &
-- mvn编译成功后,打印类似如下:
[WARNING] sourceDirectory is not specified or does not exist value=/opt/software/spark-1.3.0/external/kafka-assembly/src/main/scala
Saving to outputFile=/opt/software/spark-1.3.0/external/kafka-assembly/scalastyle-output.xml
Processed 0 file(s)
Found 0 errors
Found 0 warnings
Found 0 infos
Finished in 0 ms
[INFO] ------------------------------------------------------------------------
[INFO] Reactor Summary:
[INFO]
[INFO] Spark Project Parent POM ........................... SUCCESS [ 19.631 s]
[INFO] Spark Project Networking ........................... SUCCESS [ 35.222 s]
[INFO] Spark Project Shuffle Streaming Service ............ SUCCESS [ 22.597 s]
[INFO] Spark Project Core ................................. SUCCESS [11:54 min]
[INFO] Spark Project Bagel ................................ SUCCESS [01:05 min]
[INFO] Spark Project GraphX ............................... SUCCESS [03:21 min]
[INFO] Spark Project Streaming ............................ SUCCESS [05:03 min]
[INFO] Spark Project Catalyst ............................. SUCCESS [05:57 min]
[INFO] Spark Project SQL .................................. SUCCESS [07:25 min]
[INFO] Spark Project ML Library ........................... SUCCESS [07:53 min]
[INFO] Spark Project Tools ................................ SUCCESS [ 44.746 s]
[INFO] Spark Project Hive ................................. SUCCESS [05:12 min]
[INFO] Spark Project REPL ................................. SUCCESS [02:38 min]
[INFO] Spark Project YARN ................................. SUCCESS [03:01 min]
[INFO] Spark Project Hive Thrift Server ................... SUCCESS [02:50 min]
[INFO] Spark Project Assembly ............................. SUCCESS [03:53 min]
[INFO] Spark Project External Twitter ..................... SUCCESS [01:11 min]
[INFO] Spark Project External Flume Sink .................. SUCCESS [02:43 min]
[INFO] Spark Project External Flume ....................... SUCCESS [01:45 min]
[INFO] Spark Project External MQTT ........................ SUCCESS [03:24 min]
[INFO] Spark Project External ZeroMQ ...................... SUCCESS [01:09 min]
[INFO] Spark Project External Kafka ....................... SUCCESS [02:01 min]
[INFO] Spark Project Examples ............................. SUCCESS [08:49 min]
[INFO] Spark Project YARN Shuffle Service ................. SUCCESS [ 15.687 s]
[INFO] Spark Project External Kafka Assembly .............. SUCCESS [ 55.975 s]
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 01:24 h
[INFO] Finished at: 2015-03-20T20:56:16+08:00
[INFO] Final Memory: 105M/1751M
[INFO] ------------------------------------------------------------------------
------------------------------
-- mvn编译错误1.
[INFO] Reactor Summary:
[INFO]
[INFO] Spark Project Parent POM ........................... SUCCESS [ 21.537 s]
[INFO] Spark Project Networking ........................... SUCCESS [ 31.171 s]
[INFO] Spark Project Shuffle Streaming Service ............ SUCCESS [ 16.630 s]
[INFO] Spark Project Core ................................. SUCCESS [11:43 min]
[INFO] Spark Project Bagel ................................ SUCCESS [01:13 min]
[INFO] Spark Project GraphX ............................... SUCCESS [03:45 min]
[INFO] Spark Project Streaming ............................ SUCCESS [06:08 min]
[INFO] Spark Project Catalyst ............................. SUCCESS [05:24 min]
[INFO] Spark Project SQL .................................. SUCCESS [07:18 min]
[INFO] Spark Project ML Library ........................... FAILURE [35:18 min]
[INFO] Spark Project Tools ................................ SKIPPED
[INFO] Spark Project Hive ................................. SKIPPED
[INFO] Spark Project REPL ................................. SKIPPED
[INFO] Spark Project YARN ................................. SKIPPED
[INFO] Spark Project Hive Thrift Server ................... SKIPPED
[INFO] Spark Project Assembly ............................. SKIPPED
[INFO] Spark Project External Twitter ..................... SKIPPED
[INFO] Spark Project External Flume Sink .................. SKIPPED
[INFO] Spark Project External Flume ....................... SKIPPED
[INFO] Spark Project External MQTT ........................ SKIPPED
[INFO] Spark Project External ZeroMQ ...................... SKIPPED
[INFO] Spark Project External Kafka ....................... SKIPPED
[INFO] Spark Project Examples ............................. SKIPPED
[INFO] Spark Project YARN Shuffle Service ................. SKIPPED
[INFO] Spark Project External Kafka Assembly .............. SKIPPED
[INFO] ------------------------------------------------------------------------
[INFO] BUILD FAILURE
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 01:12 h
[INFO] Finished at: 2015-03-20T19:05:36+08:00
[INFO] Final Memory: 83M/1376M
[INFO] ------------------------------------------------------------------------
[ERROR] Failed to execute goal on project spark-mllib_2.10: Could not resolve dependencies for project org.apache.spark:spark-mllib_2.10:jar:1.3.0: Could not transfer artifact org.spire-math:spire_2.10:jar:0.7.4 from/to central (https://repo1.maven.org/maven2): GET request of: org/spire-math/spire_2.10/0.7.4/spire_2.10-0.7.4.jar from central failed: Read timed out -> [Help 1]
[ERROR]
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch.
[ERROR] Re-run Maven using the -X switch to enable full debug logging.
[ERROR]
[ERROR] For more information about the errors and possible solutions, please read the following articles:
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/DependencyResolutionException
[ERROR]
[ERROR] After correcting the problems, you can resume the build with the command
[ERROR] mvn <goals> -rf :spark-mllib_2.10
-- mvn编译错误1解决:
-- 直接根据如下URL下载spire_2.10-0.7.4.jar 包到/root/.m2/repository/org/spire-math/spire_2.10/0.7.4/
-- 因为我是用root用户编译的,所以对应的是/root,如果你用其他用户的话,可能目录不一样哦
http://search.maven.org/#browse%7C1724544790
---------------------------------------------------------------------------------------------------
-- 第三步. 第二步操作测试mvn编译成功后,可以用如下命令生成安装包:(当然第二步也可以不要,直接执行“第三步”生成安装包,但安全起见,还是先执行第二步测试一下)
-- 注意:在执行之前,先看一下java和javac的版本是否一致。
cd /opt/software/spark-1.3.0
export MAVEN_OPTS="-Xmx4g -XX:MaxPermSize=1024M -XX:ReservedCodeCacheSize=1024m"
nohup ./make-distribution.sh --tgz --skip-java-test -Dyarn.version=2.2.0 -Dhadoop.version=2.2.0 -Pyarn -Phive -Phive-thriftserver
-- 第三步操作成功后,将在 /opt/software/spark-1.3.0目录下生成 spark-1.3.0-bin-2.4.1.tgz 安装包
---------------------------------------------------------------------------------------------------
-- ############################################################################################# --
-- 二、安装Spark 1.3.0
-- 安装前,我的hadoop集群是安装在hadoop用户下:
----------------------------------------------------------------
| IP | 主机名 | 角 色 |
----------------------------------------------------------------
| 192.168.117.193 | funshion-hadoop193 | NameNode,SparkMaster |
----------------------------------------------------------------
| 192.168.117.194 | funshion-hadoop194 | DataNode,SparkSlave |
----------------------------------------------------------------
| 192.168.117.195 | funshion-hadoop195 | DataNode,SparkSlave |
----------------------------------------------------------------
| 192.168.117.196 | funshion-hadoop196 | DataNode,SparkSlave |
----------------------------------------------------------------
---------------------------------------------------------------------------------------------------
-- 第一步 将 spark-1.3.0-bin-2.4.1.tgz 解压到/usr/local/,并创建相关软链接。
-- (下面操作分别在funshion-hadoop193、funshion-hadoop194、funshion-hadoop195、funshion-hadoop196四个节点以root用户执行)
tar -xvf /opt/software/spark-1.3.0/spark-1.3.0-bin-2.4.1.tgz
mv /opt/software/spark-1.3.0/spark-1.3.0-bin-2.4.1 /usr/local/
cd /usr/local
chown -R hadoop.hadoop ./spark-1.3.0-bin-2.4.1
rm -rf spark
ln -s spark-1.3.0-bin-2.4.1 spark
---------------------------------------------------------------------------------------------------
-- 第二步 配置Spark
cd /usr/local/spark-1.3.0/conf
-- 2.1 编辑 slaves 文件:
[hadoop@funshion-hadoop193 conf]$ vi slaves
# A Spark Worker will be started on each of the machines listed below.
funshion-hadoop194
funshion-hadoop195
funshion-hadoop196
-- 2.2 编辑 spark-env.sh.template 文件:
-- 拷贝spark-env.sh.template为spark-env.sh,并编辑 spark-env.sh (注意:最后两行是用来支持LZO压缩的)
[hadoop@funshion-hadoop193 conf]$ vi spark-env.sh
export JAVA_HOME=/usr/java/latest
export SCALA_HOME=/usr/local/scala
export SPARK_MASTER_IP=funshion-hadoop193
export SPARK_WORKER_MEMORY=2g
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_LIBRARY_PATH=$SPARK_LIBRARY_PATH:/usr/local/spark/lib:/usr/local/hadoop/lzo/lib
export SPARK_CLASSPATH=$SPARK_CLASSPATH:/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar
-- 2.3 编辑 spark-defaults.conf 文件:
cd /usr/local/spark/conf/
cp spark-defaults.conf.template spark-defaults.conf
[hadoop@funshion-hadoop193 conf]$ vi spark-defaults.conf -- 添加如下记录行:
spark.master spark://funshion-hadoop193:7077
spark.yarn.jar hdfs://funshion-hadoop193:8020/home/lib/spark.yarn.jar
spark.eventLog.enabled true
spark.eventLog.dir hdfs://funshion-hadoop193:8020/spark_log
-- 注意上面的两个HDFS目录,你需要创建一下:
hdfs dfs -mkdir -p /home/lib/spark.yarn.jar
hdfs dfs -mkdir /spark_log
-- 注意:上面2.3操作完成后,记得将其conf目录同步到其他节点:
-- 2.4 添加环境变量(我hadoop用户下的 ~/.bash_profile 文件全部内容如下)
-- (添加环境变量分别在funshion-hadoop193、funshion-hadoop194、funshion-hadoop195、funshion-hadoop196四个节点以hadoop用户执行)
---------------------------
[hadoop@funshion-hadoop193 spark]$ vi ~/.bash_profile
# .bash_profile
# Get the aliases and functions
if [ -f ~/.bashrc ]; then
. ~/.bashrc
fi
# User specific environment and startup programs
PATH=$PATH:$HOME/bin
# export PATH
export JAVA_HOME=/usr/java/latest
export PATH=$PATH:$HOME/bin:$JAVA_HOME/bin:/usr/local/bin
export HADOOP_INSTALL=/usr/local/hadoop
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_DEV_HOME=/usr/local/hadoop
export HADOOP_PREFIX=/usr/local/hadoop
export SCALA_HOME=/usr/local/scala
export PATH=$PATH:$SCALA_HOME/bin
export SPARK_HOME=/usr/local/spark
export PATH=$PATH:$SPARK_HOME/bin
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export HIVE_HOME=/usr/local/hive
# export HBASE_HOME=/usr/local/hbase
# export ZK_HOME=/usr/local/zookeeper
export PATH=$PATH:$HADOOP_DEV_HOME/bin
export PATH=$PATH:$HADOOP_DEV_HOME/sbin
export PATH=$PATH:$HIVE_HOME/bin
# export PATH=$PATH:$HBASE_HOME/bin
# export PATH=$PATH:$ZK_HOME/bin
export HADOOP_MAPARED_HOME=${HADOOP_DEV_HOME}
export HADOOP_COMMON_HOME=${HADOOP_DEV_HOME}
export HADOOP_HDFS_HOME=${HADOOP_DEV_HOME}
export YARN_HOME=${HADOOP_DEV_HOME}
export HADOOP_YARN_HOME=${HADOOP_DEV_HOME}
export HADOOP_CLIENT_CONF_DIR=${HADOOP_DEV_HOME}/etc/hadoop
export HADOOP_CONF_DIR=${HADOOP_DEV_HOME}/etc/hadoop
export HDFS_CONF_DIR=${HADOOP_DEV_HOME}/etc/hadoop
export YARN_CONF_DIR=${HADOOP_DEV_HOME}/etc/hadoop
export CLASSPATH=".:$JAVA_HOME/lib:$CLASSPATH"
export PATH="$JAVA_HOME/:$HADOOP_PREFIX/bin:$PATH"
# Native Path
export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_PREFIX}/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_PREFIX/lib/native"
# SET HADOOP_CLASSPATH
for file in `ls $HADOOP_HOME/share/hadoop/common/lib/*jar`
do
HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$file
done
# SET HIVE_CLASSPATH
for file in `ls $HIVE_HOME/lib/*jar`
do
HIVE_CLASSPATH=$HIVE_CLASSPATH:$file
done
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH
export CLASSPATH=$CLASSPATH:$HADOOP_CLASSPATH:$HIVE_CLASSPATH
# SET JAVA_LIBRARY_PATH
for file in `ls $JAVA_HOME/lib/*jar`
do
JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:$file
done
export JAVA_LIBRARY_PATH=$JAVA_LIBRARY_PATH:$HADOOP_PREFIX/lib/native
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/hadoop/lib/native:/usr/lib64
export PYTHONPATH=$PYTHONPATH:/usr/local/hadoop/etc/hadoop
export PATH=$PATH:$PYTHONPATH
export EXINIT='set ts=4 sw=4'
---------------------------
---------------------------------------------------------------------------------------------------
-- 第三步 启动Spark集群:
[hadoop@funshion-hadoop193 sbin]$ cd /usr/local/spark/sbin
[hadoop@funshion-hadoop193 sbin]$ pwd
/usr/local/spark/sbin
[hadoop@funshion-hadoop193 sbin]$ ./start-all.sh
starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark-1.3.0-bin-2.4.1/sbin/../logs/spark-hadoop-org.apache.spark.deploy.master.Master-1-funshion-hadoop193.out
funshion-hadoop194: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark-1.3.0-bin-2.4.1/sbin/../logs/spark-hadoop-org.apache.spark.deploy.worker.Worker-1-funshion-hadoop194.out
funshion-hadoop196: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark-1.3.0-bin-2.4.1/sbin/../logs/spark-hadoop-org.apache.spark.deploy.worker.Worker-1-funshion-hadoop196.out
funshion-hadoop195: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark-1.3.0-bin-2.4.1/sbin/../logs/spark-hadoop-org.apache.spark.deploy.worker.Worker-1-funshion-hadoop195.out
-- 要关闭Spark集群的话,执行如下命令:
[hadoop@funshion-hadoop193 sbin]$ ./stop-all.sh
funshion-hadoop194: stopping org.apache.spark.deploy.worker.Worker
funshion-hadoop195: stopping org.apache.spark.deploy.worker.Worker
funshion-hadoop196: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master
---------------------------------------------------------------------------------------------------
-- ############################################################################################# --
-- 三、测试Spark 1.3.0
-- 3.1 测试SparkPi
cd /usr/local/spark
./bin/spark-submit --class org.apache.spark.examples.SparkPi \
--master spark://funshion-hadoop193:7077 \
--num-executors 3 \
--driver-memory 2g \
--executor-memory 1g \
--executor-cores 1 \
--queue root.hadoop \
lib/spark-examples*.jar \
10
-- 上面命令输出如下(我们看到有一行打印“Pi is roughly 3.141544”代表执行是有返回结果的,是OK的):
Spark assembly has been built with Hive, including Datanucleus jars on classpath
15/03/21 16:05:23 INFO SparkContext: Running Spark version 1.3.0
15/03/21 16:05:23 WARN SparkConf:
SPARK_CLASSPATH was detected (set to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar').
This is deprecated in Spark 1.0+.
Please instead use:
- ./spark-submit with --driver-class-path to augment the driver classpath
- spark.executor.extraClassPath to augment the executor classpath
15/03/21 16:05:23 WARN SparkConf: Setting 'spark.executor.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/21 16:05:23 WARN SparkConf: Setting 'spark.driver.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/21 16:05:25 INFO SecurityManager: Changing view acls to: hadoop
15/03/21 16:05:25 INFO SecurityManager: Changing modify acls to: hadoop
15/03/21 16:05:25 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/21 16:05:26 INFO Slf4jLogger: Slf4jLogger started
15/03/21 16:05:26 INFO Remoting: Starting remoting
15/03/21 16:05:27 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@funshion-hadoop193:54001]
15/03/21 16:05:27 INFO Utils: Successfully started service 'sparkDriver' on port 54001.
15/03/21 16:05:27 INFO SparkEnv: Registering MapOutputTracker
15/03/21 16:05:27 INFO SparkEnv: Registering BlockManagerMaster
15/03/21 16:05:27 INFO DiskBlockManager: Created local directory at /tmp/spark-3637a018-6da9-446b-9fe6-b4cd75d346c4/blockmgr-4bb0b6f0-a816-46af-b5e3-b8c3ffaa3c04
15/03/21 16:05:27 INFO MemoryStore: MemoryStore started with capacity 1060.3 MB
15/03/21 16:05:28 INFO HttpFileServer: HTTP File server directory is /tmp/spark-5a5dd05a-e1e1-4c68-8517-b7e63ebcaab3/httpd-6eed070f-8636-40c6-8461-4b61a31fb3a0
15/03/21 16:05:28 INFO HttpServer: Starting HTTP Server
15/03/21 16:05:28 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 16:05:28 INFO AbstractConnector: Started
[email protected]:50159
15/03/21 16:05:28 INFO Utils: Successfully started service 'HTTP file server' on port 50159.
15/03/21 16:05:28 INFO SparkEnv: Registering OutputCommitCoordinator
15/03/21 16:05:29 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 16:05:29 INFO AbstractConnector: Started
[email protected]:4040
15/03/21 16:05:29 INFO Utils: Successfully started service 'SparkUI' on port 4040.
15/03/21 16:05:29 INFO SparkUI: Started SparkUI at http://funshion-hadoop193:4040
15/03/21 16:05:30 INFO SparkContext: Added JAR file:/usr/local/spark-1.3.0-bin-2.4.1/lib/spark-examples-1.3.0-hadoop2.4.1.jar at http://192.168.117.193:50159/jars/spark-examples-1.3.0-hadoop2.4.1.jar with timestamp 1426925130170
15/03/21 16:05:30 INFO AppClient$ClientActor: Connecting to master akka.tcp://sparkMaster@funshion-hadoop193:7077/user/Master...
15/03/21 16:05:31 INFO SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20150321160531-0000
15/03/21 16:05:31 INFO AppClient$ClientActor: Executor added: app-20150321160531-0000/0 on worker-20150321160018-funshion-hadoop195-46031 (funshion-hadoop195:46031) with 2 cores
15/03/21 16:05:31 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321160531-0000/0 on hostPort funshion-hadoop195:46031 with 2 cores, 1024.0 MB RAM
15/03/21 16:05:31 INFO AppClient$ClientActor: Executor added: app-20150321160531-0000/1 on worker-20150321160019-funshion-hadoop196-53113 (funshion-hadoop196:53113) with 2 cores
15/03/21 16:05:31 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321160531-0000/1 on hostPort funshion-hadoop196:53113 with 2 cores, 1024.0 MB RAM
15/03/21 16:05:31 INFO AppClient$ClientActor: Executor added: app-20150321160531-0000/2 on worker-20150321160018-funshion-hadoop194-56515 (funshion-hadoop194:56515) with 2 cores
15/03/21 16:05:31 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321160531-0000/2 on hostPort funshion-hadoop194:56515 with 2 cores, 1024.0 MB RAM
15/03/21 16:05:32 INFO AppClient$ClientActor: Executor updated: app-20150321160531-0000/0 is now RUNNING
15/03/21 16:05:32 INFO AppClient$ClientActor: Executor updated: app-20150321160531-0000/0 is now LOADING
15/03/21 16:05:32 INFO AppClient$ClientActor: Executor updated: app-20150321160531-0000/1 is now RUNNING
15/03/21 16:05:32 INFO AppClient$ClientActor: Executor updated: app-20150321160531-0000/2 is now LOADING
15/03/21 16:05:32 INFO AppClient$ClientActor: Executor updated: app-20150321160531-0000/2 is now RUNNING
15/03/21 16:05:32 INFO AppClient$ClientActor: Executor updated: app-20150321160531-0000/1 is now LOADING
15/03/21 16:05:32 INFO NettyBlockTransferService: Server created on 33985
15/03/21 16:05:32 INFO BlockManagerMaster: Trying to register BlockManager
15/03/21 16:05:32 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop193:33985 with 1060.3 MB RAM, BlockManagerId(<driver>, funshion-hadoop193, 33985)
15/03/21 16:05:32 INFO BlockManagerMaster: Registered BlockManager
15/03/21 16:05:35 INFO EventLoggingListener: Logging events to hdfs://funshion-hadoop193:8020/spark_log/app-20150321160531-0000
15/03/21 16:05:35 INFO SparkDeploySchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
15/03/21 16:05:36 INFO SparkContext: Starting job: reduce at SparkPi.scala:35
15/03/21 16:05:36 INFO DAGScheduler: Got job 0 (reduce at SparkPi.scala:35) with 10 output partitions (allowLocal=false)
15/03/21 16:05:36 INFO DAGScheduler: Final stage: Stage 0(reduce at SparkPi.scala:35)
15/03/21 16:05:36 INFO DAGScheduler: Parents of final stage: List()
15/03/21 16:05:36 INFO DAGScheduler: Missing parents: List()
15/03/21 16:05:36 INFO DAGScheduler: Submitting Stage 0 (MapPartitionsRDD[1] at map at SparkPi.scala:31), which has no missing parents
15/03/21 16:05:37 INFO MemoryStore: ensureFreeSpace(1848) called with curMem=0, maxMem=1111794647
15/03/21 16:05:37 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 1848.0 B, free 1060.3 MB)
15/03/21 16:05:37 INFO MemoryStore: ensureFreeSpace(1296) called with curMem=1848, maxMem=1111794647
15/03/21 16:05:37 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 1296.0 B, free 1060.3 MB)
15/03/21 16:05:37 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop193:33985 (size: 1296.0 B, free: 1060.3 MB)
15/03/21 16:05:37 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0
15/03/21 16:05:37 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:839
15/03/21 16:05:37 INFO DAGScheduler: Submitting 10 missing tasks from Stage 0 (MapPartitionsRDD[1] at map at SparkPi.scala:31)
15/03/21 16:05:37 INFO TaskSchedulerImpl: Adding task set 0.0 with 10 tasks
15/03/21 16:05:39 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop194:60276/user/Executor#622568548] with ID 2
15/03/21 16:05:39 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, funshion-hadoop194, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:39 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, funshion-hadoop194, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:39 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop195:51291/user/Executor#1321504504] with ID 0
15/03/21 16:05:39 INFO TaskSetManager: Starting task 2.0 in stage 0.0 (TID 2, funshion-hadoop195, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:39 INFO TaskSetManager: Starting task 3.0 in stage 0.0 (TID 3, funshion-hadoop195, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:39 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop196:42388/user/Executor#1779514149] with ID 1
15/03/21 16:05:39 INFO TaskSetManager: Starting task 4.0 in stage 0.0 (TID 4, funshion-hadoop196, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:39 INFO TaskSetManager: Starting task 5.0 in stage 0.0 (TID 5, funshion-hadoop196, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:40 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop194:42041 with 530.3 MB RAM, BlockManagerId(2, funshion-hadoop194, 42041)
15/03/21 16:05:40 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop195:47926 with 530.3 MB RAM, BlockManagerId(0, funshion-hadoop195, 47926)
15/03/21 16:05:40 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop196:36975 with 530.3 MB RAM, BlockManagerId(1, funshion-hadoop196, 36975)
15/03/21 16:05:49 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop196:36975 (size: 1296.0 B, free: 530.3 MB)
15/03/21 16:05:49 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop194:42041 (size: 1296.0 B, free: 530.3 MB)
15/03/21 16:05:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop195:47926 (size: 1296.0 B, free: 530.3 MB)
15/03/21 16:05:50 INFO TaskSetManager: Starting task 6.0 in stage 0.0 (TID 6, funshion-hadoop196, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:50 INFO TaskSetManager: Starting task 7.0 in stage 0.0 (TID 7, funshion-hadoop196, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 4.0 in stage 0.0 (TID 4) in 10827 ms on funshion-hadoop196 (1/10)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 5.0 in stage 0.0 (TID 5) in 10833 ms on funshion-hadoop196 (2/10)
15/03/21 16:05:50 INFO TaskSetManager: Starting task 8.0 in stage 0.0 (TID 8, funshion-hadoop196, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:50 INFO TaskSetManager: Starting task 9.0 in stage 0.0 (TID 9, funshion-hadoop196, PROCESS_LOCAL, 1340 bytes)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 7.0 in stage 0.0 (TID 7) in 119 ms on funshion-hadoop196 (3/10)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 6.0 in stage 0.0 (TID 6) in 156 ms on funshion-hadoop196 (4/10)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 11123 ms on funshion-hadoop194 (5/10)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 8.0 in stage 0.0 (TID 8) in 70 ms on funshion-hadoop196 (6/10)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 9.0 in stage 0.0 (TID 9) in 72 ms on funshion-hadoop196 (7/10)
15/03/21 16:05:50 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 11096 ms on funshion-hadoop194 (8/10)
15/03/21 16:05:51 INFO TaskSetManager: Finished task 2.0 in stage 0.0 (TID 2) in 11423 ms on funshion-hadoop195 (9/10)
15/03/21 16:05:51 INFO DAGScheduler: Stage 0 (reduce at SparkPi.scala:35) finished in 13.243 s
15/03/21 16:05:51 INFO TaskSetManager: Finished task 3.0 in stage 0.0 (TID 3) in 11447 ms on funshion-hadoop195 (10/10)
15/03/21 16:05:51 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/03/21 16:05:51 INFO DAGScheduler: Job 0 finished: reduce at SparkPi.scala:35, took 14.928861 s
Pi is roughly 3.141544
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/metrics/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages/stage/kill,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/static,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/executors/threadDump/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/executors/threadDump,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/executors/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/executors,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/environment/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/environment,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/storage/rdd/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/storage/rdd,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/storage/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/storage,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages/pool/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages/pool,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages/stage/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages/stage,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/stages,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/jobs/job/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/jobs/job,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/jobs/json,null}
15/03/21 16:05:51 INFO ContextHandler: stopped o.s.j.s.ServletContextHandler{/jobs,null}
15/03/21 16:05:51 INFO SparkUI: Stopped Spark web UI at http://funshion-hadoop193:4040
15/03/21 16:05:51 INFO DAGScheduler: Stopping DAGScheduler
15/03/21 16:05:51 INFO SparkDeploySchedulerBackend: Shutting down all executors
15/03/21 16:05:51 INFO SparkDeploySchedulerBackend: Asking each executor to shut down
15/03/21 16:05:51 INFO OutputCommitCoordinator$OutputCommitCoordinatorActor: OutputCommitCoordinator stopped!
15/03/21 16:05:51 INFO MapOutputTrackerMasterActor: MapOutputTrackerActor stopped!
15/03/21 16:05:51 INFO MemoryStore: MemoryStore cleared
15/03/21 16:05:51 INFO BlockManager: BlockManager stopped
15/03/21 16:05:51 INFO BlockManagerMaster: BlockManagerMaster stopped
15/03/21 16:05:51 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.
15/03/21 16:05:51 INFO RemoteActorRefProvider$RemotingTerminator: Remote daemon shut down; proceeding with flushing remote transports.
15/03/21 16:05:51 INFO SparkContext: Successfully stopped SparkContext
15/03/21 16:05:51 INFO RemoteActorRefProvider$RemotingTerminator: Remoting shut down.
---------------------------------------------------------------------------------------------------
-- 3.2 测试Spark shell(访问HDFS文件)
-- 3.2.1 cd 到 /usr/local/spark
[hadoop@funshion-hadoop193 spark]$ cd /usr/local/spark
-- 3.2.2 查看集群目录 /user/hadoop 是否存在(不存在,就创建一下)
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -ls hdfs://funshion-hadoop193:8020/user/hadoop/
Found 3 items
drwx------ - hadoop supergroup 0 2015-03-18 08:00 hdfs://funshion-hadoop193:8020/user/hadoop/.Trash
drwxr-xr-x - hadoop supergroup 0 2015-03-21 15:05 hdfs://funshion-hadoop193:8020/user/hadoop/.sparkStaging
drwxr-xr-x - hadoop supergroup 0 2015-03-20 10:28 hdfs://funshion-hadoop193:8020/user/hadoop/hive
-- 3.2.3 将 /user/local/spark/README.md 文件拷贝到集群的/user/hadoop目录下:
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -copyFromLocal /usr/local/spark/README.md hdfs://funshion-hadoop193:8020/user/hadoop/
-- 3.2.4 检查上一步操作是否成功(我们看到/user/hadoop目录下已经有README.md文件了):
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -ls hdfs://funshion-hadoop193:8020/user/hadoop/
Found 4 items
drwx------ - hadoop supergroup 0 2015-03-18 08:00 hdfs://funshion-hadoop193:8020/user/hadoop/.Trash
drwxr-xr-x - hadoop supergroup 0 2015-03-21 15:05 hdfs://funshion-hadoop193:8020/user/hadoop/.sparkStaging
-rw-r--r-- 3 hadoop supergroup 3629 2015-03-21 16:28 hdfs://funshion-hadoop193:8020/user/hadoop/README.md
drwxr-xr-x - hadoop supergroup 0 2015-03-20 10:28 hdfs://funshion-hadoop193:8020/user/hadoop/hive
-- 3.2.4 测试Spark shell
[hadoop@funshion-hadoop193 spark]$ pwd
/usr/local/spark
[hadoop@funshion-hadoop193 spark]$ ./bin/spark-shell --master spark://funshion-hadoop193:7077
Spark assembly has been built with Hive, including Datanucleus jars on classpath
15/03/21 16:32:33 INFO SecurityManager: Changing view acls to: hadoop
15/03/21 16:32:33 INFO SecurityManager: Changing modify acls to: hadoop
15/03/21 16:32:33 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/21 16:32:33 INFO HttpServer: Starting HTTP Server
15/03/21 16:32:33 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 16:32:33 INFO AbstractConnector: Started
[email protected]:52784
15/03/21 16:32:33 INFO Utils: Successfully started service 'HTTP class server' on port 52784.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.3.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_75)
Type in expressions to have them evaluated.
Type :help for more information.
15/03/21 16:32:46 INFO SparkContext: Running Spark version 1.3.0
15/03/21 16:32:46 WARN SparkConf:
SPARK_CLASSPATH was detected (set to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar').
This is deprecated in Spark 1.0+.
Please instead use:
- ./spark-submit with --driver-class-path to augment the driver classpath
- spark.executor.extraClassPath to augment the executor classpath
15/03/21 16:32:46 WARN SparkConf: Setting 'spark.executor.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/21 16:32:46 WARN SparkConf: Setting 'spark.driver.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/21 16:32:47 INFO SecurityManager: Changing view acls to: hadoop
15/03/21 16:32:47 INFO SecurityManager: Changing modify acls to: hadoop
15/03/21 16:32:47 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/21 16:32:48 INFO Slf4jLogger: Slf4jLogger started
15/03/21 16:32:48 INFO Remoting: Starting remoting
15/03/21 16:32:48 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@funshion-hadoop193:56709]
15/03/21 16:32:48 INFO Utils: Successfully started service 'sparkDriver' on port 56709.
15/03/21 16:32:48 INFO SparkEnv: Registering MapOutputTracker
15/03/21 16:32:48 INFO SparkEnv: Registering BlockManagerMaster
15/03/21 16:32:48 INFO DiskBlockManager: Created local directory at /tmp/spark-fceb073a-5114-4ca1-aa3b-35cdc4905eec/blockmgr-ea7850bc-f902-4f9a-aef2-7228d35b2a2c
15/03/21 16:32:48 INFO MemoryStore: MemoryStore started with capacity 265.4 MB
15/03/21 16:32:49 INFO HttpFileServer: HTTP File server directory is /tmp/spark-0de1d6ac-b075-4cea-aefa-4e5b7fe492c6/httpd-84193574-041c-4bdc-abbb-6033aa484d92
15/03/21 16:32:49 INFO HttpServer: Starting HTTP Server
15/03/21 16:32:49 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 16:32:49 INFO AbstractConnector: Started
[email protected]:37981
15/03/21 16:32:49 INFO Utils: Successfully started service 'HTTP file server' on port 37981.
15/03/21 16:32:49 INFO SparkEnv: Registering OutputCommitCoordinator
15/03/21 16:32:49 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 16:32:49 INFO AbstractConnector: Started
[email protected]:4040
15/03/21 16:32:49 INFO Utils: Successfully started service 'SparkUI' on port 4040.
15/03/21 16:32:49 INFO SparkUI: Started SparkUI at http://funshion-hadoop193:4040
15/03/21 16:32:50 INFO AppClient$ClientActor: Connecting to master akka.tcp://sparkMaster@funshion-hadoop193:7077/user/Master...
15/03/21 16:32:50 INFO SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20150321163250-0002
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor added: app-20150321163250-0002/0 on worker-20150321160018-funshion-hadoop195-46031 (funshion-hadoop195:46031) with 2 cores
15/03/21 16:32:50 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321163250-0002/0 on hostPort funshion-hadoop195:46031 with 2 cores, 512.0 MB RAM
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor added: app-20150321163250-0002/1 on worker-20150321160019-funshion-hadoop196-53113 (funshion-hadoop196:53113) with 2 cores
15/03/21 16:32:50 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321163250-0002/1 on hostPort funshion-hadoop196:53113 with 2 cores, 512.0 MB RAM
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor added: app-20150321163250-0002/2 on worker-20150321160018-funshion-hadoop194-56515 (funshion-hadoop194:56515) with 2 cores
15/03/21 16:32:50 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321163250-0002/2 on hostPort funshion-hadoop194:56515 with 2 cores, 512.0 MB RAM
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor updated: app-20150321163250-0002/1 is now LOADING
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor updated: app-20150321163250-0002/0 is now LOADING
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor updated: app-20150321163250-0002/2 is now LOADING
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor updated: app-20150321163250-0002/0 is now RUNNING
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor updated: app-20150321163250-0002/1 is now RUNNING
15/03/21 16:32:50 INFO AppClient$ClientActor: Executor updated: app-20150321163250-0002/2 is now RUNNING
15/03/21 16:32:51 INFO NettyBlockTransferService: Server created on 49153
15/03/21 16:32:51 INFO BlockManagerMaster: Trying to register BlockManager
15/03/21 16:32:51 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop193:49153 with 265.4 MB RAM, BlockManagerId(<driver>, funshion-hadoop193, 49153)
15/03/21 16:32:51 INFO BlockManagerMaster: Registered BlockManager
15/03/21 16:32:55 INFO EventLoggingListener: Logging events to hdfs://funshion-hadoop193:8020/spark_log/app-20150321163250-0002
15/03/21 16:32:55 INFO SparkDeploySchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
15/03/21 16:32:55 INFO SparkILoop: Created spark context..
Spark context available as sc.
15/03/21 16:32:57 INFO SparkILoop: Created sql context (with Hive support)..
SQL context available as sqlContext.
15/03/21 16:32:57 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop194:52091/user/Executor#247904344] with ID 2
15/03/21 16:32:58 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop194:43331 with 265.4 MB RAM, BlockManagerId(2, funshion-hadoop194, 43331)
15/03/21 16:32:58 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop196:38636/user/Executor#-1065092827] with ID 1
15/03/21 16:32:58 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop195:47721/user/Executor#700969315] with ID 0
15/03/21 16:32:58 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop196:58778 with 265.4 MB RAM, BlockManagerId(1, funshion-hadoop196, 58778)
15/03/21 16:32:58 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop195:40409 with 265.4 MB RAM, BlockManagerId(0, funshion-hadoop195, 40409)
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@4b3734e9
scala> val file = sc.textFile("hdfs://funshion-hadoop193:8020/user/hadoop/README.md")
15/03/21 16:33:12 INFO MemoryStore: ensureFreeSpace(238253) called with curMem=0, maxMem=278302556
15/03/21 16:33:12 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 232.7 KB, free 265.2 MB)
15/03/21 16:33:12 INFO MemoryStore: ensureFreeSpace(33723) called with curMem=238253, maxMem=278302556
15/03/21 16:33:12 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 32.9 KB, free 265.2 MB)
15/03/21 16:33:13 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop193:49153 (size: 32.9 KB, free: 265.4 MB)
15/03/21 16:33:13 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0
15/03/21 16:33:13 INFO SparkContext: Created broadcast 0 from textFile at <console>:21
file: org.apache.spark.rdd.RDD[String] = hdfs://funshion-hadoop193:8020/user/hadoop/README.md MapPartitionsRDD[1] at textFile at <console>:21
scala> val sparks = file.filter(line => line.contains("Spark"))
sparks: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at <console>:23
scala> sparks.count
15/03/21 16:33:45 INFO GPLNativeCodeLoader: Loaded native gpl library from the embedded binaries
15/03/21 16:33:45 INFO LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev e8c11c2be93b965abb548411379b203dabcbce79]
15/03/21 16:33:45 INFO FileInputFormat: Total input paths to process : 1
15/03/21 16:33:45 INFO SparkContext: Starting job: count at <console>:26
15/03/21 16:33:45 INFO DAGScheduler: Got job 0 (count at <console>:26) with 2 output partitions (allowLocal=false)
15/03/21 16:33:45 INFO DAGScheduler: Final stage: Stage 0(count at <console>:26)
15/03/21 16:33:45 INFO DAGScheduler: Parents of final stage: List()
15/03/21 16:33:45 INFO DAGScheduler: Missing parents: List()
15/03/21 16:33:45 INFO DAGScheduler: Submitting Stage 0 (MapPartitionsRDD[2] at filter at <console>:23), which has no missing parents
15/03/21 16:33:46 INFO MemoryStore: ensureFreeSpace(2880) called with curMem=271976, maxMem=278302556
15/03/21 16:33:46 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 2.8 KB, free 265.1 MB)
15/03/21 16:33:46 INFO MemoryStore: ensureFreeSpace(2067) called with curMem=274856, maxMem=278302556
15/03/21 16:33:46 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 2.0 KB, free 265.1 MB)
15/03/21 16:33:46 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop193:49153 (size: 2.0 KB, free: 265.4 MB)
15/03/21 16:33:46 INFO BlockManagerMaster: Updated info of block broadcast_1_piece0
15/03/21 16:33:46 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:839
15/03/21 16:33:46 INFO DAGScheduler: Submitting 2 missing tasks from Stage 0 (MapPartitionsRDD[2] at filter at <console>:23)
15/03/21 16:33:46 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
15/03/21 16:33:46 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, funshion-hadoop195, NODE_LOCAL, 1316 bytes)
15/03/21 16:33:46 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, funshion-hadoop194, NODE_LOCAL, 1316 bytes)
15/03/21 16:33:47 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop194:43331 (size: 2.0 KB, free: 265.4 MB)
15/03/21 16:33:47 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop195:40409 (size: 2.0 KB, free: 265.4 MB)
15/03/21 16:33:47 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop194:43331 (size: 32.9 KB, free: 265.4 MB)
15/03/21 16:33:47 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop195:40409 (size: 32.9 KB, free: 265.4 MB)
15/03/21 16:33:49 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 3675 ms on funshion-hadoop195 (1/2)
15/03/21 16:33:49 INFO DAGScheduler: Stage 0 (count at <console>:26) finished in 3.761 s
15/03/21 16:33:49 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 3724 ms on funshion-hadoop194 (2/2)
15/03/21 16:33:49 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/03/21 16:33:49 INFO DAGScheduler: Job 0 finished: count at <console>:26, took 4.055437 s
res1: Long = 19
---------------------------------------------------------------------------------------------------
-- 3.3 测试Spark SQL访问Hive表:
-- 3.3.1 将hive-site.xml配置文件拷贝到/usr/local/spark/conf/,(拷贝后,建议重启一下spark集群)
cp $HIVE_HOME/conf/hive-site.xml /usr/local/spark/conf/
cp $HIVE_HOME/lib/mysql-connector-java-5.1.17-bin.jar /usr/local/spark/lib/
-- 注意1:我是在funshion-hadoop192、funshion-hadoop193两台服务器上都跑有Hive Metastore服务
-- (如果各hive客户端都配置访问两个metastore服务端口,且两个metastore服务访问两个mysql数据库A、B;
-- 且两台mysql数据库A、B是双向复制的话,Hive层面就是真正的“HA”(高可用)了。)
-- hive metastore服务启动命令类似如下:
cd $HIVE_HOME
nohup hive --service metastore -p 10000 &
-- 启动hive metastore服务后,可以通过如下命令查看10000端口是否在监听:
[hadoop@funshion-hadoop192 hive]$ netstat -anl |grep 10000
tcp 0 0 0.0.0.0:10000 0.0.0.0:* LISTEN
tcp 0 0 192.168.117.192:10000 192.168.117.193:38363 ESTABLISHED
-- 注意2:我的hive-site.xml 文件配置类似如下,
-------------------------------------------------
<property>
<name>hive.metastore.uris</name>
<value>thrift://funshion-hadoop192:10000,thrift://funshion-hadoop193:10000</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.117.193:3306/hive?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=UTF-8</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>hadoop.security.credential.provider.path</name>
<value>jceks://hdfs@funshion-hadoop193:8020/user/hadoop/hive/conf/hive.jceks</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/home/hadoop/hive/warehouse</value>
<description>location of default database for the warehouse</description>
</property>
-------------------------------------------------
[hadoop@funshion-hadoop193 spark]$ pwd
/usr/local/spark
[hadoop@funshion-hadoop193 spark]$ ./bin/spark-shell --master spark://funshion-hadoop193:7077
Spark assembly has been built with Hive, including Datanucleus jars on classpath
15/03/21 17:19:27 INFO SecurityManager: Changing view acls to: hadoop
15/03/21 17:19:27 INFO SecurityManager: Changing modify acls to: hadoop
15/03/21 17:19:27 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/21 17:19:27 INFO HttpServer: Starting HTTP Server
15/03/21 17:19:27 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 17:19:27 INFO AbstractConnector: Started
[email protected]:40063
15/03/21 17:19:27 INFO Utils: Successfully started service 'HTTP class server' on port 40063.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.3.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_75)
Type in expressions to have them evaluated.
Type :help for more information.
15/03/21 17:19:42 INFO SparkContext: Running Spark version 1.3.0
15/03/21 17:19:42 WARN SparkConf:
SPARK_CLASSPATH was detected (set to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar').
This is deprecated in Spark 1.0+.
Please instead use:
- ./spark-submit with --driver-class-path to augment the driver classpath
- spark.executor.extraClassPath to augment the executor classpath
15/03/21 17:19:42 WARN SparkConf: Setting 'spark.executor.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/21 17:19:42 WARN SparkConf: Setting 'spark.driver.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/21 17:19:42 INFO SecurityManager: Changing view acls to: hadoop
15/03/21 17:19:42 INFO SecurityManager: Changing modify acls to: hadoop
15/03/21 17:19:42 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/21 17:19:44 INFO Slf4jLogger: Slf4jLogger started
15/03/21 17:19:44 INFO Remoting: Starting remoting
15/03/21 17:19:44 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@funshion-hadoop193:45440]
15/03/21 17:19:44 INFO Utils: Successfully started service 'sparkDriver' on port 45440.
15/03/21 17:19:44 INFO SparkEnv: Registering MapOutputTracker
15/03/21 17:19:45 INFO SparkEnv: Registering BlockManagerMaster
15/03/21 17:19:45 INFO DiskBlockManager: Created local directory at /tmp/spark-69b67fe8-574d-4476-b020-06740c36c98a/blockmgr-fec621cd-f28f-424c-8854-9d7e842b5212
15/03/21 17:19:45 INFO MemoryStore: MemoryStore started with capacity 265.4 MB
15/03/21 17:19:45 INFO HttpFileServer: HTTP File server directory is /tmp/spark-7459d9d7-3f68-4f89-b16b-5a20d44b7fba/httpd-61ca1574-6693-4b38-ad22-bee627890833
15/03/21 17:19:45 INFO HttpServer: Starting HTTP Server
15/03/21 17:19:45 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 17:19:45 INFO AbstractConnector: Started
[email protected]:54196
15/03/21 17:19:45 INFO Utils: Successfully started service 'HTTP file server' on port 54196.
15/03/21 17:19:45 INFO SparkEnv: Registering OutputCommitCoordinator
15/03/21 17:19:46 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/21 17:19:46 INFO AbstractConnector: Started
[email protected]:4040
15/03/21 17:19:46 INFO Utils: Successfully started service 'SparkUI' on port 4040.
15/03/21 17:19:46 INFO SparkUI: Started SparkUI at http://funshion-hadoop193:4040
15/03/21 17:19:46 INFO AppClient$ClientActor: Connecting to master akka.tcp://sparkMaster@funshion-hadoop193:7077/user/Master...
15/03/21 17:19:47 INFO SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20150321171947-0000
15/03/21 17:19:47 INFO AppClient$ClientActor: Executor added: app-20150321171947-0000/0 on worker-20150321171905-funshion-hadoop195-43185 (funshion-hadoop195:43185) with 2 cores
15/03/21 17:19:47 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321171947-0000/0 on hostPort funshion-hadoop195:43185 with 2 cores, 512.0 MB RAM
15/03/21 17:19:47 INFO AppClient$ClientActor: Executor added: app-20150321171947-0000/1 on worker-20150321171905-funshion-hadoop194-34245 (funshion-hadoop194:34245) with 2 cores
15/03/21 17:19:47 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321171947-0000/1 on hostPort funshion-hadoop194:34245 with 2 cores, 512.0 MB RAM
15/03/21 17:19:47 INFO AppClient$ClientActor: Executor added: app-20150321171947-0000/2 on worker-20150321171905-funshion-hadoop196-48202 (funshion-hadoop196:48202) with 2 cores
15/03/21 17:19:47 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150321171947-0000/2 on hostPort funshion-hadoop196:48202 with 2 cores, 512.0 MB RAM
15/03/21 17:19:48 INFO AppClient$ClientActor: Executor updated: app-20150321171947-0000/0 is now RUNNING
15/03/21 17:19:48 INFO AppClient$ClientActor: Executor updated: app-20150321171947-0000/1 is now RUNNING
15/03/21 17:19:48 INFO AppClient$ClientActor: Executor updated: app-20150321171947-0000/2 is now LOADING
15/03/21 17:19:48 INFO AppClient$ClientActor: Executor updated: app-20150321171947-0000/2 is now RUNNING
15/03/21 17:19:48 INFO AppClient$ClientActor: Executor updated: app-20150321171947-0000/1 is now LOADING
15/03/21 17:19:48 INFO AppClient$ClientActor: Executor updated: app-20150321171947-0000/0 is now LOADING
15/03/21 17:19:48 INFO NettyBlockTransferService: Server created on 56884
15/03/21 17:19:48 INFO BlockManagerMaster: Trying to register BlockManager
15/03/21 17:19:48 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop193:56884 with 265.4 MB RAM, BlockManagerId(<driver>, funshion-hadoop193, 56884)
15/03/21 17:19:48 INFO BlockManagerMaster: Registered BlockManager
15/03/21 17:19:51 INFO EventLoggingListener: Logging events to hdfs://funshion-hadoop193:8020/spark_log/app-20150321171947-0000
15/03/21 17:19:51 INFO SparkDeploySchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
15/03/21 17:19:51 INFO SparkILoop: Created spark context..
Spark context available as sc.
15/03/21 17:19:53 INFO SparkILoop: Created sql context (with Hive support)..
SQL context available as sqlContext.
15/03/21 17:19:55 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop194:58559/user/Executor#-1666693618] with ID 1
15/03/21 17:19:56 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop195:44023/user/Executor#2077708725] with ID 0
15/03/21 17:19:56 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop196:55503/user/Executor#282621553] with ID 2
15/03/21 17:19:56 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop194:41519 with 265.4 MB RAM, BlockManagerId(1, funshion-hadoop194, 41519)
15/03/21 17:19:56 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop195:35169 with 265.4 MB RAM, BlockManagerId(0, funshion-hadoop195, 35169)
15/03/21 17:19:56 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop196:40584 with 265.4 MB RAM, BlockManagerId(2, funshion-hadoop196, 40584)
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@53077f45
scala> val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
sqlContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@78624930
scala> sqlContext.sql("FROM web.pv2 SELECT time, ip, fck, mac, userid, fpc, version limit 10").collect().foreach(println)
15/03/21 17:20:24 INFO metastore: Trying to connect to metastore with URI thrift://funshion-hadoop192:10000
15/03/21 17:20:24 INFO metastore: Connected to metastore.
15/03/21 17:20:25 INFO SessionState: No Tez session required at this point. hive.execution.engine=mr.
15/03/21 17:20:26 INFO ParseDriver: Parsing command: FROM web.pv2 SELECT time, ip, fck, mac, userid, fpc, version limit 10
15/03/21 17:20:26 INFO ParseDriver: Parse Completed
15/03/21 17:20:30 INFO deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
15/03/21 17:20:30 INFO MemoryStore: ensureFreeSpace(392934) called with curMem=0, maxMem=278302556
15/03/21 17:20:30 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 383.7 KB, free 265.0 MB)
15/03/21 17:20:31 INFO MemoryStore: ensureFreeSpace(70953) called with curMem=392934, maxMem=278302556
15/03/21 17:20:31 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 69.3 KB, free 265.0 MB)
15/03/21 17:20:31 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop193:56884 (size: 69.3 KB, free: 265.3 MB)
15/03/21 17:20:31 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0
15/03/21 17:20:31 INFO SparkContext: Created broadcast 0 from broadcast at TableReader.scala:74
15/03/21 17:20:39 INFO GPLNativeCodeLoader: Loaded native gpl library from the embedded binaries
15/03/21 17:20:39 INFO LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev e8c11c2be93b965abb548411379b203dabcbce79]
15/03/21 17:20:39 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:40 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:40 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:40 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:40 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:40 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:41 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:42 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:43 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:44 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:45 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:46 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:47 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:48 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO NetworkTopology: Adding a new node: /default-rack/192.168.117.194:50010
15/03/21 17:20:49 INFO NetworkTopology: Adding a new node: /default-rack/192.168.117.196:50010
15/03/21 17:20:49 INFO NetworkTopology: Adding a new node: /default-rack/192.168.117.195:50010
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:49 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:50 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:51 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:52 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO NetworkTopology: Adding a new node: /default-rack/192.168.117.196:50010
15/03/21 17:20:53 INFO NetworkTopology: Adding a new node: /default-rack/192.168.117.195:50010
15/03/21 17:20:53 INFO NetworkTopology: Adding a new node: /default-rack/192.168.117.194:50010
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:53 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:54 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:55 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:56 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:20:57 INFO SparkContext: Starting job: runJob at SparkPlan.scala:121
15/03/21 17:20:57 INFO DAGScheduler: Got job 0 (runJob at SparkPlan.scala:121) with 1 output partitions (allowLocal=false)
15/03/21 17:20:57 INFO DAGScheduler: Final stage: Stage 0(runJob at SparkPlan.scala:121)
15/03/21 17:20:57 INFO DAGScheduler: Parents of final stage: List()
15/03/21 17:20:58 INFO DAGScheduler: Missing parents: List()
15/03/21 17:20:58 INFO DAGScheduler: Submitting Stage 0 (MapPartitionsRDD[682] at map at SparkPlan.scala:96), which has no missing parents
15/03/21 17:20:59 INFO MemoryStore: ensureFreeSpace(231264) called with curMem=463887, maxMem=278302556
15/03/21 17:20:59 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 225.8 KB, free 264.7 MB)
15/03/21 17:20:59 INFO MemoryStore: ensureFreeSpace(155760) called with curMem=695151, maxMem=278302556
15/03/21 17:20:59 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 152.1 KB, free 264.6 MB)
15/03/21 17:20:59 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop193:56884 (size: 152.1 KB, free: 265.2 MB)
15/03/21 17:20:59 INFO BlockManagerMaster: Updated info of block broadcast_1_piece0
15/03/21 17:20:59 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:839
15/03/21 17:20:59 INFO DAGScheduler: Submitting 1 missing tasks from Stage 0 (MapPartitionsRDD[682] at map at SparkPlan.scala:96)
15/03/21 17:20:59 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
15/03/21 17:20:59 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, funshion-hadoop196, NODE_LOCAL, 1476 bytes)
15/03/21 17:21:00 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop196:40584 (size: 152.1 KB, free: 265.3 MB)
15/03/21 17:21:02 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop196:40584 (size: 69.3 KB, free: 265.2 MB)
15/03/21 17:21:06 INFO DAGScheduler: Stage 0 (runJob at SparkPlan.scala:121) finished in 7.570 s
15/03/21 17:21:06 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 7549 ms on funshion-hadoop196 (1/1)
15/03/21 17:21:06 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/03/21 17:21:06 INFO DAGScheduler: Job 0 finished: runJob at SparkPlan.scala:121, took 9.200024 s
[1425700800,106.39.223.13,142566222966d10,,0,,]
[1425700800,171.126.92.234,1419652425640v4,001E90B48B29,0,uoc_0_,3.0.3.45]
[1425700800,115.48.155.99,142278045504f40,48D22446F0FD,0,uoc_0_,3.0.3.45]
[1425700800,42.84.215.124,1425297728b6ec5,8C89A57242F8,0,uoc_0_,3.0.1.30]
[1425700800,27.36.219.185,142570079711a1a,,0,,]
[1425700800,42.63.106.214,142570079690d2b,,0,,]
[1425700800,119.177.15.114,14241507820428d,00245404264E,0,uoc_0_,3.0.1.30]
[1425700800,42.63.106.214,1425700796594da,,0,,]
[1425700800,180.149.143.146,1425700800d0502,,0,,]
[1425700800,111.201.153.164,1378541151a3eea,E0B9A51A05E0,0,oin_0_,3.0.3.45]
scala> 15/03/21 17:25:04 INFO BlockManager: Removing broadcast 1
15/03/21 17:25:04 INFO BlockManager: Removing block broadcast_1_piece0
15/03/21 17:25:04 INFO MemoryStore: Block broadcast_1_piece0 of size 155760 dropped from memory (free 277607405)
15/03/21 17:25:04 INFO BlockManagerInfo: Removed broadcast_1_piece0 on funshion-hadoop193:56884 in memory (size: 152.1 KB, free: 265.3 MB)
15/03/21 17:25:04 INFO BlockManagerMaster: Updated info of block broadcast_1_piece0
15/03/21 17:25:04 INFO BlockManager: Removing block broadcast_1
15/03/21 17:25:04 INFO MemoryStore: Block broadcast_1 of size 231264 dropped from memory (free 277838669)
15/03/21 17:25:04 INFO BlockManagerInfo: Removed broadcast_1_piece0 on funshion-hadoop196:40584 in memory (size: 152.1 KB, free: 265.3 MB)
15/03/21 17:25:05 INFO ContextCleaner: Cleaned broadcast 1
scala> sqlContext.sql("FROM web.pv2 SELECT count(*) WHERE year='2015' and month='03' and day='09' and hour='10'").collect().foreach(println)
15/03/21 17:33:53 INFO ParseDriver: Parsing command: FROM web.pv2 SELECT count(*) WHERE year='2015' and month='03' and day='09' and hour='10'
15/03/21 17:33:53 INFO ParseDriver: Parse Completed
15/03/21 17:33:54 INFO MemoryStore: ensureFreeSpace(387646) called with curMem=463887, maxMem=278302556
15/03/21 17:33:54 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 378.6 KB, free 264.6 MB)
15/03/21 17:33:55 INFO MemoryStore: ensureFreeSpace(70619) called with curMem=851533, maxMem=278302556
15/03/21 17:33:55 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 69.0 KB, free 264.5 MB)
15/03/21 17:33:55 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on funshion-hadoop193:56884 (size: 69.0 KB, free: 265.3 MB)
15/03/21 17:33:55 INFO BlockManagerMaster: Updated info of block broadcast_2_piece0
15/03/21 17:33:55 INFO SparkContext: Created broadcast 2 from broadcast at TableReader.scala:74
15/03/21 17:33:56 INFO SparkContext: Starting job: collect at SparkPlan.scala:83
15/03/21 17:33:57 INFO FileInputFormat: Total input paths to process : 4
15/03/21 17:33:57 INFO DAGScheduler: Registering RDD 688 (mapPartitions at Exchange.scala:100)
15/03/21 17:33:57 INFO DAGScheduler: Got job 1 (collect at SparkPlan.scala:83) with 1 output partitions (allowLocal=false)
15/03/21 17:33:57 INFO DAGScheduler: Final stage: Stage 2(collect at SparkPlan.scala:83)
15/03/21 17:33:57 INFO DAGScheduler: Parents of final stage: List(Stage 1)
15/03/21 17:33:57 INFO DAGScheduler: Missing parents: List(Stage 1)
15/03/21 17:33:57 INFO DAGScheduler: Submitting Stage 1 (MapPartitionsRDD[688] at mapPartitions at Exchange.scala:100), which has no missing parents
15/03/21 17:33:57 INFO MemoryStore: ensureFreeSpace(202320) called with curMem=922152, maxMem=278302556
15/03/21 17:33:57 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 197.6 KB, free 264.3 MB)
15/03/21 17:33:57 INFO MemoryStore: ensureFreeSpace(129167) called with curMem=1124472, maxMem=278302556
15/03/21 17:33:57 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 126.1 KB, free 264.2 MB)
15/03/21 17:33:57 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on funshion-hadoop193:56884 (size: 126.1 KB, free: 265.2 MB)
15/03/21 17:33:57 INFO BlockManagerMaster: Updated info of block broadcast_3_piece0
15/03/21 17:33:57 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:839
15/03/21 17:33:57 INFO DAGScheduler: Submitting 3 missing tasks from Stage 1 (MapPartitionsRDD[688] at mapPartitions at Exchange.scala:100)
15/03/21 17:33:57 INFO TaskSchedulerImpl: Adding task set 1.0 with 3 tasks
15/03/21 17:33:57 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, funshion-hadoop196, NODE_LOCAL, 1465 bytes)
15/03/21 17:33:57 INFO TaskSetManager: Starting task 1.0 in stage 1.0 (TID 2, funshion-hadoop194, NODE_LOCAL, 1466 bytes)
15/03/21 17:33:57 INFO TaskSetManager: Starting task 2.0 in stage 1.0 (TID 3, funshion-hadoop195, NODE_LOCAL, 1466 bytes)
15/03/21 17:33:57 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on funshion-hadoop196:40584 (size: 126.1 KB, free: 265.2 MB)
15/03/21 17:33:58 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on funshion-hadoop196:40584 (size: 69.0 KB, free: 265.2 MB)
15/03/21 17:33:58 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on funshion-hadoop195:35169 (size: 126.1 KB, free: 265.3 MB)
15/03/21 17:33:58 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on funshion-hadoop194:41519 (size: 126.1 KB, free: 265.3 MB)
15/03/21 17:34:01 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on funshion-hadoop195:35169 (size: 69.0 KB, free: 265.2 MB)
15/03/21 17:34:01 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on funshion-hadoop194:41519 (size: 69.0 KB, free: 265.2 MB)
15/03/21 17:34:02 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 4500 ms on funshion-hadoop196 (1/3)
15/03/21 17:34:05 INFO TaskSetManager: Finished task 2.0 in stage 1.0 (TID 3) in 8102 ms on funshion-hadoop195 (2/3)
15/03/21 17:34:08 INFO DAGScheduler: Stage 1 (mapPartitions at Exchange.scala:100) finished in 10.438 s
15/03/21 17:34:08 INFO DAGScheduler: looking for newly runnable stages
15/03/21 17:34:08 INFO DAGScheduler: running: Set()
15/03/21 17:34:08 INFO TaskSetManager: Finished task 1.0 in stage 1.0 (TID 2) in 10440 ms on funshion-hadoop194 (3/3)
15/03/21 17:34:08 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
15/03/21 17:34:08 INFO DAGScheduler: waiting: Set(Stage 2)
15/03/21 17:34:08 INFO DAGScheduler: failed: Set()
15/03/21 17:34:08 INFO DAGScheduler: Missing parents for Stage 2: List()
15/03/21 17:34:08 INFO DAGScheduler: Submitting Stage 2 (MapPartitionsRDD[692] at map at SparkPlan.scala:83), which is now runnable
15/03/21 17:34:08 INFO MemoryStore: ensureFreeSpace(200192) called with curMem=1253639, maxMem=278302556
15/03/21 17:34:08 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 195.5 KB, free 264.0 MB)
15/03/21 17:34:08 INFO MemoryStore: ensureFreeSpace(127644) called with curMem=1453831, maxMem=278302556
15/03/21 17:34:08 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 124.7 KB, free 263.9 MB)
15/03/21 17:34:08 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on funshion-hadoop193:56884 (size: 124.7 KB, free: 265.0 MB)
15/03/21 17:34:08 INFO BlockManagerMaster: Updated info of block broadcast_4_piece0
15/03/21 17:34:08 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:839
15/03/21 17:34:08 INFO DAGScheduler: Submitting 1 missing tasks from Stage 2 (MapPartitionsRDD[692] at map at SparkPlan.scala:83)
15/03/21 17:34:08 INFO TaskSchedulerImpl: Adding task set 2.0 with 1 tasks
15/03/21 17:34:08 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 4, funshion-hadoop194, PROCESS_LOCAL, 1056 bytes)
15/03/21 17:34:08 INFO BlockManager: Removing broadcast 3
15/03/21 17:34:08 INFO BlockManager: Removing block broadcast_3_piece0
15/03/21 17:34:08 INFO MemoryStore: Block broadcast_3_piece0 of size 129167 dropped from memory (free 276850248)
15/03/21 17:34:08 INFO BlockManagerInfo: Removed broadcast_3_piece0 on funshion-hadoop193:56884 in memory (size: 126.1 KB, free: 265.2 MB)
15/03/21 17:34:08 INFO BlockManagerMaster: Updated info of block broadcast_3_piece0
15/03/21 17:34:08 INFO BlockManager: Removing block broadcast_3
15/03/21 17:34:08 INFO MemoryStore: Block broadcast_3 of size 202320 dropped from memory (free 277052568)
15/03/21 17:34:08 INFO BlockManagerInfo: Removed broadcast_3_piece0 on funshion-hadoop196:40584 in memory (size: 126.1 KB, free: 265.3 MB)
15/03/21 17:34:08 INFO BlockManagerInfo: Removed broadcast_3_piece0 on funshion-hadoop195:35169 in memory (size: 126.1 KB, free: 265.3 MB)
15/03/21 17:34:08 INFO BlockManagerInfo: Removed broadcast_3_piece0 on funshion-hadoop194:41519 in memory (size: 126.1 KB, free: 265.3 MB)
15/03/21 17:34:08 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on funshion-hadoop194:41519 (size: 124.7 KB, free: 265.2 MB)
15/03/21 17:34:08 INFO ContextCleaner: Cleaned broadcast 3
15/03/21 17:34:08 INFO MapOutputTrackerMasterActor: Asked to send map output locations for shuffle 0 to sparkExecutor@funshion-hadoop194:58559
15/03/21 17:34:08 INFO MapOutputTrackerMaster: Size of output statuses for shuffle 0 is 176 bytes
15/03/21 17:34:09 INFO DAGScheduler: Stage 2 (collect at SparkPlan.scala:83) finished in 1.120 s
15/03/21 17:34:09 INFO DAGScheduler: Job 1 finished: collect at SparkPlan.scala:83, took 12.743659 s
15/03/21 17:34:09 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 4) in 1055 ms on funshion-hadoop194 (1/1)
15/03/21 17:34:09 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
[1302875]
-- 然后用hive验证两个查询:
[hadoop@funshion-hadoop193 lib]$ hive
Logging initialized using configuration in file:/usr/local/apache-hive-1.0.0-bin/conf/hive-log4j.properties
hive> use web;
OK
Time taken: 1.194 seconds
hive> FROM web.pv2 SELECT time, ip, fck, mac, userid, fpc, version limit 10;
OK
1425139200
42.236.234.126
1405150429lj8hn
AC220B7F6748
0
uoc_0_
3.0.1.30
1425139200
218.29.215.246
1425139395a9cef
0
1425139200
58.243.98.165
14251391979c831
0
1425139200
123.125.71.50
142513920049edd
0
1425139200
125.44.54.118
137856542564zl4
20CF30E648AB
0
uoc_0_
3.0.1.30
1425139200
122.139.44.143
1425139262d0717
0
1425139200
221.215.146.34
1414606324dx62z
DFBE2ED3B408
0
uoc_0_
3.0.3.36
1425139200
42.237.191.77
14251392436991e
0
1425139200
123.119.227.3
1425139201c570b
0
1425139200
42.237.191.77
14251392436991e
0
Time taken: 4.856 seconds, Fetched: 10 row(s)
hive> FROM web.pv2 SELECT count(*) WHERE year='2015' and month='03' and day='09' and hour='10';
Query ID = hadoop_20150321173737_db387447-29af-4199-80c8-85aa01070f67
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1426913373071_0013, Tracking URL = http://funshion-hadoop193:8088/proxy/application_1426913373071_0013/
Kill Command = /usr/local/hadoop/bin/hadoop job -kill job_1426913373071_0013
Hadoop job information for Stage-1: number of mappers: 3; number of reducers: 1
2015-03-21 17:37:50,761 Stage-1 map = 0%, reduce = 0%
2015-03-21 17:38:04,307 Stage-1 map = 33%, reduce = 0%, Cumulative CPU 4.39 sec
2015-03-21 17:38:07,589 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 19.98 sec
2015-03-21 17:38:21,527 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 23.52 sec
MapReduce Total cumulative CPU time: 23 seconds 520 msec
Ended Job = job_1426913373071_0013
MapReduce Jobs Launched:
Stage-Stage-1: Map: 3 Reduce: 1 Cumulative CPU: 23.52 sec HDFS Read: 229940119 HDFS Write: 8 SUCCESS
Total MapReduce CPU Time Spent: 23 seconds 520 msec
OK
1302875
Time taken: 53.738 seconds, Fetched: 1 row(s)
hive>
-- 第一个查询,由于各引擎limit语句可能根据的顺序不同,所以结果不一样完全可以理解(不算错误)。
-- 第二个查询,返回的结果均是 1302875 ,结果完全一致,到此该步结束。
---------------------------------------------------------------------------------------------------
-- 3.4 测试Spark SQL访问HDFS的Json文件:
-- 参考:http://spark.apache.org/docs/latest/sql-programming-guide.html#running-sql-queries-programmatically
-- 的“JSON Datasets”章节
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -copyFromLocal /usr/local/spark/examples/src/main/resources/people.json hdfs://funshion-hadoop193:8020/user/hadoop/
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -ls /user/hadoop
Found 5 items
drwx------ - hadoop supergroup 0 2015-03-22 08:00 /user/hadoop/.Trash
drwxr-xr-x - hadoop supergroup 0 2015-03-21 15:05 /user/hadoop/.sparkStaging
-rw-r--r-- 3 hadoop supergroup 3629 2015-03-21 16:28 /user/hadoop/README.md
drwxr-xr-x - hadoop supergroup 0 2015-03-20 10:28 /user/hadoop/hive
-rw-r--r-- 3 hadoop supergroup 73 2015-03-22 14:10 /user/hadoop/people.json
[hadoop@funshion-hadoop193 spark]$ pwd
/usr/local/spark
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -cat hdfs://funshion-hadoop193:8020/user/hadoop/people.json
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
-- 如上两步所示,我们成功将people.json文件上传到hadoop集群的/user/hadoop目录下,接下来对该HDFS文件操作:
[hadoop@funshion-hadoop193 spark]$ pwd
/usr/local/spark
[hadoop@funshion-hadoop193 spark]$ ./bin/spark-shell --master spark://funshion-hadoop193:7077
Spark assembly has been built with Hive, including Datanucleus jars on classpath
15/03/22 14:13:08 INFO SecurityManager: Changing view acls to: hadoop
15/03/22 14:13:08 INFO SecurityManager: Changing modify acls to: hadoop
15/03/22 14:13:08 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/22 14:13:08 INFO HttpServer: Starting HTTP Server
15/03/22 14:13:08 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/22 14:13:08 INFO AbstractConnector: Started
[email protected]:39459
15/03/22 14:13:09 INFO Utils: Successfully started service 'HTTP class server' on port 39459.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.3.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_75)
Type in expressions to have them evaluated.
Type :help for more information.
15/03/22 14:13:23 INFO SparkContext: Running Spark version 1.3.0
15/03/22 14:13:23 WARN SparkConf:
SPARK_CLASSPATH was detected (set to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar').
This is deprecated in Spark 1.0+.
Please instead use:
- ./spark-submit with --driver-class-path to augment the driver classpath
- spark.executor.extraClassPath to augment the executor classpath
15/03/22 14:13:23 WARN SparkConf: Setting 'spark.executor.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/22 14:13:23 WARN SparkConf: Setting 'spark.driver.extraClassPath' to ':/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/22 14:13:23 INFO SecurityManager: Changing view acls to: hadoop
15/03/22 14:13:23 INFO SecurityManager: Changing modify acls to: hadoop
15/03/22 14:13:23 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/22 14:13:25 INFO Slf4jLogger: Slf4jLogger started
15/03/22 14:13:25 INFO Remoting: Starting remoting
15/03/22 14:13:25 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@funshion-hadoop193:56107]
15/03/22 14:13:25 INFO Utils: Successfully started service 'sparkDriver' on port 56107.
15/03/22 14:13:25 INFO SparkEnv: Registering MapOutputTracker
15/03/22 14:13:25 INFO SparkEnv: Registering BlockManagerMaster
15/03/22 14:13:25 INFO DiskBlockManager: Created local directory at /tmp/spark-be4233be-4ef0-4251-940b-08c620766731/blockmgr-ae7a8197-0325-4e20-84ec-11391e93fe05
15/03/22 14:13:25 INFO MemoryStore: MemoryStore started with capacity 265.4 MB
15/03/22 14:13:26 INFO HttpFileServer: HTTP File server directory is /tmp/spark-a44278e3-078f-4587-a0bc-88c152936d7b/httpd-994d47ce-ca02-466d-a8fb-3196d45bcf49
15/03/22 14:13:26 INFO HttpServer: Starting HTTP Server
15/03/22 14:13:26 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/22 14:13:26 INFO AbstractConnector: Started
[email protected]:57374
15/03/22 14:13:26 INFO Utils: Successfully started service 'HTTP file server' on port 57374.
15/03/22 14:13:26 INFO SparkEnv: Registering OutputCommitCoordinator
15/03/22 14:13:27 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/22 14:13:27 INFO AbstractConnector: Started
[email protected]:4040
15/03/22 14:13:27 INFO Utils: Successfully started service 'SparkUI' on port 4040.
15/03/22 14:13:27 INFO SparkUI: Started SparkUI at http://funshion-hadoop193:4040
15/03/22 14:13:28 INFO AppClient$ClientActor: Connecting to master akka.tcp://sparkMaster@funshion-hadoop193:7077/user/Master...
15/03/22 14:13:29 INFO SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20150322141329-0003
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor added: app-20150322141329-0003/0 on worker-20150321171905-funshion-hadoop195-43185 (funshion-hadoop195:43185) with 2 cores
15/03/22 14:13:29 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150322141329-0003/0 on hostPort funshion-hadoop195:43185 with 2 cores, 512.0 MB RAM
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor added: app-20150322141329-0003/1 on worker-20150321171905-funshion-hadoop194-34245 (funshion-hadoop194:34245) with 2 cores
15/03/22 14:13:29 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150322141329-0003/1 on hostPort funshion-hadoop194:34245 with 2 cores, 512.0 MB RAM
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor added: app-20150322141329-0003/2 on worker-20150321171905-funshion-hadoop196-48202 (funshion-hadoop196:48202) with 2 cores
15/03/22 14:13:29 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150322141329-0003/2 on hostPort funshion-hadoop196:48202 with 2 cores, 512.0 MB RAM
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor updated: app-20150322141329-0003/0 is now LOADING
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor updated: app-20150322141329-0003/2 is now LOADING
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor updated: app-20150322141329-0003/1 is now LOADING
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor updated: app-20150322141329-0003/0 is now RUNNING
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor updated: app-20150322141329-0003/1 is now RUNNING
15/03/22 14:13:29 INFO AppClient$ClientActor: Executor updated: app-20150322141329-0003/2 is now RUNNING
15/03/22 14:13:29 INFO NettyBlockTransferService: Server created on 53560
15/03/22 14:13:29 INFO BlockManagerMaster: Trying to register BlockManager
15/03/22 14:13:29 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop193:53560 with 265.4 MB RAM, BlockManagerId(<driver>, funshion-hadoop193, 53560)
15/03/22 14:13:29 INFO BlockManagerMaster: Registered BlockManager
15/03/22 14:13:32 INFO EventLoggingListener: Logging events to hdfs://funshion-hadoop193:8020/spark_log/app-20150322141329-0003
15/03/22 14:13:32 INFO SparkDeploySchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
15/03/22 14:13:32 INFO SparkILoop: Created spark context..
Spark context available as sc.
15/03/22 14:13:34 INFO SparkILoop: Created sql context (with Hive support)..
SQL context available as sqlContext.
15/03/22 14:13:36 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop195:33241/user/Executor#161045615] with ID 0
15/03/22 14:13:37 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop196:42295/user/Executor#-915975088] with ID 2
15/03/22 14:13:37 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop194:39398/user/Executor#495772963] with ID 1
15/03/22 14:13:37 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop195:43851 with 265.4 MB RAM, BlockManagerId(0, funshion-hadoop195, 43851)
15/03/22 14:13:37 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop196:60072 with 265.4 MB RAM, BlockManagerId(2, funshion-hadoop196, 60072)
15/03/22 14:13:37 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop194:32982 with 265.4 MB RAM, BlockManagerId(1, funshion-hadoop194, 32982)
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@3e45d316
scala> val sqlContext = new org.apache.spark.sql.SQLContext(sc)
sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@15b7d9b8
scala> val df = sqlContext.jsonFile("hdfs://funshion-hadoop193:8020/user/hadoop/people.json")
15/03/22 14:15:09 INFO MemoryStore: ensureFreeSpace(238253) called with curMem=0, maxMem=278302556
15/03/22 14:15:09 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 232.7 KB, free 265.2 MB)
15/03/22 14:15:09 INFO MemoryStore: ensureFreeSpace(33723) called with curMem=238253, maxMem=278302556
15/03/22 14:15:09 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 32.9 KB, free 265.2 MB)
15/03/22 14:15:09 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop193:53560 (size: 32.9 KB, free: 265.4 MB)
15/03/22 14:15:09 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0
15/03/22 14:15:09 INFO SparkContext: Created broadcast 0 from textFile at JSONRelation.scala:98
15/03/22 14:15:10 INFO GPLNativeCodeLoader: Loaded native gpl library from the embedded binaries
15/03/22 14:15:10 INFO LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev e8c11c2be93b965abb548411379b203dabcbce79]
15/03/22 14:15:10 INFO FileInputFormat: Total input paths to process : 1
15/03/22 14:15:10 INFO SparkContext: Starting job: reduce at JsonRDD.scala:51
15/03/22 14:15:10 INFO DAGScheduler: Got job 0 (reduce at JsonRDD.scala:51) with 2 output partitions (allowLocal=false)
15/03/22 14:15:10 INFO DAGScheduler: Final stage: Stage 0(reduce at JsonRDD.scala:51)
15/03/22 14:15:10 INFO DAGScheduler: Parents of final stage: List()
15/03/22 14:15:10 INFO DAGScheduler: Missing parents: List()
15/03/22 14:15:10 INFO DAGScheduler: Submitting Stage 0 (MapPartitionsRDD[3] at map at JsonRDD.scala:51), which has no missing parents
15/03/22 14:15:10 INFO MemoryStore: ensureFreeSpace(3216) called with curMem=271976, maxMem=278302556
15/03/22 14:15:10 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 3.1 KB, free 265.1 MB)
15/03/22 14:15:10 INFO MemoryStore: ensureFreeSpace(2285) called with curMem=275192, maxMem=278302556
15/03/22 14:15:10 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 2.2 KB, free 265.1 MB)
15/03/22 14:15:10 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop193:53560 (size: 2.2 KB, free: 265.4 MB)
15/03/22 14:15:10 INFO BlockManagerMaster: Updated info of block broadcast_1_piece0
15/03/22 14:15:10 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:839
15/03/22 14:15:10 INFO DAGScheduler: Submitting 2 missing tasks from Stage 0 (MapPartitionsRDD[3] at map at JsonRDD.scala:51)
15/03/22 14:15:10 INFO TaskSchedulerImpl: Adding task set 0.0 with 2 tasks
15/03/22 14:15:10 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, funshion-hadoop195, NODE_LOCAL, 1318 bytes)
15/03/22 14:15:10 INFO TaskSetManager: Starting task 1.0 in stage 0.0 (TID 1, funshion-hadoop196, NODE_LOCAL, 1318 bytes)
15/03/22 14:15:11 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop196:60072 (size: 2.2 KB, free: 265.4 MB)
15/03/22 14:15:11 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on funshion-hadoop195:43851 (size: 2.2 KB, free: 265.4 MB)
15/03/22 14:15:12 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop196:60072 (size: 32.9 KB, free: 265.4 MB)
15/03/22 14:15:12 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop195:43851 (size: 32.9 KB, free: 265.4 MB)
15/03/22 14:15:16 INFO TaskSetManager: Finished task 1.0 in stage 0.0 (TID 1) in 5406 ms on funshion-hadoop196 (1/2)
15/03/22 14:15:16 INFO DAGScheduler: Stage 0 (reduce at JsonRDD.scala:51) finished in 5.749 s
15/03/22 14:15:16 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 5738 ms on funshion-hadoop195 (2/2)
15/03/22 14:15:16 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/03/22 14:15:16 INFO DAGScheduler: Job 0 finished: reduce at JsonRDD.scala:51, took 6.122394 s
df: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
scala> df.show()
15/03/22 14:15:43 INFO MemoryStore: ensureFreeSpace(238325) called with curMem=277477, maxMem=278302556
15/03/22 14:15:43 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 232.7 KB, free 264.9 MB)
15/03/22 14:15:43 INFO MemoryStore: ensureFreeSpace(33723) called with curMem=515802, maxMem=278302556
15/03/22 14:15:43 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 32.9 KB, free 264.9 MB)
15/03/22 14:15:43 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on funshion-hadoop193:53560 (size: 32.9 KB, free: 265.3 MB)
15/03/22 14:15:43 INFO BlockManagerMaster: Updated info of block broadcast_2_piece0
15/03/22 14:15:43 INFO SparkContext: Created broadcast 2 from textFile at JSONRelation.scala:98
15/03/22 14:15:43 INFO FileInputFormat: Total input paths to process : 1
15/03/22 14:15:43 INFO SparkContext: Starting job: runJob at SparkPlan.scala:121
15/03/22 14:15:43 INFO DAGScheduler: Got job 1 (runJob at SparkPlan.scala:121) with 1 output partitions (allowLocal=false)
15/03/22 14:15:43 INFO DAGScheduler: Final stage: Stage 1(runJob at SparkPlan.scala:121)
15/03/22 14:15:43 INFO DAGScheduler: Parents of final stage: List()
15/03/22 14:15:43 INFO DAGScheduler: Missing parents: List()
15/03/22 14:15:43 INFO DAGScheduler: Submitting Stage 1 (MapPartitionsRDD[8] at map at SparkPlan.scala:96), which has no missing parents
15/03/22 14:15:43 INFO MemoryStore: ensureFreeSpace(4064) called with curMem=549525, maxMem=278302556
15/03/22 14:15:43 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 4.0 KB, free 264.9 MB)
15/03/22 14:15:43 INFO MemoryStore: ensureFreeSpace(2796) called with curMem=553589, maxMem=278302556
15/03/22 14:15:43 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 2.7 KB, free 264.9 MB)
15/03/22 14:15:43 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on funshion-hadoop193:53560 (size: 2.7 KB, free: 265.3 MB)
15/03/22 14:15:43 INFO BlockManagerMaster: Updated info of block broadcast_3_piece0
15/03/22 14:15:43 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:839
15/03/22 14:15:43 INFO DAGScheduler: Submitting 1 missing tasks from Stage 1 (MapPartitionsRDD[8] at map at SparkPlan.scala:96)
15/03/22 14:15:43 INFO TaskSchedulerImpl: Adding task set 1.0 with 1 tasks
15/03/22 14:15:43 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 2, funshion-hadoop194, NODE_LOCAL, 1318 bytes)
15/03/22 14:15:44 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on funshion-hadoop194:32982 (size: 2.7 KB, free: 265.4 MB)
15/03/22 14:15:46 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on funshion-hadoop194:32982 (size: 32.9 KB, free: 265.4 MB)
15/03/22 14:15:49 INFO DAGScheduler: Stage 1 (runJob at SparkPlan.scala:121) finished in 5.280 s
15/03/22 14:15:49 INFO DAGScheduler: Job 1 finished: runJob at SparkPlan.scala:121, took 5.329337 s
15/03/22 14:15:49 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 2) in 5277 ms on funshion-hadoop194 (1/1)
15/03/22 14:15:49 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
15/03/22 14:15:49 INFO SparkContext: Starting job: runJob at SparkPlan.scala:121
15/03/22 14:15:49 INFO DAGScheduler: Got job 2 (runJob at SparkPlan.scala:121) with 1 output partitions (allowLocal=false)
15/03/22 14:15:49 INFO DAGScheduler: Final stage: Stage 2(runJob at SparkPlan.scala:121)
15/03/22 14:15:49 INFO DAGScheduler: Parents of final stage: List()
15/03/22 14:15:49 INFO DAGScheduler: Missing parents: List()
15/03/22 14:15:49 INFO DAGScheduler: Submitting Stage 2 (MapPartitionsRDD[8] at map at SparkPlan.scala:96), which has no missing parents
15/03/22 14:15:49 INFO MemoryStore: ensureFreeSpace(4064) called with curMem=556385, maxMem=278302556
15/03/22 14:15:49 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 4.0 KB, free 264.9 MB)
15/03/22 14:15:49 INFO MemoryStore: ensureFreeSpace(2796) called with curMem=560449, maxMem=278302556
15/03/22 14:15:49 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 2.7 KB, free 264.9 MB)
15/03/22 14:15:49 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on funshion-hadoop193:53560 (size: 2.7 KB, free: 265.3 MB)
15/03/22 14:15:49 INFO BlockManagerMaster: Updated info of block broadcast_4_piece0
15/03/22 14:15:49 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:839
15/03/22 14:15:49 INFO DAGScheduler: Submitting 1 missing tasks from Stage 2 (MapPartitionsRDD[8] at map at SparkPlan.scala:96)
15/03/22 14:15:49 INFO TaskSchedulerImpl: Adding task set 2.0 with 1 tasks
15/03/22 14:15:49 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 3, funshion-hadoop194, NODE_LOCAL, 1318 bytes)
15/03/22 14:15:49 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on funshion-hadoop194:32982 (size: 2.7 KB, free: 265.4 MB)
15/03/22 14:15:49 INFO DAGScheduler: Stage 2 (runJob at SparkPlan.scala:121) finished in 0.166 s
15/03/22 14:15:49 INFO DAGScheduler: Job 2 finished: runJob at SparkPlan.scala:121, took 0.204439 s
15/03/22 14:15:49 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 3) in 168 ms on funshion-hadoop194 (1/1)
15/03/22 14:15:49 INFO TaskSchedulerImpl: Removed TaskSet 2.0, whose tasks have all completed, from pool
age name
null Michael
30 Andy
19 Justin
scala> df.printSchema()
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
scala> df.select("name").show()
15/03/22 14:22:10 INFO MemoryStore: ensureFreeSpace(238325) called with curMem=1126281, maxMem=278302556
15/03/22 14:22:10 INFO MemoryStore: Block broadcast_10 stored as values in memory (estimated size 232.7 KB, free 264.1 MB)
15/03/22 14:22:10 INFO MemoryStore: ensureFreeSpace(33723) called with curMem=1364606, maxMem=278302556
15/03/22 14:22:10 INFO MemoryStore: Block broadcast_10_piece0 stored as bytes in memory (estimated size 32.9 KB, free 264.1 MB)
15/03/22 14:22:10 INFO BlockManagerInfo: Added broadcast_10_piece0 in memory on funshion-hadoop193:53560 (size: 32.9 KB, free: 265.2 MB)
15/03/22 14:22:10 INFO BlockManagerMaster: Updated info of block broadcast_10_piece0
15/03/22 14:22:10 INFO SparkContext: Created broadcast 10 from textFile at JSONRelation.scala:98
15/03/22 14:22:10 INFO FileInputFormat: Total input paths to process : 1
15/03/22 14:22:10 INFO SparkContext: Starting job: runJob at SparkPlan.scala:121
15/03/22 14:22:10 INFO DAGScheduler: Got job 6 (runJob at SparkPlan.scala:121) with 1 output partitions (allowLocal=false)
15/03/22 14:22:10 INFO DAGScheduler: Final stage: Stage 6(runJob at SparkPlan.scala:121)
15/03/22 14:22:10 INFO DAGScheduler: Parents of final stage: List()
15/03/22 14:22:10 INFO DAGScheduler: Missing parents: List()
15/03/22 14:22:10 INFO DAGScheduler: Submitting Stage 6 (MapPartitionsRDD[23] at map at SparkPlan.scala:96), which has no missing parents
15/03/22 14:22:10 INFO MemoryStore: ensureFreeSpace(5064) called with curMem=1398329, maxMem=278302556
15/03/22 14:22:10 INFO MemoryStore: Block broadcast_11 stored as values in memory (estimated size 4.9 KB, free 264.1 MB)
15/03/22 14:22:10 INFO MemoryStore: ensureFreeSpace(3457) called with curMem=1403393, maxMem=278302556
15/03/22 14:22:10 INFO MemoryStore: Block broadcast_11_piece0 stored as bytes in memory (estimated size 3.4 KB, free 264.1 MB)
15/03/22 14:22:10 INFO BlockManagerInfo: Added broadcast_11_piece0 in memory on funshion-hadoop193:53560 (size: 3.4 KB, free: 265.2 MB)
15/03/22 14:22:10 INFO BlockManagerMaster: Updated info of block broadcast_11_piece0
15/03/22 14:22:10 INFO SparkContext: Created broadcast 11 from broadcast at DAGScheduler.scala:839
15/03/22 14:22:10 INFO DAGScheduler: Submitting 1 missing tasks from Stage 6 (MapPartitionsRDD[23] at map at SparkPlan.scala:96)
15/03/22 14:22:10 INFO TaskSchedulerImpl: Adding task set 6.0 with 1 tasks
15/03/22 14:22:10 INFO TaskSetManager: Starting task 0.0 in stage 6.0 (TID 8, funshion-hadoop194, NODE_LOCAL, 1318 bytes)
15/03/22 14:22:10 INFO BlockManagerInfo: Added broadcast_11_piece0 in memory on funshion-hadoop194:32982 (size: 3.4 KB, free: 265.3 MB)
15/03/22 14:22:10 INFO BlockManagerInfo: Added broadcast_10_piece0 in memory on funshion-hadoop194:32982 (size: 32.9 KB, free: 265.3 MB)
15/03/22 14:22:10 INFO DAGScheduler: Stage 6 (runJob at SparkPlan.scala:121) finished in 0.457 s
15/03/22 14:22:10 INFO DAGScheduler: Job 6 finished: runJob at SparkPlan.scala:121, took 0.495140 s
15/03/22 14:22:10 INFO TaskSetManager: Finished task 0.0 in stage 6.0 (TID 8) in 454 ms on funshion-hadoop194 (1/1)
15/03/22 14:22:10 INFO TaskSchedulerImpl: Removed TaskSet 6.0, whose tasks have all completed, from pool
15/03/22 14:22:10 INFO SparkContext: Starting job: runJob at SparkPlan.scala:121
15/03/22 14:22:10 INFO DAGScheduler: Got job 7 (runJob at SparkPlan.scala:121) with 1 output partitions (allowLocal=false)
15/03/22 14:22:10 INFO DAGScheduler: Final stage: Stage 7(runJob at SparkPlan.scala:121)
15/03/22 14:22:10 INFO DAGScheduler: Parents of final stage: List()
15/03/22 14:22:10 INFO DAGScheduler: Missing parents: List()
15/03/22 14:22:10 INFO DAGScheduler: Submitting Stage 7 (MapPartitionsRDD[23] at map at SparkPlan.scala:96), which has no missing parents
15/03/22 14:22:10 INFO MemoryStore: ensureFreeSpace(5064) called with curMem=1406850, maxMem=278302556
15/03/22 14:22:10 INFO MemoryStore: Block broadcast_12 stored as values in memory (estimated size 4.9 KB, free 264.1 MB)
15/03/22 14:22:10 INFO MemoryStore: ensureFreeSpace(3457) called with curMem=1411914, maxMem=278302556
15/03/22 14:22:10 INFO MemoryStore: Block broadcast_12_piece0 stored as bytes in memory (estimated size 3.4 KB, free 264.1 MB)
15/03/22 14:22:10 INFO BlockManagerInfo: Added broadcast_12_piece0 in memory on funshion-hadoop193:53560 (size: 3.4 KB, free: 265.2 MB)
15/03/22 14:22:10 INFO BlockManagerMaster: Updated info of block broadcast_12_piece0
15/03/22 14:22:10 INFO SparkContext: Created broadcast 12 from broadcast at DAGScheduler.scala:839
15/03/22 14:22:10 INFO DAGScheduler: Submitting 1 missing tasks from Stage 7 (MapPartitionsRDD[23] at map at SparkPlan.scala:96)
15/03/22 14:22:10 INFO TaskSchedulerImpl: Adding task set 7.0 with 1 tasks
15/03/22 14:22:10 INFO TaskSetManager: Starting task 0.0 in stage 7.0 (TID 9, funshion-hadoop195, NODE_LOCAL, 1318 bytes)
15/03/22 14:22:10 INFO BlockManagerInfo: Added broadcast_12_piece0 in memory on funshion-hadoop195:43851 (size: 3.4 KB, free: 265.3 MB)
15/03/22 14:22:11 INFO BlockManagerInfo: Added broadcast_10_piece0 in memory on funshion-hadoop195:43851 (size: 32.9 KB, free: 265.3 MB)
15/03/22 14:22:11 INFO DAGScheduler: Stage 7 (runJob at SparkPlan.scala:121) finished in 0.419 s
15/03/22 14:22:11 INFO DAGScheduler: Job 7 finished: runJob at SparkPlan.scala:121, took 0.473975 s
15/03/22 14:22:11 INFO TaskSetManager: Finished task 0.0 in stage 7.0 (TID 9) in 423 ms on funshion-hadoop195 (1/1)
15/03/22 14:22:11 INFO TaskSchedulerImpl: Removed TaskSet 7.0, whose tasks have all completed, from pool
name
Michael
Andy
Justin
-- 注意:由于当前的操作系统用户是hadoop用户,HDFS目录也有“根目录”的概念(在HDFS里,hadoop用户的根目录就是/user/hadoop),
-- 所以我们也可以直接这样:
val df = sqlContext.jsonFile("people.json")
df.show()
---------------------------------------------------------------------------------------------------
-- 3.5 进一步测试Spark SQL访问HDFS的Json文件
-- 参考:http://spark.apache.org/docs/latest/sql-programming-guide.html#running-sql-queries-programmatically
-- 的“JSON Datasets”章节
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val path = "/user/hadoop/people.json"
val people = sqlContext.jsonFile(path)
people.printSchema()
people.registerTempTable("people")
val teenagers = sqlContext.sql("SELECT name, age+1 as agePlusOne FROM people WHERE age >= 13 AND age <= 19")
teenagers.show()
people.filter(people("age") > 21).show()
// Alternatively, a DataFrame can be created for a JSON dataset represented by
// an RDD[String] storing one JSON object per string.
val anotherPeopleRDD = sc.parallelize(
"""{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
val anotherPeople = sqlContext.jsonRDD(anotherPeopleRDD)
anotherPeople.show()
----------------------
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -copyFromLocal examples/src/main/resources/people.txt hdfs://funshion-hadoop193:8020/user/hadoop/
[hadoop@funshion-hadoop193 spark]$ hdfs dfs -cat /user/hadoop/people.txt
Michael, 29
Andy, 30
Justin, 19
[hadoop@funshion-hadoop193 spark]$ pwd
/usr/local/spark
[hadoop@funshion-hadoop193 spark]$ ./bin/spark-shell --master spark://funshion-hadoop193:7077
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name: String, age: Int)
val people = sc.textFile("people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).toDF()
people.registerTempTable("people")
val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
-- 注意:上面第四行中的文件 people.txt 是HDFS的当前“HOME”目录(/user/hadoop)下的文件。
-- 第四行可以替换为:
val people = sc.textFile("/user/hadoop/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).toDF()
-- 也可以替换为:
val people = sc.textFile("hdfs://funshion-hadoop193:8020/user/hadoop/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).toDF()
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name: String, age: Int)
val people = sc.textFile("/user/hadoop/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).toDF()
people.registerTempTable("people")
val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name: String, age: Int)
val people = sc.textFile("hdfs://funshion-hadoop193:8020/user/hadoop/people.txt").map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt)).toDF()
people.registerTempTable("people")
val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
---------------------------------------------------------------------------------------------------
-- ############################################################################################# --
-- 4 测试Spark SQL访问关系数据库:
-- 参考:http://spark.apache.org/docs/latest/sql-programming-guide.html#running-sql-queries-programmatically
-- 的“JDBC To Other Databases”章节
---------------------------------------------------------------------------------------------------
-- 4.1 访问MySQL数据库:
[hadoop@funshion-hadoop193 lib]$ pwd
/usr/local/spark/lib
[hadoop@funshion-hadoop193 lib]$ cd ..
[hadoop@funshion-hadoop193 spark]$ SPARK_CLASSPATH=/usr/local/spark/lib/mysql-connector-java-5.1.17-bin.jar ./bin/spark-shell --master spark://funshion-hadoop193:7077
Spark assembly has been built with Hive, including Datanucleus jars on classpath
15/03/22 18:14:04 INFO SecurityManager: Changing view acls to: hadoop
15/03/22 18:14:04 INFO SecurityManager: Changing modify acls to: hadoop
15/03/22 18:14:04 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/22 18:14:04 INFO HttpServer: Starting HTTP Server
15/03/22 18:14:05 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/22 18:14:05 INFO AbstractConnector: Started
[email protected]:46026
15/03/22 18:14:05 INFO Utils: Successfully started service 'HTTP class server' on port 46026.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.3.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_75)
Type in expressions to have them evaluated.
Type :help for more information.
15/03/22 18:14:19 INFO SparkContext: Running Spark version 1.3.0
15/03/22 18:14:19 WARN SparkConf:
SPARK_CLASSPATH was detected (set to '/usr/local/spark/lib/mysql-connector-java-5.1.17-bin.jar:/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar').
This is deprecated in Spark 1.0+.
Please instead use:
- ./spark-submit with --driver-class-path to augment the driver classpath
- spark.executor.extraClassPath to augment the executor classpath
15/03/22 18:14:19 WARN SparkConf: Setting 'spark.executor.extraClassPath' to '/usr/local/spark/lib/mysql-connector-java-5.1.17-bin.jar:/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/22 18:14:19 WARN SparkConf: Setting 'spark.driver.extraClassPath' to '/usr/local/spark/lib/mysql-connector-java-5.1.17-bin.jar:/usr/local/hadoop/lib/hadoop-lzo-0.4.20-SNAPSHOT.jar' as a work-around.
15/03/22 18:14:19 INFO SecurityManager: Changing view acls to: hadoop
15/03/22 18:14:19 INFO SecurityManager: Changing modify acls to: hadoop
15/03/22 18:14:19 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
15/03/22 18:14:20 INFO Slf4jLogger: Slf4jLogger started
15/03/22 18:14:21 INFO Remoting: Starting remoting
15/03/22 18:14:21 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriver@funshion-hadoop193:57998]
15/03/22 18:14:21 INFO Utils: Successfully started service 'sparkDriver' on port 57998.
15/03/22 18:14:21 INFO SparkEnv: Registering MapOutputTracker
15/03/22 18:14:21 INFO SparkEnv: Registering BlockManagerMaster
15/03/22 18:14:21 INFO DiskBlockManager: Created local directory at /tmp/spark-e447ed57-292d-4f55-ab79-4e848c1c0622/blockmgr-be884f27-0aff-4a1f-80b8-e55deb2bcbf7
15/03/22 18:14:21 INFO MemoryStore: MemoryStore started with capacity 265.4 MB
15/03/22 18:14:22 INFO HttpFileServer: HTTP File server directory is /tmp/spark-daaa5cde-f622-4da8-b17f-9990c96eb4d8/httpd-8825b3d7-5e1c-4786-9a54-ea6bbcf21f7e
15/03/22 18:14:22 INFO HttpServer: Starting HTTP Server
15/03/22 18:14:22 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/22 18:14:22 INFO AbstractConnector: Started
[email protected]:55062
15/03/22 18:14:22 INFO Utils: Successfully started service 'HTTP file server' on port 55062.
15/03/22 18:14:22 INFO SparkEnv: Registering OutputCommitCoordinator
15/03/22 18:14:22 INFO Server: jetty-8.y.z-SNAPSHOT
15/03/22 18:14:22 INFO AbstractConnector: Started
[email protected]:4040
15/03/22 18:14:22 INFO Utils: Successfully started service 'SparkUI' on port 4040.
15/03/22 18:14:22 INFO SparkUI: Started SparkUI at http://funshion-hadoop193:4040
15/03/22 18:14:23 INFO AppClient$ClientActor: Connecting to master akka.tcp://sparkMaster@funshion-hadoop193:7077/user/Master...
15/03/22 18:14:24 INFO SparkDeploySchedulerBackend: Connected to Spark cluster with app ID app-20150322181424-0016
15/03/22 18:14:24 INFO AppClient$ClientActor: Executor added: app-20150322181424-0016/0 on worker-20150321171905-funshion-hadoop195-43185 (funshion-hadoop195:43185) with 2 cores
15/03/22 18:14:24 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150322181424-0016/0 on hostPort funshion-hadoop195:43185 with 2 cores, 512.0 MB RAM
15/03/22 18:14:24 INFO AppClient$ClientActor: Executor added: app-20150322181424-0016/1 on worker-20150321171905-funshion-hadoop194-34245 (funshion-hadoop194:34245) with 2 cores
15/03/22 18:14:24 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150322181424-0016/1 on hostPort funshion-hadoop194:34245 with 2 cores, 512.0 MB RAM
15/03/22 18:14:24 INFO AppClient$ClientActor: Executor added: app-20150322181424-0016/2 on worker-20150321171905-funshion-hadoop196-48202 (funshion-hadoop196:48202) with 2 cores
15/03/22 18:14:24 INFO SparkDeploySchedulerBackend: Granted executor ID app-20150322181424-0016/2 on hostPort funshion-hadoop196:48202 with 2 cores, 512.0 MB RAM
15/03/22 18:14:25 INFO AppClient$ClientActor: Executor updated: app-20150322181424-0016/0 is now LOADING
15/03/22 18:14:25 INFO AppClient$ClientActor: Executor updated: app-20150322181424-0016/2 is now LOADING
15/03/22 18:14:25 INFO AppClient$ClientActor: Executor updated: app-20150322181424-0016/1 is now LOADING
15/03/22 18:14:25 INFO AppClient$ClientActor: Executor updated: app-20150322181424-0016/0 is now RUNNING
15/03/22 18:14:25 INFO AppClient$ClientActor: Executor updated: app-20150322181424-0016/1 is now RUNNING
15/03/22 18:14:25 INFO AppClient$ClientActor: Executor updated: app-20150322181424-0016/2 is now RUNNING
15/03/22 18:14:25 INFO NettyBlockTransferService: Server created on 37710
15/03/22 18:14:25 INFO BlockManagerMaster: Trying to register BlockManager
15/03/22 18:14:25 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop193:37710 with 265.4 MB RAM, BlockManagerId(<driver>, funshion-hadoop193, 37710)
15/03/22 18:14:25 INFO BlockManagerMaster: Registered BlockManager
15/03/22 18:14:28 INFO EventLoggingListener: Logging events to hdfs://funshion-hadoop193:8020/spark_log/app-20150322181424-0016
15/03/22 18:14:28 INFO SparkDeploySchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.0
15/03/22 18:14:29 INFO SparkILoop: Created spark context..
Spark context available as sc.
15/03/22 18:14:31 INFO SparkILoop: Created sql context (with Hive support)..
SQL context available as sqlContext.
15/03/22 18:14:32 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop194:50050/user/Executor#1949469311] with ID 1
15/03/22 18:14:32 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop195:41120/user/Executor#1115933355] with ID 0
15/03/22 18:14:32 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop194:42298 with 265.4 MB RAM, BlockManagerId(1, funshion-hadoop194, 42298)
15/03/22 18:14:32 INFO SparkDeploySchedulerBackend: Registered executor: Actor[akka.tcp://sparkExecutor@funshion-hadoop196:57795/user/Executor#-985756403] with ID 2
15/03/22 18:14:32 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop195:40586 with 265.4 MB RAM, BlockManagerId(0, funshion-hadoop195, 40586)
15/03/22 18:14:33 INFO BlockManagerMasterActor: Registering block manager funshion-hadoop196:35167 with 265.4 MB RAM, BlockManagerId(2, funshion-hadoop196, 35167)
scala> val jdbcDF = sqlContext.load("jdbc", Map(
| "url" -> "jdbc:mysql://192.168.117.193:3306/hive?user=hive&password=bee56915",
| "dbtable" -> "hive.TBLS",
| "driver" -> "com.mysql.jdbc.Driver"))
15/03/22 18:25:13 INFO metastore: Trying to connect to metastore with URI thrift://funshion-hadoop192:10000
15/03/22 18:25:14 INFO metastore: Connected to metastore.
15/03/22 18:25:14 INFO SessionState: No Tez session required at this point. hive.execution.engine=mr.
15/03/22 18:25:14 INFO SessionState: No Tez session required at this point. hive.execution.engine=mr.
jdbcDF: org.apache.spark.sql.DataFrame = [TBL_ID: bigint, CREATE_TIME: int, DB_ID: bigint, LAST_ACCESS_TIME: int, OWNER: string, RETENTION: int, SD_ID: bigint, TBL_NAME: string, TBL_TYPE: string, VIEW_EXPANDED_TEXT: string, VIEW_ORIGINAL_TEXT: string, LINK_TARGET_ID: bigint]
scala> jdbcDF.show()
15/03/22 18:25:25 INFO SparkContext: Starting job: runJob at SparkPlan.scala:121
15/03/22 18:25:25 INFO DAGScheduler: Got job 0 (runJob at SparkPlan.scala:121) with 1 output partitions (allowLocal=false)
15/03/22 18:25:25 INFO DAGScheduler: Final stage: Stage 0(runJob at SparkPlan.scala:121)
15/03/22 18:25:25 INFO DAGScheduler: Parents of final stage: List()
15/03/22 18:25:25 INFO DAGScheduler: Missing parents: List()
15/03/22 18:25:25 INFO DAGScheduler: Submitting Stage 0 (MapPartitionsRDD[1] at map at SparkPlan.scala:96), which has no missing parents
15/03/22 18:25:26 INFO MemoryStore: ensureFreeSpace(4632) called with curMem=0, maxMem=278302556
15/03/22 18:25:26 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 4.5 KB, free 265.4 MB)
15/03/22 18:25:26 INFO MemoryStore: ensureFreeSpace(2909) called with curMem=4632, maxMem=278302556
15/03/22 18:25:26 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 2.8 KB, free 265.4 MB)
15/03/22 18:25:26 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop193:37710 (size: 2.8 KB, free: 265.4 MB)
15/03/22 18:25:26 INFO BlockManagerMaster: Updated info of block broadcast_0_piece0
15/03/22 18:25:26 INFO SparkContext: Created broadcast 0 from broadcast at DAGScheduler.scala:839
15/03/22 18:25:26 INFO DAGScheduler: Submitting 1 missing tasks from Stage 0 (MapPartitionsRDD[1] at map at SparkPlan.scala:96)
15/03/22 18:25:26 INFO TaskSchedulerImpl: Adding task set 0.0 with 1 tasks
15/03/22 18:25:26 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, funshion-hadoop194, PROCESS_LOCAL, 1062 bytes)
15/03/22 18:25:27 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on funshion-hadoop194:42298 (size: 2.8 KB, free: 265.4 MB)
15/03/22 18:25:30 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 3863 ms on funshion-hadoop194 (1/1)
15/03/22 18:25:30 INFO DAGScheduler: Stage 0 (runJob at SparkPlan.scala:121) finished in 3.889 s
15/03/22 18:25:30 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
15/03/22 18:25:30 INFO DAGScheduler: Job 0 finished: runJob at SparkPlan.scala:121, took 5.087737 s
TBL_ID CREATE_TIME DB_ID LAST_ACCESS_TIME OWNER RETENTION SD_ID TBL_NAME TBL_TYPE VIEW_EXPANDED_TEXT VIEW_ORIGINAL_TEXT LINK_TARGET_ID
1 1426485587 2 0 hadoop 0 1 pv2 EXTERNAL_TABLE null null null
-- 核对上面的查询,与直接查询Mysql数据库是否一致:
[hadoop@funshion-hadoop193 conf]$ mysql -uhive -pbee56915 -dhive
Warning: Using a password on the command line interface can be insecure.
mysql: unknown option '-d'
[hadoop@funshion-hadoop193 conf]$ mysql -uhive -pbee56915 --database=hive
Warning: Using a password on the command line interface can be insecure.
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Welcome to the MySQL monitor. Commands end with ; or \g.
Your MySQL connection id is 263
Server version: 5.6.17 MySQL Community Server (GPL)
Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
mysql> select * from TBLS;
+--------+-------------+-------+------------------+--------+-----------+-------+----------+----------------+--------------------+--------------------+----------------+
| TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT | LINK_TARGET_ID |
+--------+-------------+-------+------------------+--------+-----------+-------+----------+----------------+--------------------+--------------------+----------------+
| 1 | 1426485587 | 2 | 0 | hadoop | 0 | 1 | pv2 | EXTERNAL_TABLE | NULL | NULL | NULL |
+--------+-------------+-------+------------------+--------+-----------+-------+----------+----------------+--------------------+--------------------+----------------+
1 row in set (0.01 sec)
---------------------------------------------------------------------------------------------------
-- 访问Oracle、SQL Server数据库应该都差不多,留待各位去测试吧!
-- 谢谢!