参见http://blog.csdn.net/bailu66/article/details/53863693
参见https://www.cnblogs.com/K-artorias/p/7141479.html
参见https://www.cnblogs.com/boshen-hzb/p/5889633.html
参见http://blog.csdn.net/w12345_ww/article/details/51910030
参见http://www.cnblogs.com/lonenysky/p/6775876.html
参见https://www.linode.com/docs/databases/hadoop/how-to-install-and-set-up-hadoop-cluster/
wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda2-4.2.0-Linux-x86_64.sh
bash Anaconda2-4.2.0-Linux-x86_64.sh
vi /etc/profile
添加如下信息
export JAVA_HOME=/usr/local/jdk1.8.0_151
export PATH=$JAVA_HOME/bin:$PATH
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export HADOOP_HOME=/usr/local/hadoop-2.8.2
export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native/"
export CLASSPATH=$CLASSPATH:/usr/local/hadoop-2.8.2/lib/*:.
export HADOOP_CONF_DIR=/usr/local/hadoop-2.8.2/etc/hadoop
export LD_LIBRARY_PATH=/usr/local/hadoop-2.8.2/lib/native:$LD_LIBRARY_PATH
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export SPARK_HOME=/usr/local/spark-2.1.2
export PYSPARK_PYTHON=/root/anaconda2/bin/python
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
export SCALA_HOME=/usr/local/scala-2.11.12
export PATH=$PATH:$SCALA_HOME/bin
export HIVE_HOME=/usr/local/hadoop-2.8.2/hive
export PATH=$PATH:$HIVE_HOME/bin
export CLASSPATH=$CLASSPATH:/usr/local/hadoop-2.8.2/hive/lib/*:.
export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
source /etc/profile
准备文件
1)jdk8u151
2)hive2.3.2 wget http://mirrors.shuosc.org/apache/hive/stable-2/apache-hive-2.3.2-bin.tar.gz
3)spark2.1.2 wget http://mirrors.shuosc.org/apache/spark/spark-2.1.2/spark-2.1.2-bin-hadoop2.7.tgz
4)mysql-connector-java-5.1.41-bin.jar
5)scala2.11.12 wget https://downloads.lightbend.com/scala/2.11.12/scala-2.11.12.tgz
6)hadoop2.8.2 wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-2.8.2/hadoop- 2.8.2.tar.gz
解压缩
1)tar -C /usr/local/ -xzf jdk-8u151-linux-x64.tar.tar.gz
2)tar -C /usr/local/ -xzf hadoop-2.8.2.tar.gz
3)tar zxvf spark-2.1.2-bin-hadoop2.7.tgz
mv spark-2.1.2-bin-hadoop2.7 /usr/local/spark-2.1.2
4)tar zxvf scala-2.11.12.tgz
mv scala-2.11.12 /usr/local/
5)tar zxvf apache-hive-2.3.2-bin.tar.gz
mv apache-hive-2.3.2 /usr/local/hadoop-2.8.2/hive
6)mv mysql-connector-java-5.1.41-bin.jar /usr/local/hadoop-2.8.2/hive/lib/
安装mysql
sudo apt-get install mysql-server
sudo apt-get install mysql-client
sudo apt-get install libmysqlclient-dev
安装过程会让你给数据库root用户输入密码,不要忽略。然后通过如下命令检查是否安装成功:
sudo netstat -tap | grep mysql
登录验证:
mysql -uroot -p
在mysql中创建hive用户,数据库等
create user 'hivespark' identified by 'hivespark';
create database hivespark;
grant all on hivespark.* to hivespark@'%' identified by 'hivespark';
grant all on hivespark.* to hivespark@'localhost' identified by 'hivespark';
flush privileges;
修改hadoop配置文件
参见https://www.cnblogs.com/ggjucheng/archive/2012/04/17/2454590.html
cd /usr/local/hadoop-2.8.2/etc/hadoop
vim core-site.xml
<configuration>
<property>
<name>fs.defaultFSname>
<value>hdfs://master:9000value>
property>
<property>
<name>hadoop.tmp.dirname>
<value>file:///data/hadoop/data/tmpvalue>
property>
configuration>
vim hdfs-site.xml
<configuration>
<property>
<name>dfs.replicationname>
<value>2value>
property>
<property>
<name>dfs.permissionsname>
<value>falsevalue>
property>
<property>
<name>dfs.namenode.secondary.http-addressname>
<value>master:50090value>
property>
<property>
<name>dfs.namenode.http-addressname>
<value>master:50070value>
property>
<property>
<name>dfs.datanode.data.dirname>
<value>file:///data/hadoop/data/datanodevalue>
property>
<property>
<name>dfs.namenode.name.dirname>
<value>file:///data/hadoop/data/namenodevalue>
property>
<property>
<name>dfs.namenode.edits.dirname>
<value>file:///data/hadoop/data/editsvalue>
property>
<property>
<name>dfs.namenode.checkpoint.dirname>
<value>file:///data/hadoop/data/checkpointsvalue>
property>
<property>
<name>dfs.namenode.checkpoint.edits.dirname>
<value>file:///data/hadoop/data/checkpoints/editsvalue>
property>
configuration>
vi mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>master:19888value>
property>
<property>
<name>mapreduce.jobhistory.addressname>
<value>master:10020value>
property>
<property>
<name>mapreduce.job.ubertask.enablename>
<value>falsevalue>
property>
<property>
<name>yarn.app.mapreduce.am.staging-dirname>
<value>hdfs://master:9000/tmp/hadoop-yarn/stagingvalue>
<description>The staging dir used while submitting jobs.description>
property>
<property>
<name>mapreduce.jobhistory.intermediate-done-dirname>
<value>${yarn.app.mapreduce.am.staging-dir}/history/done_intermediatevalue>
property>
<property>
<name>mapreduce.jobhistory.done-dirname>
<value>${yarn.app.mapreduce.am.staging-dir}/history/donevalue>
property>
<property>
<name>mapreduce.map.memory.mbname>
<value>2048value>
property>
<property>
<name>mapreduce.reduce.memory.mbname>
<value>4096value>
property>
configuration>
vi yarn-site.xml
<configuration>
<property>
<name>yarn.acl.enablename>
<value>0value>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>mastervalue>
property>
<property>
<name>yarn.resourcemanager.webapp.addressname>
<value>master:8088value>
property>
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>86400value>
property>
<property>
<name>yarn.nodemanager.resource.memory-mbname>
<value>2648value>
property>
<property>
<name>yarn.scheduler.maximum-allocation-mbname>
<value>2048value>
property>
<property>
<name>yarn.scheduler.minimum-allocation-mbname>
<value>2048value>
property>
<property>
<name>yarn.app.mapreduce.am.resource.mbname>
<value>4096value>
property>
<property>
<name>yarn.scheduler.increment-allocation-mbname>
<value>512value>
property>
<property>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
property>
<property>
<name>yarn.nodemanager.pmem-check-enabledname>
<value>falsevalue>
property>
<property>
<name>yarn.nodemanager.remote-app-log-dirname>
<value>/var/log/hadoop-yarn/appsvalue>
property>
configuration>
vi slaves
slave2
slave1
vi hadoop-env.sh
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Set Hadoop-specific environment variables here.
# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.
# The java implementation to use.
#配置jdk的环境
export JAVA_HOME=/usr/local/jdk1.8.0_151
# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
# protocol. Jsvc is not required if SASL is configured for authentication of
# data transfer protocol using non-privileged ports.
#export JSVC_HOME=${JSVC_HOME}
export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
if [ "$HADOOP_CLASSPATH" ]; then
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
else
export HADOOP_CLASSPATH=$f
fi
done
# The maximum amount of heap to use, in MB. Default is 1000.
#export HADOOP_HEAPSIZE=
#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
# Extra Java runtime options. Empty by default.
#export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
export HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native"
# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
# On secure datanodes, user to run the datanode as after dropping privileges.
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
# to provide authentication of data transfer protocol. This **MUST NOT** be
# defined if SASL is configured for authentication of data transfer protocol
# using non-privileged ports.
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
# Where log files are stored. $HADOOP_HOME/logs by default.
#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
# Where log files are stored in the secure data environment.
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
###
# HDFS Mover specific parameters
###
# Specify the JVM options to be used when starting the HDFS Mover.
# These options will be appended to the options specified as HADOOP_OPTS
# and therefore may override any similar flags set in HADOOP_OPTS
#
# export HADOOP_MOVER_OPTS=""
###
# Advanced Users Only!
###
# The directory where pid files are stored. /tmp by default.
# NOTE: this should be set to a directory that can only be written to by
# the user that will run the hadoop daemons. Otherwise there is the
# potential for a symlink attack.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
# A string representing this instance of hadoop. $USER by default.
export HADOOP_IDENT_STRING=$USER
scp -r usr/local/hadoop-2.8.2/* root@slave2:usr/local/hadoop-2.8.2/
验证
hadoop namenode -format
start-dfs.sh
http://192.81.212.100:50070/dfshealth.html#tab-overview查看hdfs情况
start-yarn.sh
yarn node -list
yarn application -list
http://192.81.212.100:8088/cluster查看yarn情况
hdfs dfs -mkdir /books
wget -O alice.txt https://www.gutenberg.org/files/11/11-0.txt
wget -O holmes.txt https://www.gutenberg.org/ebooks/1661.txt.utf-8
wget -O frankenstein.txt https://www.gutenberg.org/ebooks/84.txt.utf-8
hdfs dfs -put alice.txt holmes.txt /books/
hdfs dfs -put frankenstein.txt /books/
hdfs dfs -ls /books
yarn jar /usr/local/hadoop-2.8.2/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.8.2.jar wordcount "/books/*" output
hdfs dfs -cat output/part-r-00000
stop-yarn.sh
stop-dfs.sh
start-all.sh 启动所有的Hadoop守护进程。包括NameNode、 Secondary NameNode、DataNode、JobTracker、 TaskTrack
stop-all.sh 停止所有的Hadoop守护进程。包括NameNode、 Secondary NameNode、DataNode、JobTracker、 TaskTrack
start-dfs.sh 启动Hadoop HDFS守护进程NameNode、SecondaryNameNode和DataNode
stop-dfs.sh 停止Hadoop HDFS守护进程NameNode、SecondaryNameNode和DataNode
hadoop-daemons.sh start namenode 单独启动NameNode守护进程
hadoop-daemons.sh stop namenode 单独停止NameNode守护进程
hadoop-daemons.sh start datanode 单独启动DataNode守护进程
hadoop-daemons.sh stop datanode 单独停止DataNode守护进程
hadoop-daemons.sh start secondarynamenode 单独启动SecondaryNameNode守护进程
hadoop-daemons.sh stop secondarynamenode 单独停止SecondaryNameNode守护进程
start-mapred.sh 启动Hadoop MapReduce守护进程JobTracker和TaskTracker
stop-mapred.sh 停止Hadoop MapReduce守护进程JobTracker和TaskTracker
hadoop-daemons.sh start jobtracker 单独启动JobTracker守护进程
hadoop-daemons.sh stop jobtracker 单独停止JobTracker守护进程
hadoop-daemons.sh start tasktracker 单独启动TaskTracker守护进程
hadoop-daemons.sh stop tasktracker 单独启动TaskTracker守护进程
成功启动后,可以访问 Web 界面 http://localhost:50070 查看 NameNode 和 Datanode 信息,还可以在线查看 HDFS 中的文件。
cd /usr/local/spark-2.1.2/conf
mv spark-env.sh.template spark-env.sh
vi spark-env.sh
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.
# Options read when launching programs locally with
# ./bin/run-example or ./bin/spark-submit
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
# - SPARK_CLASSPATH, default classpath entries to append
# Options read by executors and drivers running inside the cluster
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
# - SPARK_CLASSPATH, default classpath entries to append
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
# Options read in YARN client mode
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2)
# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
# Options for the daemons used in the standalone deploy mode
# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
# - SPARK_WORKER_DIR, to set the working directory of worker processes
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
# Generic options for the daemons used in the standalone deploy mode
# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf)
# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs)
# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp)
# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER)
# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0)
export JAVA_HOME=/usr/local/jdk1.8.0_151
#spark主节点的ip
export SCALA_HOME=/usr/local/scala-2.11.12
export SPARK_HOME=/usr/local/spark-2.1.2
#export SPARK_MASTER_IP=master
#spark主节点的端口号
#export SPARK_MASTER_PORT=7077
#export SPARK_MASTER_WEBUI_PORT=18080
export HADOOP_CONF_DIR=/usr/local/hadoop-2.8.2/etc/hadoop
#export SPARK_LOCAL_IP=slave1
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/hadoop-2.8.2/lib/native
修改spark-default.conf
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs://master:9000/usr/spark/eventLogging
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 512m
spark.yarn.am.memory 2048m
spark.executor.memory 1536m
spark.yarn.jars hdfs://master:9000/spark-jars/*
spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider
spark.history.fs.logDirectory hdfs://master:9000/usr/spark/eventLogging
spark.history.fs.update.interval 10s
spark.history.ui.port 18080
vi slaves
slave1
slave2
上传jar包到hdfs
hdfs dfs -mkdir -p /usr/spark/eventLogging
把SPARK_HOME/jars下所有jar包复制到hdfs://master:9000/spark-jars/
下载kryo-3.0.3.jar asm-5.0.3.jar minlog-1.3.0.jar objenesis-2.1.jar reflectasm-1.10.1.jar
hdfs dfs -mkdir -p /spark-jars
hdfs dfs -put jars/* /spark-jars
拷贝配置到所有节点
scp -r spark-2.1.2/* root@slave2:/usr/local/spark-2.1.2/
验证
前提是start-dfs.sh和start-yarn.sh
通过jps查看启动namenode/datanode等情况
$SPARK_HOME/sbin/start-history-server.sh
spark-submit --deploy-mode client \
--class org.apache.spark.examples.SparkPi \
$SPARK_HOME/examples/jars/spark-examples_2.11-2.1.2.jar 10
http://master:18080 查看spark运行情况
yarn logs -applicationId application_xxxxx查看日志(输出结果,错误等)
yarn node -list查看节点数
cd /usr/local/hadoop-2.8.2/hive/conf
cp hive-env.sh.template hive-env.sh
在hdfs目录下建立三个文件,用来存放hive信息,并赋予777权限
start-dfs.sh
hdfs dfs -mkdir -p /usr/hive/warehouse
hdfs dfs -mkdir -p /usr/hive/tmp
hdfs dfs -mkdir -p /usr/hive/log
hdfs dfs -mkdir -p /usr/hive/download
hdfs dfs -chmod -R 777 /usr/hive/warehouse
hdfs dfs -chmod -R 777 /usr/hive/tmp
hdfs dfs -chmod -R 777 /usr/hive/log
hdfs dfs -chmod -R 777 /usr/hive/download
修改hive-env.sh文件
export JAVA_HOME=/usr/local/jdk1.8.0_151
export HADOOP_HOME=/usr/local/hadoop-2.8.2
export HIVE_HOME=/usr/local/hadoop-2.8.2/hive
export HIVE_CONF_DIR=/usr/local/hadoop-2.8.2/hive/conf
export HIVE_AUX_JARS_PATH=/usr/local/hadoop-2.8.2/hive/lib
vi hive-site.xml,加入如下内容
<configuration>
<property>
<name>hive.metastore.urisname>
<value>thrift://master:9083value>
property>
<property>
<name>hive.execution.enginename>
<value>sparkvalue>
property>
<property>
<name>hive.exec.scratchdirname>
<value>/usr/hive/tmpvalue>
property>
<property>
<name>hive.metastore.warehouse.dirname>
<value>/usr/hive/warehousevalue>
property>
<property>
<name>hive.querylog.locationname>
<value>/usr/hive/logvalue>
property>
<property>
<name>hive.downloaded.resources.dirname>
<value>/usr/hive/downloadvalue>
property>
<property>
<name>javax.jdo.option.ConnectionURLname>
<value>jdbc:mysql://master:3306/hivespark?createDatabaseIfNotExist=true&characterEncoding=UTF-8&useSSL=falsevalue>
property>
<property>
<name>javax.jdo.option.ConnectionDriverNamename>
<value>com.mysql.jdbc.Drivervalue>
property>
<property>
<name>javax.jdo.option.ConnectionUserNamename>
<value>hivesparkvalue>
property>
<property>
<name>javax.jdo.option.ConnectionPasswordname>
<value>hivesparkvalue>
property>
<property>
<name>hive.metastore.schema.verificationname>
<value>falsevalue>
property>
configuration>
cp /usr/local/spark-2.1.2/jars/spark-* /usr/local/hadoop-2.8.2/hive/lib/
cp /usr/local/spark-2.1.2/jars/scala-* /usr/local/hadoop-2.8.2/hive/lib/
cp /hive/conf/hive-site.xml /usr/local/spark-2.1.2/conf/
拷贝到各台机器
scp -r hadoop-2.8.2/* root@slave2:/usr/local/hadoop-2.8.2/
scp -r spark-2.1.2/* root@slave2:/usr/local/spark-2.1.2/
scp -r hive/* root@slave2:/usr/local/hadoop-2.8.2/hive/
初始化hive,在hive2.0以后的版本,初始化命令(服务端):
edit /etc/mysql/mysql.conf.d/mysqld.cnf (可能在别的路经)
bind-address = 127.0.0.1 修改为
bind-address = x.x.x.x(自己的ip)
/etc/init.d/mysql restart
schematool -initSchema -dbType mysql
初始化成功后,就可以运行hive了,可以检测一下hive是否正常
hive --service metastore & (服务端开启hive metastore服务(需要先开启hdfs))
hive
use defautlt;
show tables;
create table test(key string);
select count(*) from test;
exit;
hdfs dfs -ls /usr/hive/warehouse
mysql -uhivespark -phivespark
use hive;
select TBL_NAME from TBLS;
pyspark
data=[1,2,3,4,5]
distData=sc.parallelize(data)
distData.first()
./bin/spark-submit examples/src/main/python/pi.py
在界面上查看hive位于hdfs的数据
http://master:50070/explorer.html#/user/hive/warehouse/
jps查看dfs和yarn和historyserver开启情况
kill -9 PID(history-server)
kill -9 PID(RunJar)
stop-yarn.sh
stop-dfs.sh
start-dfs.sh
start-yarn.sh
bash /usr/local/spark-2.1.2/sbin/start-history-server.sh
hive --service metastore & 开启hive metastore
测试:
pyspark
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
my_dataframe = sqlContext.sql("Select count(*) from test")
my_dataframe.show()
https://spark.apache.org/docs/latest/rdd-programming-guide.html
https://spark.apache.org/docs/latest/sql-programming-guide.html
https://spark.apache.org/mllib/
参见http://blog.csdn.net/strongyoung88/article/details/53743937
参见http://blog.csdn.net/youngqj/article/details/47315167
Containers = minimum of (2*CORES, 1.8*DISKS, (Total available RAM) / MIN_CONTAINER_SIZE)
RAM-per-Container = maximum of (MIN_CONTAINER_SIZE, (Total Available RAM) / Containers))
例子
Cluster nodes have 12 CPU cores, 48 GB RAM, and 12 disks.
Reserved Memory = 6 GB reserved for system memory + (if HBase) 8 GB for HBase
Min Container size = 2 GB
If there is no HBase:
# of Containers = minimum of (2*12, 1.8* 12, (48-6)/2) = minimum of (24, 21.6, 21) = 21
RAM-per-Container = maximum of (2, (48-6)/21) = maximum of (2, 2) = 2
一个spark application所使用的资源为:
client模式
cores = spark.yarn.am.cores + spark.executor.cores * spark.executor.instances
memory = spark.yarn.am.memory + spark.yarn.am.memoryOverhead + (spark.executor.memory + spark.yarn.executor.memoryOverhead) * spark.executor.instances + --driver-memory
cluster模式:
cores = spark.driver.cores + spark.executor.cores * spark.executor.instances
memory = spark.driver.memory + spark.yarn.driver.memoryOverhead + (spark.executor.memory + spark.yarn.executor.memoryOverhead) * spark.executor.instances
运行时参数:
1.num-executors
该参数用于设置Spark作业总共要用多少个Executor进程来执行
2.executor-memory
3.executor-cores
4.driver-memory
Driver的内存通常来说不设置,或者设置1G左右应该就够了。唯一需要注意的一点是,如果需要使用collect算子将RDD的数据全部拉取到Driver上进行处理,那么必须确保Driver的内存足够大,否则会出现OOM内存溢出的问题
5.spark.default.parallelism
设置该参数为num-executors * executor-cores的2~3倍较为合适,比如Executor的总CPU core数量为300个,那么设置1000个task是可以的
6.storage.memoryFraction
该参数用于设置RDD持久化数据在Executor内存中能占的比例,默认是0.6。也就是说,默认Executor 60%的内存,可以用来保存持久化的RDD数据。根据你选择的不同的持久化策略,如果内存不够时,可能数据就不会持久化,或者数据会写入磁盘
7.spark.shuffle.memoryFraction
该参数用于设置shuffle过程中一个task拉取到上个stage的task的输出后,进行聚合操作时能够使用的Executor内存的比例,默认是0.2
8.total-executor-cores
以下是一份spark-submit命令的示例:
./bin/spark-submit \
--master spark://192.168.1.1:7077 \
--num-executors 100 \
--executor-memory 6G \
--executor-cores 4 \
--total-executor-cores 400 \ ##standalone default all cores
--driver-memory 1G \
--conf spark.default.parallelism=1000 \
--conf spark.storage.memoryFraction=0.5 \
--conf spark.shuffle.memoryFraction=0.3 \
1) no module named pyspark
export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
2)WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform… using builtin-java classes where applicable
修改spark-env.sh文件加入LD_LIBRARY_PATH环境变量
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/hadoop-2.8.2/lib/native
3)WARN [Thread-378] 2015-06-11 13:41:39,712 ExternalLogger.java (line 73) SparkWorker: Your hostname, myhost1.somedomain.com resolves to a loopback address: 127.0.0.1; using 10.1.2.1 instead (on interface bond1)
WARN [Thread-378] 2015-06-11 13:41:39,714 ExternalLogger.java (line 73) SparkWorker: Set SPARK_LOCAL_IP if you need to bind to another address
export SPARK_LOCAL_IP=""
4) java.lang.UnsatisfiedLinkError: no hadoop in java.library.path
export HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_HOME}/lib/native
export HADOOP_OPTS="-Djava.library.path=${HADOOP_HOME}/lib/native/"
5)cannot access /usr/local/spark/lib/spark-assembly-*.jar: No such file or directory
参见http://blog.csdn.net/Gpwner/article/details/73457108
打开/usr/local/apache-hive-1.2.1-bin/bin下hive
sparkAssemblyPath=`ls ${SPARK_HOME}/lib/spark-assembly-*.jar` 修改为
sparkAssemblyPath=`ls ${SPARK_HOME}/jars/*.jar`
6)Error: ERROR: relation “BUCKETING_COLS” already exists (state=42P07,code=0)
org.apache.hadoop.hive.metastore.HiveMetaException: Schema initialization FAILED! Metastore state would be inconsistent !!
* schemaTool failed *
原因是mysql已经存在hive数据库,删除即可
7)hive启动log4j重复
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/D:/software/slf4j-log4j12-1.7.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/D:/software/log4j-slf4j-impl-2.4.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
从中筛选出信息,slf4j-log4j12-1.7.2.jar、log4j-slf4j-impl-2.4.1.jar,以及Class path contains multiple SLF4J bindings,说明是slf4j-log4j12-1.7.2.jar和 log4j-slf4j-impl-2.4.1.jar重复了,应该去掉其中一个jar包。把log4j-slf4j-impl-2.4.1.jar包去掉后项目启动正常。
8)Could not create ServerSocket on address 0.0.0.0/0.0.0.0:9083
执行命令jps
将Runjar kill掉,再重启hive --service metastore,就可以了
8) hadoop namenode -format
每次format之前删除datanode的文件夹
rm -r /data/hadoop/
9)Call From /127.0.0.1 to :36682failed on connection exception: java.net.ConnectException: ubuntu
master机器
hostname master(暂时修改)
mv /etc/hostname /etc/hostname.bak永久修改
vi /etc/hostname
slave1
hostname -i
slave机器
hostname slave2(暂时修改)
mv /etc/hostname /etc/hostname.bak永久修改
vi /etc/hostname
slave2
hostname -i
10) Exception in thread “main” org.apache.thrift.transport.TTransportException: Could not create ServerSocket on address 0.0.0.0/0.0.0.0:9083.
遇到这种情况大家都找不到头绪,是因为你开始运行了hive的metastore,可以输入jps
杀掉Runjar进程 kill -9 PID