apache-hive-3.1.3-bin.tar
spark-3.0.0-bin-hadoop3.2.tgz
hadoop-3.1.3.tar.gz
在hdfs上新建
spark-history(设置权限777),spark-jars文件夹
上传jar到hdfs
hdfs dfs -D dfs.replication=1 -put ./* /spark-jars
<configuration>
<property>
<name>fs.defaultFSname>
<value>hdfs://haclustervalue>
property>
<property>
<name>hadoop.tmp.dirname>
<value>file:///opt/hadoop-3.1.3/tmpvalue>
property>
<property>
<name>io.file.buffer.sizename>
<value>4096value>
property>
<property>
<name>ha.zookeeper.quorumname>
<value>node15:2181,node16:2181,node17:2181,node18:2181value>
property>
<property>
<name>hadoop.proxyuser.root.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.root.groupsname>
<value>*value>
property>
<property>
<name>hadoop.http.staticuser.username>
<value>rootvalue>
property>
configuration>
<configuration>
<property>
<name>dfs.block.sizename>
<value>134217728value>
property>
<property>
<name>dfs.nameservicesname>
<value>activeNodevalue>
property>
<property>
<name>dfs.replicationname>
<value>3value>
property>
<property>
<name>dfs.name.dirname>
<value>file:///opt/hadoop-3.1.3/dfs/namenode_datavalue>
property>
<property>
<name>dfs.data.dirname>
<value>file:///opt/hadoop-3.1.3/dfs/datanode_datavalue>
property>
<property>
<name>dfs.webhdfs.enabledname>
<value>truevalue>
property>
<property>
<name>dfs.datanode.max.transfer.threadsname>
<value>4096value>
property>
<property>
<name>dfs.nameservicesname>
<value>haclustervalue>
property>
<property>
<name>dfs.ha.namenodes.haclustername>
<value>nn1,nn2value>
property>
<property>
<name>dfs.namenode.rpc-address.hacluster.nn1name>
<value>node15:9000value>
property>
<property>
<name>dfs.namenode.servicepc-address.hacluster.nn1name>
<value>node15:53310value>
property>
<property>
<name>dfs.namenode.http-address.hacluster.nn1name>
<value>node15:50070value>
property>
<property>
<name>dfs.namenode.rpc-address.hacluster.nn2name>
<value>node16:9000value>
property>
<property>
<name>dfs.namenode.servicepc-address.hacluster.nn2name>
<value>node16:53310value>
property>
<property>
<name>dfs.namenode.http-address.hacluster.nn2name>
<value>node16:50070value>
property>
<property>
<name>dfs.namenode.shared.edits.dirname>
<value>qjournal://node15:8485;node16:8485;node17:8485;node18:8485/haclustervalue>
property>
<property>
<name>dfs.journalnode.edits.dirname>
<value>/opt/hadoop-3.1.3/dfs/journalnode_datavalue>
property>
<property>
<name>dfs.namenode.edits.dirname>
<value>/opt/hadoop-3.1.3/dfs/editsvalue>
property>
<property>
<name>dfs.ha.automatic-failover.enabledname>
<value>truevalue>
property>
<property>
<name>dfs.client.failover.proxy.provider.haclustername>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvidervalue>
property>
<property>
<name>dfs.ha.fencing.methodsname>
<value>sshfencevalue>
property>
<property>
<name>dfs.ha.fencing.ssh.private-key-filesname>
<value>/root/.ssh/id_rsavalue>
property>
<property>
<name>dfs.premissionsname>
<value>falsevalue>
property>
configuration>
<configuration>
<property>
<name>mapred.job.trackername>
<value>node15:9001value>
property>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>mapreduce.jobhistory.addressname>
<value>node15:10020value>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>node15:19888value>
property>
<property>
<name>yarn.application.classpathname>
<value>/opt/hadoop-3.1.3/etc/hadoop:/opt/hadoop-3.1.3/share/hadoop/common/lib/*:/opt/hadoop-3.1.3/share/hadoop/common/*:/opt/hadoop-3.1.3/share/hadoop/hdfs:/opt/hadoop-3.1.3/share/hadoop/hdfs/lib/*:/opt/hadoop-3.1.3/share/hadoop/hdfs/*:/opt/hadoop-3.1.3/share/hadoop/mapreduce/lib/*:/opt/hadoop-3.1.3/share/hadoop/mapreduce/*:/opt/hadoop-3.1.3/share/hadoop/yarn:/opt/hadoop-3.1.3/share/hadoop/yarn/lib/*:/opt/hadoop-3.1.3/share/hadoop/yarn/*value>
property>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>mapreduce.map.memory.mbname>
<value>1500value>
<description>每个Map任务的物理内存限制description>
property>
<property>
<name>mapreduce.reduce.memory.mbname>
<value>3000value>
<description>每个Reduce任务的物理内存限制description>
property>
<property>
<name>mapreduce.map.java.optsname>
<value>-Xmx1200mvalue>
property>
<property>
<name>mapreduce.reduce.java.optsname>
<value>-Xmx2600mvalue>
property>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
configuration>
node15
node16
node17
node18
node15
node16
node17
node18
<configuration>
<property>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
<description>Whether virtual memory limits will be enforced for containersdescription>
property>
<property>
<name>yarn.nodemanager.vmem-pmem-rationame>
<value>4value>
<description>Ratio between virtual memory to physical memory when setting memory limits for containersdescription>
property>
<property>
<name>yarn.resourcemanager.ha.enabledname>
<value>truevalue>
property>
<property>
<name>yarn.resourcemanager.cluster-idname>
<value>hayarnvalue>
property>
<property>
<name>yarn.resourcemanager.ha.rm-idsname>
<value>rm1,rm2value>
property>
<property>
<name>yarn.resourcemanager.hostname.rm1name>
<value>node15value>
property>
<property>
<name>yarn.resourcemanager.hostname.rm2name>
<value>node16value>
property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1name>
<value>node15:8088value>
property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2name>
<value>node16:8088value>
property>
<property>
<name>yarn.resourcemanager.zk-addressname>
<value>node15:2181,node16:2181,node17:2181value>
property>
<property>
<name>yarn.resourcemanager.recovery.enabledname>
<value>truevalue>
property>
<property>
<name>yarn.resourcemanager.store.classname>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStorevalue>
property>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>node18value>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
<property>
<name>yarn.log.server.urlname>
<value>http://node15:19888/jobhistory/logsvalue>
property>
configuration>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURLname>
<value>jdbc:mysql://node15:3306/metastore?useSSL=falsevalue>
property>
<property>
<name>javax.jdo.option.ConnectionDriverNamename>
<value>com.mysql.jdbc.Drivervalue>
property>
<property>
<name>javax.jdo.option.ConnectionUserNamename>
<value>rootvalue>
property>
<property>
<name>javax.jdo.option.ConnectionPasswordname>
<value>hadoopvalue>
property>
<property>
<name>hive.metastore.warehouse.dirname>
<value>/user/hive/warehousevalue>
property>
<property>
<name>hive.metastore.schema.verificationname>
<value>falsevalue>
property>
<property>
<name>hive.metastore.event.db.notification.api.authname>
<value>falsevalue>
property>
<property>
<name>hive.server2.thrift.bind.hostname>
<value>node15value>
property>
<property>
<name>hive.server2.thrift.portname>
<value>10000value>
property>
<property>
<name>spark.yarn.jarsname>
<value>hdfs://node15:9000/spark-jars/*value>
property>
<property>
<name>hive.execution.enginename>
<value>sparkvalue>
property>
<property>
<name>spark.homename>
<value>/opt/spark-3.0.0-bin-hadoop3.2/value>
property>
configuration>
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs://node15:9000/spark-history
spark.executor.memory 600m
spark.driver.memory 600m
链接hadoop中的文件
ln -s 源文件名 新文件名
链接hive中的文件
ln -s 源文件名 新文件名
node15
node16
node17
node18
链接hadoop中的文件
ln -s 源文件名 新文件名
#!/usr/bin/env bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
export SCALA_HOME=/usr/share/scala
export JAVA_HOME=/usr/java/jdk1.8.0_241-amd64
export SPARK_HOME=/opt/spark-3.0.0-bin-hadoop3.2
export SPARK_MASTER_IP=192.168.206.215
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=7080 #spark的web访问端口默认是8080,防止可能存在端口冲突,可以修
改端口号为其他的export SPARK_WORKER_CORES=1
export SPARK_WORKER_INSTANCES=1
export SPARK_EXECUTOR_MEMORY=512M
export SPARK_WORKER_MEMORY=1G
export SPARK_DIST_CLASSPATH=$(/opt/hadoop-3.1.3/bin/hadoop classpath)
export HADOOP_CONF_DIR=/opt/hadoop-3.1.3/etc/hadoop
# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.
# Options read when launching programs locally with
# ./bin/run-example or ./bin/spark-submit
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
# Options read by executors and drivers running inside the cluster
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
# Options read in YARN client/cluster mode
# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
# Options for the daemons used in the standalone deploy mode
# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2
g)# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
# - SPARK_WORKER_DIR, to set the working directory of worker processes
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default:
1g).# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y
")# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
# Options for launcher
# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
# Generic options for the daemons used in the standalone deploy mode
# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf)
# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs)
# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp)
# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER)
# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0)
# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file.
# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL
# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS