Hadoop是一个分布式系统基础架构,主要解决海量数据的存储及分析计算的问题。
主机名 | IP | NN | DN | RM | NM |
---|---|---|---|---|---|
bigdata01 | 192.168.1.101 | Y | Y | Y | |
bigdata02 | 192.168.1.102 | Y | Y | Y | |
bigdata03 | 192.168.1.103 | Y | Y | Y |
hosts
# 设置主机名
hostnamectl set-hostname bigdata01
# 设置集群内DNS访问
cat << EOF >> /etc/hosts
192.168.1.101 bigdata01
192.168.1.102 bigdata02
192.168.1.103 bigdata03
EOF
# 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld
# 关闭SELinux
setenforce 0
sed -i 's/enforcing$/disabled/' /etc/selinux/config
# 添加用户
useradd omm
passwd omm
# 配置管理员权限
# 在文件 '/etc/sudoers' 行 'root ALL=(ALL) ALL' 下添加
omm ALL=(ALL) NOPASSWD: ALL
以 omm 用户身份执行
#!/bin/bash
hosts="192.168.1.101 192.168.1.102 192.168.1.103"
passwd="abcd1234.."
yum install sshpass
user=`whoami`
echo n | ssh-keygen -t rsa -P "" -f ~/.ssh/id_rsa > /dev/null
for host in $hosts
do
echo "Spread SSH Pub Key to $host"
{
sshpass -p$passwd ssh-copy-id -i ~/.ssh/id_rsa.pub $user@$host -p 22 -o "StrictHostKeyChecking=no" &> /dev/null
}&
done
wait
将此脚本命名为
xsync
并移至$PATH
扫描路径下如/bin
#!/bin/bash
hosts="bigdata02 bigdata03"
stand=`pwd`
p_dir=`cd .. && pwd`
cd $stand
sudo yum install rsync
for host in $hosts
do
rsync -avz --delete $stand $host:$p_dir
done
/opt
├── module
│ ├── hadoop -> hadoop-3.1.4/
│ ├── hadoop-3.1.4
│ ├── jdk -> jdk1.8.0_271/
│ └── jdk1.8.0_271
└── soft
├── hadoop-3.1.4.tar.gz
└── jdk-8u271-linux-x64.tar.gz
# 准备目录并上传软件包
mkdir /opt/module
mkdir /opt/soft
# 解压缩
tar -zxf /opt/soft/jdk-8u271-linux-x64.tar.gz -C /opt/module
tar -zxf /opt/soft/hadoop-3.1.4.tar.gz -C /opt/module
# 软链接
ln -s /opt/module/jdk1.8.0_271 /opt/module/jdk
ln -s /opt/module/hadoop-3.1.4 /opt/module/hadoop-3.1.4
# PATH 路径
cat << EOF >> /etc/profile
# Java
export JAVA_HOME=/opt/module/jdk
export PATH=$PATH:$JAVA_HOME/bin
# Hadoop
export HADOOP_HOME=/opt/module/hadoop
export PATH=$PATH:$HADOOP_HOME/bin
# Hadoop sbin
export PATH=$PATH:$HADOOP_HOME/sbin
EOF
# 验证
[omm@bigdata01 ~]$ java -version
java version "1.8.0_271"
Java(TM) SE Runtime Environment (build 1.8.0_271-b09)
Java HotSpot(TM) 64-Bit Server VM (build 25.271-b09, mixed mode)
[omm@bigdata01 ~]$ h
h2ph hadoop-daemon.sh hardlink hdfs.cmd help hostid httpfs.sh
hadoop hadoop-daemons.sh hash hdsploader hexdump hostname hwclock
hadoop.cmd halt hdfs head history hostnamectl
[omm@bigdata01 ~]$ hadoop version
Hadoop 3.1.4
Source code repository https://github.com/apache/hadoop.git -r 1e877761e8dadd71effef30e592368f7fe66a61b
Compiled by gabota on 2020-07-21T08:05Z
Compiled with protoc 2.5.0
From source with checksum 38405c63945c88fdf7a6fe391494799b
This command was run using /opt/module/hadoop-3.1.4/share/hadoop/common/hadoop-common-3.1.4.jar
[omm@bigdata01 ~]$
克隆虚拟机 ==》 修改主机名 ==》 修改IP
Hadoop 配置文件路径:
/opt/module/hadoop/etc/hadoop
hadoop-env.sh
export JAVA_HOME=/opt/module/jdk
core-site.xml
<configuration>
<property>
<name>fs.defaultFSname>
<value>hdfs://bigdata01:8020value>
property>
<property>
<name>hadoop.data.dirname>
<value>/opt/module/hadoop/datavalue>
property>
<property>
<name>hadoop.proxyuser.omm.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.omm.groupsname>
<value>*value>
property>
configuration>
hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.name.dirname>
<value>file://${hadoop.data.dir}/namevalue>
property>
<property>
<name>dfs.datanode.data.dirname>
<value>file://${hadoop.data.dir}/datavalue>
property>
<property>
<name>dfs.namenode.checkpoint.dirname>
<value>file://${hadoop.data.dir}/namesecondaryvalue>
property>
<property>
<name>dfs.client.datanode-restart.timeoutname>
<value>30value>
property>
<property>
<name>dfs.namenode.secondary.http-addressname>
<value>bigdata03:9868value>
property>
configuration>
mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>mapreduce.jobhistory.addressname>
<value>bigdata01:10020value>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>bigdata01:19888value>
property>
configuration>
yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>bigdata02value>
property>
<property>
<name>yarn.nodemanager.env-whitelistname>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOMEvalue>
property>
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log.server.urlname>
<value>http://bigdata01:19888/jobhistory/logsvalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
configuration>
[omm@bigdata01 hadoop]$ xsync
sending incremental file list
hadoop/core-site.xml
hadoop/hadoop-env.sh
hadoop/hdfs-site.xml
hadoop/mapred-site.xml
hadoop/yarn-site.xml
sent 1,278 bytes received 305 bytes 3,166.00 bytes/sec
total size is 114,023 speedup is 72.03
sending incremental file list
hadoop/core-site.xml
hadoop/hadoop-env.sh
hadoop/hdfs-site.xml
hadoop/mapred-site.xml
hadoop/yarn-site.xml
sent 1,278 bytes received 305 bytes 3,166.00 bytes/sec
total size is 114,023 speedup is 72.03
[omm@bigdata01 hadoop]$
[omm@bigdata01 ~]$ hdfs namenode -format
[omm@bigdata01 hadoop]$ start-dfs.sh
Starting namenodes on [bigdata01]
Starting datanodes
bigdata03: WARNING: /opt/module/hadoop/logs does not exist. Creating.
bigdata02: WARNING: /opt/module/hadoop/logs does not exist. Creating.
Starting secondary namenodes [bigdata03]
[omm@bigdata01 hadoop]$
[omm@bigdata02 ~]$ start-yarn.sh
Starting resourcemanager
Starting nodemanagers
[omm@bigdata02 ~]$
[omm@bigdata01 ~]$ jps
11988 DataNode
12265 NodeManager
11834 NameNode
12380 Jps
[omm@bigdata01 ~]$
[omm@bigdata02 ~]$ jps
1940 NodeManager
1605 DataNode
1772 ResourceManager
2238 Jps
[omm@bigdata02 ~]$
[omm@bigdata03 sbin]$ jps
9425 SecondaryNameNode
9538 NodeManager
9653 Jps
9343 DataNode
[omm@bigdata03 sbin]$
NodeManager
http://bigdata01:8042/node
ResourceManager
http://bigdata02:8088/cluster
HDFS
http://bigdata01:9870/dfshealth.html#tab-overview
HistoryServer
[omm@bigdata01 spark]$ mapred --daemon start historyserver
[omm@bigdata01 spark]$ jps
8867 DataNode
8741 NameNode
9861 Jps
9099 NodeManager
9822 JobHistoryServer
[omm@bigdata01 spark]$ ss -tlunp | grep 9822
tcp LISTEN 0 128 192.168.1.101:19888 *:* users:(("java",pid=9822,fd=325))
tcp LISTEN 0 128 *:10033 *:* users:(("java",pid=9822,fd=314))
tcp LISTEN 0 128 192.168.1.101:10020 *:* users:(("java",pid=9822,fd=331))
[omm@bigdata01 spark]$
hadoop fs ...
和hdfs dfs ...
命令同价。
命令格式:
hadoop fs -command src.. [dest]
[omm@bigdata01 ~]$ hadoop fs
Usage: hadoop fs [generic options]
[-appendToFile <localsrc> ... <dst>]
[-cat [-ignoreCrc] <src> ...]
[-checksum <src> ...]
[-chgrp [-R] GROUP PATH...]
[-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...]
[-chown [-R] [OWNER][:[GROUP]] PATH...]
[-copyFromLocal [-f] [-p] [-l] [-d] [-t <thread count>] <localsrc> ... <dst>]
[-copyToLocal [-f] [-p] [-ignoreCrc] [-crc] <src> ... <localdst>]
[-count [-q] [-h] [-v] [-t [<storage type>]] [-u] [-x] [-e] <path> ...]
[-cp [-f] [-p | -p[topax]] [-d] <src> ... <dst>]
[-createSnapshot <snapshotDir> [<snapshotName>]]
[-deleteSnapshot <snapshotDir> <snapshotName>]
[-df [-h] [<path> ...]]
[-du [-s] [-h] [-v] [-x] <path> ...]
[-expunge [-immediate]]
[-find <path> ... <expression> ...]
[-get [-f] [-p] [-ignoreCrc] [-crc] <src> ... <localdst>]
[-getfacl [-R] <path>]
[-getfattr [-R] {-n name | -d} [-e en] <path>]
[-getmerge [-nl] [-skip-empty-file] <src> <localdst>]
[-head <file>]
[-help [cmd ...]]
[-ls [-C] [-d] [-h] [-q] [-R] [-t] [-S] [-r] [-u] [-e] [<path> ...]]
[-mkdir [-p] <path> ...]
[-moveFromLocal <localsrc> ... <dst>]
[-moveToLocal <src> <localdst>]
[-mv <src> ... <dst>]
[-put [-f] [-p] [-l] [-d] <localsrc> ... <dst>]
[-renameSnapshot <snapshotDir> <oldName> <newName>]
[-rm [-f] [-r|-R] [-skipTrash] [-safely] <src> ...]
[-rmdir [--ignore-fail-on-non-empty] <dir> ...]
[-setfacl [-R] [{-b|-k} {-m|-x <acl_spec>} <path>]|[--set <acl_spec> <path>]]
[-setfattr {-n name [-v value] | -x name} <path>]
[-setrep [-R] [-w] <rep> <path> ...]
[-stat [format] <path> ...]
[-tail [-f] [-s <sleep interval>] <file>]
[-test -[defsz] <path>]
[-text [-ignoreCrc] <src> ...]
[-touch [-a] [-m] [-t TIMESTAMP ] [-c] <path> ...]
[-touchz <path> ...]
[-truncate [-w] <length> <path> ...]
[-usage [cmd ...]]
Generic options supported are:
-conf <configuration file> specify an application configuration file
-D <property=value> define a value for a given property
-fs <file:///|hdfs://namenode:port> specify default filesystem URL to use, overrides 'fs.defaultFS' property from configurations.
-jt <local|resourcemanager:port> specify a ResourceManager
-files <file1,...> specify a comma-separated list of files to be copied to the map reduce cluster
-libjars <jar1,...> specify a comma-separated list of jar files to be included in the classpath
-archives <archive1,...> specify a comma-separated list of archives to be unarchived on the compute machines
The general command line syntax is:
command [genericOptions] [commandOptions]
[omm@bigdata01 ~]$
数据流向 | 命令 | 说明 |
---|---|---|
Local => HDFS | put | |
copyFromLocal | ||
moveFromLocal | ||
appendToFile | ||
HDFS => HDFS | cp | |
mv | ||
chown | ||
chgrp | ||
chmod | ||
mkdir | ||
du | ||
df | ||
cat | ||
rm | ||
HDFS => Local | get | |
getmerge | ||
copyToLocal |
[atguigu@hadoop102 hadoop-3.1.3]$ touch kongming.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -moveFromLocal ./kongming.txt /sanguo/shuguo
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -copyFromLocal README.txt /
[atguigu@hadoop102 hadoop-3.1.3]$ touch liubei.txt
[atguigu@hadoop102 hadoop-3.1.3]$ vi liubei.txt
输入
san gu mao lu
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -appendToFile liubei.txt /sanguo/shuguo/kongming.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -put ./zaiyiqi.txt /user/atguigu/test/
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -copyToLocal /sanguo/shuguo/kongming.txt ./
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -get /sanguo/shuguo/kongming.txt ./
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -getmerge /user/atguigu/test/* ./zaiyiqi.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -ls /
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -mkdir -p /sanguo/shuguo
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -cat /sanguo/shuguo/kongming.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -chmod 666 /sanguo/shuguo/kongming.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -chown atguigu:atguigu /sanguo/shuguo/kongming.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -cp /sanguo/shuguo/kongming.txt /zhuge.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -mv /zhuge.txt /sanguo/shuguo/
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -tail /sanguo/shuguo/kongming.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -rm /user/atguigu/test/jinlian2.txt
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -mkdir /test
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -rmdir /test
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -du -s -h /user/atguigu/test
2.7 K /user/atguigu/test
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -du -h /user/atguigu/test
1.3 K /user/atguigu/test/README.txt
15 /user/atguigu/test/jinlian.txt
1.4 K /user/atguigu/test/zaiyiqi.txt
这里设置的副本数只是记录在NameNode的元数据中,是否真的会有这么多副本,还得看(<=)DataNode的数量。
[atguigu@hadoop102 hadoop-3.1.3]$ hadoop fs -setrep 10 /sanguo/shuguo/kongming.txt
Github链接:https://github.com/cdarlint/winutils
<properties>
<hadoop.version>3.1.3hadoop.version>
properties>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-client-apiartifactId>
<version>${hadoop.version}version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-client-runtimeartifactId>
<version>${hadoop.version}version>
dependency>
dependencies>
dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-client-apiartifactId>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-client-runtimeartifactId>
dependency>
<dependency>
<groupId>org.projectlombokgroupId>
<artifactId>lombokartifactId>
<optional>trueoptional>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-devtoolsartifactId>
<scope>runtimescope>
<optional>trueoptional>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-testartifactId>
<scope>testscope>
dependency>
dependencies>
package com.simwor.bigdata.hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URI;
public class HdfsTest {
private static FileSystem fileSystem;
@BeforeAll
public static void createConnection() throws IOException, InterruptedException {
fileSystem = FileSystem.get(URI.create("hdfs://bigdata01:8020"), new Configuration(), "omm");
}
@Test
public void testCopyFromLocalFile() throws IOException {
fileSystem.copyFromLocalFile(new Path("E:\\SOFT\\hadoop-3.1.4.tar.gz"), new Path("/"));
}
@Test
public void testGet() throws IOException {
fileSystem.copyToLocalFile(new Path("/hadoop-3.1.4.tar.gz"), new Path("C:\\Users\\m1553\\Desktop"));
}
@AfterAll
public static void closeConnection() throws IOException {
fileSystem.close();
}
}
[omm@bigdata01 ~]$ hadoop fs -ls /
Found 2 items
-rw-r--r-- 3 omm supergroup 147145 2021-01-17 11:39 /LICENSE.txt
-rw-r--r-- 3 omm supergroup 348326890 2021-01-17 13:53 /hadoop-3.1.4.tar.gz
[omm@bigdata01 ~]$
C:\Users\m1553\Desktop>dir
C:\Users\m1553\Desktop 的目录
2021/01/17 16:16 <DIR> .
2021/01/17 16:16 <DIR> ..
2021/01/17 16:16 2,721,312 .hadoop-3.1.4.tar.gz.crc
2021/01/17 16:16 348,326,890 hadoop-3.1.4.tar.gz
2 个文件 351,048,202 字节
2 个目录 30,596,923,392 可用字节
C:\Users\m1553\Desktop>
在HDFS写数据的过程中,NameNode会选择距离待上传数据最近距离的DataNode接收数据。
节点距离:两个节点到达最近的共同祖先的距离总和。
[omm@bigdata01 current]$ pwd
/opt/module/hadoop/data/name/current
[omm@bigdata01 current]$ ll
...
-rw-rw-r-- 1 omm omm 1048576 Feb 21 15:47 edits_0000000000000000068-0000000000000000068
-rw-rw-r-- 1 omm omm 1048576 Feb 22 09:50 edits_inprogress_0000000000000000069
...
-rw-rw-r-- 1 omm omm 398 Feb 22 09:50 fsimage_0000000000000000068
-rw-rw-r-- 1 omm omm 62 Feb 22 09:50 fsimage_0000000000000000068.md5
-rw-rw-r-- 1 omm omm 3 Feb 22 09:50 seen_txid
-rw-rw-r-- 1 omm omm 217 Feb 22 09:50 VERSION
[omm@bigdata01 current]$
集群处于安全模式,不能执行重要操作(写操作)。集群启动完成后,自动退出安全模式。
bin/hdfs dfsadmin -safemode get
(功能描述:查看安全模式状态)bin/hdfs dfsadmin -safemode enter
(功能描述:进入安全模式状态)bin/hdfs dfsadmin -safemode leave
(功能描述:离开安全模式状态)bin/hdfs dfsadmin -safemode wait
(功能描述:等待安全模式状态)需要注意的是hdfs-site.xml 配置文件中的heartbeat.recheck.interval的单位为毫秒,dfs.heartbeat.interval的单位为秒。
<property>
<name>dfs.namenode.heartbeat.recheck-intervalname>
<value>300000value>
property>
<property>
<name>dfs.heartbeat.intervalname>
<value>3value>
property>
[omm@bigdata01 ~]$ sudo rsync -avz /opt/module/hadoop-3.1.4 192.168.1.104:/opt/module
[omm@bigdata01 ~]$ sudo rsync -avz /opt/module/hadoop 192.168.1.104:/opt/module
[omm@bigdata01 ~]$ sudo rsync -avz /opt/module/jdk1.8.0_271 192.168.1.104:/opt/module
[omm@bigdata01 ~]$ sudo rsync -avz /opt/module/jdk 192.168.1.104:/opt/module
[omm@bigdata01 ~]$ sudo rsync -avz /etc/profile 192.168.1.104:/etc
[omm@bigdata04 ~]$ source /etc/profile
[omm@bigdata04 ~]$ sudo vi /etc/hosts
[omm@bigdata04 ~]$ cat /etc/hosts # 四台都配一下
...
192.168.1.101 bigdata01
192.168.1.102 bigdata02
192.168.1.103 bigdata03
192.168.1.104 bigdata04
[omm@bigdata04 hadoop]$ pwd
/opt/module/hadoop
[omm@bigdata04 hadoop]$ rm -rf data logs
[omm@bigdata04 hadoop]$ hdfs --daemon start datanode
WARNING: /opt/module/hadoop/logs does not exist. Creating.
[omm@bigdata04 hadoop]$ yarn --daemon start nodemanager
http://bigdata01:9870/dfshealth.html#tab-datanode
http://bigdata02:8088/cluster/nodes
扩容节点并不需要配置主机间互信,但群起(start-dfs.sh/start-yarn.sh)等操作时时需要配置互信。这里已经配置过bigdata04的互信工作。
[omm@bigdata01 ~]$ cd /opt/module/hadoop/etc/hadoop/
[omm@bigdata01 hadoop]$ vi hdfs-site.xml
[omm@bigdata01 hadoop]$ tail hdfs-site.xml
<property>
<name>dfs.hosts</name>
<value>/opt/module/hadoop/etc/hadoop/dfs.hosts</value>
</property>
<property>
<name>dfs.hosts.exclude</name>
<value>/opt/module/hadoop/etc/hadoop/dfs.hosts.exclude</value>
</property>
</configuration>
[omm@bigdata01 hadoop]$ touch dfs.hosts
[omm@bigdata01 hadoop]$ touch dfs.hosts.exclude
[omm@bigdata01 hadoop]$ cat << EOF > dfs.hosts
> bigdata01
> bigdata02
> bigdata03
> bigdata04
> EOF
> [omm@bigdata01 hadoop]$ # 同步配置文件到集群其它节点
[omm@bigdata01 hadoop]$ stop-dfs.sh
[omm@bigdata01 hadoop]$ start-dfs.sh
[omm@bigdata01 hadoop]$ echo "bigdata04" > dfs.hosts.exclude
[omm@bigdata01 hadoop]$ hdfs dfsadmin -refreshNodes
[omm@bigdata02 hadoop]$ yarn rmadmin -refreshNodes
正在退役
已经退役(观察 bigdata01 数据量的变化)
[omm@bigdata04 ~]$ hdfs --daemon stop datanode
[omm@bigdata04 ~]$ yarn --daemon stop nodemanager
[omm@bigdata01 hadoop]$ start-balancer.sh
MapReduce是一个分布式运算程序的编程框架,是用户开发“基于Hadoop的数据分析应用”的核心框架。
MapReduce核心功能是将用户编写的业务逻辑代码和自带默认组件整合成一个完整的分布式运算程序,并发运行在一个Hadoop集群上。
一个完整的MapReduce程序在分布式运行时有三类实例进程。
<dependencies>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-client-apiartifactId>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-client-runtimeartifactId>
dependency>
dependencies>
package com.simwor.bigdata.mapreduce;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
//Mapper
// KEYIN == 行号
// VALUEIN == 行文本
// KEYOUT == 单词
// VALUEOUT == 单词出现次数
public class WordCountMapper extends Mapper<LongWritable,Text,Text, IntWritable> {
private static final IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* 框架将文件一行行输入进来,此方法将数据变成(单词,1)的形式
*
* @param key 行号
* @param value 行文本
* @param context 任务本身
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while(itr.hasMoreTokens()) {
this.word.set(itr.nextToken());
context.write(this.word, this.one);
}
}
}
package com.simwor.bigdata.mapreduce;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//Reducer
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
/**
* 框架将KEYIN相同的数据分好组输入,此方法将VALUEIN累加。
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable value : values)
sum += value.get();
this.result.set(sum);
context.write(key, this.result);
}
}
相当于YARN集群的客户端,用于提交我们整个程序到YARN集群,提交的是封装了MapReduce程序相关运行参数的job对象。
package com.simwor.bigdata.mapreduce;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1. 获取Job实例
Job job = Job.getInstance();
//2. 设置Jar包
job.setJarByClass(WordCountDriver.class);
//3. 设置Mapper和Reducer
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//4. 设置Map和Reduce的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//5. 设置输入输出文件
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//6. 提交Job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
[omm@bigdata01 hadoop]$ pwd
/opt/module/hadoop
[omm@bigdata01 hadoop]$ mkdir custom-jobs
[omm@bigdata01 hadoop]$ ll custom-jobs/
total 41436
-rw-rw-r-- 1 omm omm 42427660 Feb 22 18:15 mapreduce-1.0-SNAPSHOT.jar
[omm@bigdata01 hadoop]$ hdfs dfs -put ./README.txt /
[omm@bigdata01 hadoop]$ yarn jar custom-jobs/mapreduce-1.0-SNAPSHOT.jar com.simwor.bigdata.mapreduce.WordCountDriver /README.txt /wordcount
....
2021-02-22 18:31:55,092 INFO mapreduce.Job: map 0% reduce 0%
2021-02-22 18:31:59,143 INFO mapreduce.Job: map 100% reduce 0%
2021-02-22 18:32:04,271 INFO mapreduce.Job: map 100% reduce 100%
2021-02-22 18:32:04,281 INFO mapreduce.Job: Job job_1613981178835_0001 completed successfully
...
Java类型 | Hadoop Writable类型 |
---|---|
Boolean | BooleanWritable |
Byte | ByteWritable |
Int | IntWritable |
Float | FloatWritable |
Long | LongWritable |
Double | DoubleWritable |
String | Text |
Map | MapWritable |
Array | ArrayWritable |
package com.simwor.mapreduce.writable;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
@Data
@NoArgsConstructor
public class FlowBean implements Writable {
private long upFlow;
private long downFlow;
private long sumFlow;
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
public void set(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow + downFlow;
}
}
package com.simwor.mapreduce.writable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
private Text phone = new Text();
private FlowBean flowBean = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
phone.set(fields[1]);
flowBean.set(Long.parseLong(fields[fields.length-3]),Long.parseLong(fields[fields.length-2]));
context.write(phone, flowBean);
}
}
package com.simwor.mapreduce.writable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
private FlowBean flowBean = new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long sumUpFlow = 0;
long sumDownFlow = 0;
for(FlowBean value : values) {
sumUpFlow += value.getUpFlow();
sumDownFlow += value.getDownFlow();
}
flowBean.set(sumUpFlow, sumDownFlow);
context.write(key, flowBean);
}
}
package com.simwor.mapreduce.writable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class FlowDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(FlowDriver.class);
job.setMapperClass(FlowMapper.class);
job.setReducerClass(FlowReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("D:\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\output"));
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
13470253144 FlowBean(upFlow=180, downFlow=180, sumFlow=360)
13509468723 FlowBean(upFlow=7335, downFlow=110349, sumFlow=117684)
13560439638 FlowBean(upFlow=918, downFlow=4938, sumFlow=5856)
13568436656 FlowBean(upFlow=3597, downFlow=25635, sumFlow=29232)
13590439668 FlowBean(upFlow=1116, downFlow=954, sumFlow=2070)
13630577991 FlowBean(upFlow=6960, downFlow=690, sumFlow=7650)
13682846555 FlowBean(upFlow=1938, downFlow=2910, sumFlow=4848)
13729199489 FlowBean(upFlow=240, downFlow=0, sumFlow=240)
13736230513 FlowBean(upFlow=2481, downFlow=24681, sumFlow=27162)
13768778790 FlowBean(upFlow=120, downFlow=120, sumFlow=240)
13846544121 FlowBean(upFlow=264, downFlow=0, sumFlow=264)
13956435636 FlowBean(upFlow=132, downFlow=1512, sumFlow=1644)
13966251146 FlowBean(upFlow=240, downFlow=0, sumFlow=240)
13975057813 FlowBean(upFlow=11058, downFlow=48243, sumFlow=59301)
13992314666 FlowBean(upFlow=3008, downFlow=3720, sumFlow=6728)
15043685818 FlowBean(upFlow=3659, downFlow=3538, sumFlow=7197)
15910133277 FlowBean(upFlow=3156, downFlow=2936, sumFlow=6092)
15959002129 FlowBean(upFlow=1938, downFlow=180, sumFlow=2118)
18271575951 FlowBean(upFlow=1527, downFlow=2106, sumFlow=3633)
18390173782 FlowBean(upFlow=9531, downFlow=2412, sumFlow=11943)
84188413 FlowBean(upFlow=4116, downFlow=1432, sumFlow=5548)
上面的流程是整个MapReduce最全工作流程,但是Shuffle过程只是从第7步开始到第16步结束。
Shuffle中的缓冲区大小会影响到MapReduce程序的执行效率,原则上说,缓冲区越大,磁盘io的次数越少,执行速度就越快。
缓冲区的大小可以通过参数调整,参数:io.sort.mb默认100M。
job.setNumReduceTasks(10);
Combiner
MR 每个阶段都伴随着数据落盘,假如在落盘前对数据进行局部合并,就可以有效地降低磁盘IO和网络IO。
OutputFormat是MapReduce输出的基类,所有实现MapReduce输出都实现了 OutputFormat接口。
类型 | 说明 |
---|---|
FileInputFormat | 【默认】文件方式切片 |
DBInputFormat | 数据库方式切片 |
TextInputFormat | 【默认】按照文件及HDFS Block切片,并将切片拆分成(LongWritable,Text)作为 Mappper 的输入 |
CombineFileInputFormat | 合并小文件,解决大量小文件单独提交一个MapTask的问题 |
NLineInputFormat | 按照文件行数切片 |
InputFormat
package com.simwor.mapreduce.inputformat;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
// CustomInputFormat 做两件事情
// 1. 切片,已由 FileInputFormat 实现
// 2. 将文件映射为 KV 对,这是我们现在要做的
public class CustomInputFormat extends FileInputFormat<Text, BytesWritable> {
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
return new CustomRecordReader();
}
}
package com.simwor.mapreduce.inputformat;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
// 定制化:将整个文件转化成一个 KV 对,即一个文件当成 Mapper 的一条记录
public class CustomRecordReader extends RecordReader<Text, BytesWritable> {
private boolean isRead = false;
private Text key = new Text();
private BytesWritable value = new BytesWritable();
private FileSplit fileSplit;
private FSDataInputStream inputStream;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException {
//开流
fileSplit = (FileSplit) inputSplit;
FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
inputStream = fileSystem.open(fileSplit.getPath());
}
/**
* 读取下一组 KV 对
* @return 是否存在
* @throws IOException
*/
@Override
public boolean nextKeyValue() throws IOException {
if(!isRead) {
key.set(fileSplit.getPath().toString());
byte[] buffer = new byte[(int) fileSplit.getLength()];
inputStream.read(buffer);
value.set(buffer, 0, buffer.length);
isRead = true;
return true;
}
return false;
}
@Override
public Text getCurrentKey() {
return key;
}
@Override
public BytesWritable getCurrentValue() {
return value;
}
@Override
public float getProgress() {
return isRead ? 1 : 0;
}
@Override
public void close() {
IOUtils.closeStream(inputStream);
}
}
package com.simwor.mapreduce.inputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
// 无需再经过 Map 和 Reduce,按文件切片,每个文件映射成一个 KV,然后按流输出。
public class CustomDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(CustomDriver.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setInputFormatClass(CustomInputFormat.class);
//直接将所有结果按流输出
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("d:/input"));
FileOutputFormat.setOutputPath(job, new Path("d:/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
Partitioner
每一条 KV 对要发往哪一个分区,这是这个类目前要做的。
job.setNumReduceTasks(5);
job.setPartitionerClass(MyPartitioner.class);
package com.atguigu.mr.partitioner;
import com.atguigu.mr.flow.FlowBean;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MyPartitioner extends Partitioner<Text, FlowBean> {
/**
* 对每一条KV对,返回它们对应的分区号
* @param text 手机号
* @param flowBean 流量
* @param numPartitions
* @return
*/
public int getPartition(Text text, FlowBean flowBean, int numPartitions) {
//去手机号前三位
String phone_head = text.toString().substring(0, 3);
switch (phone_head) {
case "136":
return 0;
case "137":
return 1;
case "138":
return 2;
case "139":
return 3;
default:
return 4;
}
}
}
WritableComparable
Shuffle 阶段会对 KV 对以 Key 排序,所以只要将想要排序的数据封装成 Bean 实现 WritableComparable 接口并放在 Mapper
NeedCompareBean, xxx> 即可。
package com.atguigu.mr.compare;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 实现WritableComparable接口
*/
public class FlowBean implements WritableComparable<FlowBean> {
private long upFlow;
private long downFlow;
private long sumFlow;
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + sumFlow;
}
public void set(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.sumFlow = upFlow + downFlow;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
/**
* 将对象数据写出到框架指定地方
* @param dataOutput 数据的容器
* @throws IOException
*/
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(upFlow);
dataOutput.writeLong(downFlow);
dataOutput.writeLong(sumFlow);
}
/**
* 从框架指定地方读取数据填充对象
* @param dataInput 数据的容器
* @throws IOException
*/
public void readFields(DataInput dataInput) throws IOException {
this.upFlow = dataInput.readLong();
this.downFlow = dataInput.readLong();
this.sumFlow = dataInput.readLong();
}
/**
* 比较方法,按照总流量降序排序
* @param o
* @return
*/
@Override
public int compareTo(FlowBean o) {
return Long.compare(o.sumFlow, this.sumFlow);
}
}
数据发送到 Reducer的数据就是以 FlowBean 自定义比较方法有序的。
package com.atguigu.mr.compare;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class CompareMapper extends Mapper<LongWritable, Text, FlowBean, Text> {
private Text phone = new Text();
private FlowBean flow = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//一行数据
String line = value.toString();
//切分
String[] fields = line.split("\t");
//封装
phone.set(fields[0]);
flow.setUpFlow(Long.parseLong(fields[1]));
flow.setDownFlow(Long.parseLong(fields[2]));
flow.setSumFlow(Long.parseLong(fields[3]));
//写出去
context.write(flow, phone);
}
}
Combiner
job.setCombinerClass(WordcountCombiner.class);
public class WordcountCombiner extends Reducer<Text, IntWritable, Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
// 1 汇总操作
int count = 0;
for(IntWritable v :values){
count += v.get();
}
// 2 写出
context.write(key, new IntWritable(count));
}
}
GroupingComparator
有有如下订单数据,现在需要求出每一个订单中最贵的商品。
0000001 Pdt_01 222.8
0000002 Pdt_05 722.4
0000001 Pdt_02 33.8
0000003 Pdt_06 232.8
0000003 Pdt_02 33.8
0000002 Pdt_03 522.8
0000002 Pdt_04 122.4
0000001 Pdt_01 222.8
0000002 Pdt_05 722.4
0000003 Pdt_06 232.8
package com.atguigu.mr.grouping;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private String productId;
private double price;
@Override
public String toString() {
return orderId + "\t" + productId + "\t" + price;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getProductId() {
return productId;
}
public void setProductId(String productId) {
this.productId = productId;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
/**
* 排序逻辑:先按照订单排序,订单相同按照价格降序排列
* @param o
* @return
*/
@Override
public int compareTo(OrderBean o) {
int compare = this.orderId.compareTo(o.orderId);
if (compare != 0) {
return compare;
} else {
return Double.compare(o.price, this.price);
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeUTF(productId);
out.writeDouble(price);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.productId = in.readUTF();
this.price = in.readDouble();
}
}
package com.atguigu.mr.grouping;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 封装OrderBean
*/
public class OrderMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
private OrderBean order = new OrderBean();
/**
* Map用来封装OrderBean
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//拆分
String[] fields = value.toString().split("\t");
//封装
order.setOrderId(fields[0]);
order.setProductId(fields[1]);
order.setPrice(Double.parseDouble(fields[2]));
context.write(order, NullWritable.get());
}
}
package com.atguigu.mr.grouping;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 按照订单编号对数据进行分组
*/
public class OrderComparator extends WritableComparator {
protected OrderComparator() {
super(OrderBean.class, true);
}
/**
* 分组比较方法,按照相同订单进入一组进行比较
* @param a
* @param b
* @return
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean oa = (OrderBean) a;
OrderBean ob = (OrderBean) b;
return oa.getOrderId().compareTo(ob.getOrderId());
}
}
package com.atguigu.mr.grouping;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
* 取每个订单的最高价格(扩展:取前二高的价格)
*/
public class OrderReducer extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable> {
/**
* 取每个订单的前二高价格
* @param key 订单信息
* @param values 啥也没有
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
Iterator<NullWritable> iterator = values.iterator();
for (int i = 0; i < 2; i++) {
if (iterator.hasNext()) {
// ****** 当 iterator.next() 方法调用时 ******
// ****** 当 key 的值就会被重置 ******
NullWritable value = iterator.next();
context.write(key, value);
}
}
}
}
package com.atguigu.mr.grouping;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class OrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(OrderDriver.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
//设置分组比较器
job.setGroupingComparatorClass(OrderComparator.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("d:/input"));
FileOutputFormat.setOutputPath(job, new Path("d:/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
OutputFormat
package com.atguigu.mr.outputformat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MyOutputFormat extends FileOutputFormat<LongWritable, Text> {
/**
* 返回一个处理数据的Record Writer
* @param job
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public RecordWriter<LongWritable, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return new MyRecordWriter(job);
}
}
package com.atguigu.mr.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.FileOutputStream;
import java.io.IOException;
/**
* 将数据按照包不包含atguigu,分别输出到两个文件
*/
public class MyRecordWriter extends RecordWriter<LongWritable, Text> {
FSDataOutputStream atguigu = null;
FSDataOutputStream other = null;
public MyRecordWriter(TaskAttemptContext job) throws IOException {
Configuration configuration = job.getConfiguration();
String outDir = configuration.get(FileOutputFormat.OUTDIR);
FileSystem fileSystem = FileSystem.get(configuration);
atguigu = fileSystem.create(new Path(outDir + "/atguigu.log"));
other = fileSystem.create(new Path(outDir + "/other.log"));
}
/**
* 接收key value对,并按照值的不同写出到不通文件
* @param key 读取的一行的偏移量
* @param value 这一行内容
* @throws IOException
* @throws InterruptedException
*/
@Override
public void write(LongWritable key, Text value) throws IOException, InterruptedException {
//获取一行内容
String line = value.toString() + "\n";
//判断包不包含atguigu
if (line.contains("atguigu")) {
//往atguigu文件写数据
atguigu.write(line.getBytes());
} else {
//往other文件写数据
other.write(line.getBytes());
}
}
/**
* 关闭资源
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
IOUtils.closeStream(atguigu);
IOUtils.closeStream(other);
}
}
package com.atguigu.mr.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class OutputDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(OutputDriver.class);
job.setOutputFormatClass(MyOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("d:/input"));
FileOutputFormat.setOutputPath(job, new Path("d:/output2"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
目前,Hadoop作业调度器主要有三种:FIFO、Capacity Scheduler和Fair Scheduler。Hadoop3.1.3默认的资源调度器是Capacity Scheduler。
<property>
<description>The class to use as the resource scheduler.description>
<name>yarn.resourcemanager.scheduler.classname>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulervalue>
property>
<property>
<name>yarn.scheduler.capacity.root.queuesname>
<value>default,hivevalue>
<description>
The queues at the this level (root is the root queue).
description>
property>
<property>
<name>yarn.scheduler.capacity.root.default.capacityname>
<value>40value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.capacityname>
<value>60value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.maximum-capacityname>
<value>100value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.user-limit-factorname>
<value>1value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.statename>
<value>RUNNINGvalue>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.acl_submit_applicationsname>
<value>*value>
property>
<property>
<name>yarn.scheduler.capacity.root.hive.acl_administer_queuename>
<value>*value>
property>
默认的任务提交都是提交到default队列的。如果希望向其他队列提交任务,需要在Driver中声明。
public class WcDrvier {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
configuration.set("mapred.job.queue.name", "hive");
//1. 获取一个Job实例
Job job = Job.getInstance(configuration);
//2. 设置类路径
job.setJarByClass(WcDrvier.class);
//3. 设置Mapper和Reducer
job.setMapperClass(WcMapper.class);
job.setReducerClass(WcReducer.class);
//4. 设置Mapper和Reducer的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setCombinerClass(WcReducer.class);
//5. 设置输入输出文件
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//6. 提交Job
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
<property>
<name>mapreduce.map.speculativename>
<value>truevalue>
<description>If true, then multiple instances of some map tasks may be executed in parallel.description>
property>
<property>
<name>mapreduce.reduce.speculativename>
<value>truevalue>
<description>If true, then multiple instances of some reduce tasks may be executed in parallel.description>
property>
MapReduce优化方法主要从六个方面考虑:数据输入、Map阶段、Reduce阶段、IO传输、数据倾斜问题和常用的调优参数。
以下参数是在用户自己的MR应用程序中配置就可以生效(mapred-default.xml)
配置参数 | 参数说明 |
---|---|
mapreduce.map.memory.mb | 一个MapTask可使用的资源上限(单位:MB),默认为1024。如果MapTask实际使用的资源量超过该值,则会被强制杀死。 |
mapreduce.reduce.memory.mb | 一个ReduceTask可使用的资源上限(单位:MB),默认为1024。如果ReduceTask实际使用的资源量超过该值,则会被强制杀死。 |
mapreduce.map.cpu.vcores | 每个MapTask可使用的最多cpu core数目,默认值: 1 |
mapreduce.reduce.cpu.vcores | 每个ReduceTask可使用的最多cpu core数目,默认值: 1 |
mapreduce.reduce.shuffle.parallelcopies | 每个Reduce去Map中取数据的并行数。默认值是5 |
mapreduce.reduce.shuffle.merge.percent | Buffer中的数据达到多少比例开始写入磁盘。默认值0.66 |
mapreduce.reduce.shuffle.input.buffer.percent | Buffer大小占Reduce可用内存的比例。默认值0.7 |
mapreduce.reduce.input.buffer.percent | 指定多少比例的内存用来存放Buffer中的数据,默认值是0.0 |
配置参数 | 参数说明 |
---|---|
yarn.scheduler.minimum-allocation-mb | 给应用程序Container分配的最小内存,默认值:1024 |
yarn.scheduler.maximum-allocation-mb | 给应用程序Container分配的最大内存,默认值:8192 |
yarn.scheduler.minimum-allocation-vcores | 每个Container申请的最小CPU核数,默认值:1 |
yarn.scheduler.maximum-allocation-vcores | 每个Container申请的最大CPU核数,默认值:32 |
yarn.nodemanager.resource.memory-mb | 给Containers分配的最大物理内存,默认值:8192 |
配置参数 | 参数说明 |
---|---|
mapreduce.task.io.sort.mb | Shuffle的环形缓冲区大小,默认100m |
mapreduce.map.sort.spill.percent | 环形缓冲区溢出的阈值,默认80% |
配置参数 | 参数说明 |
---|---|
mapreduce.map.maxattempts | 每个Map Task最大重试次数,一旦重试参数超过该值,则认为Map Task运行失败,默认值:4。 |
mapreduce.reduce.maxattempts | 每个Reduce Task最大重试次数,一旦重试参数超过该值,则认为Map Task运行失败,默认值:4。 |
mapreduce.task.timeout | Task超时时间,经常需要设置的一个参数,该参数表达的意思为:如果一个Task在一定时间内没有任何进入,即不会读取新的数据,也没有输出数据,则认为该Task处于Block状态,可能是卡住了,也许永远会卡住,为了防止因为用户程序永远Block住不退出,则强制设置了一个该超时时间(单位毫秒),默认是600000。如果你的程序对每条输入数据的处理时间过长(比如会访问数据库,通过网络拉取数据等),建议将该参数调大,该参数过小常出现的错误提示是“AttemptID:attempt_14267829456721_123456_m_000224_0 Timed out after 300 secsContainer killed by the ApplicationMaster.”。 |
必须保证两个NameNode之间能够ssh无密码登录
隔离(Fence),即同一时刻仅仅有一个NameNode对外提供服务
实现自动故障转移需要两个新的组件:ZooKeeper和ZKFailoverController(ZKFC)进程。
每个运行NameNode的主机也运行了一个ZKFC进程,ZKFC负责:
官方文档:https://hadoop.apache.org/docs/r3.2.2/hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithQJM.html
hadoop102 | hadoop103 | hadoop104 |
---|---|---|
NameNode | NameNode | NameNode |
ZKFC | ZKFC | ZKFC |
JournalNode | JournalNode | JournalNode |
DataNode | DataNode | DataNode |
ZK | ZK | ZK |
ResourceManager | ||
NodeManager | NodeManager | NodeManager |
[omm@bigdata01 ~]$ sudo mkdir /opt/ha
[omm@bigdata01 ~]$ sudo chown omm.wheel /opt/ha
[omm@bigdata01 ~]$ cp -r /opt/module/hadoop-3.1.4 /opt/ha/
[omm@bigdata01 ~]$ cd /opt/ha/
[omm@bigdata01 ha]$ ln -s hadoop-3.1.4 hadoop
[omm@bigdata01 ha]$ cd hadoop
[omm@bigdata01 hadoop]$ rm -rf data logs
core-site.xml
[omm@bigdata01 hadoop]$ cd etc/hadoop/
[omm@bigdata01 hadoop]$ vim core-site.xml
[omm@bigdata01 hadoop]$ cat core-site.xml
<configuration>
<property>
<name>fs.defaultFSname>
<value>hdfs://myclustervalue>
property>
<property>
<name>hadoop.data.dirname>
<value>/opt/ha/hadoop/datavalue>
property>
<property>
<name>hadoop.proxyuser.omm.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.omm.groupsname>
<value>*value>
property>
configuration>
[omm@bigdata01 hadoop]$
hdfs-site.xml
[omm@bigdata01 hadoop]$ vim hdfs-site.xml
[omm@bigdata01 hadoop]$ cat hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.name.dirname>
<value>file://${hadoop.data.dir}/namevalue>
property>
<property>
<name>dfs.datanode.data.dirname>
<value>file://${hadoop.data.dir}/datavalue>
property>
<property>
<name>dfs.nameservicesname>
<value>myclustervalue>
property>
<property>
<name>dfs.ha.namenodes.myclustername>
<value>nn1,nn2, nn3value>
property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1name>
<value>bigdata01:8020value>
property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2name>
<value>bigdata02:8020value>
property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn3name>
<value>bigdata03:8020value>
property>
<property>
<name>dfs.namenode.http-address.mycluster.nn1name>
<value>bigdata01:9870value>
property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2name>
<value>bigdata02:9870value>
property>
<property>
<name>dfs.namenode.http-address.mycluster.nn3name>
<value>bigdata03:9870value>
property>
<property>
<name>dfs.namenode.shared.edits.dirname>
<value>qjournal://bigdata01:8485;bigdata02:8485;bigdata03:8485/myclustervalue>
property>
<property>
<name>dfs.client.failover.proxy.provider.myclustername>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvidervalue>
property>
<property>
<name>dfs.ha.fencing.methodsname>
<value>sshfencevalue>
property>
<property>
<name>dfs.ha.fencing.ssh.private-key-filesname>
<value>/home/atguigu/.ssh/id_rsavalue>
property>
<property>
<name>dfs.journalnode.edits.dirname>
<value>${hadoop.data.dir}/jnvalue>
property>
<property>
<name>dfs.ha.automatic-failover.enabledname>
<value>truevalue>
property>
<property>
<name>ha.zookeeper.quorumname>
<value>bigdata01:2181,bigdata02:2181,bigdata03:2181value>
property>
configuration>
[omm@bigdata01 ~]$ cd /opt
[omm@bigdata01 opt]$ sudo xsync ha
[omm@bigdata01 ~]$ # 三台都做一下
[omm@bigdata01 ~]$ sudo vi /etc/profile
[omm@bigdata01 ~]$ tail /etc/profile
# Java
export JAVA_HOME=/opt/module/jdk
export PATH=$PATH:$JAVA_HOME/bin
# Hadoop
export HADOOP_HOME=/opt/ha/hadoop
export PATH=$PATH:$HADOOP_HOME/bin
# Hadoop sbin
export PATH=$PATH:$HADOOP_HOME/sbin
[omm@bigdata01 ~]$ source /etc/profile
[omm@bigdata01 ~]$ which hdfs
/opt/ha/hadoop/bin/hdfs
[omm@bigdata01 ~]$ hdfs --workers --daemon start journalnode # 存放 NN 操作日志的 journal 集群
bigdata01: WARNING: /opt/ha/hadoop-3.1.4/logs does not exist. Creating.
bigdata02: WARNING: /opt/ha/hadoop-3.1.4/logs does not exist. Creating.
bigdata03: WARNING: /opt/ha/hadoop-3.1.4/logs does not exist. Creating.
[omm@bigdata01 ~]$ hdfs namenode -format # 格式化
[omm@bigdata01 ~]$ hdfs --daemon start namenode # 启动主 NN
[omm@bigdata02 ~]$ hdfs namenode -bootstrapStandby # 同步备NN
[omm@bigdata02 ~]$ hdfs --daemon start namenode
[omm@bigdata03 ~]$ hdfs namenode -bootstrapStandby # 同步备NN
[omm@bigdata03 ~]$ hdfs --daemon start namenode
[omm@bigdata01 ~]$ hdfs zkfc -formatZK # 格式化 ZK
[omm@bigdata01 ~]$ start-dfs.sh
正确现象:一个 active 两个 standby
[omm@bigdata03 ~]$ jps
1505 QuorumPeerMain
4118 Jps
3991 NodeManager
3560 NameNode
3740 JournalNode
3629 DataNode
3869 DFSZKFailoverController
[omm@bigdata03 ~]$ kill -9 3560
fuser: command not found
[omm@bigdata01 ~]$ yum install psmisc
参考文档:https://hadoop.apache.org/docs/r3.2.2/hadoop-yarn/hadoop-yarn-site/ResourceManagerHA.html
hadoop102 | hadoop103 | hadoop104 |
---|---|---|
NameNode | NameNode | NameNode |
ZKFC | ZKFC | ZKFC |
JournalNode | JournalNode | JournalNode |
DataNode | DataNode | DataNode |
ZK | ZK | ZK |
ResourceManager | ResourceManager | |
NodeManager | NodeManager | NodeManager |
[omm@bigdata01 hadoop]$ pwd
/opt/ha/hadoop/etc/hadoop
[omm@bigdata01 hadoop]$ cat yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.resourcemanager.hostnamename>
<value>bigdata02value>
property>
<property>
<name>yarn.nodemanager.env-whitelistname>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOMEvalue>
property>
<property>
<name>yarn.log-aggregation-enablename>
<value>truevalue>
property>
<property>
<name>yarn.log.server.urlname>
<value>http://bigdata01:19888/jobhistory/logsvalue>
property>
<property>
<name>yarn.log-aggregation.retain-secondsname>
<value>604800value>
property>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.resourcemanager.ha.enabledname>
<value>truevalue>
property>
<property>
<name>yarn.resourcemanager.cluster-idname>
<value>cluster-yarn1value>
property>
<property>
<name>yarn.resourcemanager.ha.rm-idsname>
<value>rm1,rm2value>
property>
<property>
<name>yarn.resourcemanager.hostname.rm1name>
<value>bigdata01value>
property>
<property>
<name>yarn.resourcemanager.hostname.rm2name>
<value>bigdata02value>
property>
<property>
<name>yarn.resourcemanager.zk-addressname>
<value>bigdata01:2181,bigdata02:2181,bigdata03:2181value>
property>
<property>
<name>yarn.resourcemanager.recovery.enabledname>
<value>truevalue>
property>
<property>
<name>yarn.resourcemanager.store.classname>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStorevalue>
property>
configuration>
[omm@bigdata01 hadoop]$ cd ..
[omm@bigdata01 etc]$ xsync hadoop
[omm@bigdata02 ~]$ start-yarn.sh
Starting resourcemanagers on [ bigdata01 bigdata02]
Starting nodemanagers
bigdata03: nodemanager is running as process 2067. Stop it first.
bigdata02: nodemanager is running as process 2118. Stop it first.
bigdata01: nodemanager is running as process 2579. Stop it first.
[omm@bigdata01 ~]$ yarn rmadmin -getServiceState rm1
active
[omm@bigdata01 ~]$ yarn rmadmin -getServiceState rm2
standby
[omm@bigdata01 ~]$
不同应用可以使用不同NameNode进行数据管理图片业务、爬虫业务、日志审计业务。
Hadoop生态系统中,不同的框架使用不同的NameNode进行管理NameSpace。(隔离性)