sudo apt-get install openssh-server
sudo mount /dev/cdrom /mnt/
sudo apt-get install gcc make perl
sudo /mnt/VBoxLinuxAdditions.run
中文乱码问题
在/etc/default/locale中
LANG=”en_US.UTF-8″
LANGUAGE=”en_US:en”
共享文件夹
写一个脚本mount.sh
#!/bin/sh
sudo mount -t vboxsf share ~/shared
弄好一个后复制三个虚拟机,xshell上配置好ssh登录,剩下的操作在xshell上进行。
配置java和hadoop。这里java需要在三台虚拟机上配置,hadoop只需要在第一台机器上配置好发到另外两台就可以了。
export JAVA_HOME=/home/hadoop1/jdk1.8.0_73
export HADOOP_HOME=/home/hadoop1/hadoop-2.6.1
export PATH=$PATH:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:${HADOOP_HOME}/bin
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
更改hostname,配置地址映射
hostname是主机名,用于网络上区分机器的名称。在/etc/hostname里配置,地址映射在/etc/hosts里配置。
配置ssh免密登录
这里可以采用的方法是利用共享文件夹,三台机器都使用代码,然后把公钥都共享在共享文件夹里,分配添加到authorized_keys里。
ssh-keygen -t rsa
cd ~/.ssh
cat id_rsa.pub >> authorized_keys
export JAVA_HOME=/home/hadoop1/jdk1.8.0_73
//这个不加是不行的,即便/etc/profile里配置了java
(2)yarn-site.xml
用于配置yarn的管理端口信息,在第一台虚拟机上进行
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.classname>
<value>org.apache.hadoop.mapred.ShuffleHandlervalue>
property>
<property>
<name>yarn.resourcemanager.addressname>
<value>hadoop1:8032value>
property>
<property>
<name>yarn.resourcemanager.scheduler.addressname>
<value>hadoop1:8030value>
property>
<property>
<name>yarn.resourcemanager.resource-tracker.addressname>
<value>hadoop1:8035value>
property>
<property>
<name>yarn.resourcemanager.admin.addressname>
<value>hadoop1:8033value>
property>
<property>
<name>yarn.resourcemanager.webapp.addressname>
<value>hadoop1:8088value>
property>
(3)slaves
配置data_node的节点host。将三台机器的host写进去。
hadoop1
hadoop2
hadoop3
(4)core-site.xml
配置tmp目录的位置信息
<property>
<name>fs.defaultFSname>
<value>hdfs://hadoop1:9000value>
property>
<property>
<name>hadoop.tmp.dirname>
<value>file:/home/hadoop1/hadoop-2.6.1/tmpvalue>
property>
(5)hdfs-site.xml
配置hdfs的关键参数,包括name和data两个目录信息
<configuration>
<property>
<name>dfs.namenode.secondary.http-addressname>
<value>hadoop1:9001value>
property>
<property>
<name>dfs.namenode.name.dirname>
<value>file:/home/hadoop1/hadoop-2.6.1/dfs/namevalue>
property>
<property>
<name>dfs.datanode.data.dirname>
<value>file:/home/hadoop1/hadoop-2.6.1/dfs/datavalue>
property>
<property>
<name>dfs.replicationname>
<value>2value>
property>
<property>
<name>dfs.webhdfs.enabledname>
<value>truevalue>
property>
configuration>
(6)mapred-site.xml
配置mapreduce的信息
<configuration>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
<property>
<name>mapreduce.jobhistory.addressname>
<value>hadoop1:10020value>
property>
<property>
<name>mapreduce.jobhistory.webapp.addressname>
<value>hadoop1:19888value>
property>
configuration>
之后用共享文件夹将hadoop复制到其它两个机器上。
./bin/hdfs namenode -format
./sbin/start-all.sh
./bin/hdfs dfsadmin -report
50070端口和8088端口看hdfs和mapreduce信息
注意的是格式化不能太多次,不然会导致datanode进程宕掉。
至此分布式的学习环境就搭建好了。
<mirror>
<id>alimavenid>
<name>aliyun mavenname>
<url>http://maven.aliyun.com/nexus/content/groups/public/url>
<mirrorOf>centralmirrorOf>
mirror>
<mirror>
<id>net-cnid>
<mirrorOf>centralmirrorOf>
<name>Human Readable Name for this Mirror.name>
<url>http://maven.net.cn/content/groups/public/url>
mirror>
<mirror>
<id>CNid>
<name>OSChina Centralname>
<url>http://maven.oschina.net/content/groups/public/url>
<mirrorOf>centralmirrorOf>
mirror>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<properties>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
properties>
<groupId>org.examplegroupId>
<artifactId>projectartifactId>
<version>1.0-SNAPSHOTversion>
<repositories>
<repository>
<id>apacheid>
<url>http://maven.apache.orgurl>
repository>
repositories>
<dependencies>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.12version>
<scope>testscope>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-commonartifactId>
<version>2.6.1version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-hdfsartifactId>
<version>2.6.1version>
dependency>
<dependency>
<groupId>commons-cligroupId>
<artifactId>commons-cliartifactId>
<version>1.2version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-coreartifactId>
<version>2.6.1version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-jobclientartifactId>
<version>2.6.1version>
dependency>
<dependency>
<groupId>log4jgroupId>
<artifactId>log4jartifactId>
<version>1.2.17version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-examplesartifactId>
<version>2.6.1version>
dependency>
<dependency>
<groupId>org.projectlombokgroupId>
<artifactId>lombokartifactId>
<version>1.16.6version>
dependency>
<dependency>
<groupId>org.apache.zookeepergroupId>
<artifactId>zookeeperartifactId>
<version>3.4.8version>
dependency>
dependencies>
project>
install,build,reimport。直到红字消除为止。
在resources里配置log4j.properties
log4j.rootLogger=DEBUG, stdout
# Console output...
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
import java.io.IOException;
import java.net.URI;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME","warden");
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "yarn");
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("fs.default.name", "hdfs://hadoop1:9000/");
conf.set("yarn.resourcemanager.hostname", "hadoop1");
Job job = Job.getInstance(conf, "word count");
job.setJar("C:\\Users\\warden\\IdeaProjects\\test\\target\\test-1.0-SNAPSHOT.jar");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path path = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(new URI("hdfs://hadoop1:9000"),conf,"hadoop1");
if (fileSystem.exists(path)) {
fileSystem.delete(path, true);
}
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
这几个conf都是必备的,一个是确定自己的用户身份,使用哪个hdfs的目录。一个是使用yarn集群来进行,去掉这个配置就是采用本地模式。下一个是windows下跨平台操作。在下面就是确定hdfs和yarn的地址。yarn必须配hostname,不然会出现只accepted不running的情况。另外这里需要提交打包好的jar文件,不打包的话也会出现不running的情况。如果要删除job,用以下的语句
hadoop job -list
hadoop job -kill <job_id>
整体来说,这些坑踩过去之后,hadoop的运行机制就清晰了很多。