$ yum -y install java-1.7.0-openjdk*
$ ls -lrt /usr/bin/java
# lrwxrwxrwx 1 root root 22 Apr 29 13:47 /usr/bin/java -> /etc/alternatives/java
$ ls -lrt /etc/alternatives/java
#lrwxrwxrwx 1 root root 76 Apr 29 13:47 /etc/alternatives/java -> /usr/lib/jvm/java-1.7.0-openjdk-1.7.0.221-2.6.18.0.el7_6.x86_64/jre/bin/java
$ echo 'export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.211-2.6.17.1.el7_6.x86_64/' >> /etc/bashrc
$ echo 'export JRE_HOME=$JAVA_HOME/jre' >> /etc/bashrc
$ echo 'export CLASSPATH=$JAVA_HOME/lib:$JRE_HOME/lib:$CLASSPATH' >> /etc/bashrc
$ echo 'export PATH=$JAVA_HOME/bin:$JRE_HOME/bin:$PATH' >> /etc/bashrc
$ ssh-keygen -t rsa
$ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
$ chmod 600 ~/.ssh/authorized_keys
$ mkdir ~/download
$ wget -P ~/download/ http://mirror.bit.edu.cn/apache/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz
$ tar zxf ~/download/hadoop-2.6.5.tar.gz -C /opt/
$ echo 'export HADOOP_HOME=/opt/hadoop-2.6.5' >> /etc/bashrc
$ echo 'export STREAM=$HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-*.jar' >> /etc/bashrc
$ source /etc/bashrc
$ cd $HADOOP_HOME
$ vi $HADOOP_HOME/etc/hadoop/core-site.xml
修改内容为
<configuration>
<property>
<name>fs.defaultFSname>
<value>hdfs://localhost:9000value>
property>
configuration>
$ vi $HADOOP_HOME/etc/hadoop/hdfs-site.xml
修改内容为
<configuration>
<property>
<name>dfs.replicationname>
<value>1value>
property>
configuration>
$ $HADOOP_HOME/bin/hdfs namenode -format
$ $HADOOP_HOME/sbin/start-dfs.sh
# 查看端口是否启动
$ netstat -ntpl|grep 9000
$ vi $HADOOP_HOME/etc/hadoop/mapred-site.xml
修改内容为
<configuration>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
configuration>
$ vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
修改内容为
<configuration>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
configuration>
$ $HADOOP_HOME/sbin/start-yarn.sh
# 查看端口是否启动
$ netstat -ntpl|grep 8088
$ echo 'alias hadoop=$HADOOP_HOME/bin/hadoop' >> /etc/bashrc
$ echo 'alias hdfs=$HADOOP_HOME/bin/hdfs' >> /etc/bashrc
$ echo 'export HADOOP_CLASSPATH=$JAVA_HOME/lib/tools/jar' >> /etc/bashrc
$ source /etc/bashrc
$ yum install epel-release
$ yum install python36
$ echo 'alias python=python3' >> /etc/bashrc
$ source /etc/bashrc
#!/usr/bin/env python3
import sys
import re
for line in sys.stdin:
words = line.strip().split()
for word in words:
word = word.lower()
ws = re.findall("[a-z][a-z]*", word)
for w in ws:
print(w, 1)
#!/usr/bin/env python3
import sys
curr_w, curr_c, word = None, 0, None
for line in sys.stdin:
word, cnt = line.strip().split()
if curr_w == word:
curr_c += int(cnt)
else:
if curr_w is not None:
print(curr_w, curr_c)
curr_c = int(cnt)
curr_w = word
if curr_w == word:
print(curr_w, curr_c)
$ chmod +x mapper.py
$ chmod +x reducer.py
$ cat p1.txt | ./mapper.py | sort | ./reducer.py | more
# a 11
# absorb 1
# according 1
# activated 1
# active 1
# activities 1
# added 1
# adopted 1
# after 5
# airport 3
# --More--
$ hdfs dfs -mkdir -p /user/`whoami`/input
$ hdfs dfs -put ~/p*.txt /user/`whoami`/input
$HADOOP_HOME/bin/hadoop jar $STREAM \
-files ./mapper.py,./reducer.py \
-mapper ./mapper.py \
-reducer ./reducer.py \
-input /user/`whoami`/input/p*.txt \
-output /user/`whoami`/output
$ chmod +x run.sh
$ ./run.sh
# 显示进度
$ hdfs dfs -cat output/part-00000