名称 | 版本 | 备注 |
---|---|---|
宿主系统 | Win7【64位】 | |
VMware | 12 | |
虚拟机镜像 | CentOS-6.5-x86_64-minimal.iso | 下载地址(不同版本):http://vault.centos.org/ |
jdk | jdk-8u65-linux-x64.tar.gz | linux版 |
hadoop | hadoop-2.6.0-cdh5.7.0.tar.gz | linux版 |
软件名称 | 路径 |
---|---|
jdk | /software/jdk/jdk8 |
hadoop | /software/hadoop/hadoop |
/etc/profile
#jdk
JAVA_HOME=/software/jdk/jdk8
PATH=$JAVA_HOME/bin:$PATH
CLASSPATH=.:$JAVA_HOME/lb/dt.jar:$JAVA_HOME/lib/tools.jar
export JAVA_HOME PATH CLASSPATH
#hadoop
export HADOOP_HOME=/software/hadoop/hadoop
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
。。。
export JAVA_HOME=/software/jdk/jdk8
。。。
/software/hadoop/hadoop/etc/hadoop/core-site.xml
fs.defaultFS
hdfs://centos01:8020
hadoop.tmp.dir
/software/hadoop/tmp
/software/hadoop/hadoop/etc/hadoop/hdfs-site.xml
dfs.replication
1
dfs.namenode.name.dir
file:/software/hadoop/tmp/dfs/name
dfs.namenode.data.dir
file:/software/hadoop/tmp/dfs/data
centos01
/software/hadoop/hadoop/etc/hadoop/mapred-site.xml
mapreduce.framework.name
yarn
mapreduce.jobhistory.address
centos01:10020
MapReduce JobHistory Server IPC host:port
mapreduce.jobhistory.webapp.address
centos01:19888
MapReduce JobHistory Server Web UI host:port
mapreduce.jobhistory.done-dir
/history/done
mapreduce.jobhistory.intermediate-done-dir
/history/done_intermediate
/software/hadoop/hadoop/etc/hadoop/yarn-site.xml
yarn.nodemanager.aux-services
mapreduce_shuffle
yarn.log-aggregation-enable
true
182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userdynamic HTTP/1.1" 200 19501 "www.imooc.com" "-" cid=0×tamp=1478707261847&uid=2871142&touid=2871142&page=1&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=3837a5bf27ea718fe18bda6c53fbbc14 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.195 0.195
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
114.248.161.26 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getcourseintro HTTP/1.1" 200 2510 "www.imooc.com" "-" cid=283&secrect=86b720f312c2b25da3b20e59e7c89780×tamp=1478707261951&token=4c144b3f4314178b9527d1e91ecc0fac&uid=3372975 "mukewang/5.0.2 (iPhone; iOS 8.4.1; Scale/2.00)" "-" 10.100.136.65:80 200 0.007 0.008
120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getmediainfo_ver2 HTTP/1.1" 200 633 "www.imooc.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5×tamp=1478707261945&token=9bbdba949aec02735e59e0868b538e19&uid=4203162 "mukewang/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.049 0.049
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
112.10.136.45 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.006 0.006
maven配置文件
4.0.0
com.peng
hdfstest
1.0-SNAPSHOT
org.apache.maven.plugins
maven-compiler-plugin
1.8
1.8
repo
http://repo1.maven.org/maven2/
cloudera
https://repository.cloudera.com/content/repositories/releases/
org.apache.hadoop
hadoop-client
2.6.0-cdh5.7.0
junit
junit
4.10
test
org.mortbay.jetty
jetty
6.1.26
org.slf4j
slf4j-log4j12
1.7.25
test
主程序
LogTest.java
package com.peng.logtest;
import com.peng.utils.position.PositionUtils;
import com.peng.utils.useragent.UserAgent;
import com.peng.utils.useragent.UserAgentParser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogTest {
//main
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.exit(1);
}
//创建配置文件
Configuration configuration = new Configuration();
//设置mapper阶段的堆内存大小
configuration.set("mapreduce.admin.map.child.java.opts", "-Xmx1024m");
configuration.set("mapred.map.child.java.opts", "-Xmx1024m");
//设置reducer阶段的堆内存大小
configuration.set("mapreduce.admin.reduce.child.java.opts", "-Xmx1024m");
configuration.set("mapred.reduce.child.java.opts", "-Xmx1024m");
//判断是否存在输出文件--有的话进行删除
FileSystem fileSystem = FileSystem.get(configuration);
Path outFilePath = new Path(args[1]);
boolean is_exists = fileSystem.exists(outFilePath);
//判断是否存在此文件--存在的话进行删除
if (is_exists) {
fileSystem.delete(outFilePath, true);
}
//创建job对象
Job job = Job.getInstance(configuration, "logtest");
//设置job的处理类
job.setJarByClass(LogTest.class);
//设置作业处理的输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//设置map相关参数
job.setMapperClass(LogTest.MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//设置reduce相关参数
job.setReducerClass(LogTest.MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置作业处理的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
/**
* 读取输入文件
*/
public static class MyMapper extends Mapper {
UserAgentParser userAgentParser = null;
LongWritable one = null;
//初始化对象
@Override
protected void setup(Context context) throws IOException, InterruptedException {
if (userAgentParser == null) {
userAgentParser = new UserAgentParser();
}
if (one == null) {
one = new LongWritable(1);
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//接收到的每一行数据
String line = value.toString();
//---------------取出浏览器版本-------------start---------------------
int start_index = PositionUtils.getCharacterPosition(line, "\"", 7);
int end_index = PositionUtils.getCharacterPosition(line, "\"", 8);
if (end_index != 0) {
String userAgentData = line.substring(start_index + 1, end_index);
if (userAgentData != null && userAgentData.length() > 0) {
UserAgent userAgent = userAgentParser.parse(userAgentData);
if (userAgent != null) {
if (userAgent.getBrowser() != null) {
context.write(new Text(userAgent.getBrowser()), one);
}
}
}
}
//---------------取出浏览器版本--------------end----------------------
}
}
/**
* 归并操作
*/
public static class MyReduce extends Reducer {
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
//浏览器出现的总数
long sum = 0;
for (LongWritable value : values) {
//求key出现的次数
sum += value.get();
}
//将统计的结果进行输出
context.write(key, new LongWritable(sum));
}
}
}