先来介绍一下kmeans算法的思想:(以下是我自己的理解):
1.先随机选取几个中心点作为初始的中心点
2.然后根据每个点到这些点的距离远近,找到最近的那个点作为它的中心点,将这些点进行分类。
3.计算每一类的点形成的新的中心点。
重复2,3步。
hadoop中mapreduce的代码我分成了4部分,是有一点的繁琐,但是比较好理解。
第一部分:kmeansInit:
这部分就是设置了初始的三个点的坐标,在map中对这些点进行分类,在reduce中对点重新计算中心点。
map的的输入:
每一行的key是默认的偏移值,value是一行的点。
reduce的输入是:
前面是第几类,后面是点的坐标。
而输出是:
key是新的中心点,value是属于这个类的点的列表。
第二部分:kmeansIter
第二部分其实跟第一部分差别不大,不过增加了一个setup方法,这个方法是为了将上一次的迭代的中心点从文件中读取出来然后保存到数组里,方便后面的使用。而map和reduce跟之前的一样。下面详细的说明一下setup。
setup:用缓存的方式读取上次产生的文件
输入格式为:
所以用每一行的“\t”做分割,前面是新的中心点坐标,读取出来保存到数组里即可,后面map在使用就可以了。
第三部分:kmeansViewer
其实这部分也跟前面的差别不大,主要是因为输入只是每个点的类别,所以我们不用之前的reduce了,setup和map与之前的一样,而reduce的功能就是输出。
reduce:
输入依然是这样的形式:
输出的key是map输出的key,value是map的输出的value,因为可能在map时会有合并的项
-_-||(发现这里其实可以不要reduce的,用默认的就行)
最后一部分:kmeansDriver
就是调度的部分,整个程序执行一次kmeansInit,数次kmeansIter,一次kmeansViewer。
附上四部分的代码:
一:kmeansInit
//package org.apache.hadoop.examples;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
//import GraphBuilder.GraphBuilderMapper;
//import GraphBuilder.GraphBuilderReducer;
import java.io.IOException;
import java.net.URI;
import java.util.StringTokenizer;
/**
* MapReduce开发WordCount应用程序
* */
public class kmeansInit {
/**
*
* Map:读取输入的文件
* */
public static class MyMapper extends Mapper {
//private final IntWritable one = new IntWritable(1);
private Text word = new Text();
//String dd=new String();
double[][] k={{467,41},{500,334},{724,169}};
int[] kind=new int[20000];
int count=0;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获得每行数据
String line = value.toString();
double vertice_x=Double.valueOf(line.split(",")[0]);
double vertice_y=Double.valueOf(line.split(",")[1]);
double distance=10000000.0;
for(int i=0;i<3;i++)//
{
double temp=Math.pow((k[i][0]-vertice_x),2)+Math.pow((k[i][1]-vertice_y),2);
if(temp {
@Override
protected void reduce(Text key, Text values, Context context) throws IOException, InterruptedException {
//用于累加的变量
int sum = 0;
for(IntWritable value: values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
}
}
*/
public static class MyReducer extends
Reducer {
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
//context.write(key, values);
double dis_x=0.0;
double dis_y=0.0;
int count=0;
String temp="";
for(Text value:values)
{
dis_x+=Double.valueOf(value.toString().split(",")[0]);
dis_y+=Double.valueOf(value.toString().split(",")[1]);
temp+=value.toString()+"#";
count++;
}
context.write(new Text(String.valueOf(dis_x/(count))+","+String.valueOf(dis_y/(count))),new Text(temp));
}
}
/**
* 定义Driver类
* */
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
final FileSystem filesystem = FileSystem.get(new URI(args[0]),conf);
final Path outPath = new Path(args[1]);
if(filesystem.exists(outPath)){
filesystem.delete(outPath, true);
}
Job job1 = new Job(conf, "Graph Builder");
job1.setJarByClass(kmeansInit.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setMapperClass(MyMapper.class);
job1.setReducerClass(MyReducer.class);
FileInputFormat.addInputPath(job1, new Path(args[0]));
FileOutputFormat.setOutputPath(job1, new Path(args[1]));
job1.waitForCompletion(true);
}
}
二:kmeansIter
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class kmeansIter {
private static final double damping = 0.85;
public static class PRIterMapper extends
Mapper {
int count=0;
int[] kind=new int[20000];
Text word=new Text();
private Path[] localFiles;
static double[][] k={{0.0,0.0},{0.0,0.0},{0.0,0.0}};
//private String pathfile = "hdfs://localhost:9000/user/hadoop/output1/part-r-00000";
//String [] points=new String[20];
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
//URI[] localCacheFiles = context.getCacheFiles();
localFiles = DistributedCache.getLocalCacheFiles(conf); // 获得停词表
//FileSystem fs =FileSystem.get(context.getConfiguration());
// FSDataInputStream in = fs.open(new Path(pathfile));
for (int i = 0; i < localFiles.length; i++) {
String line;
// BufferedReader br = new BufferedReader(new InputStreamReader(in));
// BufferedReader br =
// new BufferedReader(new FileReader(localCacheFiles[0].getPath()));
BufferedReader br =
new BufferedReader(new FileReader(localFiles[i].toString()));
while ((line = br.readLine()) != null) {
//StringTokenizer itr = new StringTokenizer(line);
//while (itr.hasMoreTokens()) {
//String temp=itr.nextToken();
double point_x=Double.valueOf(line.split("\t")[0].split(",")[0]);
double point_y=Double.valueOf(line.split("\t")[0].split(",")[1]);
k[count][0]=point_x;
k[count][1]=point_y;
count++;
//System.out.println("hh");
//}
}
}count=0;
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获得每行数据
String line = value.toString();
double vertice_x=Double.valueOf(line.split(",")[0]);
double vertice_y=Double.valueOf(line.split(",")[1]);
double distance=1000000000.0;
for(int i=0;i<3;i++)//
{
double temp=Math.pow((k[i][0]-vertice_x),2)+Math.pow((k[i][1]-vertice_y),2);
if(temp {
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
double dis_x=0.0;
double dis_y=0.0;
int count=0;
String temp="";
for(Text value:values)
{
dis_x+=Double.valueOf(value.toString().split(",")[0]);
dis_y+=Double.valueOf(value.toString().split(",")[1]);
temp+=value.toString()+"#";
count++;
}
context.write(new Text(String.valueOf(dis_x/(count))+","+String.valueOf(dis_y/(count))),new Text(temp));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
DistributedCache.addCacheFile(new URI(
args[2]), conf);// 设置停词列表文档作为当前作业的缓存文件
final FileSystem filesystem = FileSystem.get(new URI(args[0]),conf);
final Path outPath = new Path(args[1]);
if(filesystem.exists(outPath)){
filesystem.delete(outPath, true);
}
Job job2 = new Job(conf, "PageRankIter");
//job2.addCacheFile(new Path("hdfs://localhost:9000/input").toUri());
job2.setJarByClass(kmeansIter.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
job2.setMapperClass(PRIterMapper.class);
//job2.setCombinerClass(myCombine.class);
job2.setReducerClass(PRIterReducer.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.waitForCompletion(true);
}
}
三:kmeansViewer
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class kmeansViewer1 {
private static final double damping = 0.85;
static double[][] k={{0.0,0.0},{0.0,0.0},{0.0,0.0}};
public static class PRIterMapper extends
Mapper {
int count=0;
int[] kind=new int[20000];
Text word=new Text();
private Path[] localFiles;
//private String pathfile = "hdfs://localhost:9000/user/hadoop/output1/part-r-00000";
//String [] points=new String[20];
public void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
//URI[] localCacheFiles = context.getCacheFiles();
localFiles = DistributedCache.getLocalCacheFiles(conf); // 获得停词表
//FileSystem fs =FileSystem.get(context.getConfiguration());
// FSDataInputStream in = fs.open(new Path(pathfile));
for (int i = 0; i < localFiles.length; i++) {
String line;
// BufferedReader br = new BufferedReader(new InputStreamReader(in));
// BufferedReader br =
// new BufferedReader(new FileReader(localCacheFiles[0].getPath()));
BufferedReader br =
new BufferedReader(new FileReader(localFiles[i].toString()));
while ((line = br.readLine()) != null) {
//StringTokenizer itr = new StringTokenizer(line);
//while (itr.hasMoreTokens()) {
//String temp=itr.nextToken();
double point_x=Double.valueOf(line.split("\t")[0].split(",")[0]);
double point_y=Double.valueOf(line.split("\t")[0].split(",")[1]);
k[count][0]=point_x;
k[count][1]=point_y;
count++;
//System.out.println("hh");
//}
}
}count=0;
}
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//获得每行数据
String line = value.toString();
double vertice_x=Double.valueOf(line.split(",")[0]);
double vertice_y=Double.valueOf(line.split(",")[1]);
double distance=100000000.0;
for(int i=0;i<3;i++)//
{
double temp=Math.pow((k[i][0]-vertice_x),2)+Math.pow((k[i][1]-vertice_y),2);
if(temp {
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
for(Text value:values)
{
context.write(key, value);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
DistributedCache.addCacheFile(new URI(
args[2]), conf);// 设置停词列表文档作为当前作业的缓存文件
final FileSystem filesystem = FileSystem.get(new URI(args[0]),conf);
final Path outPath = new Path(args[1]);
if(filesystem.exists(outPath)){
filesystem.delete(outPath, true);
}
Job job2 = new Job(conf, "kmeansViewer1");
//job2.addCacheFile(new Path("hdfs://localhost:9000/input").toUri());
job2.setJarByClass(kmeansViewer1.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
job2.setMapperClass(PRIterMapper.class);
//job2.setCombinerClass(myCombine.class);
job2.setReducerClass(PRIterReducer.class);
FileInputFormat.addInputPath(job2, new Path(args[0]));
FileOutputFormat.setOutputPath(job2, new Path(args[1]));
job2.waitForCompletion(true);
}
}
四:kmeansDriver
public class kmeansDriver {
private static int times = 20; // 设置迭代次数
public static void main(String[] args) throws Exception {
String[] forGB = { "", args[1] + "/Data0" };
forGB[0] = args[0];
kmeansInit.main(forGB);
String[] forItr = { "", "","" };
for (int i = 0; i < times; i++) {
forItr[0] = args[0];
forItr[1] = args[1] + "/Data" + String.valueOf(i + 1);
forItr[2]=args[1]+"/Data"+i+"/part-r-00000";
kmeansIter.main(forItr);
}
String[] forRV = { args[0],args[1] + "/FinalRank",args[1] + "/Data" + times+"/part-r-00000" };
kmeansViewer1.main(forRV);
}
}