KMapper.java
import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URI; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class KMapper extends Mapper<LongWritable, Text, Text, Text> { private String[] center; //读取3.txt中更新的中心点坐标,并将坐标存入center数组中 protected void setup(Context context) throws IOException,InterruptedException //read centerlist, and save to center[] { String centerlist = "hdfs://localhost:9000/home/administrator/hadoop/kmeans/input2/3.txt"; //center文件 Configuration conf1 = new Configuration(); conf1.set("hadoop.job.ugi", "hadoop-user,hadoop-user"); FileSystem fs = FileSystem.get(URI.create(centerlist),conf1); FSDataInputStream in = null; ByteArrayOutputStream out = new ByteArrayOutputStream(); try{ in = fs.open( new Path(centerlist) ); IOUtils.copyBytes(in,out,100,false); center = out.toString().split(" "); }finally{ IOUtils.closeStream(in); } } //从hadoop接收的数据在2.txt中保存 public void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); //从2.txt读入数据,以空格为分割符,一个一个处理 while(itr.hasMoreTokens())//用于判断所要分析的字符串中,是否还有语言符号,如果有则返回true,反之返回false { //计算第一个坐标跟第一个中心的距离min String outValue = new String(itr.nextToken());//逐个获取以空格为分割符的字符串(2,3) (10,30) (34,40) (1,1) String[] list = outValue.replace("(", "").replace(")", "").split(","); String[] c = center[0].replace("(", "").replace(")", "").split(","); float min = 0; int pos = 0; for(int i=0;i<list.length;i++) { System.out.println(i+"list:"+list[i]); System.out.println(i+"c:"+c[i]); min += (float) Math.pow((Float.parseFloat(list[i]) - Float.parseFloat(c[i])),2);//求欧式距离,为加根号 } for(int i=0;i<center.length;i++) { String[] centerStrings = center[i].replace("(", "").replace(")", "").split(","); float distance = 0; for(int j=0;j<list.length;j++) distance += (float) Math.pow((Float.parseFloat(list[j]) - Float.parseFloat(centerStrings[j])),2); if(min>distance) { min=distance; pos=i; } } context.write(new Text(center[pos]), new Text(outValue));//输出:中心点,对应的坐标 System.out.println("中心点"+center[pos]+"对应坐标"+outValue); System.out.println("Mapper输出:"+center[pos]+" "+outValue); } } }
import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class KReducer extends Reducer<Text, Text, Text, Text> { //<中心点类别,中心点对应的坐标集合>,每个中心点类别的坐标集合求新的中心点 public void reduce(Text key,Iterable<Text> value,Context context) throws IOException,InterruptedException { String outVal = ""; int count=0; String center=""; System.out.println("Reduce过程第一次"); System.out.println(key.toString()+"Reduce"); int length = key.toString().replace("(", "").replace(")", "").replace(":", "").split(",").length; float[] ave = new float[Float.SIZE*length]; for(int i=0;i<length;i++) ave[i]=0; for(Text val:value) { System.out.println("val:"+val.toString()); System.out.println("values:"+value.toString()); outVal += val.toString()+" "; String[] tmp = val.toString().replace("(", "").replace(")", "").split(","); System.out.println("temlength:"+tmp.length); for(int i=0;i<tmp.length;i++) ave[i] += Float.parseFloat(tmp[i]); count ++; } System.out.println("count:"+count); System.out.println("outVal:"+outVal+"/outVal"); for (int i=0;i<2;i++) { System.out.println("ave"+i+"i"+ave[i]); } //ave[0]存储X坐标之和,ave[1]存储Y坐标之和 for(int i=0;i<length;i++) { ave[i]=ave[i]/count; if(i==0) center += "("+ave[i]+","; else { if(i==length-1) center += ave[i]+")"; else { center += ave[i]+","; } } } System.out.println("写入part:"+key+" "+outVal+" "+center); context.write(key, new Text(outVal+center)); } }
import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; public class NewCenter { int k = 2; float shold=Integer.MIN_VALUE; String[] line; String newcenter = new String(""); public float run(String[] args) throws IOException,InterruptedException { Configuration conf = new Configuration(); conf.set("hadoop.job.ugi", "hadoop,hadoop"); FileSystem fs = FileSystem.get(URI.create(args[2]+"/part-r-00000"),conf); FSDataInputStream in = null; ByteArrayOutputStream out = new ByteArrayOutputStream(); try{ in = fs.open( new Path(args[2]+"/part-r-00000")); IOUtils.copyBytes(in,out,50,false); line = out.toString().split("\n"); } finally { IOUtils.closeStream(in); } //System.out.println("上一次的MapReduce结果:"+out.toString()); System.out.println("上一次MapReduce结果:第一行:"+line[0]); System.out.println("第二行:"+line[1]); System.out.println("。"); for(int i=0;i<k;i++) { String[] l = line[i].replace("\t", " ").split(" ");//如果这行有tab的空格,可以替代为空格 //(key,values)key和values同时输出是,中间保留一个Tab的距离,即'\t' String[] startCenter = l[0].replace("(", "").replace(")", "").split(","); //上上次的中心点startCenter[0]=(10,30);startCenter[1]=(2,3); String[] finalCenter = l[l.length-1].replace("(", "").replace(")", "").split(","); //上一次的中心点finalCenter[0]=(22,35);finalCenter[1]=(1.5,2.0); float tmp = 0; for(int j=0;j<startCenter.length;j++) tmp += Math.pow(Float.parseFloat(startCenter[j])-Float.parseFloat(finalCenter[j]), 2); //两个中心点间的欧式距离的平方 newcenter = newcenter + l[l.length - 1].replace("\t", "") + " "; if(shold <= tmp) shold = tmp; System.out.println(i+"坐标距离:"+tmp); } System.out.println("新中心点:"+newcenter); OutputStream out2 = fs.create(new Path(args[1]+"/center/3.txt") ); IOUtils.copyBytes(new ByteArrayInputStream(newcenter.getBytes()), out2, 4096,true); //System.out.println(newcenter); return shold; //return 0; } }