参考书籍《Mahout in Action》:要资源的可以找我~(中英文都有)
在eclipse平台上实现K-Means实例
代码如下:
package kmeans;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.clustering.kmeans.Kluster;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
public class testkmeans {
public static final double[][] points = {
{2, 4}, {4, 2}, {6, 2}, {5, 3}, {5, 5}, {7, 5},
{5, 15}, {6, 17}, {4, 14}, {5, 13}, {9, 15}, {3, 14}, {7, 13},
{20, 16}, {19, 15}, {17, 15}, {16, 14}, {14, 18}, {22, 10}, {17, 17}, {16, 13}, {18, 14}, {17, 13},
{22, 26}, {24, 23}, {25, 25}, {26, 22}, {26, 26}, {26, 28}, {28, 18}, {28, 28}};
//将已经转换为Vector类型的点存到序列文件中
public static void writePointsToFile(List points,
String fileName,
FileSystem fs,
Configuration conf) throws IOException {
Path path = new Path(fileName);
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
path, LongWritable.class, VectorWritable.class);
long recNum = 0;
VectorWritable vec = new VectorWritable();
for (Vector point : points) {
vec.set(point);
writer.append(new LongWritable(recNum++), vec);
}
writer.close();
}
//将数据集中的点转换为Vector类型
public static List getPoints(double[][] raw) {
List points = new ArrayList();
for (int i = 0; i < raw.length; i++) {
double[] fr = raw[i];
//create RandomAccessSparseVector
Vector vec = new RandomAccessSparseVector(fr.length);
//storage the data into Vector
vec.assign(fr);
points.add(vec);
}
return points;
}
public static void main(String args[]) throws Exception {
int k = 3;
List vectors = getPoints(points);
File testData = new File("clustering/testdata");
if (!testData.exists()) {
testData.mkdir();
}
testData = new File("clustering/testdata/points");
if (!testData.exists()) {
testData.mkdir();
}
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
writePointsToFile(vectors, "clustering/testdata/points/file1", fs, conf);
Path path = new Path("clustering/testdata/clusters/part-00000");
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, Kluster.class);
for (int i = 0; i < k; i++) {
Vector vec = vectors.get(i);
Kluster cluster = new Kluster(vec, i, new EuclideanDistanceMeasure());//采用欧几里得测距方式测量两点之间的距离
writer.append(new Text(cluster.getIdentifier()), cluster);
}
writer.close();
KMeansDriver.run(conf,
new Path("clustering/testdata/points"), //inputPath
new Path("clustering/testdata/clusters"), //ClusterPath
new Path("clustering/output"), //OutputPath
0.001, //convergenceDelta收敛系数 新的簇中心与上次的簇中心的的距离不能超过 convergenceDelta ,如果超过,则继续迭代,否则停止迭代。参数可缺,默认值是 0.5
10, //MaxIterations
true, //runCLustering
0, //clusterClassificationThreshold
true); //runSequential
SequenceFile.Reader reader = new SequenceFile.Reader(fs,
new Path("clustering/output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-0"), conf);
IntWritable key = new IntWritable();
WeightedPropertyVectorWritable value = new WeightedPropertyVectorWritable();
while (reader.next(key, value)) {
System.out.println(value.toString() + " belongs to cluster " + key.toString());
}
reader.close();
}
}
在此之前记得先配置好Maven,然后在阿里云下载相应的jar包。
下面是我的pom.xml文件:
4.0.0
k-means
kmeans
0.0.1-SNAPSHOT
http://maven.apache.org
UTF-8
junit
junit
3.8.1
test
dom4j
dom4j
1.6.1
commons-lang
commons-lang
2.6
pom
org.apache.mahout
mahout-math
0.9
commons-logging
commons-logging
1.2
com.google.collections
google-collections
1.0
org.slf4j
slf4j-api
1.7.6
org.slf4j
slf4j-log4j12
1.7.24
sources
java-source
org.slf4j
slf4j-nop
1.6.0
sources
java-source
org.apache.mahout
mahout-core
0.9
javadoc
javadoc
org.apache.mahout
mahout-integration
0.9
me.prettyprint
hector-core
1.0-3
直接在pom.xml中添加
step1 从网上下载数据集到本地,或者自己创建一个小的数据集,我在我的test目录下创建了一个test01.txt文本文件
step2 先在HDFS上创建一个新的目录,命令行为(这些命令行都是在hadoop下面运行的,在上传之前需要先启动集群):
bin/hdfs dfs -mkdir -p /user/test
step3 将测试文本test01.txt上传到HDFS上,命令行为:
bin/hdfs dfs -put test/test01.txt /user/test/
step4 可以通过ls命令查看一下上传是否成功(这一步不是必要的),看看test目录下面是否有test01.txt这个 文件;然后在用cat命令查看test01.txt的内容,命令行分别为:
bin/hdfs dfs -ls /user/test/
bin/hdfs dfs -cat /user/test/test01.txt
step5 将数据转换成Vector类型,命令行为:
mahout org.apache.mahout.clstering.conversion.InputDriver -i /user/test/test01.txt -o /user/test/text-vector
step6 执行K-Means聚类算法,命令行为:
mahout kmeans -i /user/test/text-vector -o /user/test/kmeans -c /user/test/initialcenter --maxIter 10 -k 4
step7 查看结果,命令行为:
mahout vectordump -i /user/test/text-vector/part-m-00000
或者:
mahout seqdumper -i /user/test/text-vector/part-m-00000
参考文献:
1.《Mahout in Action》
2. 博客:https://blog.csdn.net/u012948976/article/details/50263343