在http://www.daviddlewis.com/resources/testcollections/reuters21578/下载Reuters数据
参照:http://www.shellsec.com/tech/63646.html和http://blog.chinaunix.net/uid-20761674-id-3535501.html
跑的job:
生成的向量化文件的目录结构是这样的:
在eclipse中运行需要以下的关于lucene的jar包
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-benchmark</artifactId>
<version>4.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.6.1</version>
</dependency>
解压缩reuters数据
public static void extractReuters(){
File inputFolder = new File("datafile/reuters");
File outputFolder = new File("datafile/reuters-extracted");
ExtractReuters extractor = new ExtractReuters(inputFolder, outputFolder);
extractor.extract();
}
将解压后的reuters数据转化成SequenceFile
public static void transformToSequenceFile(){
Configuration config = BasicConfig.config();
HdfsUtils hdfs = new HdfsUtils(BasicConfig.HDFS, config);
String[] mahoutJars = {//local jars mahout和lucene的相关jar包用于传到hadoop上的classpath中,是local的jar包
"/home/training/git/socialrecommendation/datafile/mahout-math-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/lucene-analyzers-common-4.6.1.jar",
"/home/training/git/socialrecommendation/datafile/mahout-integration-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT-job.jar" ,
};
try {
hdfs.addJarToDistributedCache(Arrays.asList(mahoutJars), config);
} catch (IOException e1) {
e1.printStackTrace();
}
String[] args = {"-c", "UTF-8", "-i", BasicConfig.HDFS+"/user/hdfs/userCF/reutersExtracted", "-o",
BasicConfig.HDFS+"/user/hdfs/userCF/reutersSeqfiles"}; //在集群上运行的参数
/*String[] args = {"-c", "UTF-8", "-i", "datafile/reuters-extracted/", "-o",
"datafile/reuters-seqfiles"};*/ //在本地运行的参数
try {
/*SequenceFilesFromDirectory.main(args);*/ //在本地运行的参数
SequenceFilesFromDirectory job = new SequenceFilesFromDirectory();
job.main(args, config);
} catch (Exception e) {
e.printStackTrace();
}
}
想要让这个job在hadoop集群上跑,
需要将64行
public static void main(String[] args) throws Exception {
ToolRunner.run(new SequenceFilesFromDirectory(), args);
}
改成
private static Configuration conf;
public static void main(String[] args, Configuration config) throws Exception {
conf = config;
ToolRunner.run(new SequenceFilesFromDirectory(), args);
}
将84行:HadoopUtil.delete(getConf(), output);
改成 HadoopUtil.delete(conf, output);
将89行://runSequential(getConf(), getInputPath(), output, options);
改成:runSequential(conf, getInputPath(), output, options);
将153行:
Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
SequenceFileOutputFormat.class, "SequenceFilesFromDirectory", conf);
改成
Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
SequenceFilesFromDirectoryMapper.class, Text.class, Text.class,
SequenceFileOutputFormat.class, "SequenceFilesFromDirectory");
进行以上改变主要是为了能够将configuration的配置信息传到job上,好像该方法只能用于进行本地跑
另外在AbstractJob.java类中添加方法
protected Job prepareJob(Path inputPath,
Path outputPath,
Class<? extends InputFormat> inputFormat,
Class<? extends Mapper> mapper,
Class<? extends Writable> mapperKey,
Class<? extends Writable> mapperValue,
Class<? extends OutputFormat> outputFormat,
String jobname, Configuration conf) throws IOException {
Job job = HadoopUtil.prepareJob(inputPath, outputPath,
inputFormat, mapper, mapperKey, mapperValue, outputFormat, conf);
String name =
jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
job.setJobName(name);
return job;
}
将SequenceFile进行向量化
public static void transformToVector(Long l){
Configuration config = BasicConfig.config();
HdfsUtils hdfs = new HdfsUtils(BasicConfig.HDFS, config);
String[] mahoutJars = {//local jars
"/home/training/git/socialrecommendation/datafile/mahout-math-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/lucene-analyzers-common-4.6.1.jar",
"/home/training/git/socialrecommendation/datafile/mahout-integration-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT.jar",
"/home/training/git/socialrecommendation/datafile/mahout-mrlegacy-1.0-SNAPSHOT-job.jar" ,
};
try {
hdfs.addJarToDistributedCache(Arrays.asList(mahoutJars), config);
} catch (IOException e1) {
e1.printStackTrace();
}
String[] args = {"-a", "org.apache.lucene.analysis.core.WhitespaceAnalyzer",
"-chunk", "200","-o", BasicConfig.HDFS+"/user/hdfs/userCF/"+l+"/reutersVectorsBigram",
"-i", BasicConfig.HDFS+"/user/hdfs/userCF/reutersSeqfiles/", "-md", "3",
"-x", "90", "-wt", "tfidf", "-ml", "50","-ng", "2",
"-seq"};
try {
SparseVectorsFromSequenceFiles job = new SparseVectorsFromSequenceFiles();
job.main(args,config);
} catch (Exception e) {
e.printStackTrace();
}
}
在类SparseVectorsFromSequenceFiles的方法
将54行
private static Configuration conf;
public static void main(String[] args) throws Exception {
ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}
改成
private static Configuration conf;
public static void main(String[] args,Configuration config) throws Exception {
conf=config;
ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}
把253行:去掉Configuration conf = getConf();
进行以上改变主要是为了能够将configuration的配置信息传到job上,好像该方法只能用于进行本地跑
在HighDFWordsPruner类
第82行: DistributedCache.setCacheFiles(new URI[]{dictionaryFilePath.toUri()}, conf);
改成 DistributedCache.addCacheFileAsFirstOne(dictionaryFilePath.toUri(), conf);
在DistributedCache类中
添加方法:
/**
* Add a file to be localized to the conf. Intended
* to be used by user code.
* @param uri The uri of the cache to be localized
* @param conf Configuration to add the cache to
* @deprecated Use {@link Job#addCacheFile(URI)} instead
*/
@Deprecated
public static void addCacheFileAsFirstOne(URI uri, Configuration conf) {
String files = conf.get(MRJobConfig.CACHE_FILES);
conf.set(MRJobConfig.CACHE_FILES, files == null ? uri.toString() : uri.toString()+ ","
+ files);
}
由于使用set方法会把之前的放在classpath下的jar包的路径给删掉,因此使用add方法,并且需要改成添加成第一个路径,
因为在之后取的时候会只取第一个路径,因此不放在第一个会报错
其中:
这个是yarn的配置信息:
package com.hp.recommendation.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
public class BasicConfig {
public static final String HDFS = "hdfs://c0004649.itcs.hp.com:9000";
public static final String YARN_RESOURCE="c0004650.itcs.hp.com";
public static Configuration config() {
Configuration conf = new YarnConfiguration();
conf.set("fs.defaultFS", BasicConfig.HDFS);
conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resourcemanager.address", BasicConfig.YARN_RESOURCE+":8032");
conf.set("yarn.resourcemanager.scheduler.address", BasicConfig.YARN_RESOURCE+":8030");
//put all of the third party jars like this, but these jars should be put on hadoop clusters.
//conf.set("mapreduce.application.classpath", "$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,/opt/mount/learn/mahout-1.0-lib/*");将第三方jar包放到hadoop集群的每个节点中比较麻烦
return conf;
}
}
这个是如何操作hdfs文件的使用方法:
package com.hp.recommendation.util;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import com.hp.recommendation.model.WriteDataToHDFSModel;
/**
*
* @author shijie
* ref:http://blog.fens.me/hadoop-mahout-mapreduce-itemcf/
*/
public class HdfsUtils {
private static final String HDFS = "hdfs://c0004649.itcs.hp.com:9000";
public HdfsUtils(Configuration conf) {
this(HDFS, conf);
}
public HdfsUtils(String hdfs, Configuration conf) {
this.hdfsPath = hdfs;
this.conf = conf;
}
private String hdfsPath;
private Configuration conf;
public static void getConf() {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", HDFS);
FileSystem hdfs;
try {
hdfs = FileSystem.get(conf);
FileStatus[] fs = hdfs.listStatus(new Path("/"));
for (int i = 0; i < fs.length; i++) {
System.out.println(fs[i].toString());
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void mkdirs(String folder) throws IOException {
Path path = new Path(folder);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
if (!fs.exists(path)) {
fs.mkdirs(path);
System.out.println("Create: " + folder);
}
fs.close();
}
public void rmr(String folder) throws IOException {
Path path = new Path(folder);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
fs.deleteOnExit(path);
System.out.println("Delete: " + folder);
fs.close();
}
public void ls(String folder) throws IOException {
Path path = new Path(folder);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
FileStatus[] list = fs.listStatus(path);
System.out.println("ls: " + folder);
System.out.println("==========================================================");
for (FileStatus f : list) {
System.out.printf("name: %s, folder: %s, size: %d\n", f.getPath(), f.isDir(), f.getLen());
}
System.out.println("==========================================================");
fs.close();
}
public void createFile(String file, String content) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
byte[] buff = content.getBytes();
FSDataOutputStream os = null;
try {
os = fs.create(new Path(file));
os.write(buff, 0, buff.length);
System.out.println("Create: " + file);
} finally {
if (os != null)
os.close();
}
fs.close();
}
public void copyFile(String local, String remote) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
fs.copyFromLocalFile(new Path(local), new Path(remote));
System.out.println("copy from: " + local + " to " + remote);
fs.close();
}
public void download(String remote, String local) throws IOException {
Path path = new Path(remote);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
fs.copyToLocalFile(path, new Path(local));
System.out.println("download: from" + remote + " to " + local);
fs.close();
}
public void cat(String remoteFile) throws IOException {
Path path = new Path(remoteFile);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf)
FSDataInputStream fsdis = null;
System.out.println("cat: " + remoteFile);
try {
fsdis =fs.open(path);
IOUtils.copyBytes(fsdis, System.out, 4096, false);
} finally {
IOUtils.closeStream(fsdis);
fs.close();
}
}
public String getFile(String remoteFile) throws IOException {
Path path = new Path(remoteFile);
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
FSDataInputStream fsdis = null;
System.out.println("cat: " + remoteFile);
BufferedInputStream buffer = null;
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
String str = null;
try {
fsdis =fs.open(path);
buffer = new BufferedInputStream(fsdis);
int BUFFER_SIZE = 4096;
byte[] data = new byte[BUFFER_SIZE];
int count = -1;
while((count = buffer.read(data, 0, BUFFER_SIZE)) != -1){
outStream.write(data, 0, count);
}
data = null;
str = new String(outStream.toByteArray());
} finally {
buffer.close();
outStream.close();
fs.close();
}
return str;
}
public void writeFileToHDFS(String source, String dest) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
// Get the filename out of the file path
String filename = source.substring(source.lastIndexOf('/') + 1, source.length());
// Create the destination path including the filename.
if (dest.charAt(dest.length() - 1) != '/') {
dest = dest + "/" + filename;
} else {
dest = dest + filename;
}
// Check if the file already exists
Path path = new Path(dest);
if (fs.exists(path)) {
System.out.println("File " + dest + " already exists");
return;
}
// Create a new file and write data to it.
FSDataOutputStream out = fs.create(path);
InputStream in = new BufferedInputStream(new FileInputStream(
new File(source)));
byte[] b = new byte[1024];
int numBytes = 0;
while ((numBytes = in.read(b)) > 0) {
out.write(b, 0, numBytes);
}
// Close all the file descripters
in.close();
out.close();
fs.close();
}
public WriteDataToHDFSModel getFileSystemAndFSDataOutputStream(String file) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
// Check if the file already exists
Path path = new Path(file);
WriteDataToHDFSModel model = new WriteDataToHDFSModel();
if (fs.exists(path)) {
System.out.println("File " + file + " already exists");
FSDataOutputStream out = fs.append(path);
model.setOut(out);
}else{
// Create a new file and write data to it.
FSDataOutputStream out = fs.create(path);
model.setOut(out);
}
model.setFs(fs);
return model;
}
public void writeStringToFile(String content, String file) throws IOException {
FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
// Check if the file already exists
Path path = new Path(file);
if (fs.exists(path)) {
System.out.println("File " + file + " already exists");
FSDataOutputStream out = fs.append(path);
BufferedOutputStream buffer = new BufferedOutputStream(out);
buffer.write(content.getBytes());
buffer.flush();
// Close all the file descripters
buffer.close();
out.close();
}else{
// Create a new file and write data to it.
FSDataOutputStream out = fs.create(path);
BufferedOutputStream buffer = new BufferedOutputStream(out);
buffer.write(content.getBytes());
buffer.flush();
// Close all the file descripters
buffer.close();
out.close();
}
fs.close();
}
public static void addJarToDistributedCache(Class classToAdd,Configuration conf) throws IOException {
// Retrieve jar file for class2Add
String jar = classToAdd.getProtectionDomain().getCodeSource().getLocation().getPath();
System.out.println("jar=" + jar);
File jarFile = new File(jar);
// Declare new HDFS location
Path hdfsJar = new Path("/user/hadoop/lib/mahout/" + jarFile.getName());
// Mount HDFS
FileSystem hdfs = FileSystem.get(conf);
// Copy (override) jar file to HDFS
hdfs.copyFromLocalFile(false, true, new Path(jar), hdfsJar);
// Add jar to distributed classPath
DistributedCache.addFileToClassPath(hdfsJar, conf);
}
//add jars to classpath by jar path
public static void addJarToDistributedCache(List<String> jarPaths,Configuration conf) throws IOException {
// Mount HDFS
FileSystem hdfs = FileSystem.get(conf);
for (String jar : jarPaths) {
File jarFile = new File(jar);
// Declare new HDFS location
Path hdfsJar = new Path("/user/hadoop/lib/mahout/" + jarFile.getName());
// Copy (override) jar file to HDFS
if (!hdfs.exists(hdfsJar)) {
hdfs.copyFromLocalFile(false, true, new Path(jar), hdfsJar);
}
// Add jar to distributed classPath
DistributedCache.addFileToClassPath(hdfsJar, conf);
}
}
}