使用MapReduce实现pairs算法实现单词的共现矩阵

词频共现矩阵的用途很广泛,个性化的推荐系统,基于物品的协同过滤等等。

什么叫做共现矩阵

例如: I am a good boy good boy

    I  am  a  good boy

I      1

am       1

a                  1

good                 2

boy              2

就是二个单词一起出现的次数在一篇文档中。

如何用MapReduce实现这个功能呢

1,我们使用pairs算法,设定一个窗口,将窗口的第一个元素与窗口后面的元素一次形成一个队《(I ,am),1》

2,我们需要重写FileInputFormat将一个文件作为整体不允许分割key为文件名,value为内容的bytes

3,我们需要自定义key值得类型是一个二个对(word1,word2)是一个key,因此我们需要extends WritableComparable 并实现

equals(比较二个类是否相等)  hashCode(将相同的key值不会因为顺序,分到不同的RedUCe上)compareTo 比较二个类的大小 readFiles() writeFileds()序列化

下面是具体的代码:

package WordConCurrence;


import java.io.DataInput;


public class WordPair implements WritableComparable {
private String wordA;
private String wordB;


public WordPair() {
}


public WordPair(String wordA, String wordB) {
this.wordA = wordA;
this.wordB = wordB;
}


public String getWordA() {
return this.wordA;
}


public String getWordB() {
return this.wordB;
}


@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(wordA);
out.writeUTF(wordB);
}


@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
wordA = in.readUTF();
wordB = in.readUTF();
}


@Override
public String toString() {
return wordA + "," + wordB;
}


@Override
public int compareTo(WordPair o) {
if (this.equals(o))
return 0;
else
return (wordA + wordB).compareTo(o.getWordA() + o.getWordB());
}


@Override
public boolean equals(Object o) {
// 无序对,不用考虑顺序
if (!(o instanceof WordPair))
return false;
WordPair w = (WordPair) o;
if ((this.wordA.equals(w.wordA) && this.wordB.equals(w.wordB))
|| (this.wordB.equals(w.wordA) && this.wordA.equals(w.wordB)))
return true;
return false;
}


@Override
public int hashCode() {
return (wordA.hashCode() + wordB.hashCode()) * 17;
}
}

package WordConCurrence;


import java.io.IOException;


/*
 *重写FileInputFormat,将文件不分割,读入到一个map 
 * */
public class WholeFileInputFormat extends FileInputFormat {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
// TODO Auto-generated method stub
return false;
}


@Override
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
// return null;
return new SingleFileNameReader((FileSplit) split, context
.getConfiguration());
}
}


package WordConCurrence;


import java.io.IOException;


public class SingleFileNameReader extends RecordReader {


private FileSplit fileSplit;
@SuppressWarnings("unused")
private Configuration conf;
private boolean processed = false;
private Text key = null;
private BytesWritable value = null;
private FSDataInputStream fis = null;


public SingleFileNameReader(FileSplit fileSplit, Configuration conf) {
this.fileSplit = fileSplit;
this.conf = conf;
}


@Override
public void close() throws IOException {
// TODO Auto-generated method stub


}


@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return processed ? 1.0f : 0.0f;
}


@Override
public Text getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return key;
}


@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
// TODO Auto-generated method stub
return value;
}


/*
* @Override public void initialize(InputSplit arg0, TaskAttemptContext
* arg1) throws IOException, InterruptedException { fileSplit =
* (FileSplit)arg0; Configuration job = arg1.getConfiguration(); Path file =
* fileSplit.getPath(); FileSystem fs = file.getFileSystem(job); fis =
* fs.open(file); }
*/


@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new Text();
}
if (value == null) {
value = new BytesWritable();
}
if (!processed) {
byte[] content = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
System.out.println(file.getName());
key.set(file.getName());
try {
IOUtils.readFully(fis, content, 0, content.length);
value.set(new BytesWritable(content));
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeStream(fis);
}
processed = true;
return true;// return true表示这次inputformat还没有结束,会有下一对keyvalue产生
}
return false;// return false表示这次inputformat结束了
}


@Override
public void initialize(InputSplit split,
org.apache.hadoop.mapreduce.TaskAttemptContext context)
throws IOException, InterruptedException {
fileSplit = (FileSplit) split;
Configuration job = context.getConfiguration();
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(job);
fis = fs.open(file);


}
}

package WordConCurrence;


import java.io.IOException;


/**
 *统计在若干篇文档中两个英文单词在一定窗口内同时出现的次数
 * 如何计算二个单词出现的频率,使用pairs算法,该算法的流程就是:
 * 选择一个窗口的大小,使用队列,将队列的第一个值与后面的值分别成为一个
 *  e,of 1
 * we,on 1 we,said 1 we,should 2 we,stay 1 we,that 1 we,the 2 we,them 1 we,us 1
 * we,which 1 which,Junk 1 which,a 1 which,assures 1 which,food 1 which,is 1
 * which,necessary 1 which,nutritions 1 which,the 1 which,us 1 who,at 1 who,ate
 * 1 who,enjoy 1 who,main 1 who,meal 1 who,midday 1 who,now 1 who,their 1
 * who,traditionally
 */
public class WordConcurrnce {
private static int MAX_WINDOW = 20;// 单词同现的最大窗口大小
private static String wordRegex = "([a-zA-Z]{1,})";// 仅仅匹配由字母组成的简单英文单词
private static Pattern wordPattern = Pattern.compile(wordRegex);// 用于识别英语单词(带连字符-)
private static IntWritable one = new IntWritable(1);


public static class WordConcurrenceMapper extends
Mapper {
private int windowSize;
private Queue windowQueue = new LinkedList();


@Override
protected void setup(Context context) throws IOException,
InterruptedException {
windowSize = Math.min(context.getConfiguration()
.getInt("window", 2), MAX_WINDOW);
}


/**
* 输入键位文档的文件名,值为文档中的内容的字节形式。

*/
@Override
public void map(Text docName, BytesWritable docContent, Context context)
throws IOException, InterruptedException {
Matcher matcher = wordPattern.matcher(new String(docContent
.getBytes(), "UTF-8"));
while (matcher.find()) {
windowQueue.add(matcher.group());
if (windowQueue.size() >= windowSize) {
// 对于队列中的元素[q1,q2,q3...qn]发射[(q1,q2),1],[(q1,q3),1],
// ...[(q1,qn),1]出去
Iterator it = windowQueue.iterator();
String w1 = it.next();
while (it.hasNext()) {
String next = it.next();
context.write(new WordPair(w1, next), one);
}
windowQueue.remove();
}
}
while (!(windowQueue.size() <= 1)) {
Iterator it = windowQueue.iterator();
String w1 = it.next();
while (it.hasNext()) {
context.write(new WordPair(w1, it.next()), one);
}
windowQueue.remove();
}
}


}


public static class WordConcurrenceReducer extends
Reducer {
@Override
public void reduce(WordPair wordPair, Iterable frequence,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : frequence) {
sum += val.get();
}
context.write(wordPair, new IntWritable(sum));
}
}


public static void main(String[] args) throws IOException,
InterruptedException, ClassNotFoundException {
Job wordConcurrenceJob = new Job();
wordConcurrenceJob.setJobName("wordConcurrenceJob");
wordConcurrenceJob.setJarByClass(WordConcurrnce.class);
wordConcurrenceJob.getConfiguration().setInt("window",
Integer.parseInt(args[2]));


wordConcurrenceJob.setMapperClass(WordConcurrenceMapper.class);
wordConcurrenceJob.setMapOutputKeyClass(WordPair.class);
wordConcurrenceJob.setMapOutputValueClass(IntWritable.class);


wordConcurrenceJob.setReducerClass(WordConcurrenceReducer.class);
wordConcurrenceJob.setOutputKeyClass(WordPair.class);
wordConcurrenceJob.setOutputValueClass(IntWritable.class);


wordConcurrenceJob.setInputFormatClass(WholeFileInputFormat.class);
wordConcurrenceJob.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(wordConcurrenceJob, new Path(args[0]));
FileOutputFormat.setOutputPath(wordConcurrenceJob, new Path(args[1]));


wordConcurrenceJob.waitForCompletion(true);
System.out.println("finished!");
}
}

你可能感兴趣的:(hadoop)