题外话: 请关注http://code.google.com/p/redpoll
如果有人可以提供10台左右普通机器测就好了,学校实验室不给这么多,俺已经写了一篇paper, 程序啥都好了,就差数据, 真郁闷。
进展比较慢, 走了些弯路, 不过最终还是理清了。开始考虑文档聚类后,要能通过文档的id定位到该文档的内容。而且这个id最好是整型,或者long型,而搜狐新闻给的docno是占32字节的GUID。如果不考虑这点, hadoop在算词频, 势必会生成很庞大的中间文件。
term1 docId1:tf, docId2:tf, ....
term2 docId1:tf, docId2:tf, ....
...
为了图个简便,考虑用数据库, select content from table where documentid = x 就可以直接实现这功能。然而MySql与hadoop的结合尚在初步, 没有作sharding。这样Mapper在读取数据时,就不是分布式的了。捣鼓了几天hbase, 写了些代码, 发现只存了17万条数据后,就再也存不下去了,原因不明。而且 我发现这个bigtable的仿制品还只是刚刚起步,有很多不稳定性。
没办法,看来只能靠自己了。定位其实很简单, 只要知道这篇文档在文件中的偏移量,然后用这个偏移量seek就可以了。在java中,一个long型占8字节,而且把hadoop的hdfs一般操作64m以上的文件比较有利。 把3.4G+的搜狗语料(http://www.sogou.com/labs/dl/cs.html) 全部cat在一起,然后用偏移量做文档ID是比较合理的。要定义mapper接受的key-value不是<LongWritable, Text>对的话,那得自定义InputFormat。 我针对搜狗语料做了一个SogouInputFormat,然后还有对应的RecordReader, Writable实现。结果,学校网络有问题,googlecode的svn commit不了。
搜狗的语料是采用类xml形式存储文本文件。因为处理大规模数据要求速度快,用DOM不现实。开始尝试用sax解析,结果有问题。因为有些格式出错。 于是我花了两个晚上,手写了两个状态机用来解析,终于可以读取正确,而且速度比较快。单机读取语料的速度平均51m/s,也就是说单机读取3.4G的搜狗语料一分钟多一点就可以完成。而且这种作法可以跑在mapreduce模型上了。
接下来,就是处理分词, tf, df及计算tf-idf得出VSM。
贴些代码片段:
package redpoll.examples; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConfigurable; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; /** * Input format for sogou corpus. * @author Jeremy Chow([email protected]) */ public class SogouInputFormat extends FileInputFormat<LongWritable, DocumentWritable> implements JobConfigurable { private CompressionCodecFactory compressionCodecs = null; public void configure(JobConf conf) { compressionCodecs = new CompressionCodecFactory(conf); } protected boolean isSplitable(FileSystem fs, Path file) { return compressionCodecs.getCodec(file) == null; } public RecordReader<LongWritable, DocumentWritable> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); return new SogouRecordReader(job, (FileSplit) genericSplit); } }
package redpoll.examples; import java.io.EOFException; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.WritableComparator; /** * A class that provides a sogou document reader from an input stream. * @author Jeremy Chow([email protected]) */ public class SogouCorpusReader { private static final int DEFAULT_BUFFER_SIZE = 64 * 1024; private int bufferSize = DEFAULT_BUFFER_SIZE; /* input stream which we will get documents from */ private InputStream in; /* a buffer stores temporary bytes readed from input stream */ private byte[] buffer; /* the number of bytes of real data in the buffer */ private int bufferLength = 0; /* the current position in the buffer */ private int bufferPosn = 0; /* the buffer position in input stream */ private long bufferCurrentPosn = 0; private long currentDocPosn = 0; /* xml-like mark tags used in sogou corpus */ private byte[] docTag; private byte[] urlTag; private byte[] docnoTag; private byte[] titleTag; private byte[] contentTag; /* parser status */ enum STATUS { PREPARE, START_ELEMENT, END_ELEMENT, TEXT }; /* used for defining current parsing node */ enum NODE { NULL, DOC, URL, DOC_NO, TITLE, CONTENT, FAILED, SUCCEED }; private STATUS currentSatus; private NODE currentNode; public SogouCorpusReader(InputStream in) throws IOException { this(in, DEFAULT_BUFFER_SIZE); } public SogouCorpusReader(InputStream in, int bufferSize) throws IOException { this(in, bufferSize, "doc", "url", "docno", "contenttitle", "content"); } public SogouCorpusReader(InputStream in, int bufferSize, String doc, String url, String docno, String title, String content) throws IOException { this.in = in; this.bufferSize = bufferSize; this.buffer = new byte[this.bufferSize]; docTag = doc.getBytes("UTF-8"); urlTag = url.getBytes("UTF-8"); docnoTag = docno.getBytes("UTF-8"); titleTag = title.getBytes("UTF-8"); contentTag = content.getBytes("UTF-8"); } public SogouCorpusReader(InputStream in, Configuration conf) throws IOException { this(in, conf.getInt("redpoll.sogou.doc.buffersize", DEFAULT_BUFFER_SIZE), conf.get("redpoll.sogou.doc", "doc"), conf.get("redpoll.sogou.doc.url","url"), conf.get("redpoll.sogou.doc.docno", "docno"), conf.get("redpoll.sogou.doc.contenttitle", "contenttitle"), conf.get("redpoll.sogou.doc.content", "content")); } /** * Gets a {@link redpoll.examples.Document} instance from sogou text file. If it reached EOF, return null. * @param a {@link redpoll.examples.Document} instance getting from sogou text file. * @return the position of this document, -1 if it reached EOF. * @throws IOException */ public long nextDoc(SogouDocument doc) throws IOException { currentSatus = STATUS.PREPARE; currentNode = NODE.NULL; try { while (currentNode != NODE.SUCCEED) { adjustBuffer(); if (currentSatus == STATUS.PREPARE) { if (buffer[bufferPosn] == '<') currentSatus = STATUS.START_ELEMENT; } else if (currentSatus == STATUS.START_ELEMENT) { if (buffer[bufferPosn] == '/') { // e.g. </node> currentSatus = STATUS.END_ELEMENT; } else { int start = bufferPosn; byte[] name = null; while (buffer[bufferPosn] != '>' && buffer[bufferPosn] != '\n') { bufferPosn++; if(bufferPosn >= bufferLength) { name = new byte[bufferLength - start]; System.arraycopy(buffer, start, name, 0, bufferLength - start); start = 0; } adjustBuffer(); } // if a element ends with '\n', we consider it as a wrong element if (buffer[bufferPosn] == '\n') failed(); // FAILED else if (buffer[bufferPosn] == '>') { int len = bufferPosn - start; if (len > 0) { if (name != null) { byte[] newname = new byte[name.length + len]; System.arraycopy(name, 0, newname, 0, name.length); System.arraycopy(buffer, start, newname, name.length, len); name = newname; } else { name = new byte[len]; System.arraycopy(buffer, start, name, 0, len); } startElement(name); } ignoreWhite(); currentSatus = STATUS.TEXT; } } } else if (currentSatus == STATUS.TEXT) { int start = bufferPosn; byte[] text = null; while (buffer[bufferPosn] != '<' && buffer[bufferPosn] != '\n') { bufferPosn++; if(bufferPosn >= bufferLength) { // FIXME: if the content of a document passes through more than two buffers, it will get wrong! text = new byte[bufferLength - start]; System.arraycopy(buffer, start, text, 0, bufferLength - start); start = 0; } adjustBuffer(); } if (buffer[bufferPosn] == '<') { int len = bufferPosn - start; if (len > 0) { if (text != null) { byte[] newtext = new byte[text.length + len]; System.arraycopy(text, 0, newtext, 0, text.length); System.arraycopy(buffer, start, newtext, text.length, len); text = newtext; } else { text = new byte[len]; System.arraycopy(buffer, start, text, 0, len); } characters(text, doc); } currentSatus = STATUS.START_ELEMENT; } else if (buffer[bufferPosn] == '\n') failed(); // FAILED } else if (currentSatus == STATUS.END_ELEMENT) { int start = bufferPosn; byte[] name = null; while (buffer[bufferPosn] != '>' && buffer[bufferPosn] != '\n') { bufferPosn++; if(bufferPosn >= bufferLength) { name = new byte[bufferLength - start]; System.arraycopy(buffer, start, name, 0, bufferLength - start); start = 0; } adjustBuffer(); } if (buffer[bufferPosn] == '>') { int len = bufferPosn - start; if (len > 0) { if (name != null) { byte[] newname = new byte[name.length + len]; System.arraycopy(name, 0, newname, 0, name.length); System.arraycopy(buffer, start, newname, name.length, len); name = newname; } else { name = new byte[len]; System.arraycopy(buffer, start, name, 0, len); } endElement(name); } ignoreWhite(); currentSatus = STATUS.PREPARE; } else if (buffer[bufferPosn] != '\n') failed(); // FAILED } bufferPosn++; } } catch (EOFException eofe) { return -1; } return currentDocPosn; } /** * Close the underlying stream. * @throws IOException */ public void close() throws IOException { in.close(); } private void ignoreWhite() throws IOException, EOFException { do { bufferPosn++; adjustBuffer(); } while (buffer[bufferPosn] == '\n' || buffer[bufferPosn] == '\r' || buffer[bufferPosn] == '\t' || buffer[bufferPosn] == ' '); bufferPosn--; } private void adjustBuffer() throws IOException, EOFException { if (bufferPosn >= bufferLength) { bufferCurrentPosn += bufferLength; bufferPosn = 0; bufferLength = in.read(buffer); if (bufferLength <= 0) throw new EOFException(); } } private void startElement(byte[] name) { if ((currentNode == NODE.NULL || currentNode == NODE.FAILED) && equals(docTag, name)) { currentDocPosn = bufferCurrentPosn + bufferPosn - docTag.length - 1; currentNode = NODE.DOC; } else if (currentNode == NODE.DOC && equals(urlTag, name)) { currentNode = NODE.URL; } else if (currentNode == NODE.URL && equals(docnoTag, name)) { currentNode = NODE.DOC_NO; } else if (currentNode == NODE.DOC_NO && equals(titleTag, name)) { currentNode = NODE.TITLE; } else if (currentNode == NODE.TITLE && equals(contentTag, name)) { currentNode = NODE.CONTENT; } else { currentNode = NODE.FAILED; } } private void endElement(byte[] name) { if (currentNode == NODE.CONTENT && equals(contentTag, name)) { currentNode = NODE.SUCCEED; } } private void characters(byte[] text, SogouDocument doc) { if (currentNode == NODE.URL) { doc.setPathBytes(text); } else if (currentNode == NODE.DOC_NO) { doc.setIdBytes(text); } else if (currentNode == NODE.TITLE) { doc.setTitleBytes(text); } else if (currentNode == NODE.CONTENT) { doc.setContentBytes(text); } } private void failed() { currentNode = NODE.FAILED; } private boolean equals(final byte [] left, final byte [] right) { return left == null && right == null? true: left == null && right != null? false: left != null && right == null? false: left.length != right.length? false: WritableComparator.compareBytes(left, 0, left.length, right, 0, right.length) == 0; } }