转:http://gbfd2012.iteye.com/blog/732042
htmlparser在提取网站内容时,有时会出现乱码或者是编码不能转换的问题。这是htmlparser的一个小bug,因为htmlparser作为一个开源软件已经很长时间没有更新了。
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23或者会出现页面的乱码问题。
为了彻底避免上述问题,我们可以改下htmlparser的源码的两个类。
package org.htmlparser.lexer;和InputStreamSource类。另外我们还要用 CodepageDetectorProxy(根据二进制流来分析网页编码)来提前解析网页编码。
htmlparser中设置编码一般为
Parser parser=new Parser(url);
parser.setEnconding("bianma");
但存在漏洞。
htmlparser编码的分析过程:htmlparser会根据服务器返回的文件头信息与网页的meta标签中的编码进行对比,如果服务器返回的文件头编码为空,默认返回为ISO-8859-1的编码,它会meta标签的charset里的编码对比。
改进的思路:利用CodepageDetectorProxy.jar对网页进行编码分析,获得网页的编码格式。将htmlparser的服务器返回的默认编码设置为CodepageDetectorProxy解析的编码。这样的编码和meta标签的编码总能保持一致了。。代码如下:
整个修改过程如下:
import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import info.monitorenter.cpdetector.io.ParsingDetector; import java.net.MalformedURLException; import java.net.URL; import org.htmlparser.lexer.Page; public class WebEncoding { public String AnalyEnconding(String path){ URL url=null; try { url=new URL(path); } catch (MalformedURLException e) { e.printStackTrace(); } CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); detector.add(new ParsingDetector(false)); java.nio.charset.Charset charset = null; try { charset = detector.detectCodepage(url); } catch (Exception ex) {ex.printStackTrace();} if(charset.name().equalsIgnoreCase("utf-8")||charset.name().equals("UTF-8")){ Page.GaoBinDEFAULT_CHARSET="utf-8"; }else{ Page.GaoBinDEFAULT_CHARSET="gb2312"; } return Page.getGaoBinDEFAULT_CHARSET(); } }
package org.htmlparser.lexer;
import java.io.*; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.*; import java.util.zip.*; import org.htmlparser.http.ConnectionManager; import org.htmlparser.util.ParserException; // Referenced classes of package org.htmlparser.lexer: // InputStreamSource, PageIndex, StringSource, Cursor, // Stream, Source public class Page implements Serializable { public Page() { this(""); } public Page(URLConnection connection) throws ParserException { if(null == connection) { throw new IllegalArgumentException("connection cannot be null"); } else { setConnection(connection); mBaseUrl = null; return; } } public Page(InputStream stream, String charset) throws UnsupportedEncodingException { if(null == stream) throw new IllegalArgumentException("stream cannot be null"); if(null == charset) charset = "ISO-8859-1"; mSource = new InputStreamSource(stream, charset); mIndex = new PageIndex(this); mConnection = null; mUrl = null; mBaseUrl = null; } public Page(String text, String charset) { if(null == text) throw new IllegalArgumentException("text cannot be null"); if(null == charset) charset = "ISO-8859-1"; mSource = new StringSource(text, charset); mIndex = new PageIndex(this); mConnection = null; mUrl = null; mBaseUrl = null; } public Page(String text) { this(text, null); } public Page(Source source) { if(null == source) { throw new IllegalArgumentException("source cannot be null"); } else { mSource = source; mIndex = new PageIndex(this); mConnection = null; mUrl = null; mBaseUrl = null; return; } } public static ConnectionManager getConnectionManager() { return mConnectionManager; } public static void setConnectionManager(ConnectionManager manager) { mConnectionManager = manager; } public String getCharset(String content) { String CHARSET_STRING = "charset"; String ret; if(null == mSource) ret = "ISO-8859-1"; else ret = mSource.getEncoding(); if(null != content) { int index = content.indexOf("charset"); if(index != -1) { content = content.substring(index + "charset".length()).trim(); if(content.startsWith("=")) { content = content.substring(1).trim(); index = content.indexOf(";"); if(index != -1) content = content.substring(0, index); if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length()) content = content.substring(1, content.length() - 1); if(content.startsWith("'") && content.endsWith("'") && 1 < content.length()) content = content.substring(1, content.length() - 1); ret = findCharset(content, ret); } } } return ret; } public static String findCharset(String name, String fallback) { String ret; try { Class cls = Class.forName("java.nio.charset.Charset"); Method method = cls.getMethod("forName", new Class[] { java.lang.String.class }); Object object = method.invoke(null, new Object[] { name }); method = cls.getMethod("name", new Class[0]); object = method.invoke(object, new Object[0]); ret = (String)object; } catch(ClassNotFoundException cnfe) { ret = name; } catch(NoSuchMethodException nsme) { ret = name; } catch(IllegalAccessException ia) { ret = name; } catch(InvocationTargetException ita) { ret = fallback; System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback); } return ret; } private void writeObject(ObjectOutputStream out) throws IOException { if(null != getConnection()) { out.writeBoolean(true); out.writeInt(mSource.offset()); String href = getUrl(); out.writeObject(href); setUrl(getConnection().getURL().toExternalForm()); Source source = getSource(); mSource = null; PageIndex index = mIndex; mIndex = null; out.defaultWriteObject(); mSource = source; mIndex = index; } else { out.writeBoolean(false); String href = getUrl(); out.writeObject(href); setUrl(null); out.defaultWriteObject(); setUrl(href); } } private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { boolean fromurl = in.readBoolean(); if(fromurl) { int offset = in.readInt(); String href = (String)in.readObject(); in.defaultReadObject(); if(null != getUrl()) { URL url = new URL(getUrl()); try { setConnection(url.openConnection()); } catch(ParserException pe) { throw new IOException(pe.getMessage()); } } Cursor cursor = new Cursor(this, 0); for(int i = 0; i < offset; i++) try { getCharacter(cursor); } catch(ParserException pe) { throw new IOException(pe.getMessage()); } setUrl(href); } else { String href = (String)in.readObject(); in.defaultReadObject(); setUrl(href); } } public void reset() { getSource().reset(); mIndex = new PageIndex(this); } public void close() throws IOException { if(null != getSource()) getSource().destroy(); } protected void finalize() throws Throwable { close(); } public URLConnection getConnection() { return mConnection; } public void setConnection(URLConnection connection) throws ParserException { mConnection = connection; mConnection.setConnectTimeout(6000); mConnection.setReadTimeout(6000); try { getConnection().connect(); } catch(UnknownHostException uhe) { throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe); } catch(IOException ioe) { throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe); } String type = getContentType(); String charset = getCharset(type); try { String contentEncoding = connection.getContentEncoding(); System.out.println("contentEncoding="+contentEncoding); Stream stream; if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip")) stream = new Stream(new GZIPInputStream(getConnection().getInputStream())); else if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate")) stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true))); else{ stream = new Stream(getConnection().getInputStream()); } try { /* * 时间:2010年8月6日 * 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下 */ if(charset.indexOf("ISO-8859-1")!=-1){ charset =getGaoBinDEFAULT_CHARSET() ; } mSource = new InputStreamSource(stream, charset); } catch(UnsupportedEncodingException uee) { charset = "ISO-8859-1"; mSource = new InputStreamSource(stream, charset); } } catch(IOException ioe) { throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe); } mUrl = connection.getURL().toExternalForm(); mIndex = new PageIndex(this); } public String getUrl() { return mUrl; } public void setUrl(String url) { mUrl = url; } public String getBaseUrl() { return mBaseUrl; } public void setBaseUrl(String url) { mBaseUrl = url; } public Source getSource() { return mSource; } public String getContentType() { String ret = "text/html"; URLConnection connection = getConnection(); if(null != connection) { String content = connection.getHeaderField("Content-Type"); if(null != content) ret = content; } return ret; } public char getCharacter(Cursor cursor) throws ParserException { int i = cursor.getPosition(); int offset = mSource.offset(); char ret; if(offset == i) try { i = mSource.read(); if(-1 == i) { ret = '\uFFFF'; } else { ret = (char)i; cursor.advance(); } } catch(IOException ioe) { throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe); } else if(offset > i) { try { ret = mSource.getCharacter(i); } catch(IOException ioe) { throw new ParserException("can't read a character at position " + i, ioe); } cursor.advance(); } else { throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset()); } if('\r' == ret) { ret = '\n'; if(mSource.offset() == cursor.getPosition()) try { i = mSource.read(); if(-1 != i) if('\n' == (char)i) cursor.advance(); else try { mSource.unread(); } catch(IOException ioe) { throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe); } } catch(IOException ioe) { throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe); } else try { if('\n' == mSource.getCharacter(cursor.getPosition())) cursor.advance(); } catch(IOException ioe) { throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe); } } if('\n' == ret) mIndex.add(cursor); return ret; } public void ungetCharacter(Cursor cursor) throws ParserException { cursor.retreat(); int i = cursor.getPosition(); try { char ch = mSource.getCharacter(i); if('\n' == ch && 0 != i) { ch = mSource.getCharacter(i - 1); if('\r' == ch) cursor.retreat(); } } catch(IOException ioe) { throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe); } } public String getEncoding() { return getSource().getEncoding(); } public void setEncoding(String character_set) throws ParserException { Page.GaoBinDEFAULT_CHARSET = character_set; getSource().setEncoding(character_set); } public URL constructUrl(String link, String base) throws MalformedURLException { return constructUrl(link, base, false); } public URL constructUrl(String link, String base, boolean strict) throws MalformedURLException { int index; URL url; if(!strict && '?' == link.charAt(0)) { if(-1 != (index = base.lastIndexOf('?'))) base = base.substring(0, index); url = new URL(base + link); } else { url = new URL(new URL(base), link); } String path = url.getFile(); boolean modified = false; boolean absolute = link.startsWith("/"); if(!absolute) do { if(!path.startsWith("/.")) break; if(path.startsWith("/../")) { path = path.substring(3); modified = true; continue; } if(!path.startsWith("/./") && !path.startsWith("/.")) break; path = path.substring(2); modified = true; } while(true); while(-1 != (index = path.indexOf("/\\"))) { path = path.substring(0, index + 1) + path.substring(index + 2); modified = true; } if(modified) url = new URL(url, path); return url; } public String getAbsoluteURL(String link) { return getAbsoluteURL(link, false); } public String getAbsoluteURL(String link, boolean strict) { String ret; if(null == link || "".equals(link)) ret = ""; else try { String base = getBaseUrl(); if(null == base) base = getUrl(); if(null == base) { ret = link; } else { URL url = constructUrl(link, base, strict); ret = url.toExternalForm(); } } catch(MalformedURLException murle) { ret = link; } return ret; } public int row(Cursor cursor) { return mIndex.row(cursor); } public int row(int position) { return mIndex.row(position); } public int column(Cursor cursor) { return mIndex.column(cursor); } public int column(int position) { return mIndex.column(position); } public String getText(int start, int end) throws IllegalArgumentException { String ret; try { ret = mSource.getString(start, end - start); } catch(IOException ioe) { throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage()); } return ret; } public void getText(StringBuffer buffer, int start, int end) throws IllegalArgumentException { if(mSource.offset() < start || mSource.offset() < end) throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset()); int length; if(end < start) { length = end; end = start; start = length; } length = end - start; try { mSource.getCharacters(buffer, start, length); } catch(IOException ioe) { throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage()); } } public String getText() { return getText(0, mSource.offset()); } public void getText(StringBuffer buffer) { getText(buffer, 0, mSource.offset()); } public void getText(char array[], int offset, int start, int end) throws IllegalArgumentException { if(mSource.offset() < start || mSource.offset() < end) throw new IllegalArgumentException("attempt to extract future characters from source"); int length; if(end < start) { length = end; end = start; start = length; } length = end - start; try { mSource.getCharacters(array, offset, start, end); } catch(IOException ioe) { throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage()); } } public String getLine(Cursor cursor) { int line = row(cursor); int size = mIndex.size(); int start; int end; if(line < size) { start = mIndex.elementAt(line); if(++line <= size) end = mIndex.elementAt(line); else end = mSource.offset(); } else { start = mIndex.elementAt(line - 1); end = mSource.offset(); } return getText(start, end); } public String getLine(int position) { return getLine(new Cursor(this, position)); } public String toString() { String ret; if(mSource.offset() > 0) { StringBuffer buffer = new StringBuffer(43); int start = mSource.offset() - 40; if(0 > start) start = 0; else buffer.append("..."); getText(buffer, start, mSource.offset()); ret = buffer.toString(); } else { ret = super.toString(); } return ret; } public static final String DEFAULT_CHARSET = "ISO-8859-1"; public static String GaoBinDEFAULT_CHARSET; public static final String DEFAULT_CONTENT_TYPE = "text/html"; public static final char EOF = 65535; protected String mUrl; protected String mBaseUrl; protected Source mSource; protected PageIndex mIndex; protected transient URLConnection mConnection; protected static ConnectionManager mConnectionManager = new ConnectionManager(); public static String getGaoBinDEFAULT_CHARSET() { return GaoBinDEFAULT_CHARSET; } public static void setGaoBinDEFAULT_CHARSET(String gaoBinDEFAULT_CHARSET) { GaoBinDEFAULT_CHARSET = gaoBinDEFAULT_CHARSET; } }
package org.htmlparser.lexer;
import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.UnsupportedEncodingException; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.ParserException; public class InputStreamSource extends Source { /** * An initial buffer size. * Has a default value of {16384}. */ public static int BUFFER_SIZE = 16384; /** * The stream of bytes. * Set to <code>null</code> when the source is closed. */ protected transient InputStream mStream; /** * The character set in use. */ protected String mEncoding; /** * The converter from bytes to characters. */ protected transient InputStreamReader mReader; /** * The characters read so far. */ protected char[] mBuffer; /** * The number of valid bytes in the buffer. */ protected int mLevel; /** * The offset of the next byte returned by read(). */ protected int mOffset; /** * The bookmark. */ protected int mMark; /** * Create a source of characters using the default character set. * @param stream The stream of bytes to use. * @exception UnsupportedEncodingException If the default character set * is unsupported. */ public InputStreamSource (InputStream stream) throws UnsupportedEncodingException { this (stream, null, BUFFER_SIZE); } public InputStreamSource (InputStream stream, String charset) throws UnsupportedEncodingException { this (stream, charset, BUFFER_SIZE); } /** * Create a source of characters. * @param stream The stream of bytes to use. * @param charset The character set used in encoding the stream. * @param size The initial character buffer size. * @exception UnsupportedEncodingException If the character set * is unsupported. */ public InputStreamSource (InputStream stream, String charset, int size) throws UnsupportedEncodingException { if (null == stream) stream = new Stream (null); else // bug #1044707 mark()/reset() issues if (!stream.markSupported ()) // wrap the stream so we can reset stream = new Stream (stream); mStream = stream; if (null == charset) { mReader = new InputStreamReader (stream); mEncoding = mReader.getEncoding (); } else { mEncoding = charset; mReader = new InputStreamReader (stream, charset); } mBuffer = new char[size]; mLevel = 0; mOffset = 0; mMark = -1; } /** * Serialization support. * @param out Where to write this object. * @exception IOException If serialization has a problem. */ private void writeObject (ObjectOutputStream out) throws IOException { int offset; char[] buffer; if (null != mStream) { // remember the offset, drain the input stream, restore the offset offset = mOffset; buffer = new char[4096]; while (EOF != read (buffer)) ; mOffset = offset; } out.defaultWriteObject (); } /** * Deserialization support. * @param in Where to read this object from. * @exception IOException If deserialization has a problem. */ private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); if (null != mBuffer) // buffer is null when destroy's been called // pretend we're open, mStream goes null when exhausted mStream = new ByteArrayInputStream (new byte[0]); } /** * Get the input stream being used. * @return The current input stream. */ public InputStream getStream () { return (mStream); } /** * Get the encoding being used to convert characters. * @return The current encoding. */ public String getEncoding () { return (mEncoding); } /** * Begins reading from the source with the given character set. * If the current encoding is the same as the requested encoding, * this method is a no-op. Otherwise any subsequent characters read from * this page will have been decoded using the given character set.<p> * Some magic happens here to obtain this result if characters have already * been consumed from this source. * Since a Reader cannot be dynamically altered to use a different character * set, the underlying stream is reset, a new Source is constructed * and a comparison made of the characters read so far with the newly * read characters up to the current position. * If a difference is encountered, or some other problem occurs, * an exception is thrown. * @param character_set The character set to use to convert bytes into * characters. * @exception ParserException If a character mismatch occurs between * characters already provided and those that would have been returned * had the new character set been in effect from the beginning. An * exception is also thrown if the underlying stream won't put up with * these shenanigans. */ public void setEncoding (String character_set) throws ParserException { String encoding; InputStream stream; char[] buffer; int offset; char[] new_chars; encoding = getEncoding (); if(encoding!=null){ character_set=encoding; } if (!encoding.equalsIgnoreCase (character_set)) { stream = getStream (); try { buffer = mBuffer; offset = mOffset; stream.reset (); try { mEncoding = character_set; mReader = new InputStreamReader (stream, character_set); mBuffer = new char[mBuffer.length]; mLevel = 0; mOffset = 0; mMark = -1; if (0 != offset) { new_chars = new char[offset]; if (offset != read (new_chars)) throw new ParserException ("reset stream failed"); for (int i = 0; i < offset; i++) if (new_chars[i] != buffer[i]) throw new EncodingChangeException ("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString (new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString (buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i); } } catch (IOException ioe) { throw new ParserException (ioe.getMessage (), ioe); } } catch (IOException ioe) { // bug #1044707 mark()/reset() issues throw new ParserException ("Stream reset failed (" + ioe.getMessage () + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe); } } } /** * Fetch more characters from the underlying reader. * Has no effect if the underlying reader has been drained. * @param min The minimum to read. * @exception IOException If the underlying reader read() throws one. */ protected void fill (int min) throws IOException { char[] buffer; int size; int read; if (null != mReader) // mReader goes null when it's been sucked dry { size = mBuffer.length - mLevel; // available space if (size < min) // oops, better get some buffer space { // unknown length... keep doubling size = mBuffer.length * 2; read = mLevel + min; if (size < read) // or satisfy min, whichever is greater size = read; else min = size - mLevel; // read the max buffer = new char[size]; } else { buffer = mBuffer; min = size; } // read into the end of the 'new' buffer read = mReader.read (buffer, mLevel, min); if (EOF == read) { mReader.close (); mReader = null; } else { if (mBuffer != buffer) { // copy the bytes previously read System.arraycopy (mBuffer, 0, buffer, 0, mLevel); mBuffer = buffer; } mLevel += read; } // todo, should repeat on read shorter than original min } } /** * Does nothing. * It's supposed to close the source, but use destroy() instead. * @exception IOException <em>not used</em> * @see #destroy */ public void close () throws IOException { } /** * Read a single character. * This method will block until a character is available, * an I/O error occurs, or the end of the stream is reached. * @return The character read, as an integer in the range 0 to 65535 * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has * been reached * @exception IOException If an I/O error occurs. */ public int read () throws IOException { int ret; if (mLevel - mOffset < 1) { if (null == mStream) throw new IOException ("source is closed"); fill (1); if (mOffset >= mLevel) ret = EOF; else ret = mBuffer[mOffset++]; } else ret = mBuffer[mOffset++]; return (ret); } /** * Read characters into a portion of an array. This method will block * until some input is available, an I/O error occurs, or the end of the * stream is reached. * @param cbuf Destination buffer * @param off Offset at which to start storing characters * @param len Maximum number of characters to read * @return The number of characters read, or {@link #EOF EOF} if the end of * the stream has been reached * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf, int off, int len) throws IOException { int ret; if (null == mStream) throw new IOException ("source is closed"); if ((null == cbuf) || (0 > off) || (0 > len)) throw new IOException ("illegal argument read (" + ((null == cbuf) ? "null" : "cbuf") + ", " + off + ", " + len + ")"); if (mLevel - mOffset < len) fill (len - (mLevel - mOffset)); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, len); System.arraycopy (mBuffer, mOffset, cbuf, off, ret); mOffset += ret; } return (ret); } /** * Read characters into an array. * This method will block until some input is available, an I/O error occurs, * or the end of the stream is reached. * @param cbuf Destination buffer. * @return The number of characters read, or {@link #EOF EOF} if the end of * the stream has been reached. * @exception IOException If an I/O error occurs. */ public int read (char[] cbuf) throws IOException { return (read (cbuf, 0, cbuf.length)); } /** * Reset the source. * Repositions the read point to begin at zero. * @exception IllegalStateException If the source has been closed. */ public void reset () throws IllegalStateException { if (null == mStream) throw new IllegalStateException ("source is closed"); if (-1 != mMark) mOffset = mMark; else mOffset = 0; } /** * Tell whether this source supports the mark() operation. * @return <code>true</code>. */ public boolean markSupported () { return (true); } /** * Mark the present position in the source. * Subsequent calls to {@link #reset()} * will attempt to reposition the source to this point. * @param readAheadLimit <em>Not used.</em> * @exception IOException If the source is closed. * */ public void mark (int readAheadLimit) throws IOException { if (null == mStream) throw new IOException ("source is closed"); mMark = mOffset; } /** * Tell whether this source is ready to be read. * @return <code>true</code> if the next read() is guaranteed not to block * for input, <code>false</code> otherwise. * Note that returning false does not guarantee that the next read will block. * @exception IOException If the source is closed. */ public boolean ready () throws IOException { if (null == mStream) throw new IOException ("source is closed"); return (mOffset < mLevel); } /** * Skip characters. * This method will block until some characters are available, * an I/O error occurs, or the end of the stream is reached. * <em>Note: n is treated as an int</em> * @param n The number of characters to skip. * @return The number of characters actually skipped * @exception IllegalArgumentException If <code>n</code> is negative. * @exception IOException If an I/O error occurs. */ public long skip (long n) throws IOException, IllegalArgumentException { long ret; if (null == mStream) throw new IOException ("source is closed"); if (0 > n) throw new IllegalArgumentException ("cannot skip backwards"); else { if (mLevel - mOffset < n) fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request if (mOffset >= mLevel) ret = EOF; else { ret = Math.min (mLevel - mOffset, n); mOffset += ret; } } return (ret); } /** * Undo the read of a single character. * @exception IOException If the source is closed or no characters have * been read. */ public void unread () throws IOException { if (null == mStream) throw new IOException ("source is closed"); if (0 < mOffset) mOffset--; else throw new IOException ("can't unread no characters"); } /** * Retrieve a character again. * @param offset The offset of the character. * @return The character at <code>offset</code>. * @exception IOException If the offset is beyond {@link #offset()} or the * source is closed. */ public char getCharacter (int offset) throws IOException { char ret; if (null == mStream) throw new IOException ("source is closed"); if (offset >= mBuffer.length) throw new IOException ("illegal read ahead"); else ret = mBuffer[offset]; return (ret); } /** * Retrieve characters again. * @param array The array of characters. * @param offset The starting position in the array where characters are to be placed. * @param start The starting position, zero based. * @param end The ending position * (exclusive, i.e. the character at the ending position is not included), * zero based. * @exception IOException If the start or end is beyond {@link #offset()} * or the source is closed. */ public void getCharacters (char[] array, int offset, int start, int end) throws IOException { if (null == mStream) throw new IOException ("source is closed"); System.arraycopy (mBuffer, start, array, offset, end - start); } /** * Retrieve a string. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @return A string containing the <code>length</code> characters at <code>offset</code>. * @exception IOException If the offset or (offset + length) is beyond * {@link #offset()} or the source is closed. */ public String getString (int offset, int length) throws IOException { String ret; if (null == mStream) throw new IOException ("source is closed"); if (offset + length > mBuffer.length) throw new IOException ("illegal read ahead"); else ret = new String (mBuffer, offset, length); return (ret); } /** * Append characters already read into a <code>StringBuffer</code>. * @param buffer The buffer to append to. * @param offset The offset of the first character. * @param length The number of characters to retrieve. * @exception IOException If the offset or (offset + length) is beyond * {@link #offset()} or the source is closed. */ public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException { if (null == mStream) throw new IOException ("source is closed"); buffer.append (mBuffer, offset, length); } /** * Close the source. * Once a source has been closed, further {@link #read() read}, * {@link #ready ready}, {@link #mark mark}, {@link #reset reset}, * {@link #skip skip}, {@link #unread unread}, * {@link #getCharacter getCharacter} or {@link #getString getString} * invocations will throw an IOException. * Closing a previously-closed source, however, has no effect. * @exception IOException If an I/O error occurs */ public void destroy () throws IOException { mStream = null; if (null != mReader) mReader.close (); mReader = null; mBuffer = null; mLevel = 0; mOffset = 0; mMark = -1; } /** * Get the position (in characters). * @return The number of characters that have already been read, or * {@link #EOF EOF} if the source is closed. */ public int offset () { int ret; if (null == mStream) ret = EOF; else ret = mOffset; return (ret); } /** * Get the number of available characters. * @return The number of characters that can be read without blocking or * zero if the source is closed. */ public int available () { int ret; if (null == mStream) ret = 0; else ret = mLevel - mOffset; return (ret); } }