htmlparser在提取网站内容时,有时会出现乱码或者是编码不能转换的问题。这是htmlparser的一个小bug,因为htmlparser作为一个开源软件已经很长时间没有更新了。
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23或者会出现页面的乱码问题。
为了彻底避免上述问题,我们可以改下htmlparser的源码的两个类。
package org.htmlparser.lexer;和InputStreamSource类。另外我们还要用 CodepageDetectorProxy(根据二进制流来分析网页编码)来提前解析网页编码。
htmlparser中设置编码一般为
Parser parser=new Parser(url);
parser.setEnconding("bianma");
但存在漏洞。
htmlparser编码的分析过程:htmlparser会根据服务器返回的文件头信息与网页的meta标签中的编码进行对比,如果服务器返回的文件头编码为空,默认返回为ISO-8859-1的编码,它会meta标签的charset里的编码对比。
改进的思路:利用CodepageDetectorProxy.jar对网页进行编码分析,获得网页的编码格式。将htmlparser的服务器返回的默认编码设置为CodepageDetectorProxy解析的编码。这样的编码和meta标签的编码总能保持一致了。。代码如下:
整个修改过程如下:
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.ParsingDetector;
import java.net.MalformedURLException;
import java.net.URL;
import org.htmlparser.lexer.Page;
public class WebEncoding {
public String AnalyEnconding(String path){
URL url=null;
try {
url=new URL(path);
} catch (MalformedURLException e) {
e.printStackTrace();
}
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
java.nio.charset.Charset charset = null;
try {
charset = detector.detectCodepage(url);
} catch (Exception ex) {ex.printStackTrace();}
if(charset.name().equalsIgnoreCase("utf-8")||charset.name().equals("UTF-8")){
Page.GaoBinDEFAULT_CHARSET="utf-8";
}else{
Page.GaoBinDEFAULT_CHARSET="gb2312";
}
return Page.getGaoBinDEFAULT_CHARSET();
}
}
package org.htmlparser.lexer;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;
// Referenced classes of package org.htmlparser.lexer:
// InputStreamSource, PageIndex, StringSource, Cursor,
// Stream, Source
public class Page
implements Serializable
{
public Page()
{
this("");
}
public Page(URLConnection connection)
throws ParserException
{
if(null == connection)
{
throw new IllegalArgumentException("connection cannot be null");
} else
{
setConnection(connection);
mBaseUrl = null;
return;
}
}
public Page(InputStream stream, String charset)
throws UnsupportedEncodingException
{
if(null == stream)
throw new IllegalArgumentException("stream cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text, String charset)
{
if(null == text)
throw new IllegalArgumentException("text cannot be null");
if(null == charset)
charset = "ISO-8859-1";
mSource = new StringSource(text, charset);
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
}
public Page(String text)
{
this(text, null);
}
public Page(Source source)
{
if(null == source)
{
throw new IllegalArgumentException("source cannot be null");
} else
{
mSource = source;
mIndex = new PageIndex(this);
mConnection = null;
mUrl = null;
mBaseUrl = null;
return;
}
}
public static ConnectionManager getConnectionManager()
{
return mConnectionManager;
}
public static void setConnectionManager(ConnectionManager manager)
{
mConnectionManager = manager;
}
public String getCharset(String content)
{
String CHARSET_STRING = "charset";
String ret;
if(null == mSource)
ret = "ISO-8859-1";
else
ret = mSource.getEncoding();
if(null != content)
{
int index = content.indexOf("charset");
if(index != -1)
{
content = content.substring(index + "charset".length()).trim();
if(content.startsWith("="))
{
content = content.substring(1).trim();
index = content.indexOf(";");
if(index != -1)
content = content.substring(0, index);
if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
content = content.substring(1, content.length() - 1);
if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
content = content.substring(1, content.length() - 1);
ret = findCharset(content, ret);
}
}
}
return ret;
}
public static String findCharset(String name, String fallback)
{
String ret;
try
{
Class cls = Class.forName("java.nio.charset.Charset");
Method method = cls.getMethod("forName", new Class[] {
java.lang.String.class
});
Object object = method.invoke(null, new Object[] {
name
});
method = cls.getMethod("name", new Class[0]);
object = method.invoke(object, new Object[0]);
ret = (String)object;
}
catch(ClassNotFoundException cnfe)
{
ret = name;
}
catch(NoSuchMethodException nsme)
{
ret = name;
}
catch(IllegalAccessException ia)
{
ret = name;
}
catch(InvocationTargetException ita)
{
ret = fallback;
System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
}
return ret;
}
private void writeObject(ObjectOutputStream out)
throws IOException
{
if(null != getConnection())
{
out.writeBoolean(true);
out.writeInt(mSource.offset());
String href = getUrl();
out.writeObject(href);
setUrl(getConnection().getURL().toExternalForm());
Source source = getSource();
mSource = null;
PageIndex index = mIndex;
mIndex = null;
out.defaultWriteObject();
mSource = source;
mIndex = index;
} else
{
out.writeBoolean(false);
String href = getUrl();
out.writeObject(href);
setUrl(null);
out.defaultWriteObject();
setUrl(href);
}
}
private void readObject(ObjectInputStream in)
throws IOException, ClassNotFoundException
{
boolean fromurl = in.readBoolean();
if(fromurl)
{
int offset = in.readInt();
String href = (String)in.readObject();
in.defaultReadObject();
if(null != getUrl())
{
URL url = new URL(getUrl());
try
{
setConnection(url.openConnection());
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
}
Cursor cursor = new Cursor(this, 0);
for(int i = 0; i < offset; i++)
try
{
getCharacter(cursor);
}
catch(ParserException pe)
{
throw new IOException(pe.getMessage());
}
setUrl(href);
} else
{
String href = (String)in.readObject();
in.defaultReadObject();
setUrl(href);
}
}
public void reset()
{
getSource().reset();
mIndex = new PageIndex(this);
}
public void close()
throws IOException
{
if(null != getSource())
getSource().destroy();
}
protected void finalize()
throws Throwable
{
close();
}
public URLConnection getConnection()
{
return mConnection;
}
public void setConnection(URLConnection connection)
throws ParserException
{
mConnection = connection;
mConnection.setConnectTimeout(6000);
mConnection.setReadTimeout(6000);
try
{
getConnection().connect();
}
catch(UnknownHostException uhe)
{
throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
}
catch(IOException ioe)
{
throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
String type = getContentType();
String charset = getCharset(type);
try
{
String contentEncoding = connection.getContentEncoding();
System.out.println("contentEncoding="+contentEncoding);
Stream stream;
if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
else
if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
else{
stream = new Stream(getConnection().getInputStream());
}
try
{
/*
* 时间:2010年8月6日
* 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
*/
if(charset.indexOf("ISO-8859-1")!=-1){
charset =getGaoBinDEFAULT_CHARSET() ;
}
mSource = new InputStreamSource(stream, charset);
}
catch(UnsupportedEncodingException uee)
{
charset = "ISO-8859-1";
mSource = new InputStreamSource(stream, charset);
}
}
catch(IOException ioe)
{
throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
}
mUrl = connection.getURL().toExternalForm();
mIndex = new PageIndex(this);
}
public String getUrl()
{
return mUrl;
}
public void setUrl(String url)
{
mUrl = url;
}
public String getBaseUrl()
{
return mBaseUrl;
}
public void setBaseUrl(String url)
{
mBaseUrl = url;
}
public Source getSource()
{
return mSource;
}
public String getContentType()
{
String ret = "text/html";
URLConnection connection = getConnection();
if(null != connection)
{
String content = connection.getHeaderField("Content-Type");
if(null != content)
ret = content;
}
return ret;
}
public char getCharacter(Cursor cursor)
throws ParserException
{
int i = cursor.getPosition();
int offset = mSource.offset();
char ret;
if(offset == i)
try
{
i = mSource.read();
if(-1 == i)
{
ret = '\uFFFF';
} else
{
ret = (char)i;
cursor.advance();
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
if(offset > i)
{
try
{
ret = mSource.getCharacter(i);
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + i, ioe);
}
cursor.advance();
} else
{
throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
}
if('\r' == ret)
{
ret = '\n';
if(mSource.offset() == cursor.getPosition())
try
{
i = mSource.read();
if(-1 != i)
if('\n' == (char)i)
cursor.advance();
else
try
{
mSource.unread();
}
catch(IOException ioe)
{
throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
}
}
catch(IOException ioe)
{
throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
}
else
try
{
if('\n' == mSource.getCharacter(cursor.getPosition()))
cursor.advance();
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
if('\n' == ret)
mIndex.add(cursor);
return ret;
}
public void ungetCharacter(Cursor cursor)
throws ParserException
{
cursor.retreat();
int i = cursor.getPosition();
try
{
char ch = mSource.getCharacter(i);
if('\n' == ch && 0 != i)
{
ch = mSource.getCharacter(i - 1);
if('\r' == ch)
cursor.retreat();
}
}
catch(IOException ioe)
{
throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
}
}
public String getEncoding()
{
return getSource().getEncoding();
}
public void setEncoding(String character_set)
throws ParserException
{
Page.GaoBinDEFAULT_CHARSET = character_set;
getSource().setEncoding(character_set);
}
public URL constructUrl(String link, String base)
throws MalformedURLException
{
return constructUrl(link, base, false);
}
public URL constructUrl(String link, String base, boolean strict)
throws MalformedURLException
{
int index;
URL url;
if(!strict && '?' == link.charAt(0))
{
if(-1 != (index = base.lastIndexOf('?')))
base = base.substring(0, index);
url = new URL(base + link);
} else
{
url = new URL(new URL(base), link);
}
String path = url.getFile();
boolean modified = false;
boolean absolute = link.startsWith("/");
if(!absolute)
do
{
if(!path.startsWith("/."))
break;
if(path.startsWith("/../"))
{
path = path.substring(3);
modified = true;
continue;
}
if(!path.startsWith("/./") && !path.startsWith("/."))
break;
path = path.substring(2);
modified = true;
} while(true);
while(-1 != (index = path.indexOf("/\\")))
{
path = path.substring(0, index + 1) + path.substring(index + 2);
modified = true;
}
if(modified)
url = new URL(url, path);
return url;
}
public String getAbsoluteURL(String link)
{
return getAbsoluteURL(link, false);
}
public String getAbsoluteURL(String link, boolean strict)
{
String ret;
if(null == link || "".equals(link))
ret = "";
else
try
{
String base = getBaseUrl();
if(null == base)
base = getUrl();
if(null == base)
{
ret = link;
} else
{
URL url = constructUrl(link, base, strict);
ret = url.toExternalForm();
}
}
catch(MalformedURLException murle)
{
ret = link;
}
return ret;
}
public int row(Cursor cursor)
{
return mIndex.row(cursor);
}
public int row(int position)
{
return mIndex.row(position);
}
public int column(Cursor cursor)
{
return mIndex.column(cursor);
}
public int column(int position)
{
return mIndex.column(position);
}
public String getText(int start, int end)
throws IllegalArgumentException
{
String ret;
try
{
ret = mSource.getString(start, end - start);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
return ret;
}
public void getText(StringBuffer buffer, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(buffer, start, length);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getText()
{
return getText(0, mSource.offset());
}
public void getText(StringBuffer buffer)
{
getText(buffer, 0, mSource.offset());
}
public void getText(char array[], int offset, int start, int end)
throws IllegalArgumentException
{
if(mSource.offset() < start || mSource.offset() < end)
throw new IllegalArgumentException("attempt to extract future characters from source");
int length;
if(end < start)
{
length = end;
end = start;
start = length;
}
length = end - start;
try
{
mSource.getCharacters(array, offset, start, end);
}
catch(IOException ioe)
{
throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
}
}
public String getLine(Cursor cursor)
{
int line = row(cursor);
int size = mIndex.size();
int start;
int end;
if(line < size)
{
start = mIndex.elementAt(line);
if(++line <= size)
end = mIndex.elementAt(line);
else
end = mSource.offset();
} else
{
start = mIndex.elementAt(line - 1);
end = mSource.offset();
}
return getText(start, end);
}
public String getLine(int position)
{
return getLine(new Cursor(this, position));
}
public String toString()
{
String ret;
if(mSource.offset() > 0)
{
StringBuffer buffer = new StringBuffer(43);
int start = mSource.offset() - 40;
if(0 > start)
start = 0;
else
buffer.append("...");
getText(buffer, start, mSource.offset());
ret = buffer.toString();
} else
{
ret = super.toString();
}
return ret;
}
public static final String DEFAULT_CHARSET = "ISO-8859-1";
public static String GaoBinDEFAULT_CHARSET;
public static final String DEFAULT_CONTENT_TYPE = "text/html";
public static final char EOF = 65535;
protected String mUrl;
protected String mBaseUrl;
protected Source mSource;
protected PageIndex mIndex;
protected transient URLConnection mConnection;
protected static ConnectionManager mConnectionManager = new ConnectionManager();
public static String getGaoBinDEFAULT_CHARSET() {
return GaoBinDEFAULT_CHARSET;
}
public static void setGaoBinDEFAULT_CHARSET(String gaoBinDEFAULT_CHARSET) {
GaoBinDEFAULT_CHARSET = gaoBinDEFAULT_CHARSET;
}
}
package org.htmlparser.lexer;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;
public class InputStreamSource
extends
Source
{
/**
* An initial buffer size.
* Has a default value of {16384}.
*/
public static int BUFFER_SIZE = 16384;
/**
* The stream of bytes.
* Set to <code>null</code> when the source is closed.
*/
protected transient InputStream mStream;
/**
* The character set in use.
*/
protected String mEncoding;
/**
* The converter from bytes to characters.
*/
protected transient InputStreamReader mReader;
/**
* The characters read so far.
*/
protected char[] mBuffer;
/**
* The number of valid bytes in the buffer.
*/
protected int mLevel;
/**
* The offset of the next byte returned by read().
*/
protected int mOffset;
/**
* The bookmark.
*/
protected int mMark;
/**
* Create a source of characters using the default character set.
* @param stream The stream of bytes to use.
* @exception UnsupportedEncodingException If the default character set
* is unsupported.
*/
public InputStreamSource (InputStream stream)
throws
UnsupportedEncodingException
{
this (stream, null, BUFFER_SIZE);
}
public InputStreamSource (InputStream stream, String charset)
throws
UnsupportedEncodingException
{
this (stream, charset, BUFFER_SIZE);
}
/**
* Create a source of characters.
* @param stream The stream of bytes to use.
* @param charset The character set used in encoding the stream.
* @param size The initial character buffer size.
* @exception UnsupportedEncodingException If the character set
* is unsupported.
*/
public InputStreamSource (InputStream stream, String charset, int size)
throws
UnsupportedEncodingException
{
if (null == stream)
stream = new Stream (null);
else
// bug #1044707 mark()/reset() issues
if (!stream.markSupported ())
// wrap the stream so we can reset
stream = new Stream (stream);
mStream = stream;
if (null == charset)
{
mReader = new InputStreamReader (stream);
mEncoding = mReader.getEncoding ();
}
else
{
mEncoding = charset;
mReader = new InputStreamReader (stream, charset);
}
mBuffer = new char[size];
mLevel = 0;
mOffset = 0;
mMark = -1;
}
/**
* Serialization support.
* @param out Where to write this object.
* @exception IOException If serialization has a problem.
*/
private void writeObject (ObjectOutputStream out)
throws
IOException
{
int offset;
char[] buffer;
if (null != mStream)
{
// remember the offset, drain the input stream, restore the offset
offset = mOffset;
buffer = new char[4096];
while (EOF != read (buffer))
;
mOffset = offset;
}
out.defaultWriteObject ();
}
/**
* Deserialization support.
* @param in Where to read this object from.
* @exception IOException If deserialization has a problem.
*/
private void readObject (ObjectInputStream in)
throws
IOException,
ClassNotFoundException
{
in.defaultReadObject ();
if (null != mBuffer) // buffer is null when destroy's been called
// pretend we're open, mStream goes null when exhausted
mStream = new ByteArrayInputStream (new byte[0]);
}
/**
* Get the input stream being used.
* @return The current input stream.
*/
public InputStream getStream ()
{
return (mStream);
}
/**
* Get the encoding being used to convert characters.
* @return The current encoding.
*/
public String getEncoding ()
{
return (mEncoding);
}
/**
* Begins reading from the source with the given character set.
* If the current encoding is the same as the requested encoding,
* this method is a no-op. Otherwise any subsequent characters read from
* this page will have been decoded using the given character set.<p>
* Some magic happens here to obtain this result if characters have already
* been consumed from this source.
* Since a Reader cannot be dynamically altered to use a different character
* set, the underlying stream is reset, a new Source is constructed
* and a comparison made of the characters read so far with the newly
* read characters up to the current position.
* If a difference is encountered, or some other problem occurs,
* an exception is thrown.
* @param character_set The character set to use to convert bytes into
* characters.
* @exception ParserException If a character mismatch occurs between
* characters already provided and those that would have been returned
* had the new character set been in effect from the beginning. An
* exception is also thrown if the underlying stream won't put up with
* these shenanigans.
*/
public void setEncoding (String character_set)
throws
ParserException
{
String encoding;
InputStream stream;
char[] buffer;
int offset;
char[] new_chars;
encoding = getEncoding ();
if(encoding!=null){
character_set=encoding;
}
if (!encoding.equalsIgnoreCase (character_set))
{
stream = getStream ();
try
{
buffer = mBuffer;
offset = mOffset;
stream.reset ();
try
{
mEncoding = character_set;
mReader = new InputStreamReader (stream, character_set);
mBuffer = new char[mBuffer.length];
mLevel = 0;
mOffset = 0;
mMark = -1;
if (0 != offset)
{
new_chars = new char[offset];
if (offset != read (new_chars))
throw new ParserException ("reset stream failed");
for (int i = 0; i < offset; i++)
if (new_chars[i] != buffer[i])
throw new EncodingChangeException ("character mismatch (new: "
+ new_chars[i]
+ " [0x"
+ Integer.toString (new_chars[i], 16)
+ "] != old: "
+ " [0x"
+ Integer.toString (buffer[i], 16)
+ buffer[i]
+ "]) for encoding change from "
+ encoding
+ " to "
+ character_set
+ " at character offset "
+ i);
}
}
catch (IOException ioe)
{
throw new ParserException (ioe.getMessage (), ioe);
}
}
catch (IOException ioe)
{ // bug #1044707 mark()/reset() issues
throw new ParserException ("Stream reset failed ("
+ ioe.getMessage ()
+ "), try wrapping it with a org.htmlparser.lexer.Stream",
ioe);
}
}
}
/**
* Fetch more characters from the underlying reader.
* Has no effect if the underlying reader has been drained.
* @param min The minimum to read.
* @exception IOException If the underlying reader read() throws one.
*/
protected void fill (int min)
throws
IOException
{
char[] buffer;
int size;
int read;
if (null != mReader) // mReader goes null when it's been sucked dry
{
size = mBuffer.length - mLevel; // available space
if (size < min) // oops, better get some buffer space
{
// unknown length... keep doubling
size = mBuffer.length * 2;
read = mLevel + min;
if (size < read) // or satisfy min, whichever is greater
size = read;
else
min = size - mLevel; // read the max
buffer = new char[size];
}
else
{
buffer = mBuffer;
min = size;
}
// read into the end of the 'new' buffer
read = mReader.read (buffer, mLevel, min);
if (EOF == read)
{
mReader.close ();
mReader = null;
}
else
{
if (mBuffer != buffer)
{ // copy the bytes previously read
System.arraycopy (mBuffer, 0, buffer, 0, mLevel);
mBuffer = buffer;
}
mLevel += read;
}
// todo, should repeat on read shorter than original min
}
}
/**
* Does nothing.
* It's supposed to close the source, but use destroy() instead.
* @exception IOException <em>not used</em>
* @see #destroy
*/
public void close () throws IOException
{
}
/**
* Read a single character.
* This method will block until a character is available,
* an I/O error occurs, or the end of the stream is reached.
* @return The character read, as an integer in the range 0 to 65535
* (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has
* been reached
* @exception IOException If an I/O error occurs.
*/
public int read () throws IOException
{
int ret;
if (mLevel - mOffset < 1)
{
if (null == mStream)
throw new IOException ("source is closed");
fill (1);
if (mOffset >= mLevel)
ret = EOF;
else
ret = mBuffer[mOffset++];
}
else
ret = mBuffer[mOffset++];
return (ret);
}
/**
* Read characters into a portion of an array. This method will block
* until some input is available, an I/O error occurs, or the end of the
* stream is reached.
* @param cbuf Destination buffer
* @param off Offset at which to start storing characters
* @param len Maximum number of characters to read
* @return The number of characters read, or {@link #EOF EOF} if the end of
* the stream has been reached
* @exception IOException If an I/O error occurs.
*/
public int read (char[] cbuf, int off, int len) throws IOException
{
int ret;
if (null == mStream)
throw new IOException ("source is closed");
if ((null == cbuf) || (0 > off) || (0 > len))
throw new IOException ("illegal argument read ("
+ ((null == cbuf) ? "null" : "cbuf")
+ ", " + off + ", " + len + ")");
if (mLevel - mOffset < len)
fill (len - (mLevel - mOffset)); // minimum to satisfy this request
if (mOffset >= mLevel)
ret = EOF;
else
{
ret = Math.min (mLevel - mOffset, len);
System.arraycopy (mBuffer, mOffset, cbuf, off, ret);
mOffset += ret;
}
return (ret);
}
/**
* Read characters into an array.
* This method will block until some input is available, an I/O error occurs,
* or the end of the stream is reached.
* @param cbuf Destination buffer.
* @return The number of characters read, or {@link #EOF EOF} if the end of
* the stream has been reached.
* @exception IOException If an I/O error occurs.
*/
public int read (char[] cbuf) throws IOException
{
return (read (cbuf, 0, cbuf.length));
}
/**
* Reset the source.
* Repositions the read point to begin at zero.
* @exception IllegalStateException If the source has been closed.
*/
public void reset ()
throws
IllegalStateException
{
if (null == mStream)
throw new IllegalStateException ("source is closed");
if (-1 != mMark)
mOffset = mMark;
else
mOffset = 0;
}
/**
* Tell whether this source supports the mark() operation.
* @return <code>true</code>.
*/
public boolean markSupported ()
{
return (true);
}
/**
* Mark the present position in the source.
* Subsequent calls to {@link #reset()}
* will attempt to reposition the source to this point.
* @param readAheadLimit <em>Not used.</em>
* @exception IOException If the source is closed.
*
*/
public void mark (int readAheadLimit) throws IOException
{
if (null == mStream)
throw new IOException ("source is closed");
mMark = mOffset;
}
/**
* Tell whether this source is ready to be read.
* @return <code>true</code> if the next read() is guaranteed not to block
* for input, <code>false</code> otherwise.
* Note that returning false does not guarantee that the next read will block.
* @exception IOException If the source is closed.
*/
public boolean ready () throws IOException
{
if (null == mStream)
throw new IOException ("source is closed");
return (mOffset < mLevel);
}
/**
* Skip characters.
* This method will block until some characters are available,
* an I/O error occurs, or the end of the stream is reached.
* <em>Note: n is treated as an int</em>
* @param n The number of characters to skip.
* @return The number of characters actually skipped
* @exception IllegalArgumentException If <code>n</code> is negative.
* @exception IOException If an I/O error occurs.
*/
public long skip (long n)
throws
IOException,
IllegalArgumentException
{
long ret;
if (null == mStream)
throw new IOException ("source is closed");
if (0 > n)
throw new IllegalArgumentException ("cannot skip backwards");
else
{
if (mLevel - mOffset < n)
fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request
if (mOffset >= mLevel)
ret = EOF;
else
{
ret = Math.min (mLevel - mOffset, n);
mOffset += ret;
}
}
return (ret);
}
/**
* Undo the read of a single character.
* @exception IOException If the source is closed or no characters have
* been read.
*/
public void unread () throws IOException
{
if (null == mStream)
throw new IOException ("source is closed");
if (0 < mOffset)
mOffset--;
else
throw new IOException ("can't unread no characters");
}
/**
* Retrieve a character again.
* @param offset The offset of the character.
* @return The character at <code>offset</code>.
* @exception IOException If the offset is beyond {@link #offset()} or the
* source is closed.
*/
public char getCharacter (int offset) throws IOException
{
char ret;
if (null == mStream)
throw new IOException ("source is closed");
if (offset >= mBuffer.length)
throw new IOException ("illegal read ahead");
else
ret = mBuffer[offset];
return (ret);
}
/**
* Retrieve characters again.
* @param array The array of characters.
* @param offset The starting position in the array where characters are to be placed.
* @param start The starting position, zero based.
* @param end The ending position
* (exclusive, i.e. the character at the ending position is not included),
* zero based.
* @exception IOException If the start or end is beyond {@link #offset()}
* or the source is closed.
*/
public void getCharacters (char[] array, int offset, int start, int end) throws IOException
{
if (null == mStream)
throw new IOException ("source is closed");
System.arraycopy (mBuffer, start, array, offset, end - start);
}
/**
* Retrieve a string.
* @param offset The offset of the first character.
* @param length The number of characters to retrieve.
* @return A string containing the <code>length</code> characters at <code>offset</code>.
* @exception IOException If the offset or (offset + length) is beyond
* {@link #offset()} or the source is closed.
*/
public String getString (int offset, int length) throws IOException
{
String ret;
if (null == mStream)
throw new IOException ("source is closed");
if (offset + length > mBuffer.length)
throw new IOException ("illegal read ahead");
else
ret = new String (mBuffer, offset, length);
return (ret);
}
/**
* Append characters already read into a <code>StringBuffer</code>.
* @param buffer The buffer to append to.
* @param offset The offset of the first character.
* @param length The number of characters to retrieve.
* @exception IOException If the offset or (offset + length) is beyond
* {@link #offset()} or the source is closed.
*/
public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException
{
if (null == mStream)
throw new IOException ("source is closed");
buffer.append (mBuffer, offset, length);
}
/**
* Close the source.
* Once a source has been closed, further {@link #read() read},
* {@link #ready ready}, {@link #mark mark}, {@link #reset reset},
* {@link #skip skip}, {@link #unread unread},
* {@link #getCharacter getCharacter} or {@link #getString getString}
* invocations will throw an IOException.
* Closing a previously-closed source, however, has no effect.
* @exception IOException If an I/O error occurs
*/
public void destroy () throws IOException
{
mStream = null;
if (null != mReader)
mReader.close ();
mReader = null;
mBuffer = null;
mLevel = 0;
mOffset = 0;
mMark = -1;
}
/**
* Get the position (in characters).
* @return The number of characters that have already been read, or
* {@link #EOF EOF} if the source is closed.
*/
public int offset ()
{
int ret;
if (null == mStream)
ret = EOF;
else
ret = mOffset;
return (ret);
}
/**
* Get the number of available characters.
* @return The number of characters that can be read without blocking or
* zero if the source is closed.
*/
public int available ()
{
int ret;
if (null == mStream)
ret = 0;
else
ret = mLevel - mOffset;
return (ret);
}
}