package Core; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.Hashtable; /** * @author fonter * http://fonter.iteye.com * 此类用于解析HTML,XML,TXT,XHTML,WML等文档,支持CDATA,支持Text Extractor */ public class HtmlInputStreamReader { private Reader reader; private boolean unresolved; private boolean processNsp = true; private boolean token; private boolean wasCR; private String encoding; private char[] srcBuf; private Hashtable entityMap; private boolean relaxed = true; private boolean degenerated; private String[] attributes = new String[16]; private int type; static final private String UNEXPECTED_EOF = "Unexpected EOF"; //static final private String ILLEGAL_TYPE = "Wrong event type"; public static final String NO_NAMESPACE = ""; public static final int START_DOCUMENT = 0; public static final int END_DOCUMENT = 1; public static final int START_TAG = 2; public static final int END_TAG = 3; public static final int TEXT = 4; public static final int CDSECT = 5; public static final int ENTITY_REF = 6; public static final int IGNORABLE_WHITESPACE = 7; public static final int PROCESSING_INSTRUCTION = 8; public static final int COMMENT = 9; public static final int DOCDECL = 10; public static final int LEGACY = 999; public static final int XML_DECL = 998; private String[] nspStack = new String[8]; private int[] nspCounts = new int[4]; private String version; private Boolean standalone; private char[] txtBuf = new char[128]; private int txtPos; private String error; private int srcLength; private int srcPos; private int srcCount; private int stackMismatch = 0; private String namespace; private String prefix; private String name; private String[] elementStack = new String[16]; private int line; private int column; private int[] peek = new int[2]; private int peekCount; private boolean isWhitespace; private int attributeCount; private int depth; public HtmlInputStreamReader() throws IOException{ srcBuf = new char[Runtime.getRuntime().freeMemory() >= 1048576 ? 8192 : 128]; } public void setInput(Reader reader) throws IOException { this.reader = reader; line = 1; column = 0; type = START_DOCUMENT; name = null; namespace = null; degenerated = false; attributeCount = -1; encoding = null; version = null; standalone = null; srcLength = 0; if (reader == null) return; srcPos = 0; srcCount = 0; peekCount = 0; depth = 0; entityMap = new Hashtable(); entityMap.put("amp", "&"); entityMap.put("apos", "'"); entityMap.put("gt", ">"); entityMap.put("lt", "<"); entityMap.put("quot", "\""); entityMap.put("copy", "\251"); entityMap.put("reg", "\256"); entityMap.put("yen", "\245"); } private final int peek(int pos) throws IOException { while (pos >= peekCount) { int nw; if (srcBuf.length <= 1) nw = reader.read(); else if (srcPos < srcCount) nw = srcBuf[srcPos++]; else { srcCount = reader.read(srcBuf, 0, srcBuf.length); if (srcCount <= 0) nw = -1; else nw = srcBuf[0]; srcPos = 1; } if (nw == '\r') { wasCR = true; peek[peekCount++] = '\n'; } else { if (nw == '\n') { if (!wasCR) peek[peekCount++] = '\n'; } else peek[peekCount++] = nw; wasCR = false; } } return peek[pos]; } private final int peekType() throws IOException { switch (peek(0)) { case -1 : return END_DOCUMENT; case '&' : return ENTITY_REF; case '<' : switch (peek(1)) { case '/' : return END_TAG; case '?' : case '!' : return LEGACY; default : return START_TAG; } default : return TEXT; } } private final void error(String desc){ exception(desc); } private final void exception(String desc){ System.out.println(desc); } public final void nextImpl() throws IOException{ if (reader == null) exception("No Input specified"); if (type == END_TAG) depth--; while (true) { attributeCount = -1; // degenerated needs to be handled before error because of possible // processor expectations(!) if (degenerated) { degenerated = false; type = END_TAG; return; } if (error != null) { for (int i = 0; i < error.length(); i++) push(error.charAt(i)); // text = error; error = null; type = COMMENT; return; } if (relaxed && (stackMismatch > 0 || (peek(0) == -1 && depth > 0))) { int sp = (depth - 1) << 2; type = END_TAG; namespace = elementStack[sp]; prefix = elementStack[sp + 1]; name = elementStack[sp + 2]; if (stackMismatch != 1) error = "missing end tag /" + name + " inserted"; if (stackMismatch > 0) stackMismatch--; return; } prefix = null; name = null; namespace = null; // text = null; type = peekType(); //System.out.println("Markup:"+type); switch (type) { case ENTITY_REF : pushEntity(); return; case START_TAG : parseStartTag(false); return; case END_TAG : parseEndTag(); return; case END_DOCUMENT : return; case TEXT : pushText('<', !token); if (depth == 0) { if (isWhitespace) type = IGNORABLE_WHITESPACE; // make exception switchable for instances.chg... !!!! // else // exception ("text '"+getText ()+"' not allowed outside root element"); } return; default : type = parseLegacy(token); if (type != XML_DECL) return; } } } // boolean isEND(){ // return isEOF; //} public String getInputEncoding() { return encoding; } public String getText() { return type < TEXT || (type == ENTITY_REF && unresolved) ? null : get(0); } //text Extractor public String getTextExtractor() { //String s = get(0).; StringBuffer sb = new StringBuffer(); return type < TEXT || (type == ENTITY_REF && unresolved) ? null : appendCollapseWhiteSpace(sb,get(0)).toString(); } private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; public static final boolean isWhiteSpace(final char ch) { for (int i=0; i<WHITESPACE.length; i++) if (ch==WHITESPACE[i]) return true; return false; } static final StringBuffer appendCollapseWhiteSpace(StringBuffer sb, String text) { final int textLength=text.length(); int i=0; boolean firstWasWhiteSpace=false; while (true) { if (i>=textLength) return sb; if (!isWhiteSpace(text.charAt(i))) break; i++; } do { final char ch = text.charAt(i++); if (isWhiteSpace(ch)) { firstWasWhiteSpace =true; } else { if (firstWasWhiteSpace) { sb.append(' '); firstWasWhiteSpace =false; } sb.append(ch); } } while (i<textLength); return sb; } public int getEventType(){ return type; } private final void parseEndTag() throws IOException{ read(); // '<' read(); // '/' name = readName(); //System.out.println("EndTag:"+name); skip(); read('>'); int sp = (depth - 1) << 2; if (depth == 0) { error("element stack empty"); type = COMMENT; return; } if (!name.equals(elementStack[sp + 3])) { error("expected: /" + elementStack[sp + 3] + " read: " + name); // become case insensitive in relaxed mode int probe = sp; while (probe >= 0 && !name.toLowerCase().equals(elementStack[probe + 3].toLowerCase())) { stackMismatch++; probe -= 4; } if (probe < 0) { stackMismatch = 0; // text = "unexpected end tag ignored"; type = COMMENT; return; } } namespace = elementStack[sp]; prefix = elementStack[sp + 1]; name = elementStack[sp + 2]; } private final int parseLegacy(boolean push) throws IOException{ String req = ""; int term; int result; int prev = 0; read(); // < int c = read(); if (c == '?') { if ((peek(0) == 'x' || peek(0) == 'X') && (peek(1) == 'm' || peek(1) == 'M')) { if (push) { push(peek(0)); push(peek(1)); } read(); read(); if ((peek(0) == 'l' || peek(0) == 'L') && peek(1) <= ' ') { if (line != 1 || column > 4) error("PI must not start with xml"); parseStartTag(true); if (attributeCount < 1 || !"version".equals(attributes[2])) error("version expected"); version = attributes[3]; int pos = 1; if (pos < attributeCount && "encoding".equals(attributes[2 + 4])) { encoding = attributes[3 + 4]; pos++; } if (pos < attributeCount && "standalone".equals(attributes[4 * pos + 2])) { String st = attributes[3 + 4 * pos]; if ("yes".equals(st)) standalone = new Boolean(true); else if ("no".equals(st)) standalone = new Boolean(false); else error("illegal standalone value: " + st); pos++; } if (pos != attributeCount) error("illegal xmldecl"); isWhitespace = true; txtPos = 0; return XML_DECL; } } /* int c0 = read (); int c1 = read (); int */ term = '?'; result = PROCESSING_INSTRUCTION; } else if (c == '!') { if (peek(0) == '-') { result = COMMENT; req = "--"; term = '-'; } else if (peek(0) == '[') { result = CDSECT; req = "[CDATA["; term = ']'; push = true; } else { result = DOCDECL; req = "DOCTYPE"; term = -1; } } else { error("illegal: <" + c); return COMMENT; } for (int i = 0; i < req.length(); i++) read(req.charAt(i)); if (result == DOCDECL) parseDoctype(push); else { while (true) { c = read(); if (c == -1){ error(UNEXPECTED_EOF); return COMMENT; } if (push) push(c); if ((term == '?' || c == term) && peek(0) == term && peek(1) == '>') break; prev = c; } if (term == '-' && prev == '-') error("illegal comment delimiter: --->"); read(); read(); if (push && term != '?') txtPos--; } return result; } private final String readName() throws IOException{ int pos = txtPos; int c = peek(0); if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && c != '_' && c != ':' && c < 0x0c0 && !relaxed) error("name expected"); do { push(read()); c = peek(0); } while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c == ':' || c == '.' || c >= 0x0b7); String result = get(pos); txtPos = pos; return result; } private final String get(int pos) { return new String(txtBuf, pos, txtPos - pos); } private final void skip() throws IOException { while (true) { int c = peek(0); if (c > ' ' || c == -1) break; read(); } } private final void parseDoctype(boolean push) throws IOException{ int nesting = 1; boolean quoted = false; // read(); while (true) { int i = read(); switch (i) { case -1 : error(UNEXPECTED_EOF); return; case '\'' : quoted = !quoted; break; case '<' : if (!quoted) nesting++; break; case '>' : if (!quoted) { if ((--nesting) == 0) return; } break; } if (push) push(i); } } private final void pushText(int delimiter, boolean resolveEntities) throws IOException{ int next = peek(0); int cbrCount = 0; while (next != -1 && next != delimiter) { // covers eof, '<', '"' if (delimiter == ' ') if (next <= ' ' || next == '>') break; if (next == '&') { if (!resolveEntities) break; pushEntity(); } else if (next == '\n' && type == START_TAG) { read(); push(' '); } else push(read()); if (next == '>' && cbrCount >= 2 && delimiter != ']') error("Illegal: ]]>"); if (next == ']') cbrCount++; else cbrCount = 0; next = peek(0); } } private final void pushEntity() throws IOException{ push(read()); // & int pos = txtPos; while (true) { int c = read(); if (c == ';') break; if (c < 128 && (c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && c != '_' && c != '-' && c != '#') { if(!relaxed){ error("unterminated entity ref"); } //; ends with:"+(char)c); if (c != -1) push(c); return; } push(c); } String code = get(pos); txtPos = pos - 1; if (token && type == ENTITY_REF){ name = code; } if (code.charAt(0) == '#') { int c = (code.charAt(1) == 'x' ? Integer.parseInt(code.substring(2), 16) : Integer.parseInt(code.substring(1))); push(c); return; } String result = (String) entityMap.get(code); unresolved = result == null; if (unresolved) { if (!token) error("unresolved: &" + code + ";"); } else { for (int i = 0; i < result.length(); i++) push(result.charAt(i)); } } private final void parseStartTag(boolean xmldecl) throws IOException{ if (!xmldecl) read(); name = readName(); //System.out.println("StartTag:"+name); attributeCount = 0; while (true) { skip(); int c = peek(0); if (xmldecl) { if (c == '?') { read(); read('>'); return; } } else { if (c == '/') { degenerated = true; read(); skip(); read('>'); break; } if (c == '>' && !xmldecl) { read(); break; } } if (c == -1) { error(UNEXPECTED_EOF); //type = COMMENT; return; } String attrName = readName(); if (attrName.length() == 0) { error("attr name expected"); break; } int i = (attributeCount++) << 2; attributes = ensureCapacity(attributes, i + 4); attributes[i++] = ""; attributes[i++] = null; attributes[i++] = attrName; skip(); if (peek(0) != '=') { error("Attr.value missing f. "+attrName); attributes[i] = "1"; } else { read('='); skip(); int delimiter = peek(0); if (delimiter != '\'' && delimiter != '"') { error("attr value delimiter missing!"); delimiter = ' '; } else read(); int p = txtPos; pushText(delimiter, true); String skdkfk = get(p); attributes[i] = skdkfk; System.out.println("attributes:"+skdkfk); txtPos = p; if (delimiter != ' ') read(); // skip endquote } } int sp = depth++ << 2; elementStack = ensureCapacity(elementStack, sp + 4); elementStack[sp + 3] = name; if (depth >= nspCounts.length) { int[] bigger = new int[depth + 4]; System.arraycopy(nspCounts, 0, bigger, 0, nspCounts.length); nspCounts = bigger; } nspCounts[depth] = nspCounts[depth - 1]; /* if(!relaxed){ for (int i = attributeCount - 1; i > 0; i--) { for (int j = 0; j < i; j++) { if (getAttributeName(i).equals(getAttributeName(j))) exception("Duplicate Attribute: " + getAttributeName(i)); } } } */ if (processNsp) adjustNsp(); else namespace = ""; elementStack[sp] = namespace; elementStack[sp + 1] = prefix; elementStack[sp + 2] = name; } private final boolean adjustNsp(){ boolean any = false; for (int i = 0; i < attributeCount << 2; i += 4) { // * 4 - 4; i >= 0; i -= 4) { String attrName = attributes[i + 2]; int cut = attrName.indexOf(':'); String prefix; if (cut != -1) { prefix = attrName.substring(0, cut); attrName = attrName.substring(cut + 1); } else if (attrName.equals("xmlns")) { prefix = attrName; attrName = null; } else continue; if (!prefix.equals("xmlns")) { any = true; } else { int j = (nspCounts[depth]++) << 1; nspStack = ensureCapacity(nspStack, j + 2); nspStack[j] = attrName; nspStack[j + 1] = attributes[i + 3]; if (attrName != null && attributes[i + 3].equals("")) error("illegal empty namespace"); // prefixMap = new PrefixMap (prefixMap, attrName, attr.getValue ()); //System.out.println (prefixMap); System.arraycopy( attributes, i + 4, attributes, i, ((--attributeCount) << 2) - i); i -= 4; } } if (any) { for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) { String attrName = attributes[i + 2]; int cut = attrName.indexOf(':'); if (cut == 0 && !relaxed) throw new RuntimeException( "illegal attribute name: " + attrName + " at " + this); else if (cut != -1) { String attrPrefix = attrName.substring(0, cut); attrName = attrName.substring(cut + 1); String attrNs = getNamespace(attrPrefix); if (attrNs == null && !relaxed) throw new RuntimeException( "Undefined Prefix: " + attrPrefix + " in " + this); attributes[i] = attrNs; attributes[i + 1] = attrPrefix; attributes[i + 2] = attrName; /* if (!relaxed) { for (int j = (attributeCount << 2) - 4; j > i; j -= 4) if (attrName.equals(attributes[j + 2]) && attrNs.equals(attributes[j])) exception( "Duplicate Attribute: {" + attrNs + "}" + attrName); } */ } } } int cut = name.indexOf(':'); if (cut == 0) error("illegal tag name: " + name); if (cut != -1) { prefix = name.substring(0, cut); name = name.substring(cut + 1); } this.namespace = getNamespace(prefix); if (this.namespace == null) { if (prefix != null) error("undefined prefix: " + prefix); this.namespace = NO_NAMESPACE; } return any; } //获取命名空间 public String getNamespace(String prefix) { if ("xml".equals(prefix)) return "http://www.w3.org/XML/1998/namespace"; if ("xmlns".equals(prefix)) return "http://www.w3.org/2000/xmlns/"; for (int i = (getNamespaceCount(depth) << 1) - 2; i >= 0; i -= 2) { if (prefix == null) { if (nspStack[i] == null) return nspStack[i + 1]; } else if (prefix.equals(nspStack[i])) return nspStack[i + 1]; } return null; } public int getNamespaceCount(int depth) { if (depth > this.depth) throw new IndexOutOfBoundsException(); return nspCounts[depth]; } private final void read(char c)throws IOException{ int a = read(); if (a != c) error("expected: '" + c + "' actual: '" + ((char) a) + "'"); } private final int read() throws IOException { int result; if (peekCount == 0) result = peek(0); else { result = peek[0]; peek[0] = peek[1]; } // else { // result = peek[0]; // System.arraycopy (peek, 1, peek, 0, peekCount-1); // } peekCount--; column++; srcLength++; if (result == '\n') { line++; column = 1; } return result; } private final void push(int c) { isWhitespace &= c <= ' '; if (txtPos == txtBuf.length) { char[] bigger = new char[txtPos * 4 / 3 + 4]; System.arraycopy(txtBuf, 0, bigger, 0, txtPos); txtBuf = bigger; } txtBuf[txtPos++] = (char) c; } private final String[] ensureCapacity(String[] arr, int required) { if (arr.length >= required) return arr; String[] bigger = new String[required + 16]; System.arraycopy(arr, 0, bigger, 0, arr.length); return bigger; } //设置编码 public void setInput(InputStream is, String _enc) throws IOException { srcPos = 0; srcCount = 0; String enc = _enc; if (is == null) throw new IllegalArgumentException(); try { if (enc == null) { // read four bytes int chk = 0; while (srcCount < 4) { int i = is.read(); srcLength++; if (i == -1) break; chk = (chk << 8) | i; srcBuf[srcCount++] = (char) i; } System.out.println(chk); if (srcCount == 4) { switch (chk) { case 0x00000FEFF : enc = "UTF-32BE"; srcCount = 0; break; case 0x0FFFE0000 : enc = "UTF-32LE"; srcCount = 0; break; case 0x03c : enc = "UTF-32BE"; srcBuf[0] = '<'; srcCount = 1; break; case 0x03c000000 : enc = "UTF-32LE"; srcBuf[0] = '<'; srcCount = 1; break; case 0x0003c003f : enc = "UTF-16BE"; srcBuf[0] = '<'; srcBuf[1] = '?'; srcCount = 2; break; // 这是我加上去的---------------------------------- case 0x3c68746d: //System.out.println("ssdesdfdf"); enc = "gb2312"; srcBuf[0] = '<'; //srcBuf[1] = '?'; srcCount = 1; break; case 0xd0a3c3f: enc = "UTF-8"; srcBuf[0] = '<'; srcBuf[1] = '?'; srcCount = 2; break; //------------------------------------------- case 0x03c003f00 : enc = "UTF-16LE"; srcBuf[0] = '<'; srcBuf[1] = '!'; srcCount = 2; break; case 0xa0a3c21: enc = "UTF-8"; srcBuf[0] = '<'; srcBuf[1] = '!'; srcCount = 2; break; //case 0x03c21444f: //enc = "gb2312"; //srcBuf[0] = '<'; //srcBuf[1] = '!'; //srcCount = 2; //break; case 0x03c3f786d : while (true) { int i = is.read(); srcLength++; if (i == -1) break; srcBuf[srcCount++] = (char) i; if (i == '>') { String s = new String(srcBuf, 0, srcCount); int i0 = s.indexOf("encoding"); if (i0 != -1) { while (s.charAt(i0) != '"' && s.charAt(i0) != '\'') i0++; char deli = s.charAt(i0++); int i1 = s.indexOf(deli, i0); enc = s.substring(i0, i1); } if(enc == null) enc = "UTF-8"; break; } } default : if ((chk & 0x0ffff0000) == 0x0FEFF0000) { enc = "UTF-16BE"; srcBuf[0] = (char) ((srcBuf[2] << 8) | srcBuf[3]); srcCount = 1; } else if ((chk & 0x0ffff0000) == 0x0fffe0000) { enc = "UTF-16LE"; srcBuf[0] = (char) ((srcBuf[3] << 8) | srcBuf[2]); srcCount = 1; } else if ((chk & 0x0ffffff00) == 0x0EFBBBF00) { enc = "UTF-8"; srcBuf[0] = srcBuf[3]; srcCount = 1; } } } } System.out.println(enc); //if (enc == null) //enc = "gb2312"; int sc = srcCount; if (enc == null) setInput(new InputStreamReader(is)); else setInput(new InputStreamReader(is, enc)); encoding = _enc; srcCount = sc; } catch (Exception e) { throw new IOException(); } } public int next() throws IOException { txtPos = 0; isWhitespace = true; int minType = 9999; token = false; do { nextImpl(); if (type < minType) minType = type; // if (curr <= TEXT) type = curr; } while (minType > ENTITY_REF // ignorable || (minType >= TEXT && peekType() >= TEXT)); type = minType; if (type > TEXT) type = TEXT; return type; } public int getLength(){ return srcLength; } //获取标签名 public String getTagName(){ return name; } //获取标签属性 public String getAttributeValue(String namespace, String name) { for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) { if (attributes[i + 2].equals(name) && (namespace == null || attributes[i].equals(namespace))) return attributes[i + 3]; } return null; } }