手机解析HTML,XML,TXT,XHTML,WML等文档

package Core;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Hashtable;

/**
 * @author fonter
 * http://fonter.iteye.com
 * 此类用于解析HTML,XML,TXT,XHTML,WML等文档,支持CDATA,支持Text Extractor
 */
public class HtmlInputStreamReader {
    
    
    private Reader reader;
    private boolean unresolved;
    private boolean processNsp = true;
    private boolean token;
    private boolean wasCR;
    private String encoding;
    private char[] srcBuf;
    private Hashtable entityMap;
    private boolean relaxed = true;
    private boolean degenerated;
    private String[] attributes = new String[16];
    private int type;
    static final private String UNEXPECTED_EOF = "Unexpected EOF";
    //static final private String ILLEGAL_TYPE = "Wrong event type";
    public static final String NO_NAMESPACE = "";
    public static final int START_DOCUMENT = 0;
    public static final int END_DOCUMENT = 1;
    public static final int START_TAG = 2;
    public static final int END_TAG = 3;
    public static final int TEXT = 4;
    public static final int CDSECT = 5;
    public static final int ENTITY_REF = 6;
    public static final int IGNORABLE_WHITESPACE = 7;
    public static final int PROCESSING_INSTRUCTION = 8;
    public static final int COMMENT = 9;
    public static final int DOCDECL = 10;
    public static final int LEGACY = 999;
    public static final int XML_DECL = 998;
    private String[] nspStack = new String[8];
    private int[] nspCounts = new int[4];
    private String version;
    private Boolean standalone;
    private char[] txtBuf = new char[128];
    private int txtPos;
    private String error;
    private int srcLength;
    private int srcPos;
    private int srcCount;
    
    private int stackMismatch = 0;
    private String namespace;
    private String prefix;
    private String name;
    private String[] elementStack = new String[16];
    
    private int line;
    private int column;
    
    private int[] peek = new int[2];
    private int peekCount;
    private boolean isWhitespace;
    private int attributeCount;
    
    private int depth;
    
    public HtmlInputStreamReader() throws IOException{
        srcBuf = new char[Runtime.getRuntime().freeMemory() >= 1048576 ? 8192 : 128];
    }
    
    
    public void setInput(Reader reader) throws IOException {
        this.reader = reader;
        line = 1;
        column = 0;
        type = START_DOCUMENT;
        name = null;
        namespace = null;
        degenerated = false;
        attributeCount = -1;
        encoding = null;
        version = null;
        standalone = null;
        srcLength = 0;
        
        if (reader == null)
            return;
        
        srcPos = 0;
        srcCount = 0;
        peekCount = 0;
        depth = 0;
        
        entityMap = new Hashtable();
        entityMap.put("amp", "&");
        entityMap.put("apos", "'");
        entityMap.put("gt", ">");
        entityMap.put("lt", "<");
        entityMap.put("quot", "\"");
        entityMap.put("copy", "\251");
        entityMap.put("reg", "\256");
        entityMap.put("yen", "\245");
    }
    
    
    private final int peek(int pos) throws IOException {
        
        while (pos >= peekCount) {
            
            int nw;
            
            if (srcBuf.length <= 1)
                nw = reader.read();
            else if (srcPos < srcCount)
                nw = srcBuf[srcPos++];
            else {
                srcCount = reader.read(srcBuf, 0, srcBuf.length);
                if (srcCount <= 0)
                    nw = -1;
                else
                    nw = srcBuf[0];
                
                srcPos = 1;
            }
            
            if (nw == '\r') {
                wasCR = true;
                peek[peekCount++] = '\n';
            } else {
                if (nw == '\n') {
                    if (!wasCR)
                        peek[peekCount++] = '\n';
                } else
                    peek[peekCount++] = nw;
                
                wasCR = false;
            }
        }
        
        return peek[pos];
    }
    
    
    private final int peekType() throws IOException {
        switch (peek(0)) {
            case -1 :
                return END_DOCUMENT;
            case '&' :
                return ENTITY_REF;
            case '<' :
                switch (peek(1)) {
                    case '/' :
                        return END_TAG;
                    case '?' :
                    case '!' :
                        return LEGACY;
                    default :
                        return START_TAG;
                }
            default :
                return TEXT;
        }
    }
    
    
    private final void error(String desc){
        exception(desc);
    }
    
    private final void exception(String desc){
        System.out.println(desc);
    }
    
    public final void nextImpl() throws IOException{
        
        if (reader == null)
            exception("No Input specified");
        
        if (type == END_TAG)
            depth--;
        
        while (true) {
            attributeCount = -1;
            
            // degenerated needs to be handled before error because of possible
            // processor expectations(!)
            
            if (degenerated) {
                degenerated = false;
                type = END_TAG;
                return;
            }
            
            
            if (error != null) {
                for (int i = 0; i < error.length(); i++)
                    push(error.charAt(i));
                //				text = error;
                error = null;
                type = COMMENT;
                return;
            }
            
            
            if (relaxed
                    && (stackMismatch > 0 || (peek(0) == -1 && depth > 0))) {
                int sp = (depth - 1) << 2;
                type = END_TAG;
                namespace = elementStack[sp];
                prefix = elementStack[sp + 1];
                name = elementStack[sp + 2];
                if (stackMismatch != 1)
                    error = "missing end tag /" + name + " inserted";
                if (stackMismatch > 0)
                    stackMismatch--;
                return;
            }
            
            prefix = null;
            name = null;
            namespace = null;
            //            text = null;
            
            type = peekType();
            //System.out.println("Markup:"+type);
            
            switch (type) {
                
                case ENTITY_REF :
                    pushEntity();
                    return;
                    
                case START_TAG :
                    parseStartTag(false);
                    return;
                    
                case END_TAG :
                    parseEndTag();
                    return;
                    
                case END_DOCUMENT :
                    return;
                    
                case TEXT :
                    pushText('<', !token);
                    if (depth == 0) {
                        if (isWhitespace)
                            type = IGNORABLE_WHITESPACE;
                        // make exception switchable for instances.chg... !!!!
                        //	else
                        //    exception ("text '"+getText ()+"' not allowed outside root element");
                    }
                    return;
                    
                default :
                    type = parseLegacy(token);
                    if (type != XML_DECL)
                        return;
            }
        }
    }
    
    // boolean isEND(){
    //	return isEOF;
    //}
    
    public String getInputEncoding() {
        return encoding;
    }
    
    public String getText() {
        return type < TEXT
                || (type == ENTITY_REF && unresolved) ? null : get(0);
    }
    //text Extractor
    public String getTextExtractor() {
        //String s = get(0).;
        StringBuffer sb = new StringBuffer();
        return type < TEXT
                || (type == ENTITY_REF && unresolved) ? null : appendCollapseWhiteSpace(sb,get(0)).toString();
    }
    
    private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'};
    
    public static final boolean isWhiteSpace(final char ch) {
        for (int i=0; i<WHITESPACE.length; i++)
            if (ch==WHITESPACE[i]) return true;
        return false;
    }
    
    static final StringBuffer appendCollapseWhiteSpace(StringBuffer sb, String text) {
        final int textLength=text.length();
        int i=0;
        boolean firstWasWhiteSpace=false;
        while (true) {
            if (i>=textLength) return sb;
            if (!isWhiteSpace(text.charAt(i))) break;
            i++;
        }
        do {
            final char ch = text.charAt(i++);
            if (isWhiteSpace(ch)) {
                firstWasWhiteSpace =true;
            } else {
                if (firstWasWhiteSpace) {
                    sb.append(' ');
                    firstWasWhiteSpace =false;
                }
                sb.append(ch);
            }
        } while (i<textLength);
        return sb;
    }
    
    public int getEventType(){
        return type;
    }
    
    private final void parseEndTag()
    throws IOException{
        
        read(); // '<'
        read(); // '/'
        name = readName();
        //System.out.println("EndTag:"+name);
        skip();
        read('>');
        
        int sp = (depth - 1) << 2;
        
        if (depth == 0) {
            error("element stack empty");
            type = COMMENT;
            return;
        }
        
        if (!name.equals(elementStack[sp + 3])) {
            error("expected: /" + elementStack[sp + 3] + " read: " + name);
            
            // become case insensitive in relaxed mode
            
            int probe = sp;
            while (probe >= 0 && !name.toLowerCase().equals(elementStack[probe + 3].toLowerCase())) {
                stackMismatch++;
                probe -= 4;
            }
            
            if (probe < 0) {
                stackMismatch = 0;
                //			text = "unexpected end tag ignored";
                type = COMMENT;
                return;
            }
        }
        
        namespace = elementStack[sp];
        prefix = elementStack[sp + 1];
        name = elementStack[sp + 2];
    }
    
    private final int parseLegacy(boolean push)
    throws IOException{
        
        String req = "";
        int term;
        int result;
        int prev = 0;
        
        read(); // <
        int c = read();
        if (c == '?') {
            if ((peek(0) == 'x' || peek(0) == 'X')
            && (peek(1) == 'm' || peek(1) == 'M')) {
                
                if (push) {
                    push(peek(0));
                    push(peek(1));
                }
                read();
                read();
                
                if ((peek(0) == 'l' || peek(0) == 'L') && peek(1) <= ' ') {
                    
                    if (line != 1 || column > 4)
                        error("PI must not start with xml");
                    
                    parseStartTag(true);
                    
                    if (attributeCount < 1 || !"version".equals(attributes[2]))
                        error("version expected");
                    
                    version = attributes[3];
                    
                    int pos = 1;
                    
                    if (pos < attributeCount
                            && "encoding".equals(attributes[2 + 4])) {
                        encoding = attributes[3 + 4];
                        pos++;
                    }
                    
                    if (pos < attributeCount
                            && "standalone".equals(attributes[4 * pos + 2])) {
                        String st = attributes[3 + 4 * pos];
                        if ("yes".equals(st))
                            standalone = new Boolean(true);
                        else if ("no".equals(st))
                            standalone = new Boolean(false);
                        else
                            error("illegal standalone value: " + st);
                        pos++;
                    }
                    
                    if (pos != attributeCount)
                        error("illegal xmldecl");
                    
                    isWhitespace = true;
                    txtPos = 0;
                    
                    return XML_DECL;
                }
            }
            
        /*            int c0 = read ();
                    int c1 = read ();
                    int */
            
            term = '?';
            result = PROCESSING_INSTRUCTION;
        } else if (c == '!') {
            if (peek(0) == '-') {
                result = COMMENT;
                req = "--";
                term = '-';
            } else if (peek(0) == '[') {
                result = CDSECT;
                req = "[CDATA[";
                term = ']';
                push = true;
            } else {
                result = DOCDECL;
                req = "DOCTYPE";
                term = -1;
            }
        } else {
            error("illegal: <" + c);
            return COMMENT;
        }
        for (int i = 0; i < req.length(); i++)
            read(req.charAt(i));
        
        if (result == DOCDECL)
            parseDoctype(push);
        else {
            while (true) {
                c = read();
                if (c == -1){
                    error(UNEXPECTED_EOF);
                    return COMMENT;
                }
                
                if (push)
                    push(c);
                
                if ((term == '?' || c == term)
                && peek(0) == term
                        && peek(1) == '>')
                    break;
                
                prev = c;
            }
            
            if (term == '-' && prev == '-')
                error("illegal comment delimiter: --->");
            
            read();
            read();
            
            if (push && term != '?')
                txtPos--;
            
        }
        return result;
    }
    private final String readName()
    throws IOException{
        
        int pos = txtPos;
        int c = peek(0);
        if ((c < 'a' || c > 'z')
        && (c < 'A' || c > 'Z')
        && c != '_'
                && c != ':'
                && c < 0x0c0
                && !relaxed)
            error("name expected");
        
        do {
            push(read());
            c = peek(0);
        }
        while ((c >= 'a' && c <= 'z')
        || (c >= 'A' && c <= 'Z')
        || (c >= '0' && c <= '9')
        || c == '_'
                || c == '-'
                || c == ':'
                || c == '.'
                || c >= 0x0b7);
        
        String result = get(pos);
        txtPos = pos;
        return result;
    }
    
    private final String get(int pos) {
        return new String(txtBuf, pos, txtPos - pos);
    }
    
    private final void skip() throws IOException {
        
        while (true) {
            int c = peek(0);
            if (c > ' ' || c == -1)
                break;
            read();
        }
    }
    private final void parseDoctype(boolean push)
    throws IOException{
        int nesting = 1;
        boolean quoted = false;
        
// read();
        
        while (true) {
            int i = read();
            switch (i) {
                
                case -1 :
                    error(UNEXPECTED_EOF);
                    return;
                    
                case '\'' :
                    quoted = !quoted;
                    break;
                    
                case '<' :
                    if (!quoted)
                        nesting++;
                    break;
                    
                case '>' :
                    if (!quoted) {
                        if ((--nesting) == 0)
                            return;
                    }
                    break;
            }
            if (push)
                push(i);
        }
    }
    private final void pushText(int delimiter, boolean resolveEntities)
    throws IOException{
        
        int next = peek(0);
        int cbrCount = 0;
        
        while (next != -1 && next != delimiter) { // covers eof, '<', '"'
            
            if (delimiter == ' ')
                if (next <= ' ' || next == '>')
                    break;
            
            if (next == '&') {
                if (!resolveEntities)
                    break;
                
                pushEntity();
            } else if (next == '\n' && type == START_TAG) {
                read();
                push(' ');
            } else
                push(read());
            
            if (next == '>' && cbrCount >= 2 && delimiter != ']')
                error("Illegal: ]]>");
            
            if (next == ']')
                cbrCount++;
            else
                cbrCount = 0;
            
            next = peek(0);
        }
    }
    
    private final void pushEntity()
    throws IOException{
        
        push(read()); // &
        
        
        int pos = txtPos;
        
        while (true) {
            int c = read();
            if (c == ';')
                break;
            if (c < 128
                    && (c < '0' || c > '9')
                    && (c < 'a' || c > 'z')
                    && (c < 'A' || c > 'Z')
                    && c != '_'
                    && c != '-'
                    && c != '#') {
                if(!relaxed){
                    error("unterminated entity ref");
                }
                //; ends with:"+(char)c);
                if (c != -1)
                    push(c);
                return;
            }
            
            push(c);
        }
        
        String code = get(pos);
        txtPos = pos - 1;
        if (token && type == ENTITY_REF){
            name = code;
        }
        
        if (code.charAt(0) == '#') {
            int c =
                    (code.charAt(1) == 'x'
                    ? Integer.parseInt(code.substring(2), 16)
                    : Integer.parseInt(code.substring(1)));
            push(c);
            return;
        }
        
        String result = (String) entityMap.get(code);
        
        unresolved = result == null;
        
        if (unresolved) {
            if (!token)
                error("unresolved: &" + code + ";");
        } else {
            for (int i = 0; i < result.length(); i++)
                push(result.charAt(i));
        }
    }
    
    
    private final void parseStartTag(boolean xmldecl)
    throws IOException{
        
        if (!xmldecl)
            read();
        name = readName();
        //System.out.println("StartTag:"+name);
        attributeCount = 0;
        
        while (true) {
            skip();
            
            int c = peek(0);
            
            if (xmldecl) {
                if (c == '?') {
                    read();
                    read('>');
                    return;
                }
            } else {
                if (c == '/') {
                    degenerated = true;
                    read();
                    skip();
                    read('>');
                    break;
                }
                
                if (c == '>' && !xmldecl) {
                    read();
                    break;
                }
            }
            
            if (c == -1) {
                error(UNEXPECTED_EOF);
                //type = COMMENT;
                return;
            }
            
            String attrName = readName();
            
            if (attrName.length() == 0) {
                error("attr name expected");
                break;
            }
            
            int i = (attributeCount++) << 2;
            
            attributes = ensureCapacity(attributes, i + 4);
            
            attributes[i++] = "";
            attributes[i++] = null;
            attributes[i++] = attrName;
            
            skip();
            
            if (peek(0) != '=') {
                error("Attr.value missing f. "+attrName);
                attributes[i] = "1";
            } else {
                read('=');
                skip();
                int delimiter = peek(0);
                
                if (delimiter != '\'' && delimiter != '"') {
                    error("attr value delimiter missing!");
                    delimiter = ' ';
                } else
                    read();
                
                int p = txtPos;
                pushText(delimiter, true);
                String skdkfk = get(p);
                attributes[i] = skdkfk;
                System.out.println("attributes:"+skdkfk);
                txtPos = p;
                
                if (delimiter != ' ')
                    read(); // skip endquote
            }
        }
        
        int sp = depth++ << 2;
        
        elementStack = ensureCapacity(elementStack, sp + 4);
        elementStack[sp + 3] = name;
        
        if (depth >= nspCounts.length) {
            int[] bigger = new int[depth + 4];
            System.arraycopy(nspCounts, 0, bigger, 0, nspCounts.length);
            nspCounts = bigger;
        }
        
        nspCounts[depth] = nspCounts[depth - 1];
        
    /*
                if(!relaxed){
            for (int i = attributeCount - 1; i > 0; i--) {
                for (int j = 0; j < i; j++) {
                    if (getAttributeName(i).equals(getAttributeName(j)))
                        exception("Duplicate Attribute: " + getAttributeName(i));
                }
            }
                }
     */
        if (processNsp)
            adjustNsp();
        else
            namespace = "";
        
        elementStack[sp] = namespace;
        elementStack[sp + 1] = prefix;
        elementStack[sp + 2] = name;
    }
    
    
    
    private final boolean adjustNsp(){
        
        boolean any = false;
        
        for (int i = 0; i < attributeCount << 2; i += 4) {
            // * 4 - 4; i >= 0; i -= 4) {
            
            String attrName = attributes[i + 2];
            int cut = attrName.indexOf(':');
            String prefix;
            
            if (cut != -1) {
                prefix = attrName.substring(0, cut);
                attrName = attrName.substring(cut + 1);
            } else if (attrName.equals("xmlns")) {
                prefix = attrName;
                attrName = null;
            } else
                continue;
            
            if (!prefix.equals("xmlns")) {
                any = true;
            } else {
                int j = (nspCounts[depth]++) << 1;
                
                nspStack = ensureCapacity(nspStack, j + 2);
                nspStack[j] = attrName;
                nspStack[j + 1] = attributes[i + 3];
                
                if (attrName != null && attributes[i + 3].equals(""))
                    error("illegal empty namespace");
                
                //  prefixMap = new PrefixMap (prefixMap, attrName, attr.getValue ());
                
                //System.out.println (prefixMap);
                
                System.arraycopy(
                        attributes,
                        i + 4,
                        attributes,
                        i,
                        ((--attributeCount) << 2) - i);
                
                i -= 4;
            }
        }
        
        if (any) {
            for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) {
                
                String attrName = attributes[i + 2];
                int cut = attrName.indexOf(':');
                
                if (cut == 0 && !relaxed)
                    throw new RuntimeException(
                            "illegal attribute name: " + attrName + " at " + this);
                
                else if (cut != -1) {
                    String attrPrefix = attrName.substring(0, cut);
                    
                    attrName = attrName.substring(cut + 1);
                    
                    String attrNs = getNamespace(attrPrefix);
                    
                    if (attrNs == null && !relaxed)
                        throw new RuntimeException(
                                "Undefined Prefix: " + attrPrefix + " in " + this);
                    
                    attributes[i] = attrNs;
                    attributes[i + 1] = attrPrefix;
                    attributes[i + 2] = attrName;
                    
                    /*
                                        if (!relaxed) {
                                            for (int j = (attributeCount << 2) - 4; j > i; j -= 4)
                                                if (attrName.equals(attributes[j + 2])
                                                    && attrNs.equals(attributes[j]))
                                                    exception(
                                                        "Duplicate Attribute: {"
                                                            + attrNs
                                                            + "}"
                                                            + attrName);
                                        }
                     */
                }
            }
        }
        
        int cut = name.indexOf(':');
        
        if (cut == 0)
            error("illegal tag name: " + name);
        
        if (cut != -1) {
            prefix = name.substring(0, cut);
            name = name.substring(cut + 1);
        }
        
        this.namespace = getNamespace(prefix);
        
        if (this.namespace == null) {
            if (prefix != null)
                error("undefined prefix: " + prefix);
            this.namespace = NO_NAMESPACE;
        }
        
        return any;
    }
    //获取命名空间
    public String getNamespace(String prefix) {
        
        if ("xml".equals(prefix))
            return "http://www.w3.org/XML/1998/namespace";
        if ("xmlns".equals(prefix))
            return "http://www.w3.org/2000/xmlns/";
        
        for (int i = (getNamespaceCount(depth) << 1) - 2; i >= 0; i -= 2) {
            if (prefix == null) {
                if (nspStack[i] == null)
                    return nspStack[i + 1];
            } else if (prefix.equals(nspStack[i]))
                return nspStack[i + 1];
        }
        return null;
    }
    
    public int getNamespaceCount(int depth) {
        if (depth > this.depth)
            throw new IndexOutOfBoundsException();
        return nspCounts[depth];
    }
    
    
    
    
    
    private final void read(char c)throws IOException{
        int a = read();
        if (a != c)
            error("expected: '" + c + "' actual: '" + ((char) a) + "'");
    }
    
    private final int read() throws IOException {
        int result;
        
        if (peekCount == 0)
            result = peek(0);
        else {
            result = peek[0];
            peek[0] = peek[1];
        }
        //		else {
        //			result = peek[0];
        //			System.arraycopy (peek, 1, peek, 0, peekCount-1);
        //		}
        peekCount--;
        
        column++;
        srcLength++;
        
        if (result == '\n') {
            
            line++;
            column = 1;
        }
        
        return result;
    }
    
    private final void push(int c) {
        
        isWhitespace &= c <= ' ';
        
        if (txtPos == txtBuf.length) {
            char[] bigger = new char[txtPos * 4 / 3 + 4];
            System.arraycopy(txtBuf, 0, bigger, 0, txtPos);
            txtBuf = bigger;
        }
        txtBuf[txtPos++] = (char) c;
    }
    
    private final String[] ensureCapacity(String[] arr, int required) {
        if (arr.length >= required)
            return arr;
        String[] bigger = new String[required + 16];
        System.arraycopy(arr, 0, bigger, 0, arr.length);
        return bigger;
    }
    //设置编码
    public void setInput(InputStream is, String _enc) throws IOException {
        
        srcPos = 0;
        srcCount = 0;
        String enc = _enc;
        
        if (is == null)
            throw new IllegalArgumentException();
        
        try {
            
            if (enc == null) {
                // read four bytes
                
                int chk = 0;
                
                while (srcCount < 4) {
                    int i = is.read();
                    srcLength++;
                    if (i == -1)
                        break;
                    chk = (chk << 8) | i;
                    srcBuf[srcCount++] = (char) i;
                }
                System.out.println(chk);
                if (srcCount == 4) {
                    switch (chk) {
                        case 0x00000FEFF :
                            enc = "UTF-32BE";
                            srcCount = 0;
                            break;
                            
                        case 0x0FFFE0000 :
                            enc = "UTF-32LE";
                            srcCount = 0;
                            break;
                            
                        case 0x03c :
                            enc = "UTF-32BE";
                            srcBuf[0] = '<';
                            srcCount = 1;
                            break;
                            
                        case 0x03c000000 :
                            enc = "UTF-32LE";
                            srcBuf[0] = '<';
                            srcCount = 1;
                            break;
                            
                        case 0x0003c003f :
                            enc = "UTF-16BE";
                            srcBuf[0] = '<';
                            srcBuf[1] = '?';
                            srcCount = 2;
                            break;
                            // 这是我加上去的----------------------------------
                        case 0x3c68746d:
                            //System.out.println("ssdesdfdf");
                            enc = "gb2312";
                            srcBuf[0] = '<';
                            //srcBuf[1] = '?';
                            srcCount = 1;
                            break;
                            
                        case 0xd0a3c3f:
                            enc = "UTF-8";
                            srcBuf[0] = '<';
                            srcBuf[1] = '?';
                            srcCount = 2;
                            break;
                            //-------------------------------------------
                            
                            
                        case 0x03c003f00 :
                            enc = "UTF-16LE";
                            srcBuf[0] = '<';
                            srcBuf[1] = '!';
                            srcCount = 2;
                            break;
                        case 0xa0a3c21:
                            enc = "UTF-8";
                            srcBuf[0] = '<';
                            srcBuf[1] = '!';
                            srcCount = 2;
                            break;
                            //case 0x03c21444f:
                            //enc = "gb2312";
                            //srcBuf[0] = '<';
                            //srcBuf[1] = '!';
                            //srcCount = 2;
                            //break;
                            
                        case 0x03c3f786d :
                            while (true) {
                                
                                int i = is.read();
                                srcLength++;
                                if (i == -1)
                                    break;
                                srcBuf[srcCount++] = (char) i;
                                if (i == '>') {
                                    String s = new String(srcBuf, 0, srcCount);
                                    int i0 = s.indexOf("encoding");
                                    if (i0 != -1) {
                                        while (s.charAt(i0) != '"'
                                                && s.charAt(i0) != '\'')
                                            i0++;
                                        char deli = s.charAt(i0++);
                                        int i1 = s.indexOf(deli, i0);
                                        enc = s.substring(i0, i1);
                                    }
                                    if(enc == null)
                                        enc = "UTF-8";
                                    break;
                                }
                            }
                            
                        default :
                            if ((chk & 0x0ffff0000) == 0x0FEFF0000) {
                                enc = "UTF-16BE";
                                srcBuf[0] =
                                        (char) ((srcBuf[2] << 8) | srcBuf[3]);
                                srcCount = 1;
                            } else if ((chk & 0x0ffff0000) == 0x0fffe0000) {
                                enc = "UTF-16LE";
                                srcBuf[0] =
                                        (char) ((srcBuf[3] << 8) | srcBuf[2]);
                                srcCount = 1;
                            } else if ((chk & 0x0ffffff00) == 0x0EFBBBF00) {
                                enc = "UTF-8";
                                srcBuf[0] = srcBuf[3];
                                srcCount = 1;
                            }
                    }
                }
            }
            System.out.println(enc);
            //if (enc == null)
            //enc = "gb2312";
            
            
            int sc = srcCount;
            if (enc == null)
                setInput(new InputStreamReader(is));
            else
                setInput(new InputStreamReader(is, enc));
            encoding = _enc;
            srcCount = sc;
        } catch (Exception e) {
            throw new IOException();
        }
        
        
    }
    
    public int next() throws IOException {
        
        txtPos = 0;
        isWhitespace = true;
        int minType = 9999;
        token = false;
        
        do {
            nextImpl();
            if (type < minType)
                minType = type;
            //	    if (curr <= TEXT) type = curr;
        }
        while (minType > ENTITY_REF // ignorable
                || (minType >= TEXT && peekType() >= TEXT));
        
        type = minType;
        if (type > TEXT)
            type = TEXT;
        
        return type;
    }
    
    public int getLength(){
        return srcLength;
    }
    
    //获取标签名
    public String getTagName(){
        return name;
    }
    
    //获取标签属性
    public String getAttributeValue(String namespace, String name) {
        
        for (int i = (attributeCount << 2) - 4; i >= 0; i -= 4) {
            if (attributes[i + 2].equals(name)
            && (namespace == null || attributes[i].equals(namespace)))
                return attributes[i + 3];
        }
        
        return null;
    }
    
}

你可能感兴趣的:(html,C++,c,xml,XHTML)