jQuery源码二周目#10 Sizzle 词法解析

注意：之后的讲解通通以div > p + .aaron[type="checkbox"], #id:first-child这个选择器为例

词法解析

拿到这个选择器之后首先要做的事情是把它分解成一个个单元，单元结构如下

{
    value: '匹配到的字符串',
    type: '对应的Token类型',
    matches: '正则匹配到的一个结构'
}

上面的选择器拆分之后就是

[
  [
    { value: "div", type: "TAG", matches: ["div"] },
    { value: " > ", type: ">" },
    { value: "p", type: "TAG", matches: ["p"] },
    { value: " + ", type: "+" },
    { value: ".aaron", type: "CLASS", matches: ["aaron"] },
    { value: "[type='checkbox']", type: "ATTR", matches: ["type", "=", "checkbox"] }
  ],
  [
    { value: "#id", type: "ID", matches: ["id"] },
    { value: ":first-child", type: "CHILD", matches: Array }
  ]
]

这一个过程叫做词法解析

Sizzle.tokenize

Sizzle.tokenize()就是词法解析的接口，将css选择器传入就能得到上面的Token序列，接下来用代码去实现。

整体架构
首先搭建一个大体的框架，和jQuery整体架构一样，外部用一个即时函数包裹

(function(window, undefined) {

    var Sizzle = function() {

    }

    window.Sizzle = Sizzle;
})(window);

主体
注释已经做了很详细的讲解了，剩下的就是在debug模式下一步步的调试，看代码是如何执行的。其实词法解析这一篇内容大可以跳过，因为它的任务只是将css选择器解析成Token序列，不理解Sizzle.tokenize的源码也不影响接下来的代码学习，只需要有解析这个步骤罢了。至于怎么解析，要是能力很强的话可以自己写一个，像我这种菜鸟选择照搬jQuery源码

Sizzle.tokenize = function (selector, parseOnly) {
        var matched,                        // 用于判断每执行一次while循环是否解析成功，如果为false说明css选择器有错误，直接终止循环
            match,                          // 正则匹配结果
            tokens,                         // Token序列
            soFar = selector,               // 表示字符串未解析的部分
            groups = [],                    // 存放已经解析好的Token序列
            preFilters = Expr.preFilter,    // 预处理用
            cached = tokenCache[selector];  // 缓存

        if (cached) {
            return parseOnly ? 0 : cached;
        }

        while (soFar) {

            // 检查是否有逗号，有逗号的话就是多个Token序列
            // 比如像'div > p + .aaron[type="checkbox"], #id:first-child'这个选择器就是两个Token序列
            // groups = [
            //     [序列一],   // div > p + .aaron[type="checkbox"]
            //     [序列二]    // #id:first-child
            // ]
            if (!matched || (match = rcomma.exec(soFar))) {
                if (match) {
                    // 清除逗号
                    soFar = soFar.slice(match[0].length);
                }

                // 每有一个逗号，都要往groups压入一个Token序列
                // 然后就是最开始的时候也要往groups压入一个Token
                groups.push(tokens = []);
            }

            matched = false;

            // 处理特殊的Token：>, +, 空格, ~
            if (match = rcombinators.exec(soFar)) {
                matched = match.shift();
                tokens.push({
                    value: matched,
                    type: match[0].replace(rtrim, " ")
                });

                // 处理完之后将其从待处理的字符中删除
                soFar = soFar.slice(matched.length);
            }

            // 这里开始分析这几种Token：TAG, ID, CLASS, ATTR, CHILD, PSEUDO, NAME
            // 如果通过正则匹配到了Token格式：match = matchExpr[ type ].exec( soFar )
            // 需要预处理的Token：ATTR, CHILD, PSEUDO
            // 交给预处理器处理：match = preFilters[ type ]( match )
            for (var type in Expr.filter) {
                if ( match = matchExpr[type].exec(soFar) ) {
                    // 预处理
                    if (preFilters[type]) {
                        match = preFilters[type](match);
                    }

                    matched = match.shift();
                    tokens.push({
                        value: matched,
                        type: type,
                        matches: match
                    })

                    // 处理完之后将其从待处理的字符中删除
                    soFar = soFar.slice(matched.length);
                }
            }

            // 如果到了这里都还没matched到，那么说明这个选择器在这里有错误
            // 直接中断词法分析过程
            // 这就是Sizzle对词法分析的异常处理
            if (!matched) {
                break;
            }
        }

        // 如果只需要这个接口检查选择器的合法性，直接就返回soFar的剩余长度，因为soFar长度大于零说明选择器不合法
        // 其余情况，如果soFar不等于""，抛出异常；否则把groups记录在cache里边并返回
        return parseOnly ?
            soFar.length :
            soFar ?
                Sizzle.error(selector) :

                // 这里对选择器(selector)做一个缓存
                tokenCache(selector, groups);
    }

其余代码

如果只有上面的代码是运行不了的，因为上面代码所用到的一些变量和方法是未定义的，接下来把那些缺失的变量和方法添上

定义变量
这些变量几乎都是正则表达式，用来匹配不同Token（TAG, ID, CLASS, ATTR, CHILD, PSEUDO, NAME）的。表达式不用去研究了，只需要明白每个表达式是匹配哪种Token的就行了。我之前尝试去弄懂为什么表达式要那样写，然而到现在我都没弄懂，而且研究源码也不是每个细节都需要弄懂，只要把整体思路理解了就行了。

var
    // 选择器缓存
    tokenCache = createCache(),

    booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" +
        "ismap|loop|multiple|open|readonly|required|scoped",


    // 空白字符
    whitespace = "[\\x20\\t\\r\\n\\f]",

    // ClassName或ID Name
    identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace +
        "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+",

    // css选择器属性，如[type="phone"]
    attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace +

        // Operator (capture 2)
        "*([*^$|!~]?=)" + whitespace +

        // "Attribute values must be CSS identifiers [capture 5]
        // or strings [capture 3 or capture 4]"
        "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" +
        whitespace + "*\\]",

    pseudos = ":(" + identifier + ")(?:\\((" +

        // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments:
        // 1. quoted (capture 3; capture 4 or capture 5)
        "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" +

        // 2. simple (capture 6)
        "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" +

        // 3. anything else (capture 2)
        ".*" +
        ")\\)|)",

    // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter
    rwhitespace = new RegExp( whitespace + "+", "g" ),

    // 去除左右空格用
    rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" +
        whitespace + "+$", "g" ),

    // 逗号
    rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ),

    // >, +, 空格, ~
    rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace +
        "*" ),
    rdescend = new RegExp( whitespace + "|>" ),

    rpseudo = new RegExp( pseudos ),
    ridentifier = new RegExp( "^" + identifier + "$" ),

    matchExpr = {
        "ID": new RegExp( "^#(" + identifier + ")" ),
        "CLASS": new RegExp( "^\\.(" + identifier + ")" ),
        "TAG": new RegExp( "^(" + identifier + "|[*])" ),
        "ATTR": new RegExp( "^" + attributes ),
        "PSEUDO": new RegExp( "^" + pseudos ),
        "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" +
            whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" +
            whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ),
        "bool": new RegExp( "^(?:" + booleans + ")$", "i" ),

        // For use in libraries implementing .is()
        // We use this for POS matching in `select`
        "needsContext": new RegExp( "^" + whitespace +
            "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace +
            "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" )
    },

    rhtml = /HTML$/i,
    rinputs = /^(?:input|select|textarea|button)$/i,
    rheader = /^h\d$/i,

    rnative = /^[^{]+\{\s*\[native \w/,

    // Easily-parseable/retrievable ID or TAG or CLASS selectors
    rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,

    rsibling = /[+~]/,

    // CSS escapes
    // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters
    runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ),
    
    funescape = function( escape, nonHex ) {
        var high = "0x" + escape.slice( 1 ) - 0x10000;

        return nonHex ?

            // Strip the backslash prefix from a non-hex escape sequence
            nonHex :

            // Replace a hexadecimal escape sequence with the encoded Unicode code point
            // Support: IE <=11+
            // For values outside the Basic Multilingual Plane (BMP), manually construct a
            // surrogate pair
            high < 0 ?
                String.fromCharCode( high + 0x10000 ) :
                String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 );
    };

定义方法

var Expr = Sizzle.selectors = {

    // 选择器缓存长度
    cacheLength: 50,

    // 预处理
    preFilter: {
        "ATTR": function( match ) {
            match[ 1 ] = match[ 1 ].replace( runescape, funescape );

            // Move the given value to match[3] whether quoted or unquoted
            match[ 3 ] = ( match[ 3 ] || match[ 4 ] ||
                match[ 5 ] || "" ).replace( runescape, funescape );

            if ( match[ 2 ] === "~=" ) {
                match[ 3 ] = " " + match[ 3 ] + " ";
            }

            return match.slice( 0, 4 );
        },

        "CHILD": function( match ) {

            /* matches from matchExpr["CHILD"]
                1 type (only|nth|...)
                2 what (child|of-type)
                3 argument (even|odd|\d*|\d*n([+-]\d+)?|...)
                4 xn-component of xn+y argument ([+-]?\d*n|)
                5 sign of xn-component
                6 x of xn-component
                7 sign of y-component
                8 y of y-component
            */
            match[ 1 ] = match[ 1 ].toLowerCase();

            if ( match[ 1 ].slice( 0, 3 ) === "nth" ) {

                // nth-* requires argument
                if ( !match[ 3 ] ) {
                    Sizzle.error( match[ 0 ] );
                }

                // numeric x and y parameters for Expr.filter.CHILD
                // remember that false/true cast respectively to 0/1
                match[ 4 ] = +( match[ 4 ] ?
                    match[ 5 ] + ( match[ 6 ] || 1 ) :
                    2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) );
                match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" );

                // other types prohibit arguments
            } else if ( match[ 3 ] ) {
                Sizzle.error( match[ 0 ] );
            }

            return match;
        },

        "PSEUDO": function( match ) {
            var excess,
                unquoted = !match[ 6 ] && match[ 2 ];

            if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) {
                return null;
            }

            // Accept quoted arguments as-is
            if ( match[ 3 ] ) {
                match[ 2 ] = match[ 4 ] || match[ 5 ] || "";

                // Strip excess characters from unquoted arguments
            } else if ( unquoted && rpseudo.test( unquoted ) &&

                // Get excess from tokenize (recursively)
                ( excess = tokenize( unquoted, true ) ) &&

                // advance to the next closing parenthesis
                ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) {

                // excess is a negative index
                match[ 0 ] = match[ 0 ].slice( 0, excess );
                match[ 2 ] = unquoted.slice( 0, excess );
            }

            // Return only captures needed by the pseudo filter method (type and argument)
            return match.slice( 0, 3 );
        }
    },

    filter: {

        "TAG": function( nodeNameSelector ) {
            var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase();
            return nodeNameSelector === "*" ?
                function() {
                    return true;
                } :
                function( elem ) {
                    return elem.nodeName && elem.nodeName.toLowerCase() === nodeName;
                };
        },

        "CLASS": function( className ) {
            var pattern = classCache[ className + " " ];

            return pattern ||
                ( pattern = new RegExp( "(^|" + whitespace +
                    ")" + className + "(" + whitespace + "|$)" ) ) && classCache(
                    className, function( elem ) {
                        return pattern.test(
                            typeof elem.className === "string" && elem.className ||
                            typeof elem.getAttribute !== "undefined" &&
                            elem.getAttribute( "class" ) ||
                            ""
                        );
                    } );
        },

        "ATTR": function( name, operator, check ) {
            return function( elem ) {
                var result = Sizzle.attr( elem, name );

                if ( result == null ) {
                    return operator === "!=";
                }
                if ( !operator ) {
                    return true;
                }

                result += "";

                /* eslint-disable max-len */

                return operator === "=" ? result === check :
                    operator === "!=" ? result !== check :
                        operator === "^=" ? check && result.indexOf( check ) === 0 :
                            operator === "*=" ? check && result.indexOf( check ) > -1 :
                                operator === "$=" ? check && result.slice( -check.length ) === check :
                                    operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 :
                                        operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" :
                                            false;
                /* eslint-enable max-len */

            };
        },

        "CHILD": function( type, what, _argument, first, last ) {
            var simple = type.slice( 0, 3 ) !== "nth",
                forward = type.slice( -4 ) !== "last",
                ofType = what === "of-type";

            return first === 1 && last === 0 ?

                // Shortcut for :nth-*(n)
                function( elem ) {
                    return !!elem.parentNode;
                } :

                function( elem, _context, xml ) {
                    var cache, uniqueCache, outerCache, node, nodeIndex, start,
                        dir = simple !== forward ? "nextSibling" : "previousSibling",
                        parent = elem.parentNode,
                        name = ofType && elem.nodeName.toLowerCase(),
                        useCache = !xml && !ofType,
                        diff = false;

                    if ( parent ) {

                        // :(first|last|only)-(child|of-type)
                        if ( simple ) {
                            while ( dir ) {
                                node = elem;
                                while ( ( node = node[ dir ] ) ) {
                                    if ( ofType ?
                                        node.nodeName.toLowerCase() === name :
                                        node.nodeType === 1 ) {

                                        return false;
                                    }
                                }

                                // Reverse direction for :only-* (if we haven't yet done so)
                                start = dir = type === "only" && !start && "nextSibling";
                            }
                            return true;
                        }

                        start = [ forward ? parent.firstChild : parent.lastChild ];

                        // non-xml :nth-child(...) stores cache data on `parent`
                        if ( forward && useCache ) {

                            // Seek `elem` from a previously-cached index

                            // ...in a gzip-friendly way
                            node = parent;
                            outerCache = node[ expando ] || ( node[ expando ] = {} );

                            // Support: IE <9 only
                            // Defend against cloned attroperties (jQuery gh-1709)
                            uniqueCache = outerCache[ node.uniqueID ] ||
                                ( outerCache[ node.uniqueID ] = {} );

                            cache = uniqueCache[ type ] || [];
                            nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ];
                            diff = nodeIndex && cache[ 2 ];
                            node = nodeIndex && parent.childNodes[ nodeIndex ];

                            while ( ( node = ++nodeIndex && node && node[ dir ] ||

                                // Fallback to seeking `elem` from the start
                                ( diff = nodeIndex = 0 ) || start.pop() ) ) {

                                // When found, cache indexes on `parent` and break
                                if ( node.nodeType === 1 && ++diff && node === elem ) {
                                    uniqueCache[ type ] = [ dirruns, nodeIndex, diff ];
                                    break;
                                }
                            }

                        } else {

                            // Use previously-cached element index if available
                            if ( useCache ) {

                                // ...in a gzip-friendly way
                                node = elem;
                                outerCache = node[ expando ] || ( node[ expando ] = {} );

                                // Support: IE <9 only
                                // Defend against cloned attroperties (jQuery gh-1709)
                                uniqueCache = outerCache[ node.uniqueID ] ||
                                    ( outerCache[ node.uniqueID ] = {} );

                                cache = uniqueCache[ type ] || [];
                                nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ];
                                diff = nodeIndex;
                            }

                            // xml :nth-child(...)
                            // or :nth-last-child(...) or :nth(-last)?-of-type(...)
                            if ( diff === false ) {

                                // Use the same loop as above to seek `elem` from the start
                                while ( ( node = ++nodeIndex && node && node[ dir ] ||
                                    ( diff = nodeIndex = 0 ) || start.pop() ) ) {

                                    if ( ( ofType ?
                                        node.nodeName.toLowerCase() === name :
                                        node.nodeType === 1 ) &&
                                        ++diff ) {

                                        // Cache the index of each encountered element
                                        if ( useCache ) {
                                            outerCache = node[ expando ] ||
                                                ( node[ expando ] = {} );

                                            // Support: IE <9 only
                                            // Defend against cloned attroperties (jQuery gh-1709)
                                            uniqueCache = outerCache[ node.uniqueID ] ||
                                                ( outerCache[ node.uniqueID ] = {} );

                                            uniqueCache[ type ] = [ dirruns, diff ];
                                        }

                                        if ( node === elem ) {
                                            break;
                                        }
                                    }
                                }
                            }
                        }

                        // Incorporate the offset, then check against cycle size
                        diff -= last;
                        return diff === first || ( diff % first === 0 && diff / first >= 0 );
                    }
                };
        },

        "PSEUDO": function( pseudo, argument ) {

            // pseudo-class names are case-insensitive
            // http://www.w3.org/TR/selectors/#pseudo-classes
            // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters
            // Remember that setFilters inherits from pseudos
            var args,
                fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] ||
                    Sizzle.error( "unsupported pseudo: " + pseudo );

            // The user may use createPseudo to indicate that
            // arguments are needed to create the filter function
            // just as Sizzle does
            if ( fn[ expando ] ) {
                return fn( argument );
            }

            // But maintain support for old signatures
            if ( fn.length > 1 ) {
                args = [ pseudo, pseudo, "", argument ];
                return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ?
                    markFunction( function( seed, matches ) {
                        var idx,
                            matched = fn( seed, argument ),
                            i = matched.length;
                        while ( i-- ) {
                            idx = indexOf( seed, matched[ i ] );
                            seed[ idx ] = !( matches[ idx ] = matched[ i ] );
                        }
                    } ) :
                    function( elem ) {
                        return fn( elem, 0, args );
                    };
            }

            return fn;
        },

        "ID": function( id ) {
            var attrId = id.replace( runescape, funescape );
            return function( elem ) {
                return elem.getAttribute( "id" ) === attrId;
            };

        }
    },
}

// 异常
Sizzle.error = function( msg ) {
    throw new Error( "Syntax error, unrecognized expression: " + msg );
}

// 选择器缓存
function createCache() {
    var keys = [];

    function cache(key, value) {
        if (keys.push(key) > Expr.cacheLength) {
            delete cache[keys.shift()];
        }
        return (cache[key] = value);
    }
    return cache;
}

代码下载

传送门

jQuery源码二周目#10 Sizzle 词法解析

词法解析

Sizzle.tokenize

其余代码

代码下载

你可能感兴趣的:(jQuery源码二周目#10 Sizzle 词法解析)