xss的htmlspecialchars绕过

找啊找找啊找,在php源码中找到了

PHP_FUNCTION(htmlspecialchars)
{
	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
}


原来htmlspecialchars是调用php_html_entities这个函数的,顺带看了一下htmlentities

PHP_FUNCTION(htmlentities)
{
	php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
}

也是调用php_html_entities。

然后我们跳转到php_html_entities

static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
{
	char *str, *hint_charset = NULL;
	int str_len, hint_charset_len = 0;
	size_t new_len;
	long flags = ENT_COMPAT;
	char *replaced;
	zend_bool double_encode = 1;

	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, &flags, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) {
		return;
	}

	replaced = php_escape_html_entities_ex(str, str_len, &new_len, all, (int) flags, hint_charset, double_encode TSRMLS_CC);
	RETVAL_STRINGL(replaced, (int)new_len, 0);
}

zend_parse_parameters只是用来获取参数的,看来还不是最终处理的函数,继续寻找一个叫php_escape_html_entities_ex的函数。

哈哈,原来这才是最终的处理函数。

PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC)
{
	size_t cursor, maxlen, len;
	char *replaced;
	enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC);
	int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
	entity_table_opt entity_table;
	const enc_to_uni *to_uni_table = NULL;
	const entity_ht *inv_map = NULL; /* used for !double_encode */
	/* only used if flags includes ENT_HTML_IGNORE_ERRORS or ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS */
	const unsigned char *replacement = NULL;
	size_t replacement_len = 0;

	if (all) { /* replace with all named entities */
		if (CHARSET_PARTIAL_SUPPORT(charset)) {
			php_error_docref0(NULL TSRMLS_CC, E_STRICT, "Only basic entities "
				"substitution is supported for multi-byte encodings other than UTF-8; "
				"functionality is equivalent to htmlspecialchars");
		}
		LIMIT_ALL(all, doctype, charset);
	}
	entity_table = determine_entity_table(all, doctype);
	if (all && !CHARSET_UNICODE_COMPAT(charset)) {
		to_uni_table = enc_to_uni_index[charset];
	}

	if (!double_encode) {
		/* first arg is 1 because we want to identify valid named entities
		 * even if we are only encoding the basic ones */
		inv_map = unescape_inverse_map(1, flags);
	}

	if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) {
		if (charset == cs_utf_8) {
			replacement = (const unsigned char*)"\xEF\xBF\xBD";
			replacement_len = sizeof("\xEF\xBF\xBD") - 1;
		} else {
			replacement = (const unsigned char*)"�";
			replacement_len = sizeof("�") - 1;
		}
	}

	/* initial estimate */
	if (oldlen < 64) {
		maxlen = 128;	
	} else {
		maxlen = 2 * oldlen;
		if (maxlen < oldlen) {
			zend_error_noreturn(E_ERROR, "Input string is too long");
			return NULL;
		}
	}

	replaced = emalloc(maxlen + 1); /* adding 1 is safe: maxlen is even */
	len = 0;
	cursor = 0;
	while (cursor < oldlen) {
		const unsigned char *mbsequence = NULL;
		size_t mbseqlen					= 0,
		       cursor_before			= cursor;
		int status						= SUCCESS;
		unsigned int this_char			= get_next_char(charset, old, oldlen, &cursor, &status);

		/* guarantee we have at least 40 bytes to write.
		 * In HTML5, entities may take up to 33 bytes */
		if (len > maxlen - 40) { /* maxlen can never be smaller than 128 */
			replaced = safe_erealloc(replaced, maxlen , 1, 128 + 1);
			maxlen += 128;
		}

		if (status == FAILURE) {
			/* invalid MB sequence */
			if (flags & ENT_HTML_IGNORE_ERRORS) {
				continue;
			} else if (flags & ENT_HTML_SUBSTITUTE_ERRORS) {
				memcpy(&replaced[len], replacement, replacement_len);
				len += replacement_len;
				continue;
			} else {
				efree(replaced);
				*newlen = 0;
				return STR_EMPTY_ALLOC();
			}
		} else { /* SUCCESS */
			mbsequence = &old[cursor_before];
			mbseqlen = cursor - cursor_before;
		}

		if (this_char != '&') { /* no entity on this position */
			const unsigned char *rep	= NULL;
			size_t				rep_len	= 0;

			if (((this_char == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
					(this_char == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE))))
				goto pass_char_through;

			if (all) { /* false that CHARSET_PARTIAL_SUPPORT(charset) */
				if (to_uni_table != NULL) {
					/* !CHARSET_UNICODE_COMPAT therefore not UTF-8; since UTF-8
					 * is the only multibyte encoding with !CHARSET_PARTIAL_SUPPORT,
					 * we're using a single byte encoding */
					map_to_unicode(this_char, to_uni_table, &this_char);
					if (this_char == 0xFFFF) /* no mapping; pass through */
						goto pass_char_through;
				}
				/* the cursor may advance */
				find_entity_for_char(this_char, charset, entity_table.ms_table, &rep,
					&rep_len, old, oldlen, &cursor);
			} else {
				find_entity_for_char_basic(this_char, entity_table.table, &rep, &rep_len);
			}

			if (rep != NULL) {
				replaced[len++] = '&';
				memcpy(&replaced[len], rep, rep_len);
				len += rep_len;
				replaced[len++] = ';';
			} else {
				/* we did not find an entity for this char.
				 * check for its validity, if its valid pass it unchanged */
				if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
					if (CHARSET_UNICODE_COMPAT(charset)) {
						if (!unicode_cp_is_allowed(this_char, doctype)) {
							mbsequence = replacement;
							mbseqlen = replacement_len;
						}
					} else if (to_uni_table) {
						if (!all) /* otherwise we already did this */
							map_to_unicode(this_char, to_uni_table, &this_char);
						if (!unicode_cp_is_allowed(this_char, doctype)) {
							mbsequence = replacement;
							mbseqlen = replacement_len;
						}
					} else {
						/* not a unicode code point, unless, coincidentally, it's in
						 * the 0x20..0x7D range (except 0x5C in sjis). We know nothing
						 * about other code points, because we have no tables. Since
						 * Unicode code points in that range are not disallowed in any
						 * document type, we could do nothing. However, conversion
						 * tables frequently map 0x00-0x1F to the respective C0 code
						 * points. Let's play it safe and admit that's the case */
						if (this_char <= 0x7D &&
								!unicode_cp_is_allowed(this_char, doctype)) {
							mbsequence = replacement;
							mbseqlen = replacement_len;
						}
					}
				}
pass_char_through:
				if (mbseqlen > 1) {
					memcpy(replaced + len, mbsequence, mbseqlen);
					len += mbseqlen;
				} else {
					replaced[len++] = mbsequence[0];
				}
			}
		} else { /* this_char == '&' */
			if (double_encode) {
encode_amp:
				memcpy(&replaced[len], "&", sizeof("&") - 1);
				len += sizeof("&") - 1;
			} else { /* no double encode */
				/* check if entity is valid */
				size_t ent_len; /* not counting & or ; */
				/* peek at next char */
				if (old[cursor] == '#') { /* numeric entity */
					unsigned code_point;
					int valid;
					char *pos = (char*)&old[cursor+1];
					valid = process_numeric_entity((const char **)&pos, &code_point);
					if (valid == FAILURE)
						goto encode_amp;
					if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) {
						if (!numeric_entity_is_allowed(code_point, doctype))
							goto encode_amp;
					}
					ent_len = pos - (char*)&old[cursor];
				} else { /* named entity */
					/* check for vality of named entity */
					const char *start = &old[cursor],
							   *next = start;
					unsigned   dummy1, dummy2;

					if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
						goto encode_amp;
					if (resolve_named_entity_html(start, ent_len, inv_map, &dummy1, &dummy2) == FAILURE) {
						if (!(doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
									&& start[1] == 'p' && start[2] == 'o' && start[3] == 's')) {
							/* uses html4 inv_map, which doesn't include apos;. This is a
							 * hack to support it */
							goto encode_amp;
						}
					}
				}
				/* checks passed; copy entity to result */
				/* entity size is unbounded, we may need more memory */
				/* at this point maxlen - len >= 40 */
				if (maxlen - len < ent_len + 2 /* & and ; */) {
					/* ent_len < oldlen, which is certainly <= SIZE_MAX/2 */
					replaced = safe_erealloc(replaced, maxlen, 1, ent_len + 128 + 1);
					maxlen += ent_len + 128;
				}
				replaced[len++] = '&';
				memcpy(&replaced[len], &old[cursor], ent_len);
				len += ent_len;
				replaced[len++] = ';';
				cursor += ent_len + 1;
			}
		}
	}
	replaced[len] = '\0';
	*newlen = len;

	return replaced;
}



你可能感兴趣的:(xss的htmlspecialchars绕过)