场景:
1.在没有HTML库时(谁有好的html库介绍下,C/C++的?tinyXML?),以SAX方式解析HTML时,会读入特殊字符,这时候需要转义成正常字符才能使用。
2.耗时,4-6小时(被打扰)。
3.replace虽然挺好,但是会循环整个字符串执行替换,效率应该没有一次过替换高.
#include <algorithm> void replace( iterator start, iterator end, const TYPE& old_value, const TYPE& new_value );
4.所有转义字符的网址:
http://114.xixik.com/character/
文件1:test_htmlescape.cpp
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <iostream> #include <assert.h> using namespace std; int IsLittleEndian() { int x = 1; if (*(char*) &x == 1) { return 1; } else { return 0; } } void HtmEscapeEntityCodeToUnicode(const char* entity_code,char* html_char) { int decimal_value = atoi(entity_code); char* uchari = (char*)&decimal_value; if(IsLittleEndian()) { html_char[0] = uchari[0] & 0xFF; html_char[1] = uchari[1] & 0xFF; }else { html_char[0] = uchari[1] & 0xFF; html_char[1] = uchari[0] & 0xFF; } } void OneUnicode2UTF8(const char* unicode_char,size_t unicode_char_length, char* utf_char) { //unicode: 0x192->110010010 ,utf8:0xC692->1100011010010010 int value = 0; memcpy(&value,unicode_char,unicode_char_length); if (value >= 0x0000 && value <= 0x007F) { utf_char[0] = unicode_char[0]; } else if (value >= 0x0080 && value <= 0x07FF) { utf_char[0] = ((value >> 6) | 0xC0); utf_char[1] = ((value & 0x3F) | 0x80); } else if (value >= 0x0800 && value <= 0xFFFF) { utf_char[0] = ((value >> 12) | 0xE0); utf_char[1] = ((value >> 6 & 0x3F) | 0x80); utf_char[2] = ((value & 0x3F) | 0x80); } else if (value >= 0x10000 && value <= 0x10FFFF) { utf_char[0] = (value >> 18 | 0xF0); utf_char[1] = ((value >> 12 & 0x3F) | 0x80); utf_char[2] = ((value >> 6 & 0x3F) | 0x80); utf_char[3] = ((value & 0x3F) | 0x80); } else { cerr << "value too big." << endl; assert(0); } } static const char* kEntityNameToEntityCodeMap[] = { "oelig","339","amp","38","rArr","8658","fnof","402" }; //1.这里可以优化的余地很大. const char* HtmEscapeEntityNameToEntityCode(const char* entity_name) { static size_t length = sizeof(kEntityNameToEntityCodeMap)/sizeof(char*); for(size_t i = 0; i < length; i+=2) { if(!strcmp(entity_name,kEntityNameToEntityCodeMap[i])) { return kEntityNameToEntityCodeMap[i+1]; } } return NULL; } string UnescapeUTFHTMLContent(const char* str) { string temp; char* pos_amp = NULL; char* pos_semicolon = (char*)str; const char* start_amp = str; int entity_length = 0; char entity_code[5]; const int kMaxEntityLength = 4; char entity_name[20]; const int kMaxEntityNameLength = 18; char unicode[3]; char utf8[4]; while(true) { if(!start_amp || !(*start_amp)) { break; } pos_amp = strchr(start_amp,'&'); if(!pos_amp) { temp.append(start_amp); break; } int pos_no = pos_amp - pos_semicolon; if(pos_no > 0) { temp.append(start_amp,pos_no); start_amp = pos_amp; } char* pos_amp1 = pos_amp+1; if(!pos_amp1 || !(*pos_amp1)) { string t2(start_amp); temp.append(start_amp); break; } if(isalpha(*pos_amp1)) { pos_semicolon = strchr(pos_amp1,';'); if(pos_semicolon) { //调用 HtmEscapeEntityNameToEntityCode memset(entity_name,0,sizeof(entity_name)); entity_length = ((pos_semicolon - pos_amp1) > kMaxEntityNameLength)?kMaxEntityNameLength: (pos_semicolon - pos_amp1); strncpy(entity_name,pos_amp1,entity_length); const char* entity_code_c = HtmEscapeEntityNameToEntityCode(entity_name); if(entity_code_c) { memset(unicode,0,sizeof(unicode)); memset(utf8,0,sizeof(utf8)); HtmEscapeEntityCodeToUnicode(entity_code_c,unicode); OneUnicode2UTF8(unicode,2,utf8); temp.append(utf8); }else { temp.append(entity_name); } //1.entity_name转换为entity_code之后再转换为utf8字符. start_amp = pos_semicolon + 1; pos_semicolon+=1; }else { start_amp = pos_amp1; } }else if(*pos_amp1 =='#') { char* pos_digit = (pos_amp1+1); if(!pos_digit) { break; } if(isdigit(*pos_digit)) { //1.需要判断数值小于10000. pos_semicolon = strchr(start_amp,';'); if(pos_semicolon) { memset(entity_code,0,sizeof(entity_code)); entity_length = ((pos_semicolon - pos_digit) > kMaxEntityLength)?kMaxEntityLength: (pos_semicolon - pos_digit); strncpy(entity_code,pos_digit,entity_length); memset(unicode,0,sizeof(unicode)); memset(utf8,0,sizeof(utf8)); HtmEscapeEntityCodeToUnicode(entity_code,unicode); OneUnicode2UTF8(unicode,2,utf8); temp.append(utf8); start_amp = pos_semicolon + 1; pos_semicolon+=1; }else { start_amp = pos_digit; } } }else { string sa(start_amp,pos_amp1 - start_amp); temp.append(sa); start_amp = pos_amp1; } } return temp; } int main(int argc, char *argv[]) { printf("Hello, world\n"); string str; const char *html_str = "ΖabcdΕhello©<a>⇒⇒" "ƒ…</a>" "asfas‡dfeΥΨΩ<img>n↓n⋅nωmmm</img>1jh"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;;#8736;"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str: " << str << endl; html_str = "<td>&#8736;</td>;&"; str = UnescapeUTFHTMLContent(html_str); cout << "str1: " << str << endl; html_str = "ab"; str = UnescapeUTFHTMLContent(html_str); cout << "ab: " << str << endl; return 0; }
str: ΖabcdΕhello©<a>⇒⇒ƒ…</a>asfas‡dfeΥPsiΩ<img>n↓nsdotnωmmm</img>1jh str: <td>∠</td> str: <td>∠</td>;;#8736; str: <td>∠</td>;& str: <td>∠</td>;amp str1: <td>∠</td>;& ab: ab