LingosHook:使用TinyHtml解析的LingosHook
先来张Picture,展示一下使用TinyHtmlParser解析的结果。
处理HTML果然比TEXT简单、清晰多了。如下两个函数,就可以分解出结果,这也说明了,Lingoes的结果还是很有规律的,嘿嘿。。。
int CViconECDictResultParser::ParserHTML( const wxString & html, CDBAccess::TRecordDataVector & vct) const
{
std::wstring str(html.begin(), html.end());
TinyHtmlParser::CDocumentObject doc;
if(doc.Load(str) != 0)
return -1;
vct.clear();
std::wstring body(ID.begin(), ID.end());
body = L"dict_body_" + body;
const TinyHtmlParser::CElementObject* pe = doc.FindFirstElement(L"DIV");
while(pe != NULL)
{
const TinyHtmlParser::CAttributeObject* pa = pe->FindAttribute(L"id");
if(pa != NULL && pa->value == body)
{
TinyHtmlParser::CDocumentObject::TElementStack tmpstack;
const TinyHtmlParser::CElementObject* pr = doc.FindFirstElement(pe, L"DIV", tmpstack);
while(pr != NULL)
{
pa = pr->FindAttribute(L"style");
if(pa != NULL && pa->value == L"\"MARGIN: 5px 0px\"")
{
CDBAccess::TRecordData rec;
if(GetRecord(&doc, pr, rec) != 0)
return -1;
rec.m_strHTML = html;
vct.push_back(rec);
}
pr = doc.FindNextElement(pe, L"DIV", tmpstack);
}
}
pe = doc.FindNextElement();
}
return 0;
}
int CViconECDictResultParser::GetRecord(TinyHtmlParser::CDocumentObject * doc, const TinyHtmlParser::CElementObject * pr, CDBAccess::TRecordData & rec) const
{
CDBAccess::TResultVector vct;
TinyHtmlParser::CDocumentObject::TElementStack tmpstack;
const TinyHtmlParser::CElementObject* p = doc->FindFirstElement(pr, L"DIV", tmpstack);
while(p != NULL)
{
const TinyHtmlParser::CAttributeObject* pa = p->FindAttribute(L"style");
if(pa == NULL)
return -1;
if(pa->value == L"\"MARGIN: 0px 0px 5px; COLOR: #808080; LINE-HEIGHT: normal\"")
{//word and symbol
if(p->child == NULL)
return -1;
if(p->child->child == NULL || p->child->child->type != TinyHtmlParser::ET_ELEMENT)
return -1;
rec.m_strWord = wxString(p->child->child->value.c_str(), wxConvISO8859_1);
if(p->child->sibling == NULL || p->child->sibling->child == NULL || p->child->sibling->child->type != TinyHtmlParser::ET_ELEMENT)
return -1;
rec.m_strSymbol = wxString(p->child->sibling->child->value.c_str(), wxConvISO8859_1);
}
else if(pa->value == L"\"MARGIN: 0px 0px 5px\"")
{//result and class
CDBAccess::TResultPair result;
if(p->child == NULL || p->child->type != TinyHtmlParser::ET_ELEMENT)
return -1;
result.second = wxString(p->child->value.c_str(),wxConvISO8859_1);
result.second.Trim(false);
if(p->child->child == NULL || p->child->child->type != TinyHtmlParser::ET_ELEMENT)
return -1;
result.first = StrToWC(wxString(p->child->child->value.c_str(), wxConvISO8859_1));
vct.push_back(result);
}
p = doc->FindNextElement(pr, L"DIV", tmpstack);
}
rec.m_vctResult.push_back(std::make_pair(TITLE, vct));
return 0;
}
将TinyHtmlParser合成到LingosHook里面才发现一个大问题--中文处理失败。因为整个解析过程全部都是用std::string,而不是std::wstring,这样导致在字符串分解过程中会丢失宽字符信息。于是--改,将std::string全部换成std::wstring,整个过程比我想象的简单,半小时搞定,怎么说呢,STL真好。。。
下面是TinyHtmlParser的代码,不长,下次再做实现说明,不过,常言道--“代码在手,天下我有”。。。
1
#ifndef __TINYHTMLPARSER_H__
2 #define __TINYHTMLPARSER_H__
3
4 #include < iostream >
5 #include < string >
6 #include < queue >
7 #include < stack >
8
9 namespace TinyHtmlParser
10 {
11
12enum ElementType { ET_UNKNOWN = -1, ET_TAG = 0, ET_NODE, ET_ELEMENT };//0:just a tag, 1:no value, 2:have value
13
14class CAttributeObject
15{
16public:
17 CAttributeObject(const std::wstring& a, const std::wstring& v)
18 : attr(a), value(v), next(NULL)
19 {
20 }
21 virtual ~CAttributeObject() {}
22
23 void Show(std::wostream& os) const;
24public:
25 std::wstring attr;
26 std::wstring value;
27 CAttributeObject* next;
28};
29
30class CElementObject
31{
32public:
33 CElementObject();
34 virtual ~CElementObject();
35
36 virtual int Analyse();
37
38 const CAttributeObject* FindAttribute(const std::wstring& attr) const;
39
40 void Show(std::wostream& os) const;
41protected:
42 int AnalyseAttribute(const std::wstring& attr);
43 int MakeAttribute(const std::wstring& attr);
44 int MakeAttribute(const std::wstring& attr, const std::wstring& value);
45 void FreeAnalyseAttribute();
46 int AnalyseValue();
47public:
48 ElementType type;
49 size_t level;
50 CElementObject* parent;
51 CElementObject* child;
52 CElementObject* sibling;
53
54 CAttributeObject* attrib;
55public:
56 std::wstring tag;
57 std::wstring value;
58};
59
60class CParserData
61{
62public:
63 enum DataType { DT_UNKNOWN = -1, DT_TAG = 0, DT_VALUE, DT_END, DT_DONE, DT_TAG_VALUE };
64public:
65 CParserData()
66 : type(DT_UNKNOWN)
67 , start(0)
68 {
69 }
70 virtual ~CParserData() {}
71
72public:
73 DataType type;
74 size_t start;
75 size_t end;
76 size_t vstart;
77 size_t vend;
78};
79
80class CDocumentObject
81{
82protected:
83 static const wchar_t TAG_LT = L'<';
84 static const wchar_t TAG_GT = L'>';
85 static const wchar_t TAG_SLASH = L'/';
86 static const wchar_t TAG_BSLASH = L'\\';
87 static const wchar_t TAG_AND = L'&';
88
89 typedef std::vector<CParserData> TDataVector;
90
91 typedef std::stack<CParserData> TDataStack;
92 struct TNodeData
93 {
94 size_t level;
95 CParserData tag;
96 CParserData value;
97// CParserData end;
98 };
99 typedef std::deque<TNodeData> TNodeQueue;
100public:
101 typedef std::stack<const CElementObject*> TElementStack;
102public:
103 CDocumentObject();
104 virtual ~CDocumentObject();
105
106 int Load(const std::wstring& str);
107
108 const CElementObject* Root() const;
109
110 const CElementObject* FindFirstElement(const std::wstring& tag);
111 const CElementObject* FindNextElement();
112
113 const CElementObject* FindFirstElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack);
114 const CElementObject* FindNextElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack);
115
116 const CAttributeObject* FindAttribute(const CElementObject* element, const std::wstring& attr);
117
118 void Show(std::wostream& os) const;
119protected:
120 int PreProcess(const std::wstring& str, std::wstring& html);
121 int PreParser(const std::wstring& html, TNodeQueue& vct);
122 int Parser(const std::wstring& html, TNodeQueue& que);
123private:
124 int PreParserLT(const std::wstring& html, std::wstring::size_type& pos, CParserData& data);
125 int PushValueData(const CParserData& data, TDataStack& datastack) const;
126 int PushTagData(const std::wstring& html, const CParserData& data, TDataStack& datatstack, TNodeQueue& nodeque) const;
127
128 int CheckSpecialTag(const std::wstring& html, const CParserData& data) const;
129 int CheckTag(const std::wstring& html, const CParserData& tag, const CParserData& end) const;
130 CElementObject* MakeElement(const std::wstring& html, const TNodeData& node, CElementObject* parent, CElementObject* sibling) const;
131
132 void CDocumentObject::ShowElement(std::wostream& os, const CElementObject* e) const;
133
134 void FreeElement(CElementObject* root);
135
136 const CElementObject* FindElement(const CElementObject* root, const CElementObject* pe, const std::wstring& tag, TElementStack& stack);
137private:
138 CElementObject* _root;
139private:
140 std::wstring _findtag;
141 TElementStack _findstack;
142};
143
144}
145
146 #endif
147
2 #define __TINYHTMLPARSER_H__
3
4 #include < iostream >
5 #include < string >
6 #include < queue >
7 #include < stack >
8
9 namespace TinyHtmlParser
10 {
11
12enum ElementType { ET_UNKNOWN = -1, ET_TAG = 0, ET_NODE, ET_ELEMENT };//0:just a tag, 1:no value, 2:have value
13
14class CAttributeObject
15{
16public:
17 CAttributeObject(const std::wstring& a, const std::wstring& v)
18 : attr(a), value(v), next(NULL)
19 {
20 }
21 virtual ~CAttributeObject() {}
22
23 void Show(std::wostream& os) const;
24public:
25 std::wstring attr;
26 std::wstring value;
27 CAttributeObject* next;
28};
29
30class CElementObject
31{
32public:
33 CElementObject();
34 virtual ~CElementObject();
35
36 virtual int Analyse();
37
38 const CAttributeObject* FindAttribute(const std::wstring& attr) const;
39
40 void Show(std::wostream& os) const;
41protected:
42 int AnalyseAttribute(const std::wstring& attr);
43 int MakeAttribute(const std::wstring& attr);
44 int MakeAttribute(const std::wstring& attr, const std::wstring& value);
45 void FreeAnalyseAttribute();
46 int AnalyseValue();
47public:
48 ElementType type;
49 size_t level;
50 CElementObject* parent;
51 CElementObject* child;
52 CElementObject* sibling;
53
54 CAttributeObject* attrib;
55public:
56 std::wstring tag;
57 std::wstring value;
58};
59
60class CParserData
61{
62public:
63 enum DataType { DT_UNKNOWN = -1, DT_TAG = 0, DT_VALUE, DT_END, DT_DONE, DT_TAG_VALUE };
64public:
65 CParserData()
66 : type(DT_UNKNOWN)
67 , start(0)
68 {
69 }
70 virtual ~CParserData() {}
71
72public:
73 DataType type;
74 size_t start;
75 size_t end;
76 size_t vstart;
77 size_t vend;
78};
79
80class CDocumentObject
81{
82protected:
83 static const wchar_t TAG_LT = L'<';
84 static const wchar_t TAG_GT = L'>';
85 static const wchar_t TAG_SLASH = L'/';
86 static const wchar_t TAG_BSLASH = L'\\';
87 static const wchar_t TAG_AND = L'&';
88
89 typedef std::vector<CParserData> TDataVector;
90
91 typedef std::stack<CParserData> TDataStack;
92 struct TNodeData
93 {
94 size_t level;
95 CParserData tag;
96 CParserData value;
97// CParserData end;
98 };
99 typedef std::deque<TNodeData> TNodeQueue;
100public:
101 typedef std::stack<const CElementObject*> TElementStack;
102public:
103 CDocumentObject();
104 virtual ~CDocumentObject();
105
106 int Load(const std::wstring& str);
107
108 const CElementObject* Root() const;
109
110 const CElementObject* FindFirstElement(const std::wstring& tag);
111 const CElementObject* FindNextElement();
112
113 const CElementObject* FindFirstElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack);
114 const CElementObject* FindNextElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack);
115
116 const CAttributeObject* FindAttribute(const CElementObject* element, const std::wstring& attr);
117
118 void Show(std::wostream& os) const;
119protected:
120 int PreProcess(const std::wstring& str, std::wstring& html);
121 int PreParser(const std::wstring& html, TNodeQueue& vct);
122 int Parser(const std::wstring& html, TNodeQueue& que);
123private:
124 int PreParserLT(const std::wstring& html, std::wstring::size_type& pos, CParserData& data);
125 int PushValueData(const CParserData& data, TDataStack& datastack) const;
126 int PushTagData(const std::wstring& html, const CParserData& data, TDataStack& datatstack, TNodeQueue& nodeque) const;
127
128 int CheckSpecialTag(const std::wstring& html, const CParserData& data) const;
129 int CheckTag(const std::wstring& html, const CParserData& tag, const CParserData& end) const;
130 CElementObject* MakeElement(const std::wstring& html, const TNodeData& node, CElementObject* parent, CElementObject* sibling) const;
131
132 void CDocumentObject::ShowElement(std::wostream& os, const CElementObject* e) const;
133
134 void FreeElement(CElementObject* root);
135
136 const CElementObject* FindElement(const CElementObject* root, const CElementObject* pe, const std::wstring& tag, TElementStack& stack);
137private:
138 CElementObject* _root;
139private:
140 std::wstring _findtag;
141 TElementStack _findstack;
142};
143
144}
145
146 #endif
147
1
2 #include " TinyHtmlParser.h "
3
4 namespace TinyHtmlParser
5 {
6
7void CAttributeObject::Show(std::wostream& os) const
8{
9 os << " attr : " << this->attr << " -- value = " << this->value << std::endl;
10}
11
12CElementObject::CElementObject()
13: type(ET_UNKNOWN)
14, level(0)
15, parent(NULL)
16, child(NULL)
17, sibling(NULL)
18, attrib(NULL)
19{
20}
21
22CElementObject::~CElementObject()
23{
24 FreeAnalyseAttribute();
25}
26
27int CElementObject::Analyse()
28{
29 std::wstring str = tag;
30
31 std::wstring::size_type pos = str.find(L" ");
32 if(pos != std::wstring::npos)
33 {
34 tag = str.substr(0, pos);
35
36 str = str.substr(pos + 1);
37 if(AnalyseAttribute(str) != 0)
38 {
39 return -1;
40 }
41 }
42 if(type == ET_ELEMENT)
43 {
44 if(AnalyseValue() != 0)
45 return -1;
46 }
47 return 0;
48}
49
50int CElementObject::AnalyseAttribute(const std::wstring& attr)
51{
52 if(attr.size() == 0)
53 return 0;
54
55 std::wstring a, v;
56 std::wstring::size_type pos = attr.find(L"="), start = 0;
57 while(pos != std::wstring::npos)
58 {
59 a = attr.substr(start, pos - start);
60 if(pos == attr.size() - 1)
61 return -1;
62 start = pos + 1;
63 if(attr[pos + 1] == L'\"')
64 {
65 pos = attr.find(L"\"", start + 1);
66 if(pos == std::wstring::npos)
67 return -1;
68 v = attr.substr(start, pos - start + 1);
69 start = pos + 2;
70 }
71 else
72 {
73 pos = attr.find(L" ", start);
74 if(pos == std::wstring::npos)
75 pos = attr.size();
76 v = attr.substr(start, pos - start);
77 start = pos + 1;
78 }
79 if(MakeAttribute(a, v) != 0)
80 return -1;
81
82 if(start >= attr.size())
83 break;
84
85 pos = attr.find(L"=", start);
86 }
87 return 0;
88}
89
90int CElementObject::MakeAttribute(const std::wstring &attr)
91{
92 std::wstring::size_type pos = attr.find(L"=");
93 if(pos == std::wstring::npos)
94 return -1;
95
96 return MakeAttribute(attr.substr(0, pos), attr.substr(pos));
97}
98
99int CElementObject::MakeAttribute(const std::wstring &attr, const std::wstring& value)
100{
101 std::auto_ptr<CAttributeObject> obj(new CAttributeObject(attr, value));//attr.substr(0, pos), attr.substr(pos)));
102
103 if(attrib != NULL)
104 {
105 CAttributeObject* tmp = attrib;
106 while(tmp->next != NULL)
107 tmp = tmp->next;
108 tmp->next = obj.release();
109 }
110 else
111 {
112 attrib = obj.release();
113 }
114 return 0;
115}
116
117
118void CElementObject::FreeAnalyseAttribute()
119{
120 CAttributeObject* tmp = attrib;
121 while(attrib != NULL)
122 {
123 tmp = attrib->next;
124 delete attrib;
125 attrib = tmp;
126 }
127
128}
129
130int CElementObject::AnalyseValue()
131{
132 std::wstring::size_type pos = this->value.find(L" ");
133 while(pos != std::wstring::npos)
134 {
135 this->value.replace(pos, 6, L" ");
136 pos = this->value.find(L" ", pos + 1);
137 }
138
139 return 0;
140}
141
142const CAttributeObject* CElementObject::FindAttribute(const std::wstring& attr) const
143{
144 const CAttributeObject* pa = this->attrib;
145 while(pa != NULL)
146 {
147 if(pa->attr == attr)
148 return pa;
149 pa = pa->next;
150 }
151 return pa;
152}
153
154void CElementObject::Show(std::wostream& os) const
155{
156 os << "[" << this->level << "]" << "Tag : " << this->tag;
157 if(this->type == ET_ELEMENT)
158 os << " -- value = " << this->value;
159 os << std::endl;
160
161 const CAttributeObject* attr = this->attrib;
162 while(attr != NULL)
163 {
164 attr->Show(os);
165 attr = attr->next;
166 }
167 os << std::endl;
168}
169//
170
171CDocumentObject::CDocumentObject()
172: _root(NULL)
173{
174}
175
176CDocumentObject::~CDocumentObject()
177{
178 if(_root != NULL)
179 FreeElement(_root);
180}
181
182int CDocumentObject::Load(const std::wstring &str)
183{
184 std::wstring html;
185 if(PreProcess(str, html) != 0)
186 return -1;
187 TNodeQueue que;
188 if(PreParser(html, que) != 0)
189 return -1;
190 if(Parser(html, que) != 0)
191 return -1;
192 return 0;
193}
194
195int CDocumentObject::PreProcess(const std::wstring& str, std::wstring& html)
196{
197 bool tag = false;
198 for(std::wstring::const_iterator it = str.begin(); it != str.end(); ++ it)
199 {
200 if(*it == TAG_LT)
201 {
202 if(tag == true)
203 return -1;
204 tag = true;
205 }
206 else if(*it == TAG_GT)
207 {
208 if(tag == false)
209 return -1;
210 tag = false;
211 }
212 else
213 {
214 if(tag == false)
215 {
216 if(isspace((unsigned char)*it) != 0)
217 continue;
218 }
219 }
220 html += *it;
221 }
222
223 return 0;
224}
225
226int CDocumentObject::PreParser(const std::wstring& html, CDocumentObject::TNodeQueue& que)
227{
228 std::wstring::size_type pos = 0;
229
230 if(html.size() == 0)
231 return -1;
232 if(html[pos] != TAG_LT)
233 return -1;
234
235 TDataStack datastack;
236
237 CParserData data;
238
239 while(pos < html.size())
240 {
241 if(html[pos] == TAG_LT)
242 {
243 if(pos > data.start)
244 {
245 data.type = CParserData::DT_VALUE;
246 data.end = pos;
247
248// std::cout << "VALUE - " << html.substr(data.start, data.end - data.start) << std::endl;
249 if(PushValueData(data, datastack) != 0)
250 return -1;
251 }
252
253 if(PreParserLT(html, pos, data) != 0)
254 return -1;
255// std::cout << "TAG - " << html.substr(data.start, data.end - data.start) << std::endl;
256 if(PushTagData(html, data, datastack, que) != 0)
257 return -1;
258
259 ++ pos;
260 data.start = pos;
261 }
262 //else if(html[pos] == TAG_GT || html[pos] == TAG_SLASH)
263 //{
264 // return -1;
265 //}
266 else
267 {
268 ++ pos;
269 }
270// std::cout << (char)html[pos] << std::endl;
271 }
272
273 return 0;
274}
275
276int CDocumentObject::Parser(const std::wstring& html, CDocumentObject::TNodeQueue& que)
277{
278 CElementObject *pe = NULL, *pp = NULL, *ps = NULL;
279 size_t level = 0;
280 while(que.size()> 0)
281 {
282 const TNodeData &node = que.front();
283 if(level < que.front().level)
284 {
285 pp = pe;
286 ps = NULL;
287 }
288 else if(level == que.front().level)
289 {
290 ps = pe;
291 }
292 else//>
293 {
294 ps = pe;
295 pp = pe->parent;
296 int t = level - que.front().level;
297 while(t > 0)
298 {
299 ps = ps->parent;
300 pp = pp->parent;
301 -- t;
302 }
303 }
304 level = que.front().level;
305
306 pe = MakeElement(html, que.front(), pp, ps);
307
308 if(pe == NULL)
309 return -1;
310
311 que.pop_front();
312 }
313
314 if(pp != NULL)
315 {
316 while(pp->parent != NULL)
317 pp = pp->parent;
318 _root = pp;
319 }
320 else
321 _root = pe;
322
323 return 0;
324}
325
326int CDocumentObject::PreParserLT(const std::wstring& html, std::wstring::size_type& pos, CParserData& data)
327{
328 if(pos == html.size() - 1)
329 return -1;
330
331 data.start = pos;
332
333 ++ pos;
334
335 if(html[pos] != TAG_SLASH)
336 {
337 data.type = CParserData::DT_TAG;
338 }
339 else
340 {
341 data.type = CParserData::DT_END;
342 ++ pos;
343 }
344
345 while(pos < html.size())
346 {
347 if(html[pos] == TAG_GT)
348 {
349 if(html[pos - 1] == TAG_SLASH)
350 {
351 data.type = CParserData::DT_DONE;
352 }
353
354 data.end = pos;
355
356 return 0;
357 }
358 else if(html[pos] == TAG_LT)
359 {
360 return -1;
361 }
362
363 ++ pos;
364 }
365
366 return -1;
367}
368
369int CDocumentObject::PushValueData(const TinyHtmlParser::CParserData &data, CDocumentObject::TDataStack &datastack) const
370{
371 if(datastack.size() == 0)
372 return -1;
373 datastack.push(data);
374 return 0;
375}
376
377int CDocumentObject::PushTagData(const std::wstring& html, const CParserData& data, CDocumentObject::TDataStack& datastack, CDocumentObject::TNodeQueue& nodeque) const
378{
379 if(data.type == CParserData::DT_TAG)
380 {
381 if(CheckSpecialTag(html, data) == 0)
382 {
383 TNodeData node;
384 node.tag = data;
385
386 node.level = datastack.size();
387 nodeque.push_front(node);
388 return 0;
389 }
390
391 if(datastack.size() > 0 && datastack.top().type == CParserData::DT_VALUE)
392 {
393 CParserData data = datastack.top();
394 datastack.pop();
395 if(datastack.top().type != CParserData::DT_TAG)
396 return -1;
397 datastack.top().type = CParserData::DT_TAG_VALUE;
398 datastack.top().vstart = data.start;
399 datastack.top().vend = data.end;
400 }
401
402 datastack.push(data);
403 }
404 else if(data.type == CParserData::DT_END)
405 {
406 if(datastack.size() == 0)
407 return -1;
408
409 TNodeData node;
410 if(datastack.top().type == CParserData::DT_TAG || datastack.top().type == CParserData::DT_TAG_VALUE)
411 {
412 node.tag = datastack.top();
413 datastack.pop();
414 }
415 else if(datastack.top().type == CParserData::DT_VALUE)
416 {
417 node.value = datastack.top();
418
419// std::cout << "value - " << html.substr(node.value.start, node.value.end - node.value.start) << std::endl;
420
421 datastack.pop();
422
423 if(datastack.size() == 0)
424 return -1;
425
426 if(datastack.top().type == CParserData::DT_TAG)
427 {
428 node.tag = datastack.top();
429 }
430 else if(datastack.top().type == CParserData::DT_TAG_VALUE)
431 {
432 node.tag = datastack.top();
433 }
434 else
435 {
436 return -1;
437 }
438
439 //node.tag = datastack.top();
440 //else if(datastack.top().type == CParserData::DT_TAG_VALUE)
441 //{
442 // node.tag = datastack.top();
443 //}
444 datastack.pop();
445 }
446 else
447 {
448// std::cout << "type : " << datastack.top().type << std::endl;
449 return -1;
450 }
451
452 if(CheckTag(html, node.tag, data) != 0)
453 return -1;
454
455 node.level = datastack.size();
456 nodeque.push_front(node);
457 }
458 else if(data.type == CParserData::DT_DONE)
459 {
460 if(datastack.size() > 0 && datastack.top().type == CParserData::DT_VALUE)
461 {
462 CParserData data = datastack.top();
463 datastack.pop();
464 if(datastack.top().type != CParserData::DT_TAG)
465 return -1;
466 datastack.top().type = CParserData::DT_TAG_VALUE;
467 datastack.top().vstart = data.start;
468 datastack.top().vend = data.end;
469 }
470
471// datastack.push(data);
472
473 TNodeData node;
474 node.tag = data;
475
476 node.level = datastack.size();
477 nodeque.push_front(node);
478 }
479 else
480 {
481 return -1;
482 }
483 return 0;
484}
485
486int CDocumentObject::CheckSpecialTag(const std::wstring& html, const CParserData& data) const
487{
488 std::wstring tag = html.substr(data.start + 1, data.end - data.start - 1);
489 std::wstring::size_type pos = tag.find(L" ");
490 if(pos != std::wstring::npos)
491 tag = tag.substr(0, pos);
492
493 if(tag == L"IMG")
494 return 0;
495
496 return -1;
497}
498
499int CDocumentObject::CheckTag(const std::wstring& html, const CParserData& tag, const CParserData& end) const
500{
501 std::wstring str = html.substr(tag.start + 1, tag.end - tag.start - 1);
502 std::wstring::size_type pos = str.find(L" ");
503 if(pos != std::wstring::npos)
504 str = str.substr(0, pos);
505
506 if(str != html.substr(end.start + 2, end.end - end.start - 2))
507 {
508// std::cout << "tag : " << str << " -- end : " << html.substr(end.start + 2, end.end - end.start - 2) << std::endl;
509 return -1;
510 }
511 return 0;
512}
513
514CElementObject* CDocumentObject::MakeElement(const std::wstring& html, const CDocumentObject::TNodeData &node, CElementObject *parent, CElementObject *sibling) const
515{
516 std::auto_ptr<CElementObject> ele(new CElementObject);
517
518 ele->level = node.level;
519
520 if(node.tag.type == CParserData::DT_TAG)
521 {
522 ele->type = ET_NODE;
523 ele->tag = html.substr(node.tag.start + 1, node.tag.end - node.tag.start - 1);
524 }
525 else if(node.tag.type == CParserData::DT_DONE)
526 {
527 ele->type = ET_TAG;
528 ele->tag = html.substr(node.tag.start + 1, node.tag.end - node.tag.start - 2);
529 }
530 else if(node.tag.type == CParserData::DT_TAG_VALUE)
531 {
532 ele->tag = ET_NODE;
533 ele->tag = html.substr(node.tag.start + 1, node.tag.end - node.tag.start - 1);
534 }
535 else
536 return NULL;
537
538 if(node.value.type == CParserData::DT_VALUE)
539 {
540 ele->type = ET_ELEMENT;
541 if(node.tag.type == CParserData::DT_TAG)
542 ele->value = html.substr(node.value.start, node.value.end - node.value.start);
543 else
544 ele->value = html.substr(node.tag.vstart, node.tag.vend - node.tag.vstart) + L"%" + html.substr(node.value.start, node.value.end - node.value.start);
545 }
546
547 if(ele->Analyse() != 0)
548 {
549 return NULL;
550 }
551
552 if(parent != NULL)
553 parent->child = ele.get();
554 ele->parent = parent;
555 ele->sibling = sibling;
556
557 //std::cout << "element: tag - " << ele->tag << std::endl;
558
559 return ele.release();
560}
561
562void CDocumentObject::Show(std::wostream &os) const
563{
564 if(_root != NULL)
565 ShowElement(os, _root);
566}
567
568void CDocumentObject::ShowElement(std::wostream& os, const CElementObject* e) const
569{
570 const CElementObject* pe = e, *ps = e->sibling;
571
572 pe->Show(os);
573
574 pe = pe->child;
575 if(pe != NULL)
576 {
577 ShowElement(os, pe);
578 }
579 if(ps != NULL)
580 {
581 ShowElement(os, ps);
582 }
583}
584
585void CDocumentObject::FreeElement(CElementObject* root)
586{
587 CElementObject* pe = root->child, *ps = root->sibling;
588
589// std::cout << "free:" << root->tag << std::endl;
590
591 if(root != NULL)
592 {
593 free(root);
594 root = NULL;
595 }
596
597 if(pe != NULL)
598 {
599 FreeElement(pe);
600 }
601 if(ps != NULL)
602 {
603 FreeElement(ps);
604 }
605}
606
607const CElementObject* CDocumentObject::FindFirstElement(const std::wstring &tag)
608{
609 if(_root == NULL)
610 return NULL;
611
612 _findtag = tag;
613 while(!_findstack.empty())
614 _findstack.pop();
615
616 return FindElement(NULL, _root, _findtag, _findstack);
617}
618
619const CElementObject* CDocumentObject::FindNextElement()
620{
621 if(_findstack.empty())
622 return NULL;
623
624 return FindElement(NULL, _findstack.top()->child, _findtag, _findstack);
625}
626
627const CElementObject* CDocumentObject::FindFirstElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack)
628{
629 if(element == NULL)
630 return NULL;
631
632 while(!tmpstack.empty())
633 tmpstack.pop();
634
635 return FindElement(element, element, tag, tmpstack);
636}
637
638const CElementObject* CDocumentObject::FindNextElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack)
639{
640 if(tmpstack.empty())
641 return NULL;
642
643 return FindElement(element, tmpstack.top()->child, tag, tmpstack);
644}
645
646const CElementObject* CDocumentObject::FindElement(const CElementObject* root, const CElementObject* pe, const std::wstring& tag, TElementStack& stack)
647{
648 while(pe != NULL)
649 {
650 stack.push(pe);
651 if(pe->tag == tag)
652 return pe;
653 pe = pe->child;
654 }
655
656 while(!stack.empty() && stack.top() != root && pe == NULL)
657 {
658 pe = stack.top()->sibling;
659 stack.pop();
660 }
661
662 if(pe == NULL)
663 return NULL;
664
665 return FindElement(root, pe, tag, stack);
666}
667
668const CAttributeObject* CDocumentObject::FindAttribute(const TinyHtmlParser::CElementObject *element, const std::wstring &attr)
669{
670 if(element == NULL)
671 return NULL;
672
673 const CAttributeObject* pa = element->attrib;
674 while(pa != NULL)
675 {
676 if(pa->attr == attr)
677 return pa;
678 pa = pa->next;
679 }
680 return pa;
681}
682
683}
2 #include " TinyHtmlParser.h "
3
4 namespace TinyHtmlParser
5 {
6
7void CAttributeObject::Show(std::wostream& os) const
8{
9 os << " attr : " << this->attr << " -- value = " << this->value << std::endl;
10}
11
12CElementObject::CElementObject()
13: type(ET_UNKNOWN)
14, level(0)
15, parent(NULL)
16, child(NULL)
17, sibling(NULL)
18, attrib(NULL)
19{
20}
21
22CElementObject::~CElementObject()
23{
24 FreeAnalyseAttribute();
25}
26
27int CElementObject::Analyse()
28{
29 std::wstring str = tag;
30
31 std::wstring::size_type pos = str.find(L" ");
32 if(pos != std::wstring::npos)
33 {
34 tag = str.substr(0, pos);
35
36 str = str.substr(pos + 1);
37 if(AnalyseAttribute(str) != 0)
38 {
39 return -1;
40 }
41 }
42 if(type == ET_ELEMENT)
43 {
44 if(AnalyseValue() != 0)
45 return -1;
46 }
47 return 0;
48}
49
50int CElementObject::AnalyseAttribute(const std::wstring& attr)
51{
52 if(attr.size() == 0)
53 return 0;
54
55 std::wstring a, v;
56 std::wstring::size_type pos = attr.find(L"="), start = 0;
57 while(pos != std::wstring::npos)
58 {
59 a = attr.substr(start, pos - start);
60 if(pos == attr.size() - 1)
61 return -1;
62 start = pos + 1;
63 if(attr[pos + 1] == L'\"')
64 {
65 pos = attr.find(L"\"", start + 1);
66 if(pos == std::wstring::npos)
67 return -1;
68 v = attr.substr(start, pos - start + 1);
69 start = pos + 2;
70 }
71 else
72 {
73 pos = attr.find(L" ", start);
74 if(pos == std::wstring::npos)
75 pos = attr.size();
76 v = attr.substr(start, pos - start);
77 start = pos + 1;
78 }
79 if(MakeAttribute(a, v) != 0)
80 return -1;
81
82 if(start >= attr.size())
83 break;
84
85 pos = attr.find(L"=", start);
86 }
87 return 0;
88}
89
90int CElementObject::MakeAttribute(const std::wstring &attr)
91{
92 std::wstring::size_type pos = attr.find(L"=");
93 if(pos == std::wstring::npos)
94 return -1;
95
96 return MakeAttribute(attr.substr(0, pos), attr.substr(pos));
97}
98
99int CElementObject::MakeAttribute(const std::wstring &attr, const std::wstring& value)
100{
101 std::auto_ptr<CAttributeObject> obj(new CAttributeObject(attr, value));//attr.substr(0, pos), attr.substr(pos)));
102
103 if(attrib != NULL)
104 {
105 CAttributeObject* tmp = attrib;
106 while(tmp->next != NULL)
107 tmp = tmp->next;
108 tmp->next = obj.release();
109 }
110 else
111 {
112 attrib = obj.release();
113 }
114 return 0;
115}
116
117
118void CElementObject::FreeAnalyseAttribute()
119{
120 CAttributeObject* tmp = attrib;
121 while(attrib != NULL)
122 {
123 tmp = attrib->next;
124 delete attrib;
125 attrib = tmp;
126 }
127
128}
129
130int CElementObject::AnalyseValue()
131{
132 std::wstring::size_type pos = this->value.find(L" ");
133 while(pos != std::wstring::npos)
134 {
135 this->value.replace(pos, 6, L" ");
136 pos = this->value.find(L" ", pos + 1);
137 }
138
139 return 0;
140}
141
142const CAttributeObject* CElementObject::FindAttribute(const std::wstring& attr) const
143{
144 const CAttributeObject* pa = this->attrib;
145 while(pa != NULL)
146 {
147 if(pa->attr == attr)
148 return pa;
149 pa = pa->next;
150 }
151 return pa;
152}
153
154void CElementObject::Show(std::wostream& os) const
155{
156 os << "[" << this->level << "]" << "Tag : " << this->tag;
157 if(this->type == ET_ELEMENT)
158 os << " -- value = " << this->value;
159 os << std::endl;
160
161 const CAttributeObject* attr = this->attrib;
162 while(attr != NULL)
163 {
164 attr->Show(os);
165 attr = attr->next;
166 }
167 os << std::endl;
168}
169//
170
171CDocumentObject::CDocumentObject()
172: _root(NULL)
173{
174}
175
176CDocumentObject::~CDocumentObject()
177{
178 if(_root != NULL)
179 FreeElement(_root);
180}
181
182int CDocumentObject::Load(const std::wstring &str)
183{
184 std::wstring html;
185 if(PreProcess(str, html) != 0)
186 return -1;
187 TNodeQueue que;
188 if(PreParser(html, que) != 0)
189 return -1;
190 if(Parser(html, que) != 0)
191 return -1;
192 return 0;
193}
194
195int CDocumentObject::PreProcess(const std::wstring& str, std::wstring& html)
196{
197 bool tag = false;
198 for(std::wstring::const_iterator it = str.begin(); it != str.end(); ++ it)
199 {
200 if(*it == TAG_LT)
201 {
202 if(tag == true)
203 return -1;
204 tag = true;
205 }
206 else if(*it == TAG_GT)
207 {
208 if(tag == false)
209 return -1;
210 tag = false;
211 }
212 else
213 {
214 if(tag == false)
215 {
216 if(isspace((unsigned char)*it) != 0)
217 continue;
218 }
219 }
220 html += *it;
221 }
222
223 return 0;
224}
225
226int CDocumentObject::PreParser(const std::wstring& html, CDocumentObject::TNodeQueue& que)
227{
228 std::wstring::size_type pos = 0;
229
230 if(html.size() == 0)
231 return -1;
232 if(html[pos] != TAG_LT)
233 return -1;
234
235 TDataStack datastack;
236
237 CParserData data;
238
239 while(pos < html.size())
240 {
241 if(html[pos] == TAG_LT)
242 {
243 if(pos > data.start)
244 {
245 data.type = CParserData::DT_VALUE;
246 data.end = pos;
247
248// std::cout << "VALUE - " << html.substr(data.start, data.end - data.start) << std::endl;
249 if(PushValueData(data, datastack) != 0)
250 return -1;
251 }
252
253 if(PreParserLT(html, pos, data) != 0)
254 return -1;
255// std::cout << "TAG - " << html.substr(data.start, data.end - data.start) << std::endl;
256 if(PushTagData(html, data, datastack, que) != 0)
257 return -1;
258
259 ++ pos;
260 data.start = pos;
261 }
262 //else if(html[pos] == TAG_GT || html[pos] == TAG_SLASH)
263 //{
264 // return -1;
265 //}
266 else
267 {
268 ++ pos;
269 }
270// std::cout << (char)html[pos] << std::endl;
271 }
272
273 return 0;
274}
275
276int CDocumentObject::Parser(const std::wstring& html, CDocumentObject::TNodeQueue& que)
277{
278 CElementObject *pe = NULL, *pp = NULL, *ps = NULL;
279 size_t level = 0;
280 while(que.size()> 0)
281 {
282 const TNodeData &node = que.front();
283 if(level < que.front().level)
284 {
285 pp = pe;
286 ps = NULL;
287 }
288 else if(level == que.front().level)
289 {
290 ps = pe;
291 }
292 else//>
293 {
294 ps = pe;
295 pp = pe->parent;
296 int t = level - que.front().level;
297 while(t > 0)
298 {
299 ps = ps->parent;
300 pp = pp->parent;
301 -- t;
302 }
303 }
304 level = que.front().level;
305
306 pe = MakeElement(html, que.front(), pp, ps);
307
308 if(pe == NULL)
309 return -1;
310
311 que.pop_front();
312 }
313
314 if(pp != NULL)
315 {
316 while(pp->parent != NULL)
317 pp = pp->parent;
318 _root = pp;
319 }
320 else
321 _root = pe;
322
323 return 0;
324}
325
326int CDocumentObject::PreParserLT(const std::wstring& html, std::wstring::size_type& pos, CParserData& data)
327{
328 if(pos == html.size() - 1)
329 return -1;
330
331 data.start = pos;
332
333 ++ pos;
334
335 if(html[pos] != TAG_SLASH)
336 {
337 data.type = CParserData::DT_TAG;
338 }
339 else
340 {
341 data.type = CParserData::DT_END;
342 ++ pos;
343 }
344
345 while(pos < html.size())
346 {
347 if(html[pos] == TAG_GT)
348 {
349 if(html[pos - 1] == TAG_SLASH)
350 {
351 data.type = CParserData::DT_DONE;
352 }
353
354 data.end = pos;
355
356 return 0;
357 }
358 else if(html[pos] == TAG_LT)
359 {
360 return -1;
361 }
362
363 ++ pos;
364 }
365
366 return -1;
367}
368
369int CDocumentObject::PushValueData(const TinyHtmlParser::CParserData &data, CDocumentObject::TDataStack &datastack) const
370{
371 if(datastack.size() == 0)
372 return -1;
373 datastack.push(data);
374 return 0;
375}
376
377int CDocumentObject::PushTagData(const std::wstring& html, const CParserData& data, CDocumentObject::TDataStack& datastack, CDocumentObject::TNodeQueue& nodeque) const
378{
379 if(data.type == CParserData::DT_TAG)
380 {
381 if(CheckSpecialTag(html, data) == 0)
382 {
383 TNodeData node;
384 node.tag = data;
385
386 node.level = datastack.size();
387 nodeque.push_front(node);
388 return 0;
389 }
390
391 if(datastack.size() > 0 && datastack.top().type == CParserData::DT_VALUE)
392 {
393 CParserData data = datastack.top();
394 datastack.pop();
395 if(datastack.top().type != CParserData::DT_TAG)
396 return -1;
397 datastack.top().type = CParserData::DT_TAG_VALUE;
398 datastack.top().vstart = data.start;
399 datastack.top().vend = data.end;
400 }
401
402 datastack.push(data);
403 }
404 else if(data.type == CParserData::DT_END)
405 {
406 if(datastack.size() == 0)
407 return -1;
408
409 TNodeData node;
410 if(datastack.top().type == CParserData::DT_TAG || datastack.top().type == CParserData::DT_TAG_VALUE)
411 {
412 node.tag = datastack.top();
413 datastack.pop();
414 }
415 else if(datastack.top().type == CParserData::DT_VALUE)
416 {
417 node.value = datastack.top();
418
419// std::cout << "value - " << html.substr(node.value.start, node.value.end - node.value.start) << std::endl;
420
421 datastack.pop();
422
423 if(datastack.size() == 0)
424 return -1;
425
426 if(datastack.top().type == CParserData::DT_TAG)
427 {
428 node.tag = datastack.top();
429 }
430 else if(datastack.top().type == CParserData::DT_TAG_VALUE)
431 {
432 node.tag = datastack.top();
433 }
434 else
435 {
436 return -1;
437 }
438
439 //node.tag = datastack.top();
440 //else if(datastack.top().type == CParserData::DT_TAG_VALUE)
441 //{
442 // node.tag = datastack.top();
443 //}
444 datastack.pop();
445 }
446 else
447 {
448// std::cout << "type : " << datastack.top().type << std::endl;
449 return -1;
450 }
451
452 if(CheckTag(html, node.tag, data) != 0)
453 return -1;
454
455 node.level = datastack.size();
456 nodeque.push_front(node);
457 }
458 else if(data.type == CParserData::DT_DONE)
459 {
460 if(datastack.size() > 0 && datastack.top().type == CParserData::DT_VALUE)
461 {
462 CParserData data = datastack.top();
463 datastack.pop();
464 if(datastack.top().type != CParserData::DT_TAG)
465 return -1;
466 datastack.top().type = CParserData::DT_TAG_VALUE;
467 datastack.top().vstart = data.start;
468 datastack.top().vend = data.end;
469 }
470
471// datastack.push(data);
472
473 TNodeData node;
474 node.tag = data;
475
476 node.level = datastack.size();
477 nodeque.push_front(node);
478 }
479 else
480 {
481 return -1;
482 }
483 return 0;
484}
485
486int CDocumentObject::CheckSpecialTag(const std::wstring& html, const CParserData& data) const
487{
488 std::wstring tag = html.substr(data.start + 1, data.end - data.start - 1);
489 std::wstring::size_type pos = tag.find(L" ");
490 if(pos != std::wstring::npos)
491 tag = tag.substr(0, pos);
492
493 if(tag == L"IMG")
494 return 0;
495
496 return -1;
497}
498
499int CDocumentObject::CheckTag(const std::wstring& html, const CParserData& tag, const CParserData& end) const
500{
501 std::wstring str = html.substr(tag.start + 1, tag.end - tag.start - 1);
502 std::wstring::size_type pos = str.find(L" ");
503 if(pos != std::wstring::npos)
504 str = str.substr(0, pos);
505
506 if(str != html.substr(end.start + 2, end.end - end.start - 2))
507 {
508// std::cout << "tag : " << str << " -- end : " << html.substr(end.start + 2, end.end - end.start - 2) << std::endl;
509 return -1;
510 }
511 return 0;
512}
513
514CElementObject* CDocumentObject::MakeElement(const std::wstring& html, const CDocumentObject::TNodeData &node, CElementObject *parent, CElementObject *sibling) const
515{
516 std::auto_ptr<CElementObject> ele(new CElementObject);
517
518 ele->level = node.level;
519
520 if(node.tag.type == CParserData::DT_TAG)
521 {
522 ele->type = ET_NODE;
523 ele->tag = html.substr(node.tag.start + 1, node.tag.end - node.tag.start - 1);
524 }
525 else if(node.tag.type == CParserData::DT_DONE)
526 {
527 ele->type = ET_TAG;
528 ele->tag = html.substr(node.tag.start + 1, node.tag.end - node.tag.start - 2);
529 }
530 else if(node.tag.type == CParserData::DT_TAG_VALUE)
531 {
532 ele->tag = ET_NODE;
533 ele->tag = html.substr(node.tag.start + 1, node.tag.end - node.tag.start - 1);
534 }
535 else
536 return NULL;
537
538 if(node.value.type == CParserData::DT_VALUE)
539 {
540 ele->type = ET_ELEMENT;
541 if(node.tag.type == CParserData::DT_TAG)
542 ele->value = html.substr(node.value.start, node.value.end - node.value.start);
543 else
544 ele->value = html.substr(node.tag.vstart, node.tag.vend - node.tag.vstart) + L"%" + html.substr(node.value.start, node.value.end - node.value.start);
545 }
546
547 if(ele->Analyse() != 0)
548 {
549 return NULL;
550 }
551
552 if(parent != NULL)
553 parent->child = ele.get();
554 ele->parent = parent;
555 ele->sibling = sibling;
556
557 //std::cout << "element: tag - " << ele->tag << std::endl;
558
559 return ele.release();
560}
561
562void CDocumentObject::Show(std::wostream &os) const
563{
564 if(_root != NULL)
565 ShowElement(os, _root);
566}
567
568void CDocumentObject::ShowElement(std::wostream& os, const CElementObject* e) const
569{
570 const CElementObject* pe = e, *ps = e->sibling;
571
572 pe->Show(os);
573
574 pe = pe->child;
575 if(pe != NULL)
576 {
577 ShowElement(os, pe);
578 }
579 if(ps != NULL)
580 {
581 ShowElement(os, ps);
582 }
583}
584
585void CDocumentObject::FreeElement(CElementObject* root)
586{
587 CElementObject* pe = root->child, *ps = root->sibling;
588
589// std::cout << "free:" << root->tag << std::endl;
590
591 if(root != NULL)
592 {
593 free(root);
594 root = NULL;
595 }
596
597 if(pe != NULL)
598 {
599 FreeElement(pe);
600 }
601 if(ps != NULL)
602 {
603 FreeElement(ps);
604 }
605}
606
607const CElementObject* CDocumentObject::FindFirstElement(const std::wstring &tag)
608{
609 if(_root == NULL)
610 return NULL;
611
612 _findtag = tag;
613 while(!_findstack.empty())
614 _findstack.pop();
615
616 return FindElement(NULL, _root, _findtag, _findstack);
617}
618
619const CElementObject* CDocumentObject::FindNextElement()
620{
621 if(_findstack.empty())
622 return NULL;
623
624 return FindElement(NULL, _findstack.top()->child, _findtag, _findstack);
625}
626
627const CElementObject* CDocumentObject::FindFirstElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack)
628{
629 if(element == NULL)
630 return NULL;
631
632 while(!tmpstack.empty())
633 tmpstack.pop();
634
635 return FindElement(element, element, tag, tmpstack);
636}
637
638const CElementObject* CDocumentObject::FindNextElement(const CElementObject* element, const std::wstring& tag, TElementStack& tmpstack)
639{
640 if(tmpstack.empty())
641 return NULL;
642
643 return FindElement(element, tmpstack.top()->child, tag, tmpstack);
644}
645
646const CElementObject* CDocumentObject::FindElement(const CElementObject* root, const CElementObject* pe, const std::wstring& tag, TElementStack& stack)
647{
648 while(pe != NULL)
649 {
650 stack.push(pe);
651 if(pe->tag == tag)
652 return pe;
653 pe = pe->child;
654 }
655
656 while(!stack.empty() && stack.top() != root && pe == NULL)
657 {
658 pe = stack.top()->sibling;
659 stack.pop();
660 }
661
662 if(pe == NULL)
663 return NULL;
664
665 return FindElement(root, pe, tag, stack);
666}
667
668const CAttributeObject* CDocumentObject::FindAttribute(const TinyHtmlParser::CElementObject *element, const std::wstring &attr)
669{
670 if(element == NULL)
671 return NULL;
672
673 const CAttributeObject* pa = element->attrib;
674 while(pa != NULL)
675 {
676 if(pa->attr == attr)
677 return pa;
678 pa = pa->next;
679 }
680 return pa;
681}
682
683}