xml,json都有大量的库来解析,我们如何解析html呢?
TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。
今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来
// NSData data contains the document data // encoding is the NSStringEncoding of the data // baseURL the documents base URL, i.e. location CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding); CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc); const char *enc = CFStringGetCStringPtr(cfencstr, 0); htmlDocPtr _htmlDocument = htmlReadDoc([data bytes], [[baseURL absoluteString] UTF8String], enc, XML_PARSE_NOERROR | XML_PARSE_NOWARNING); if (_htmlDocument) { xmlFreeDoc(_htmlDocument); } xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument; while (currentNode) { // output node if it is an element if (currentNode->type == XML_ELEMENT_NODE) { NSMutableArray *attrArray = [NSMutableArray array]; for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next) { xmlNodePtr contents = attrNode->children; [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]]; } NSString *attrString = [attrArray componentsJoinedByString:@" "]; if ([attrString length]) { attrString = [@" " stringByAppendingString:attrString]; } NSLog(@"<%s%@>", currentNode->name, attrString); } else if (currentNode->type == XML_TEXT_NODE) { //NSLog(@"%s", currentNode->content); NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]); } else if (currentNode->type == XML_COMMENT_NODE) { NSLog(@"/* %s */", currentNode->name); } if (currentNode && currentNode->children) { currentNode = currentNode->children; } else if (currentNode && currentNode->next) { currentNode = currentNode->next; } else { currentNode = currentNode->parent; // close node if (currentNode && currentNode->type == XML_ELEMENT_NODE) { NSLog(@"</%s>", currentNode->name); } if (currentNode->next) { currentNode = currentNode->next; } else { while(currentNode) { currentNode = currentNode->parent; if (currentNode && currentNode->type == XML_ELEMENT_NODE) { NSLog(@"</%s>", currentNode->name); if (strcmp((const char *)currentNode->name, "table") == 0) { NSLog(@"over"); } } if (currentNode == nodes->nodeTab[0]) { break; } if (currentNode && currentNode->next) { currentNode = currentNode->next; break; } } } } if (currentNode == nodes->nodeTab[0]) { break; } }
不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents. 还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",
所以我写了这个方法,同时修改node属性的content key.
NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult) { NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary]; if (currentNode->name) { NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding]; [resultForNode setObject:currentNodeContent forKey:@"nodeName"]; } if (currentNode->content) { NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]; if (currentNode->type == XML_TEXT_NODE) { if (currentNode->parent->type == XML_ELEMENT_NODE) { [parentResult setObject:currentNodeContent forKey:@"nodeContent"]; return nil; } if (currentNode->parent->type == XML_ATTRIBUTE_NODE) { [parentResult setObject: [currentNodeContent stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] forKey:@"attributeContent"]; return nil; } } } xmlAttr *attribute = currentNode->properties; if (attribute) { NSMutableArray *attributeArray = [NSMutableArray array]; while (attribute) { NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary]; NSString *attributeName = [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding]; if (attributeName) { [attributeDictionary setObject:attributeName forKey:@"attributeName"]; } if (attribute->children) { NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary); if (childDictionary) { [attributeDictionary setObject:childDictionary forKey:@"attributeContent"]; } } if ([attributeDictionary count] > 0) { [attributeArray addObject:attributeDictionary]; } attribute = attribute->next; } if ([attributeArray count] > 0) { [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"]; } } xmlNodePtr childNode = currentNode->children; if (childNode) { NSMutableArray *childContentArray = [NSMutableArray array]; while (childNode) { NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode); if (childDictionary) { [childContentArray addObject:childDictionary]; } childNode = childNode->next; } if ([childContentArray count] > 0) { [resultForNode setObject:childContentArray forKey:@"nodeChildArray"]; } } return resultForNode; }
NSString * const TFHppleNodeAttributeContentKey = @"attributeContent"; NSString * const TFHppleNodeChildArrayKey = @"nodeChildArray";
- (NSDictionary *) attributes { NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary]; for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) { [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey] forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]]; } return translatedAttributes; }
- (BOOL) hasChildren { NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey]; if (childs) { return YES; } return NO; } - (NSArray *) children { if ([self hasChildren]) return [node objectForKey: TFHppleNodeChildArrayKey]; return nil; }
- (NSString *)contentsAt:(NSString *)xPathOrCss;
参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html