使用python的lxml解析html

ibm有技术文章:

使用由 Python 编写的 lxml 实现高性能 XML 解析

http://www.ibm.com/developerworks/cn/xml/x-hiperfparse/



首先是安装lxml 
doc.xpath()返回一个满足条件的list 


# coding=gb2312

from lxml import *
import lxml.html
import urllib2
import lxml.html as H

def getjarinfo(url):
    c=urllib2.urlopen(url)
    
    f=c.read()
    doc = H.document_fromstring(f)
    tables=doc.xpath("//table[@id='xiazai']")    
    pinpais=doc.xpath("//td[@id='pinpai']")
    jixings=doc.xpath("//div[@id='jixing']")
    jars = doc.xpath("//table[@id='xiazai']//tr[2]/td[1]/a[1]")
    for j in range(len(pinpais)):
      print jars[j].get('href')
      print pinpais[j].text_content()         
      print jixings[j].text_content()
    e=doc.xpath(u"//div[text()='%s']" % u"游戏介绍")
    describe=e[0].getnext().text_content()
    #r = doc.xpath("//table[@id='xiazai']//tr[2]/td[1]/a[1]")[0]
    #jarurl=r.get('href')
    
if __name__ == '__main__':
    url='http://game.3533.com/game/30862.htm'
    getjarinfo(url)


你可能感兴趣的:(html,xml,python,IBM,url,import)