XPath(XML路径语言)

XPath(XML路径语言)

是一门在XML文档中查找信息的语言,可用来在XML文档中对元素和属性进行遍历。

W3School官方文档:http: //www.w3school.com.cn/xpath/index.asp

XPath开发工具

开源的的XPath表达式编辑工具:XMLQuire(XML格式文件可用)

Chrome插件XPath Helper

Firefox插件XPath Checker

XPath 语法

 

 

表达式 描述用法说明

nodename 选取此节点的所有子节点。xpath(‘span’)选取span元素的所有子节点

/ 从根节点选取。xpath(‘/div’)从根节点上选取div节点

// 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置。xpath(‘//div’)从当前节点选取含有div节点的标签

. 选取当前节点。xpath(‘./div’)选取当前节点下的div标签

.. 选取当前节点的父节点。 xpath(‘../’)回到上一级节点

@ 选取属性。xpath(“//div[@id=’1001’]”)获取div标签中,含有ID属性且值为1001的标签

XPath的常见用法大全

    from lxml import etree

    html = '''
     
     
        Everyday Italian  
        Giada De Laurentiis  
        2005  
        30.00 
      

     
        Harry Potter  
        J K. Rowling  
        2005  
        29.99 
      

     
        XQuery Kick Start  
        James McGovern  
        Per Bothner  
        Kurt Cagle  
        James Linn  
        Vaidyanathan Nagarajan  
        2003  
        49.99 
     

     
        Learning XML  
        Erik T. Ray  
        2003  
        39.95 
     

    

    '''
    html = etree.HTML(html) # 加载字符串

    # html = etree.parse('temp.html') # 加载文件

    #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正。
    #etree.tostring():输出修正后的结果,类型是bytes

    # 构建xpath规则提取数据
    # res = html.xpath('//bookstore/book/title/text()') 
    # res = html.xpath('//book/@cover | //book/@category')
    # res = html.xpath('//bookstore/book[1]/price/text()')
    # res = html.xpath('//bookstore/book[position()<2]') # 获取第一本书  postion就是索引,索引从1开始
    # res = html.xpath('//title[@lang]') #
    # res = html.xpath('//title[@lang="en"]/text()') #
    # res = html.xpath('//bookstore/book[price>35.00]/title/text()') #
    # res = html.xpath('//bookstore/*') #
    # res = html.xpath('//bookstore//*') #
    # res = html.xpath('//title[@*]') #
    # res = html.xpath('//book/title | //book/price') #
    #res = html.xpath('//*[@category="web"]')

    print(res)

58房源案例1

    from lxml import etree
    import requests

    base_url = 'http://bj.58.com/chuzu/?utm_source=market&spm=b-31580022738699-me-f-862.mingzhan&PGTID=0d100000-0000-17cd-3f99-94d590fc655b&ClickID=1'
    response = requests.get(base_url)

    html = response.text

    html = etree.HTML(html)

    # 找到所有房源li
    li_list = html.xpath('//ul[@class="listUl"]/li')
    for li in li_list:
        # 从一个房源中提取具体信息
        title = li.xpath('.//h2/a/text()')
        if title:
            title = title[0].strip()
        else:
            continue
        square = li.xpath('.//p[1]/text()')[0].replace(' ','').replace('\xa0','')
        print(title,square)

58房源案例2

    from lxml import etree
    import requests
    import json

    # 详情页请求
    def get_detail(url,f):
        response = requests.get(url)
        html = response.text
        html = etree.HTML(html)
        # 获取响应状态
        if 200 <= response.status_code <= 300:
            try:
                title = html.xpath('//h1/text()')[0]

                price = html.xpath('//span[@class="c_ff552e"]/b/text()')[0]

                margin = html.xpath('//span[@class="c_333"]/text()')
                if margin:
                    margin = margin[0]
                else:
                    margin = '无'

                rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0]
                house_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0]

                direction = html.xpath('//ul[@class="f14"]/li[3]/span[2]/text()')[0]
                host = html.xpath('//ul[@class="f14"]/li[4]/span[2]/a/text()')[0]

                area = html.xpath('//ul[@class="f14"]/li[5]/span[2]/a/text()')
                # ['昌平','立水桥']
                area = ''.join(area)


                addr = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0]
            except Exception as e:
                print(e)
                print(url)
                exit()

            data = {
                'title': title,
                'price': price,
                'margin': margin,
                'rent_type': rent_type,
                'house_type': house_type,
                'direction': direction,
                'host': host,
                'area': area,
                'addr': addr,
            }
            for key in data:
                data[key] = data[key].replace(' ','').replace('\xa0','').strip()

            # 保存信息到文件中
            print(data['title'])
            f.write(json.dumps(data,ensure_ascii=False) + '\n')


    def getPage():
        base_url = 'http://bj.58.com/chuzu/pn1/'
        response = requests.get(base_url)

        html = response.text

        html = etree.HTML(html)

        # 找到所有房源li
        li_list = html.xpath('//ul[@class="listUl"]/li')
        for li in li_list:
            # 从一个房源中提取详情链接
            detail_url = li.xpath('.//h2/a/@href')
            if detail_url:
                detail_url = detail_url[0]
            else:
                continue
            # 发起详情页请求
            get_detail(detail_url,f)

    if __name__ == '__main__':
        f = open('house.json','w', encoding='utf-8')
        getPage()
        f.close()

补充

    from lxml import etree

    html = '''
        
>  按学科:
''' # 查看解析以后的html 注意a标签的不规范 # result = etree.tostring(html) # print(result.decode('utf-8')) # 自定义Parser html = etree.HTML(html,parser=etree.HTMLParser()) print(html.xpath('//div[@class="php_zuopin_fenlei"]//a/text()'))

你可能感兴趣的:(python)