Python爬虫:获取DOM树各个节点的xpath路径

   在使用python进行网络爬虫并对网页解析成DOM树时,有时需要获取各个DOM树节点的xpath路径。具体代码如下:

1. 生成DOM各节点的xpath路径

方法1:

import lxml
from lxml import etree
import collections

doc='''

 
  
  Example website
 
 
  
 

'''
html=etree.HTML(doc)
tree=html.getroottree()
all_nodes=html.xpath('//*')
xpath=[]
for node in all_nodes:
    xpath.append(tree.getpath(node))
for node,path in zip(all_nodes,xpath):
    print("{}:{}".format(node.tag,path))

方法2:

import lxml
from lxml import etree
import collections

doc='''

 
  
  Example website
 
 
  
 

'''
html=etree.HTML(doc)
all_nodes=html.xpath('/html') #用于保存DOM树上的所有节点
idx=0
start=0
end=len(all_nodes)
xpath=['/'+str(all_nodes[0].tag)]
while start<end:
    for i in range(start,end):
        c_nodes=list(all_nodes[i]) #main_nodes[i]的子节点
        tmp_tag_count={key:1 for key,val in collections.Counter(node.tag for node in c_nodes).items()
                       if val>1}
        all_nodes.extend(c_nodes)
        tmp_xpath=xpath[i]
        for node in c_nodes:
            if node.tag in tmp_tag_count.keys():
                xpath.append(tmp_xpath+'/'+node.tag+'['+str(tmp_tag_count[node.tag])+']')
                tmp_tag_count[node.tag]+=1
            else:
                xpath.append(tmp_xpath+'/'+node.tag)
        idx+=1
    start=idx
    end=len(all_nodes)
    
for node,path in zip(all_nodes,xpath):
    print("{} {}".format(node.tag,path))

其计算结果如下:

html  /html
head  /html/head
body  /html/body
base  /html/head/base
title  /html/head/title
div  /html/body/div
a  /html/body/div/a[1]
h5  /html/body/div/h5
a  /html/body/div/a[2]
a  /html/body/div/a[3]
a  /html/body/div/a[4]
a  /html/body/div/a[5]
a  /html/body/div/a[6]
br  /html/body/div/a[1]/br
img  /html/body/div/a[1]/img
br  /html/body/div/a[2]/br
img  /html/body/div/a[2]/img
br  /html/body/div/a[3]/br
img  /html/body/div/a[3]/img
br  /html/body/div/a[4]/br
img  /html/body/div/a[4]/img
br  /html/body/div/a[5]/br
img  /html/body/div/a[5]/img
span  /html/body/div/a[6]/span
br  /html/body/div/a[6]/br
img  /html/body/div/a[6]/img
h5  /html/body/div/a[6]/span/h5
参考资料
  1. https://blog.csdn.net/together_cz/article/details/74015599

你可能感兴趣的:(爬虫,python,爬虫,开发语言)