在使用python进行网络爬虫并对网页解析成DOM树时,有时需要获取各个DOM树节点的xpath路径。具体代码如下:
方法1:
import lxml
from lxml import etree
import collections
doc='''
Example website
'''
html=etree.HTML(doc)
tree=html.getroottree()
all_nodes=html.xpath('//*')
xpath=[]
for node in all_nodes:
xpath.append(tree.getpath(node))
for node,path in zip(all_nodes,xpath):
print("{}:{}".format(node.tag,path))
方法2:
import lxml
from lxml import etree
import collections
doc='''
Example website
'''
html=etree.HTML(doc)
all_nodes=html.xpath('/html') #用于保存DOM树上的所有节点
idx=0
start=0
end=len(all_nodes)
xpath=['/'+str(all_nodes[0].tag)]
while start<end:
for i in range(start,end):
c_nodes=list(all_nodes[i]) #main_nodes[i]的子节点
tmp_tag_count={key:1 for key,val in collections.Counter(node.tag for node in c_nodes).items()
if val>1}
all_nodes.extend(c_nodes)
tmp_xpath=xpath[i]
for node in c_nodes:
if node.tag in tmp_tag_count.keys():
xpath.append(tmp_xpath+'/'+node.tag+'['+str(tmp_tag_count[node.tag])+']')
tmp_tag_count[node.tag]+=1
else:
xpath.append(tmp_xpath+'/'+node.tag)
idx+=1
start=idx
end=len(all_nodes)
for node,path in zip(all_nodes,xpath):
print("{} {}".format(node.tag,path))
其计算结果如下:
html /html
head /html/head
body /html/body
base /html/head/base
title /html/head/title
div /html/body/div
a /html/body/div/a[1]
h5 /html/body/div/h5
a /html/body/div/a[2]
a /html/body/div/a[3]
a /html/body/div/a[4]
a /html/body/div/a[5]
a /html/body/div/a[6]
br /html/body/div/a[1]/br
img /html/body/div/a[1]/img
br /html/body/div/a[2]/br
img /html/body/div/a[2]/img
br /html/body/div/a[3]/br
img /html/body/div/a[3]/img
br /html/body/div/a[4]/br
img /html/body/div/a[4]/img
br /html/body/div/a[5]/br
img /html/body/div/a[5]/img
span /html/body/div/a[6]/span
br /html/body/div/a[6]/br
img /html/body/div/a[6]/img
h5 /html/body/div/a[6]/span/h5