绝对路径:HTML / body / div / a
相对路径: ./a
专业术语
树:整个HTML或XML结构
节点:HTML中的每个标签,XML中标签就是节点
根节点:树的第一个节点,HTML的根节点就是HTML标签
属性:节点属性(HTML中就是标签属性)
from lxml import etree
XML数据格式
json数据与XML数据时两种通用的数据格式,用于不同语言之间进行数据交流
将一个超市的商品数据进行传输:
json:
{
"name":"永辉超市",
"address":"肖家河",
"goods":[
{"name":"泡面","price":3.5,"count":50}
{"name":"火腿肠","price":3,"count":200}
{"name":"矿泉水","price":2,"count":30}
]
}
XML:
<supermarket>
<name>永辉超市</name>
<address>肖家河</address>
<goodsList>
<goods name = "泡面" price = "3.5" count = "50"></goods>
<goods name = "火腿肠" price = "3" count = "200"></goods>
<goods name = "矿泉水" price = "2" count = "30"></goods>
</goodsList>
<workerList>
<cashier name = "张三" pay = "4000"></cashier>
<shoppingGuide name = "李四" pay = "3000"></shoppingGuide>
</workerList>
</supermarket>
xml_data ="""
永辉超市
肖家河
"""
supermarket = etree.XML(xml_data)
获取标签(获取节点)
节点对象.xpath(路径)
a.写绝对路:不管xpath前面的节点对象是什么,路径从根节点开始写
写法:/绝对路径
cashier = supermarket.xpath('/supermarket/workerList/cashier')
b.相对路径:用.表示当前节点,xpath前面是谁,当前节点就是谁
…表示当前节点的上层节点
注意:./ 可省略
cashier = supermarket.xpath('./workerList/cashier')
print(cashier) #[]
cashier = supermarket.xpath('../workerList/cashier')
print(cashier)
c.//路径 — 从任意位置开始全局搜索
查找方式和功能和xpath前的节点无关
result = supermarket.xpath('//cashier')
print(result) #[]
goods = supermarket.xpath('//goodsList/goods')
print(goods) #[, , ]
获取节点内容
语法:获取节点的路径/text()
name = supermarket.xpath('./name/text()')
print(name) #['永辉超市']
获取节点属性值
语法:获取节点的路径/@属性名
goods = supermarket.xpath('//goodsList/goods/@name')
print(goods) #['泡面', '火腿肠', '矿泉水']
html = etree.HTML(open('test.html', 'r', encoding='utf-8').read())
h1 = html.xpath('/html/body/h1')
print(h1) #[]
h1 = html.xpath('//h1')
print(h1) #[]
加谓语(加条件)
语法:选中标签的路径[谓语]
1)、[N] — 获取同层的第N个标签
p = html.xpath('//p[1]/text()') #所在父标签的第一个p标签的第一个评标前内容
print(p) #['肖家河大厦', '泡面', '矿泉水', '面包', '充电宝']
p = html.xpath('./body/p/text()')
print(p) #['肖家河大厦']
result = html.xpath('body/ul/li[2]/p/text()')
print(result) #['矿泉水', '2', '120']
2)、[last()] — 获取同层的最后一个标签
[last() - N] — 获取同层的倒数第(N+1)个标签
counts = html.xpath('body/ul/li/p[last()]/text()')
print(counts) #['15', '120', '42', '10']
bread = html.xpath('body/ul/li[last() - 1]/p[last()]/text()')
print(bread) #['42']
3)、[position() > N]
[position() >= N]
goods = html.xpath('body/ul/li[position() < 3]/p/text()')
print(goods) #['泡面', '3.5', '15', '矿泉水', '2', '120']
4)、[@属性名] — 获取有指定属性的标签
result = html.xpath('body/div/p[@class]/text()')
print(result) #['p1', 'p2', 'p4']
[@属性名 = 属性值] — 获取指定属性为指定值的标签
result = html.xpath('body/div/p[@class = "c1"]/text()')
print(result)
5)、[标签 >/>=/<=/= 数据] — 将标签按照指定子标签的内容进行筛选
prices = html.xpath('body/ul/li[p[2]>2]/p/text()')
prices = html.xpath('./body/ul/li[p[2]>2]/p/text()')
print(prices)
6、通配符 *
获取最后一个div下的所有标签的内容
result = html.xpath('body/div[last()]/*/text()')
print(result) #['p1', 'p2', 'a1', 'span1']
result = html.xpath('body/div[last()]/*[@class]/text()')
print(result) #['p1', 'span1']
result = html.xpath('body/div[last()]/*[@*]/text()')
print(result) #['p1', 'a1', 'span1']
result = html.xpath('//img/@*')
print(result) #['https://image1.guazistatic.com/qn2107010956026670c8553db23db93154432c791292ae.jpg?imageView2/1/w/270/h/180/q/88', '']
7、分支(获取若干个路径)
result = html.xpath('body/ul/li/p[1]/text()|body/ul/li/p[2]/text()')
print(result) #['泡面', '3.5', '矿泉水', '2', '面包', '5', '充电宝', '150']
DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Titletitle>
head>
<body>
<h1>永辉超市h1>
<p>肖家河大厦p>
<ul>
<li>
<p class="name">泡面p>
<p class="price">3.5p>
<p class="count">15p>
li>
<li>
<p class="name">矿泉水p>
<p class="price">2p>
<p class="count">120p>
li>
<li>
<p class="name">面包p>
<p class="price">5p>
<p class="count">42p>
li>
<li>
<p class="name">充电宝p>
<p class="price">150p>
<p class="count">10p>
li>
ul>
<div>
<p class="">p1p>
<p class="c1">p2p>
<p id="p1">p3p>
<p class="c2">p4p>
div>
<div id="div1">
<p class="">p1p>
<p>p2p>
<a href="">a1a>
<span class="">span1span>
<img src="https://image1.guazistatic.com/qn2107010956026670c8553db23db93154432c791292ae.jpg?imageView2/1/w/270/h/180/q/88" alt="">
div>
body>
html>
导包
from selenium.webdriver import Chrome
from lxml import etree
import csv
import time
获取更多数据(翻页)
def get_more():
more = browser.find_element_by_css_selector('.more')
more.click()
获取网页数据
def get_message():
movie_name = []
movie_score = []
movie_poster = []
movie_detail = []
for movie in movies:
movie_name.append((movie.xpath('div/img/@alt')))
movie_score.append(movie.xpath('p/strong/text()'))
movie_poster.append(movie.xpath('div[@class = "cover-wp"]/img/@src'))
movie_detail.append(movie.xpath('@href'))
return zip(movie_name,movie_score,movie_poster,movie_detail)
保存数据
def writer(m_message):
file = open('files/douban/movies.csv', 'w', encoding='utf-8')
writer = csv.writer(file)
writer.writerow(['电影名称','电影评分','电影海报','电影详情'])
for movie in m_message:
writer.writerow(movie)
file.close()
调用
browser = Chrome()
for index in range(0,121,20):
browser.get(f'https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start={index}')
tree = etree.HTML(browser.page_source)
movies = tree.xpath('body/div[@id = "wrapper"]/div[@id = "content"]/div[@*]/div[@class = "article"]/div[@class = "gaia"]/div[@class = "list-wp"]/div/a')
writer(get_message())
get_more()
print(index)
time.sleep(2)