菜鸟教程这么解释的
百度百科这么解释的
pip install lxml
from lxml import etree
etree.parse(filepath)
//filepath是html文档的路径
etree.HTML('page_text')
//page_text是获取的响应数据
从根节点开始进行定位/html/body/div
/
表示一个层级
例如:/html/body/div
//
表示多个层级
例如:/html//div
//
可以表示从任意位置开始定位
例如://div
./
表示从当前目录开始
例如:div.xpath('./ul')
表示之前取到的div下的ul
以上三个表达式表示结果相同
tree.xpath('//div[@class="属性名称"]')
tree.xpath('//div[@id="标签ID值"]')
注意:索引号从1开始,而不是0
tree.xpath('//div[@class="属性名称"]/p[3]')
//该属性class下的第③个p标签
Ⅰ.获取文本
直系文本:/text()
所有文本://text()
Ⅱ.获取属性
/@属性名称
获取img下面的src属性
img/@src
from lxml import etree
import requests
url='https://wx.58.com/binhu/chuzu/?utm_source=sem-sales-baidu-pc&spm=115207576396.22031703733&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg&PGTID=0d3090a7-0005-d44c-7fc9-8ceee1d40818&ClickID=2'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.56'
}
text_page = requests.get(url, headers).text
tree = etree.HTML(text_page)
li_list = tree.xpath('//ul[@class="house-list"]/li[@class="house-cell"]')
fp = open('58.txt', 'w', encoding='utf-8')
ll = len(li_list)
for i in range(1, ll, 1):
print(li_list[i].xpath('./div[@class="des"]/h2/a/text()')[0])
fp.write(li_list[i].xpath('./div[@class="des"]/h2/a/text()')[0] + '\n')
########以下方法也可实现#############
# for detail in li_list:
# print(detail.xpath('./div[@class="des"]/h2/a/text()')[0])
# fp.write(detail.xpath('./div[@class="des"]/h2/a/text()')[0]+'\n')
http://pic.netbian.com/4kmeinv/
import requests
from lxml import etree
import os
print("请输入您当前网页地址")
# http://pic.netbian.com/
url = input("")
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.56'
}
response = requests.get(url, header)
# response.encoding = 'utf-8' # 修改乱码方法一
page_text = response.text
tree = etree.HTML(page_text)
# 获取图片表格li
li_list = tree.xpath('//div[@class="slist"]/ul[@class="clearfix"]/li')
# 创建文件夹
if not os.path.exists('./meinvpic'):
os.mkdir('./meinvpic')
for nav in li_list:
# 详细图片网页
img_url = 'http://pic.netbian.com' + nav.xpath('./a/@href')[0]
# print(img_url)
detail_text = requests.get(img_url, header).text
detail_tree = etree.HTML(detail_text)
# 爬取地址,名称
img_src ='http://pic.netbian.com' + detail_tree.xpath('//div[2]/div[1]/div[@class="photo"]/div[1]/div[2]/a/img/@src')[0]
name = detail_tree.xpath('//div[2]/div[1]/div[@class="photo"]/div[1]/div[1]/h1/text()')[0]+'.jpg'
# 修改乱码方法二
name = name.encode('iso-8859-1').decode('gbk')
img_data = requests.get(img_src, header).content
# 图片存储
with open('./meinvpic/' + name, 'wb') as s:
s.write(img_data)
print(name)
print(img_src)
import requests
import os
from lxml import etree
if __name__ == '__main__':
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.56'
}
url = 'https://www.aqistudy.cn/historydata/'
# 建立文件夹
if not os.path.exists('./city'):
os.mkdir('./city')
# txt文件
f = open('./city/city.txt','w',encoding='utf-8')
response_text = requests.get(url, header).text
tree = etree.HTML(response_text)
# 一般城市的获取
normal_li_list = tree.xpath('//div[@class="container"]//div[@class="all"]/div[2]/ul/div[2]/li')
# 获取每一个ul标签
for normal_li in normal_li_list:
detail = normal_li.xpath('./a/text()')
f.write(detail[0]+'\n')
import requests
import os
from lxml import etree
url = 'http://sc.chinaz.com/jianli/free.html'
if not os.path.exists('./moban'):
os.mkdir('./moban')
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.56'
}
if __name__ == '__main__':
while(1):
print(url)
reponse = requests.get(url, header)
reponse.encoding = 'utf-8'
page_text = reponse.text
detail_tree = etree.HTML(page_text)
mo_list = detail_tree.xpath('//div[@class="bggray clearfix pt20"]/div[3]/div/div/div')
for src in mo_list:
mo_url = src.xpath('./a/@href')[0]
name = src.xpath('./a/img/@alt')[0]
# name = name.encode('iso-8859-1').decode('gbk')
detail_text = requests.get('http:'+mo_url, header).text
tree = etree.HTML(detail_text)
source = tree.xpath('//div[@class="bggray clearfix"]/div[2]/div[2]/div[1]/div[@class="down_wrap"]/div[2]//li/a/@href')[0]
resourse = requests.get(source, header).content
with open('./moban/'+name+'.rar', 'wb') as s:
s.write(resourse)
print(name)
print(source)
next_list = detail_tree.xpath('//div[@class="bggray clearfix pt20"]/div[4]/div/a[@class="nextpage"]/@href')
if next_list==[]: break
url = 'http://sc.chinaz.com/jianli/'+next_list[0]