python学习目录传送门
XPath即为XML路径语言,它是一种用来确定XML文档中某部分位置的语言,同样适用于HTML文档的检索
import random
import requests
from lxml import etree
from fake_useragent import UserAgent
class LianjiaSpider(object):
def __init__(self):
self.url = 'https://bj.lianjia.com/ershoufang/pg{}/'
def parse_html(self,url):
headers = {'User-Agent':UserAgent().random}
html = requests.get(url=url,headers=headers).content.decode('utf8','ignore')
def get_data(self,html):
p = etree.HTML(html)
# 基准xpath: [,]
li_list =p.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
# for 遍历,依次提取每个房源信息,放到字典中
item = {}
for li in li_list:
# 名称+区域
name_list = li.xpath('.//div[@class="positionInfo"]/a[1]/text()')
item['name'] = name_list[0].strip() if name_list else None
address_list =li.xpath('.//div[@class="positionInfo"]/a[2]/text()')
item['address'] = address_list[0].strip() if address_list else None
# 户型+面积+方位+是否精装+楼层+年代+类型
# h_list: ['']
h_list = li.xpath('.//div[@class="houseInfo"]/text()')
if h_list:
info_list = h_list[0].split('|')
if len(info_list) == 7:
item['model'] = info_list[0].strip()
item['area'] = info_list[1].strip()
item['direct'] = info_list[2].strip()
item['perfect'] = info_list[3].strip()
item['floor'] = info_list[4].strip()
item['year'] = info_list[5].strip()
item['type'] = info_list[6].strip()
else:
item['model'] = item['area'] = item['direct'] = item['perfect'] = item['floor'] = item['year'] = \
item['type'] = None
else:
item['model'] = item['area'] = item['direct'] = item['perfect'] = item['floor'] = item['year'] = \
item['type'] = None
# 总价+单价
total_list = li.xpath('.//div[@class="totalPrice"]/span/text()')
item['total'] = total_list[0].strip() if total_list else None
unit_list = li.xpath('.//div[@class="unitPrice"]/span/text()')
item['unit'] = unit_list[0].strip() if unit_list else None
print(item)
def run(self):
for page in range(1,2):
url = self.url.format(page)
self.parse_html(url)
time.sleep(random.randint(1,2))
if __name__ == '__main__':
spider = LianjiaSpider()
spider.run()
【注意】 1> 只要涉及到条件,加 [] : //dl[@class="xxx"] //dl/dd[2] 2> 只要获取属性值,加 @ : //dl[@class="xxx"] //p/a/@href
【1】// : 从所有节点中查找(包括子节点和后代节点)
【2】@ : 获取属性值
2.1> 使用场景1(属性值作为条件)
//div[@class="movie-item-info"]
2.2> 使用场景2(直接获取属性值)
//div[@class="movie-item-info"]/a/img/@src
【3】练习 - 猫眼电影top100
3.1> 匹配电影名称
//div[@class="movie-item-info"]/p[1]/a/@title
3.2> 匹配电影主演
//div[@class="movie-item-info"]/p[2]/text()
3.3> 匹配上映时间
//div[@class="movie-item-info"]/p[3]/text()
3.4> 匹配电影链接
//div[@class="movie-item-info"]/p[1]/a/@href
xpath表达式1 | xpath表达式2 | xpath表达式3
【1】text() :获取节点的文本内容
xpath表达式末尾不加 /text() :则得到的结果为节点对象
xpath表达式末尾加 /text() 或者 /@href : 则得到结果为字符串
【2】contains() : 匹配属性值中包含某些字符串节点
匹配class属性值中包含 'movie-item' 这个字符串的 div 节点
//div[contains(@class,"movie-item")]
【1】字符串: xpath表达式的末尾为: /text() 、/@href 得到的列表中为'字符串'
【2】节点对象: 其他剩余所有情况得到的列表中均为'节点对象'
[<element dd at xxxa>,<element dd at xxxb>,<element dd at xxxc>]
[<element div at xxxa>,<element div at xxxb>]
[<element p at xxxa>,<element p at xxxb>,<element p at xxxc>]
【1】Ubuntu: sudo pip3 install lxml
【2】Windows: python -m pip install lxml
1、导模块
from lxml import etree
2、创建解析对象
parse_html = etree.HTML(html)
3、解析对象调用xpath
r_list = parse_html.xpath('xpath表达式')
【1】基准xpath: 匹配所有电影信息的节点对象列表
//dl[@class="board-wrapper"]/dd
[<element dd at xxx>,<element dd at xxx>,...]
【2】遍历对象列表,依次获取每个电影信息
item = {}
for dd in dd_list:
item['name'] = dd.xpath('.//p[@class="name"]/a/text()').strip()
item['star'] = dd.xpath('.//p[@class="star"]/text()').strip()
item['time'] = dd.xpath('.//p[@class="releasetime"]/text()').strip()
"""
猫眼电影top100抓取(电影名称、主演、上映时间)
"""
import requests
import time
import random
from lxml import etree
class MaoyanSpider:
def __init__(self):
self.url = 'https://maoyan.com/board/4?offset={}'
self.headers = {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}
def get_html(self, url):
html = requests.get(url=url, headers=self.headers).text
# 直接调用解析函数
self.parse_html(html)
def parse_html(self, html):
"""解析提取数据 - xpath"""
p = etree.HTML(html)
# 基准xpath:每个电影信息的节点对象dd列表 [, ,...]
dd_list = p.xpath('//dl[@class="board-wrapper"]/dd')
print(dd_list)
item = {}
for dd in dd_list:
item['name'] = dd.xpath('.//p[@class="name"]/a/@title')[0].strip()
item['star'] = dd.xpath('.//p[@class="star"]/text()')[0].strip()
item['time'] = dd.xpath('.//p[@class="releasetime"]/text()')[0].strip()
print(item)
def run(self):
"""程序入口函数"""
for offset in range(0, 91, 10):
url = self.url.format(offset)
self.get_html(url=url)
# 控制数据抓取频率:uniform()生成指定范围内的浮点数
time.sleep(random.uniform(0,1))
if __name__ == '__main__':
spider = MaoyanSpider()
spider.run()