爬虫 解析 笔记1

解析:xpath

  • xpath
  • 案例1:获取百度网站的百度一下
  • 案例2:爬取站长素材的图片

xpath

xpath基本语法:

  1. 路径查询
    //:查找所有子孙节点,不考虑层级关系
    /:找直接子节点
  2. 谓词查询
    //div[@id]
    //div[@id=“mainontent”]
  3. 属性查询
    //@class
  4. 模糊查询
    //div[contains(@id, “he”)]
    //div[starts-with(@id, “he”)]
  5. 内容查询
    //div/h1/text()
  6. 逻辑运算
    //div[@id=“head” and @class=“s_down”]
    //title | //price
    注意:xpath既可以解析本地文件也可以解析服务器响应的文件
DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8"/>
    <title>Titletitle>
head>
<body>
    <ul>
        <li>北京li>
        <li>上海li>
        <li>广州li>
        <li>深圳li>
    ul>
    <ul>
        <li id="l1" class="c1">大连li>
        <li id="l2">哈尔滨li>
        <li id="c3">沈阳li>
        <li id="c4">长春li>
    ul>
body>
html>
from lxml import etree

#xpath解析
# (1)本地文件
# (2)服务器响应的数据response.read().decode('utf-8')

# 解析本地文件etree.parse()
tree = etree.parse('spider_test.html')

#tree.xpath('xpath路径')
#查找ul下面的li
li_list = tree.xpath("//body/ul/li")
print(li_list)
print(len(li_list))
#查找所有有id的属性的li标签
li_list = tree.xpath("//ul/li[@id]/text()") #路径查询
print(li_list)
print(len(li_list))
#查找id为l1的标签
li_list = tree.xpath("//ul/li[@id='l1']/text()") #谓词查询
print(li_list)
print(len(li_list))
#查找到id为l1的li标签的class的属性值
li_list = tree.xpath("//ul/li[@id='l1']/@class") #属性查询
print(li_list)
print(len(li_list))
#查询id中包含l的li标签
li_list =tree.xpath('//ul/li[contains(@id, "l")]/text()') #模糊查询
print(li_list)
print(len(li_list))
#查询id的值以l开头的li标签
li_list =tree.xpath('//ul/li[starts-with(@id, "c")]/text()') #模糊查询
print(li_list)
print(len(li_list))
#查询id为l1和class为c1的li标签
li_list =tree.xpath('//ul/li[@id="l1" and @class="c1"]/text()') #逻辑运算
print(li_list)
print(len(li_list))
#查找到id为l1的li标签以及id为c3的li标签
li_list =tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="c3"]/text()') #逻辑运算
print(li_list)
print(len(li_list))

案例1:获取百度网站的百度一下

from lxml import etree
import urllib.request

# (1)获取网页源码 (2)解析 (3)打印结果
url = 'http://www.baidu.com'
headers = {
    'User-Agent':'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
#请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
#模拟浏览器访问服务器
response = urllib.request.urlopen(request)
#获取网页源码
content = response.read().decode('utf-8')
#解析网页源码
tree = etree.HTML(content)
#获取想要的数据,xpath的返回值是一个列表类型的数据
result = tree.xpath('//input[@id="su"]/@value')

print(result[0])

案例2:爬取站长素材的图片

#(1)获取网页源码 (2)解析 (3)下载

#url = 'https://sc.chinaz.com/tupian/shengdanjietupian.html' 第一页
#url = ’https://sc.chinaz.com/tupian/shengdanjietupian_2.html‘ 第二页
#url = ’https://sc.chinaz.com/tupian/shengdanjietupian_3.html‘ 第三页

#请求对象的定制
def create_request(page):
    if page == 1:
        url = 'https://sc.chinaz.com/tupian/shengdanjietupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/shengdanjietupian_' + str(page) + '.html'
    headers = {
        'User-Agegnt':'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
    }

    request = urllib.request.Request(url=url, headers=headers)
    return request

def get_content(request):
    # 模拟浏览器访问服务器
    response = urllib.request.urlopen(request)
    #获取网页源码
    content = response.read().decode('utf-8')
    return content

def download_jpg(content):
    #解析网页源码
    tree = etree.HTML(content)
    name_list = tree.xpath('//body//div[@class="container"]//div[@class="item"]/img/@alt')
    #一般涉及到图片的网站,会使用懒加载
    data_original = tree.xpath('//body//div[@class="container"]//div[@class="item"]/img/@data-original')
    for i in range(len(name_list)):
        name = name_list[i]
        jpg = data_original[i]
        url = 'https:' + jpg
        urllib.request.urlretrieve(url=url, filename='./Christmas_img/' + name + '.jpg')


if __name__ == '__main__':
    start_page = int(input("请输入起始页码:"))
    end_page = int(input("请输入结束页码:"))
    for page in range(start_page, end_page+1):
        #请求对象的定制
        request = create_request(page)
        #获取网页源码
        content = get_content(request)
        #下载图片
        download_jpg(content)
        

你可能感兴趣的:(爬虫,爬虫)