python爬虫

爬虫

页面结构

DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8"/>
    <title>Titletitle>
head>
<body>

    <table width="200px" height="200px" border="1px">
        <tr>
            <td>姓名td>
            <td>年龄td>
            <td>性别td>
        tr>
        <tr>
            <td>张三td>
            <td>18td>
            <td>td>
        tr>
    table>

        <ul>
            <li id = "1" class = "c1">铁锅炖大鹅li>
            <li id = "2">小鸡炖蘑菇li>
            <li id = "c3">锅包肉li>
            <li id = "c4">小炒鱼li>
            <li id = "l2">荷包鲊li>
            <li id = "2l">牛腩煲li>
        ul>

        <ol>
            <li>穿衣li>
            <li>洗漱li>
        ol>


<a href="https://www.bilibili.com/video/BV1rq4y1U7Rz?p=51">页面a>
body>
html>

urllib

import urllib.request

url = 'http://www.baidu.com'

# 模拟浏览器请求
response = urllib.request.urlopen(url)
# read 返回的是字节形式的二进制数据
# 二进制-》字符串 解码 decode('编码格式')
content = response.read().decode('utf-8')
print(content)
# 一个类型和六个方法
# type(response)  
print(type(response))

# 按照一字节一字节的读
# response.read()

# 返回1024个字节
# content = response.read(1024)
# print(content)

# 读取一行
# content = response.readline()
# print(content)

# 读取完
# content = response.readlines()
# print(content)

# 状态码
print(response.getcode())

# 返回url
print(response.geturl())

# 获取状态信息
print(response.getheaders())

反爬一,ua大全

https会反爬,要伪装一下,有个User-Agent校验

import urllib.request

url_page = 'https://www.baidu.com'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

request = urllib.request.Request(url=url_page, headers=headers)
# urlopen可以传url或者request对象
response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')
print(content)

quote方法

中文转unicode编码

url_page = 'https://www.baidu.com/s?wd='

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

name = urllib.parse.quote('毛不易')
url_page += name
print(url_page)

urlencode

需要多个参数,并转换Unicode编码时,用urlencode方法

base_url = 'https://www.baidu.com/s?'

data = {
    'wd': '毛不易',
    'sex': '男',
    'location': '大陆',
}

new_data = urllib.parse.urlencode(data)

url = base_url + new_data
print(url)

handler

复杂的request需要使用handler,比如需要用到代理ip

url = 'https://www.baidu.com'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

request = urllib.request.Request(url=url, headers=headers)

# handler build_opener open

handler = urllib.request.HTTPHandler()

opener = urllib.request.build_opener(handler)

response = opener.open(request)

content = response.read().decode('utf-8')

下载

传入url地址和保存的文件

urllib.request.urlretrieve()

xpath插件和lxml

安装xpath,谷歌商店直接搜索xpath helper安装即可

安装lxml库,一般安装到python下的script文件夹下

pip install lxml -i https://pypi.douban.com/simple

51.html见最上面

from lxml import etree
# xpath解析本地文件
tree = etree.parse('51.html')
print(tree)
xpath的基本语法
1、路径查询
	// : 查找所有子孙节点,不考虑层级关系
	/ : 找直接子节点
2、谓词查询
	//div[@id]
	//div[@id="maincontent"]
3、属性查询
	//@class
4、模糊查询
	//div[contains(@id,"he")]
	//div[starts-with(@id,"he")]
5、内容查询
	//div/h1/text()
6、逻辑运算
	//div[@id="head" and @class="s_down"]
	//title | //price
# _*_ coding : utf-8 _*_
# @Time : 2021/9/15 22:56
# @Author : [email protected]
# @File : xpath的基本使用
# @Project : python基础

from lxml import etree

# xpath解析本地文件
tree = etree.parse('51.html')
print(tree)

# tree.xpath('xpath路径')
# li_list = tree.xpath('//body//li')
# li_list = tree.xpath('//body/ul/li')


# 查找所有有id的属性的li标签
# li_list = tree.xpath('//ul/li[@id]')
# text() 获取标签内容
# li_list = tree.xpath('//ul/li[@id]/text()')

# 查找id为1的内容 用""
# li_list = tree.xpath('//ul/li[@id="1"]/text()')

# 查找id为1的li标签的class的属性值
# li_list = tree.xpath('//ul/li[@id="1"]/@class')

# 查找id中包含c的li标签
# li_list = tree.xpath('//ul/li[contains(@id,"c")]/text()')
# 查找id中以c开头的li标签
li_list = tree.xpath('//ul/li[starts-with(@id,"l")]/text()')
print(li_list)

jsonpath

查找json字段用jsonpath,可以网上查一下jsonpath的用法,与xpath类似

https://blog.csdn.net/luxideyao/article/details/77802389

BeautifulSoup

简称bs4,效率没有lxml高,优点是接口设计人性化,使用方便

安装

pip install bs4 -i https://pypi.douban.com/simple
from bs4 import BeautifulSoup

引用报错,库安装过了,可能是存在bs4.py同名文件

find、find_all、select三个函数

import urllib.request

url = 'https://www.starbucks.com.cn/menu/'

response = urllib.request.urlopen(url)

content = response.read().decode('utf-8')

from bs4 import BeautifulSoup

soup = BeautifulSoup(content, 'lxml')
# xpath规则
# //ul[@class='grid padded-3 product']//strong/text()

name_list = soup.select('ul[class="grid padded-3 product"] strong')

for name in name_list:
    print(name.string)

selenium

驱动真实浏览器去请求数据,反爬手段会导致一些数据获取不到,只能用浏览器去获取

驱动下载地址

https://chromedriver.storage.googleapis.com/index.html

根据浏览器版本下载对应的驱动

安装selenium

pip install selenium -i https://pypi.douban.com/simple

# 1、导入selenium
from selenium import webdriver

# 2、创建浏览器操作对象
path = 'chromedriver.exe'

browser = webdriver.Chrome(path)

# 3、访问网站

url = 'https://www.jd.com/'

browser.get(url)

content = browser.page_source
print(content)

驱动解压后exe放py同级目录下

元素定位用法

# 根据id找到对象
# button = browser.find_element_by_id('su')
# 根据标签属性的属性值找到对象
# button = browser.find_element_by_name('wd')
# 根据xpath语句找到对象
# button = browser.find_element_by_xpath('//input[@id="su"]')
# 根据标签的名字找到对象
# button = browser.find_elements_by_tag_name('input')
# 根据bs4语法找到对象
# button = browser.find_elements_by_css_selector('#su')


button = browser.find_element_by_link_text('直播')
url = 'https://www.baidu.com'
browser.get(url)

button = browser.find_element_by_id('su')
print(button.get_attribute('value'))
print(button.tag_name)
输出:
百度一下
input

交互示例

# 1、导入selenium
from selenium import webdriver

# 2、创建浏览器操作对象
path = 'chromedriver.exe'

browser = webdriver.Chrome(path)

# 3、访问网站

url = 'https://www.baidu.com'

browser.get(url)

import time

time.sleep(2)

input = browser.find_element_by_id('kw')
input.send_keys('zhoujielun')
time.sleep(2)

button = browser.find_element_by_id('su')
button.click()
time.sleep(2)

# 滑到底部
js_bottom = 'document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)
time.sleep(2)

# 获取下一页按钮
next = browser.find_element_by_xpath('//a[@class="n"]')
next.click()

time.sleep(2)
# 返回上一页
browser.back()
time.sleep(2)
# 回去
browser.forward()
time.sleep(3)

browser.quit()

无界面浏览器模式

Chrome handless,可以让你不打开UI界面的情况下使用Chrome

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


def share_browser():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')

    # path 是你Chrome浏览器所在路径
    path = r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe'
    chrome_options.binary_location = path

    browser = webdriver.Chrome(chrome_options=chrome_options)
    return browser


browser = share_browser()

url = 'https://www.baidu.com'
browser.get(url)

request

安装

pip install request -i https://pypi.douban.com/simple

import requests

url = 'https://www.baidu.com'

response = requests.get(url=url)

# 一个类型和六个属性
# response类型
print(type(response))

# 设置响应的编码格式
response.encoding = 'utf-8'
# 以字符串形式返回网页的源码
print(response.text)
# 返回一个url地址
print(response.url)
# 返回的是二进制的数据
print(response.content)
# 返回响应状态码
print(response.status_code)
# 返回响应头
print(response.headers)

get请求

import requests

url = 'https://www.baidu.com/s?'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

data = {
    'wd': '北京'
}

response = requests.get(url=url, params=data, headers=headers)

content = response.text
print(content)

post请求类似,requests.post,总的来说比urllib简单

request页面中需要请求多次,但网页要求是同一个请求,这时要用request中session,来保证多次请求是同一个会话

页面请求字段可能还会存在隐藏域的概念,需要先爬取

scrapy

企业级用的最多的爬虫框架

pip install scrapy -i https://pypi.douban.com/simple

创建项目

1、scrapy startproject 项目名(不能数字开头,不能包含中文)

2、创建爬虫文件,要在spiders文件夹中创建

cd 项目名\项目名\spiders

创建爬虫文件

scrapy genspider 爬虫文件的名字 要爬取网页

scrapy genspider baidu www.baidu.com

3、运行爬虫代码

scrapy crawl baidu

scrapy shell交互终端调试

你可能感兴趣的:(python,爬虫,python,爬虫)