DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<title>Titletitle>
head>
<body>
<table width="200px" height="200px" border="1px">
<tr>
<td>姓名td>
<td>年龄td>
<td>性别td>
tr>
<tr>
<td>张三td>
<td>18td>
<td>男td>
tr>
table>
<ul>
<li id = "1" class = "c1">铁锅炖大鹅li>
<li id = "2">小鸡炖蘑菇li>
<li id = "c3">锅包肉li>
<li id = "c4">小炒鱼li>
<li id = "l2">荷包鲊li>
<li id = "2l">牛腩煲li>
ul>
<ol>
<li>穿衣li>
<li>洗漱li>
ol>
<a href="https://www.bilibili.com/video/BV1rq4y1U7Rz?p=51">页面a>
body>
html>
import urllib.request
url = 'http://www.baidu.com'
# 模拟浏览器请求
response = urllib.request.urlopen(url)
# read 返回的是字节形式的二进制数据
# 二进制-》字符串 解码 decode('编码格式')
content = response.read().decode('utf-8')
print(content)
# 一个类型和六个方法
# type(response)
print(type(response))
# 按照一字节一字节的读
# response.read()
# 返回1024个字节
# content = response.read(1024)
# print(content)
# 读取一行
# content = response.readline()
# print(content)
# 读取完
# content = response.readlines()
# print(content)
# 状态码
print(response.getcode())
# 返回url
print(response.geturl())
# 获取状态信息
print(response.getheaders())
反爬一,ua大全
https会反爬,要伪装一下,有个User-Agent校验
import urllib.request
url_page = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
request = urllib.request.Request(url=url_page, headers=headers)
# urlopen可以传url或者request对象
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
中文转unicode编码
url_page = 'https://www.baidu.com/s?wd='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
name = urllib.parse.quote('毛不易')
url_page += name
print(url_page)
需要多个参数,并转换Unicode编码时,用urlencode方法
base_url = 'https://www.baidu.com/s?'
data = {
'wd': '毛不易',
'sex': '男',
'location': '大陆',
}
new_data = urllib.parse.urlencode(data)
url = base_url + new_data
print(url)
复杂的request需要使用handler,比如需要用到代理ip
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
# handler build_opener open
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
传入url地址和保存的文件
urllib.request.urlretrieve()
安装xpath,谷歌商店直接搜索xpath helper安装即可
安装lxml库,一般安装到python下的script文件夹下
pip install lxml -i https://pypi.douban.com/simple
51.html见最上面
from lxml import etree
# xpath解析本地文件
tree = etree.parse('51.html')
print(tree)
xpath的基本语法
1、路径查询
// : 查找所有子孙节点,不考虑层级关系
/ : 找直接子节点
2、谓词查询
//div[@id]
//div[@id="maincontent"]
3、属性查询
//@class
4、模糊查询
//div[contains(@id,"he")]
//div[starts-with(@id,"he")]
5、内容查询
//div/h1/text()
6、逻辑运算
//div[@id="head" and @class="s_down"]
//title | //price
# _*_ coding : utf-8 _*_
# @Time : 2021/9/15 22:56
# @Author : [email protected]
# @File : xpath的基本使用
# @Project : python基础
from lxml import etree
# xpath解析本地文件
tree = etree.parse('51.html')
print(tree)
# tree.xpath('xpath路径')
# li_list = tree.xpath('//body//li')
# li_list = tree.xpath('//body/ul/li')
# 查找所有有id的属性的li标签
# li_list = tree.xpath('//ul/li[@id]')
# text() 获取标签内容
# li_list = tree.xpath('//ul/li[@id]/text()')
# 查找id为1的内容 用""
# li_list = tree.xpath('//ul/li[@id="1"]/text()')
# 查找id为1的li标签的class的属性值
# li_list = tree.xpath('//ul/li[@id="1"]/@class')
# 查找id中包含c的li标签
# li_list = tree.xpath('//ul/li[contains(@id,"c")]/text()')
# 查找id中以c开头的li标签
li_list = tree.xpath('//ul/li[starts-with(@id,"l")]/text()')
print(li_list)
查找json字段用jsonpath,可以网上查一下jsonpath的用法,与xpath类似
https://blog.csdn.net/luxideyao/article/details/77802389
简称bs4,效率没有lxml高,优点是接口设计人性化,使用方便
安装
pip install bs4 -i https://pypi.douban.com/simple
from bs4 import BeautifulSoup
引用报错,库安装过了,可能是存在bs4.py同名文件
find、find_all、select三个函数
import urllib.request
url = 'https://www.starbucks.com.cn/menu/'
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'lxml')
# xpath规则
# //ul[@class='grid padded-3 product']//strong/text()
name_list = soup.select('ul[class="grid padded-3 product"] strong')
for name in name_list:
print(name.string)
驱动真实浏览器去请求数据,反爬手段会导致一些数据获取不到,只能用浏览器去获取
驱动下载地址
https://chromedriver.storage.googleapis.com/index.html
根据浏览器版本下载对应的驱动
安装selenium
pip install selenium -i https://pypi.douban.com/simple
# 1、导入selenium
from selenium import webdriver
# 2、创建浏览器操作对象
path = 'chromedriver.exe'
browser = webdriver.Chrome(path)
# 3、访问网站
url = 'https://www.jd.com/'
browser.get(url)
content = browser.page_source
print(content)
驱动解压后exe放py同级目录下
元素定位用法
# 根据id找到对象
# button = browser.find_element_by_id('su')
# 根据标签属性的属性值找到对象
# button = browser.find_element_by_name('wd')
# 根据xpath语句找到对象
# button = browser.find_element_by_xpath('//input[@id="su"]')
# 根据标签的名字找到对象
# button = browser.find_elements_by_tag_name('input')
# 根据bs4语法找到对象
# button = browser.find_elements_by_css_selector('#su')
button = browser.find_element_by_link_text('直播')
url = 'https://www.baidu.com'
browser.get(url)
button = browser.find_element_by_id('su')
print(button.get_attribute('value'))
print(button.tag_name)
输出:
百度一下
input
交互示例
# 1、导入selenium
from selenium import webdriver
# 2、创建浏览器操作对象
path = 'chromedriver.exe'
browser = webdriver.Chrome(path)
# 3、访问网站
url = 'https://www.baidu.com'
browser.get(url)
import time
time.sleep(2)
input = browser.find_element_by_id('kw')
input.send_keys('zhoujielun')
time.sleep(2)
button = browser.find_element_by_id('su')
button.click()
time.sleep(2)
# 滑到底部
js_bottom = 'document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)
time.sleep(2)
# 获取下一页按钮
next = browser.find_element_by_xpath('//a[@class="n"]')
next.click()
time.sleep(2)
# 返回上一页
browser.back()
time.sleep(2)
# 回去
browser.forward()
time.sleep(3)
browser.quit()
无界面浏览器模式
Chrome handless,可以让你不打开UI界面的情况下使用Chrome
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def share_browser():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# path 是你Chrome浏览器所在路径
path = r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe'
chrome_options.binary_location = path
browser = webdriver.Chrome(chrome_options=chrome_options)
return browser
browser = share_browser()
url = 'https://www.baidu.com'
browser.get(url)
安装
pip install request -i https://pypi.douban.com/simple
import requests
url = 'https://www.baidu.com'
response = requests.get(url=url)
# 一个类型和六个属性
# response类型
print(type(response))
# 设置响应的编码格式
response.encoding = 'utf-8'
# 以字符串形式返回网页的源码
print(response.text)
# 返回一个url地址
print(response.url)
# 返回的是二进制的数据
print(response.content)
# 返回响应状态码
print(response.status_code)
# 返回响应头
print(response.headers)
get请求
import requests
url = 'https://www.baidu.com/s?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
data = {
'wd': '北京'
}
response = requests.get(url=url, params=data, headers=headers)
content = response.text
print(content)
post请求类似,requests.post,总的来说比urllib简单
request页面中需要请求多次,但网页要求是同一个请求,这时要用request中session,来保证多次请求是同一个会话
页面请求字段可能还会存在隐藏域的概念,需要先爬取
企业级用的最多的爬虫框架
pip install scrapy -i https://pypi.douban.com/simple
1、scrapy startproject 项目名(不能数字开头,不能包含中文)
2、创建爬虫文件,要在spiders文件夹中创建
cd 项目名\项目名\spiders
创建爬虫文件
scrapy genspider 爬虫文件的名字 要爬取网页
scrapy genspider baidu www.baidu.com
3、运行爬虫代码
scrapy crawl baidu
scrapy shell交互终端调试