一. urllib
urllib是python中自带的一个基于爬虫的模块
作用:可以使用代码模拟浏览器发起请求
使用流程:
1. 第一个urllib爬虫程序
# 需求: 爬取搜狗首页的页面数据
import urllib.request
# 1. 指定url
url = 'https://www.sogou.com/'
# 2. 发起请求:urlopen可以根据指定的url发起请求,并返回一个响应对象
response = urllib.request.urlopen(url=url)
# 3. 获取页面数据:read函数返回的就是响应对象中存储的页面数据
page_text = response.read()
# 4. 进行持久化存储
with open('./sougou.html', 'wb') as f:
f.write(page_text)
print("Done")
2. urllib编码处理
# 需求:爬去指定词条所对应的页面数据
import urllib.request
import urllib.parse
# 指定url
url = 'https://www.sogou.com/web?query='
# url特性:url不可以存在非ASCII编码的字符数据,汉字并不在ASCII编码当中
word = urllib.parse.quote("宝马")
url += word
# 发请求
response = urllib.request.urlopen(url=url)
# 获取页面数据
page_text = response.read()
# 持久化存储
with open('./bmw.html', 'wb') as f:
f.write(page_text)
3. urllib的post请求
# urllib模块发起的post请求
# 需求:爬取百度翻译的翻译结果
import urllib.request
import urllib.parse
# 1. 指定url
url = 'https://fanyi.baidu.com/sug'
# post请求携带的参数进行处理
# 流程:
# 1). 将post请求参数封装到字典
data = {
'kw': "苹果"
}
# 2). 使用parse模块中的urlencode(返回值类型为str)进行编码处理
query = urllib.parse.urlencode(data)
# 3). 将步骤2的编码结果转换成byte类型
data = query.encode()
# 2. 发起post请求:urlopen函数的data参数表示的就是经过处理之后的post请求携带的参数
response = urllib.request.urlopen(url=url, data=data)
print(response.read())
二. requests模块
requests是python原生的一个基于网络请求的模块,模拟浏览器发起请求
1. requests-get请求:
1)简单的get请求
import requests
# 需求:爬取搜狗首页的页面数据
url = 'https://www.sogou.com/'
# 发起get请求,get方法会返回请求成功的相应对象
response = requests.get(url=url)
# 获取响应中的数据值:text可以获取响应对象中字符串形式的页面数据
page_data = response.text
# response对象中其他的重要属性
# content或取得时response对象中二进制(byte)类型的页面数据
# print(response.content)
# 返回一个响应状态码
# print(response.status_code)
# 返回响应头信息
# print(response.headers)
# 获取请求的url
# print(response.url)
# 持久化操作
with open('./sougou_requests.html', 'w',encoding='utf-8') as f:
f.write(page_data)
2)requests携带参数的get请求
方式1:
# requests模块处理携带参数的get请求
# 需求:指定一个词条,获取搜狗搜索结果所对应的页面数据
# 1. 指定url
url = 'https://www.sogou.com/web?query=宝马&ie=utf-8'
response = requests.get(url=url)
page_text = response.text
with open('./bmw_requests.html', 'w', encoding='utf-8') as f:
f.write(page_text)
方式2:
# requests模块处理携带参数的get请求
# 需求:指定一个词条,获取搜狗搜索结果所对应的页面数据
url = 'https://www.sogou.com/web'
# 将参数封装到字典中
params = {
'query': '宝马',
'ie': 'utf-8'
}
response = requests.get(url=url, params=params)
print(response.text)
3)自定义请求头信息
# 自定义请求头信息
import requests
url = 'https://www.sogou.com/web'
params = {
'query': '宝马',
'ie': 'utf-8'
}
# 自定义请求头信息
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
response = requests.get(url=url, params=params, headers=headers)
print(response.text)
2. requests的post请求
import requests
# 1. 指定post请求的url
url = 'https://accounts.douban.com/login'
# 2. 发起post请求
data = {
'source': 'movie',
'redir': 'https://movie.douban.com/',
'form_email' : '[email protected]',
'form_password' : 'xxx',
'login' : '登录'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
response = requests.post(url=url, data=data)
# 3. 获取响应对象中的页面数据
page_text = response.text
# 4. 持久化操作
with open('./douban_request.html', 'w', encoding='utf-8') as f:
f.write(page_text)
3. ajax的get请求
import requests
url = 'https://movie.douban.com/j/chart/top_list?'
# 封装ajax的get请求中携带的参数
params = {
'type' : '13',
'interval_id' : '100:90',
'action': '',
'start': '0',
'limit': '20',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
response = requests.get(url, params=params, headers=headers)
print(response.text)
4. ajax的post请求
import requests
# 1. 指定url
post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
# 处理post请求的参数
data = {
'cname': '',
'pid': '',
'keyword': '上海',
'pageIndex': 1,
'pageSize': 10
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
# 2. 发起基于ajax的post请求
response = requests.post(url=post_url, data=data, headers=headers)
print(response.text)
5. 综合实战
import requests
import os
# 创建一个文件夹
if not os.path.exists('./zhihu_pages'):
os.mkdir('./zhihu_pages')
word = input('enter a word: ')
# 动态指定页码的范围
start_pageNum = int(input('enter a start pageNum: '))
end_pageNum = int(input('enter a end pageNum: '))
# 指定url:设计成一个具有通用的url
url = 'http://zhihu.sogou.com/zhihu'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
for page in range(start_pageNum, end_pageNum+1):
params = {
'query': word,
'page': page,
'ie': 'utf-8'
}
response = requests.get(url=url, params=params, headers=headers)
# 获取响应中的页面数据
page_text = response.text
# 进行持久化存储
filename = word + str(page) + '.html'
filePath = 'zhihu_pages/' + filename
with open(filePath, 'w', encoding='utf-8') as f:
f.write(page_text)
print("文件第%s页数据写入成功" % page)
6. requests携带cookie
cookie作用:服务器端使用cookie来记录客户端的状态信息
实现流程:1.执行登陆操作(获取cookie);2.在发起个人主页请求时,需要将cookie携带到该请求中
import requests
# 获取session对象
session = requests.session()
# 1. 发起登录请求:将cookie获取,并存储到session对象中
login_url = 'https://accounts.douban.com/login'
data = {
'source': 'None',
'redir': 'https://www.douban.com/people/186539740/',
'form_email' : '[email protected]',
'form_password' : 'xxx',
'login' : '登录'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
# 使用session发起post请求
login_response = session.post(url=login_url, data=data, headers=headers)
# 2. 对个人主页发起请求(session(cookie)),获取相应页面数据
url = 'https://www.douban.com/people/186539740/'
response = session.get(url=url, headers=headers)
page_text = response.text
with open('./douban_person1.html', 'w', encoding='utf-8') as f:
f.write(page_text)
7. requests模块的代理操作
代理分类:1.正向代理:代替客户端获取数据;2.反向代理:代理服务器端提供数据
免费代理ip的网站提供商:www.goubanjia.com; 快代理;西祠代理
import requests
url = 'https://www.baidu.com/s?ie=utf-8&wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
# 将代理ip封装到字典当中
proxy = {
'http': '39.137.77.66:8080'
}
# 更换网路IP
response = requests.get(url=url, proxies=proxy, headers=headers)
with open('./proxy1.html', 'w', encoding='utf-8') as f:
f.write(response.text)
8. 数据解析
1)正则表达式
# 需求:使用正则对糗事百科中的图片数据进行解析和下载
import requests
import re
import os
# 指定url
url = 'https://www.qiushibaike.com/pic/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
# 发起请求
response = requests.get(url=url, headers=headers)
# 获取页面数据
page_text = response.text
# 数据解析
img_list = re.findall('.*?.*?', page_text, re.S)
# 创建一个存储图片数据的文件夹
if not os.path.exists('./imgs_qiushi'):
os.mkdir('imgs_qiushi')
for url in img_list:
# 将图片的url进行拼接,拼接成一个完整的url
img_url = 'https:' + url
# 持久化存储:存储的是图片的数据,并不是url
# 获取图片二进制的数据值
img_data = requests.get(url=img_url, headers=headers).content
imgName = url.split('/')[-1]
imgPath = 'imgs_qiushi/' + imgName
with open(imgPath, 'wb') as f:
f.write(img_data)
print(imgName + "写入完成")
2)xpath
xpath在爬虫中的使用流程:
(1) 下载:pip install lxml
(2) 导包:from lxml import etree
(3) 创建etree对象进行指定数据的解析
常用的xpath表达式:
(1)属性定位:
# 找到class属性值为song的div标签:"//div[@class='song']"
(2)层级&索引定位:
# 找到class属性值为tang的div的直系子标签ul下的第二个子标签li下的直系子标签a:"//div[@class='tang']/ul/li[2]/a"
(3)逻辑运算:
# 找到href属性值为空且class属性值为du的a标签:"//div[@href='' and @class='du']"
(4)模糊匹配:
# "//div[contains(@class, 'ng')]"
# "//div[starts-with(@class, 'ta')]"
(5)取文本:
# /表示获取某个标签下的文本内容
# //表示获取某个标签下的文本内容和所有子标签下的文本内容
# "//div[@class='song']/p[1]/text()"
# "//div[@class='tang']//text()"
(6)取属性:
# "//div[@class='tang']//li[2]/a/@href"
# 需求:使用xpath对段子网中的段子内容和标题进行解析,并持久化存储
import requests
from lxml import etree
# 1. 指定url
url = 'https://ishuo.cn/joke'
# 2. 发起请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
# 3. 获取页面内容
page_text = response.text
# 4. 数据解析
tree = etree.HTML(page_text)
# 获取所有的li标签,
li_list = tree.xpath("//div[@id='list']/ul/li")
with open('./duanzi.txt', 'w', encoding='utf-8') as f:
for li in li_list:
content = li.xpath("./div[@class='content']/text()")[0]
title = li.xpath("./div[@class='info']/a/text()")[0]
# 5. 持久化
f.write(title + ":" + content + "\n\n")
3)bs4
核心思想:将html文档转换成Beautiful对象,然后调用该对象中的属性和方法进行html稳定指定内容的定位查找。
属性和方法:
(1)根据标签名查找
soup.a # 只能找到第一个符合要求的标签
(2)获取属性
soup.a.attrs # 获取a所有的属性和属性值,返回一个字典
soup.a.attrs['href'] # 获取href属性
soup.a['href'] # 也可以简写为这种形式
(3)获取内容
soup.a.string
soup.a.text
soup.a.get_text()
注意:如果标签还有标签,那么string获取到的结果为None,而其它两个,可以获取文本内容
(4)find:找到第一个符合要求的标签
soup.find('a') # 找到第一个符合要求的
soup.find('a', title='xxx')
soup.find('a', alt='xxx')
soup.find('a', class='xxx')
soup.find('a', id='xxx')
(5)find_all:找到所有符合要求的标签
soup.find_all('a')
soup.find_all(['a', 'b']) # 找到所有的a和b标签
soup.find_all('a', limit=2) # 限制前两个
(6)根据选择器选择指定的内容
select:soup.select('#feng')
- 常见的选择器:标签选择器(a)、类选择器(.)、id选择器(#)、层级选择器
层级选择器:
div .dudu #lala ,meme .xixi
div > p > a > .lala
注意:select选择器返回永远是列表,需要通过下标提取指定的对象
# 需求:爬取古诗文网中三国小说里的标题和内容
import requests
from bs4 import BeautifulSoup
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
}
# 根据url获取页面内容中指定的标题所对应的文章内容
def get_content(url):
content_page = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(content_page, 'lxml')
div = soup.find('div', class_='chapter_content')
return div.text
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
a_list = soup.select('.book-mulu > ul > li > a')
with open('./threekingdoms.txt', 'w', encoding='utf-8') as f:
for a in a_list:
title = a.string
content_url = 'http://www.shicimingju.com' + a['href']
content = get_content(content_url)
f.write(title + ":\n" + content + "\n\n\n")
print(title + ": 已被写入")
9. 处理页面动态加载数据的爬取
1)selenum:三方库。可以实现让浏览器完成自动化的操作
(1)环境搭建
a. 安装: pip install selenium
b. 获取浏览器的驱动程序:
谷歌浏览器驱动下载地址:chromedriver.storage.googleapis.com/index.html
下载的驱动程序必须和浏览器的版本统一,对照表参照:http://blog.csdn.net/huilan_same/article/details/51896672
from selenium import webdriver
# 创建一个浏览器对象executable_path驱动的路径
b = webdriver.Chrome(executable_path='./chromedriver')
#get方法可以指定一个url,让浏览器进行请求
url = 'https://www.baidu.com'
b.get(url)
# 使用下面的方法,查找指定的元素进行操作即可
# find_element_by_id 根据id找节点
# find_elements_by_name 根据name找
# find_elements_by_xpath 根据xpath查找
# find_elements_by_tag_name 根据标签找
# find_elements_by_class_name 根据class名字查找
# 让百度进行指定词条的搜索
text = b.find_element_by_id('kw') # 定位到了text文本框
text.send_keys('人民币') # send_keys表示向文本框中录入指定内容
button = b.find_element_by_id('su')
button.click() # click表示的是点击操作
b.quit() # 关闭浏览器
2)phantomjs
from selenium import webdriver
b = webdriver.PhantomJS(executable_path='phantomjs')
# 打开浏览器
b.get('https://www.baidu.com')
# 截屏
b.save_screenshot('./1.png')
text = b.find_element_by_id('kw')
text.send_keys('人民币')
b.save_screenshot('./2.png')
b.quit()
3)综合实战
from selenium import webdriver
b = webdriver.PhantomJS(executable_path='phantomjs')
url = 'https://movie.douban.com/typerank?type_name=%E5%8A%A8%E4%BD%9C&type=5&interval_id=100:90&action='
b.get(url)
js = 'window.scrollTo(0, document.body.scrollHeight)'
b.execute_script(js)
page_text = b.page_source
print(page_text)
补充:
案例一:爬取汽车之家新闻页面数据
'''
爬取汽车之家新闻页面数据
'''
import requests
from bs4 import BeautifulSoup
import os
# 1. 伪造浏览器发送请求
r1 = requests.get(url="https://www.autohome.com.cn/news/")
r1.encoding = "gbk"
# 2. 去响应的响应体中解析出我们想要的数据
soup = BeautifulSoup(r1.text, "html.parser")
# 3. 按照规则找,div标签且id="auto-channel-lazyload-article"找匹配成功的第一个
container = soup.find(name="div", attrs={"id": "auto-channel-lazyload-article"})
# 4. container中找所有的li标签,返回的是个列表
li_list = container.find_all(name="li")
for tag in li_list:
title = tag.find(name="h3")
if not title:
continue
summary = tag.find(name='p')
a_tag = tag.find(name='a')
url = "https:" + a_tag.attrs.get('href')
img = tag.find(name="img")
img_url = "https:" + img.get("src")
print(title.text)
print(summary.text)
print(url)
print(img_url)
r2 = requests.get(url=img_url)
img_name = img_url.rsplit('/', maxsplit=1)[1]
img_path = os.path.join("imgs", img_name)
with open(img_path, 'wb') as f:
f.write(r2.content)
print("_______________________________________________________________")
案例二:爬取抽屉新热榜首页新闻,并保存封面图片
'''
爬取抽屉热榜新闻
'''
import requests
from bs4 import BeautifulSoup
import os
r1 = requests.get(
url="https://dig.chouti.com/",
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
}
)
s1 = BeautifulSoup(r1.text, 'html.parser')
container = s1.find(name="div", attrs={'id': 'content-list'})
news_list = container.find_all(name='div', attrs={'class': 'item'})
for news in news_list:
title = news.find(name='a', attrs={'class': 'show-content'})
summary = news.find(name='span', attrs={'class': 'summary'})
print("Title: ", title.text)
img_div = news.find(name='div', attrs={'class': 'news-pic'})
img_url = "https:" + img_div.find(name='img').get('original')
r2 = requests.get(img_url)
img_name = img_url.rsplit('/', maxsplit=1)[1].split('?')[0]
img_path = os.path.join('imgs', img_name)
with open(img_path, 'wb') as f:
f.write(r2.content)
if not summary:
print("----------------------------------------------------------")
continue
print("Summary: ",summary.text)
print("----------------------------------------------------------")
案例三:登录抽屉并点赞
'''
通过代码进行自动登录,然后进行点赞
'''
import requests
import bs4
# 第一部分:登录
r1 = requests.get(
url="https://dig.chouti.com/",
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
}
)
r1_cookie_dict = r1.cookies.get_dict()
r2 = requests.post(
url="https://dig.chouti.com/login",
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
},
data={
'phone':"999999",
'password': "XXXXXX",
'oneMonth': 1,
},
cookies = r1_cookie_dict
)
print(r2.text)
# 第二部分:点赞
r3 = requests.post(
url='https://dig.chouti.com/link/vote?linksId=20843176',
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
},
cookies=r1_cookie_dict
)
print(r3.text)
案例四:爬取抖音小视频
import requests
user_id = '58841646784'
# 获取所有作品
"""
signature = _bytedAcrawler.sign('用户ID')
douyin_falcon:node_modules/byted-acrawler/dist/runtime
"""
import subprocess
signature = subprocess.getoutput('node s1.js %s' %user_id)
user_video_list = []
# ############################# 获取个人作品 ##########################
user_video_params = {
'user_id': str(user_id),
'count': '21',
'max_cursor': '0',
'aid': '1128',
'_signature': signature,
'dytk': 'b4dceed99803a04a1c4395ffc81f3dbc' # '114f1984d1917343ccfb14d94e7ce5f5'
}
def get_aweme_list(max_cursor=None):
if max_cursor:
user_video_params['max_cursor'] = str(max_cursor)
res = requests.get(
url="https://www.douyin.com/aweme/v1/aweme/post/",
params=user_video_params,
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'x-requested-with':'XMLHttpRequest',
'referer':'https://www.douyin.com/share/user/58841646784',
}
)
content_json = res.json()
aweme_list = content_json.get('aweme_list', [])
user_video_list.extend(aweme_list)
if content_json.get('has_more') == 1:
return get_aweme_list(content_json.get('max_cursor'))
get_aweme_list()
# ############################# 获取喜欢作品 ##########################
favor_video_list = []
favor_video_params = {
'user_id': str(user_id),
'count': '21',
'max_cursor': '0',
'aid': '1128',
'_signature': signature,
'dytk': 'b4dceed99803a04a1c4395ffc81f3dbc'
}
def get_favor_list(max_cursor=None):
if max_cursor:
favor_video_params['max_cursor'] = str(max_cursor)
res = requests.get(
url="https://www.douyin.com/aweme/v1/aweme/favorite/",
params=favor_video_params,
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'x-requested-with':'XMLHttpRequest',
'referer':'https://www.douyin.com/share/user/58841646784',
}
)
content_json = res.json()
aweme_list = content_json.get('aweme_list', [])
favor_video_list.extend(aweme_list)
if content_json.get('has_more') == 1:
return get_favor_list(content_json.get('max_cursor'))
get_favor_list()
# ############################# 视频下载 ##########################
for item in user_video_list:
video_id = item['video']['play_addr']['uri']
video = requests.get(
url='https://aweme.snssdk.com/aweme/v1/playwm/',
params={
'video_id':video_id
},
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.douyin.com/share/user/58841646784',
},
stream=True
)
file_name = video_id + '.mp4'
with open(file_name,'wb') as f:
for line in video.iter_content():
f.write(line)
for item in favor_video_list:
video_id = item['video']['play_addr']['uri']
video = requests.get(
url='https://aweme.snssdk.com/aweme/v1/playwm/',
params={
'video_id':video_id
},
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'referer': 'https://www.douyin.com/share/user/58841646784',
},
stream=True
)
file_name = video_id + '.mp4'
with open(file_name, 'wb') as f:
for line in video.iter_content():
f.write(line)
其他请求
requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)
# 以上方法均是在此方法的基础上构建
requests.request(method, url, **kwargs)
更多参数
def request(method, url, **kwargs):
"""Constructs and sends a :class:`Request `.
:param method: method for the new :class:`Request` object.
:param url: URL for the new :class:`Request` object.
:param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
:param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
:param json: (optional) json data to send in the body of the :class:`Request`.
:param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
:param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
:param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': file-tuple}``) for multipart encoding upload.
``file-tuple`` can be a 2-tuple ``('filename', fileobj)``, 3-tuple ``('filename', fileobj, 'content_type')``
or a 4-tuple ``('filename', fileobj, 'content_type', custom_headers)``, where ``'content-type'`` is a string
defining the content type of the given file and ``custom_headers`` a dict-like object containing additional headers
to add for the file.
:param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
:param timeout: (optional) How long to wait for the server to send data
before giving up, as a float, or a :ref:`(connect timeout, read
timeout) ` tuple.
:type timeout: float or tuple
:param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed.
:type allow_redirects: bool
:param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
:param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to ``True``.
:param stream: (optional) if ``False``, the response content will be immediately downloaded.
:param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
:return: :class:`Response ` object
:rtype: requests.Response
Usage::
>>> import requests
>>> req = requests.request('GET', 'http://httpbin.org/get')
"""
参数示例
def param_method_url():
# requests.request(method='get', url='http://127.0.0.1:8000/test/')
# requests.request(method='post', url='http://127.0.0.1:8000/test/')
pass
def param_param():
# - 可以是字典
# - 可以是字符串
# - 可以是字节(ascii编码以内)
# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params={'k1': 'v1', 'k2': '水电费'})
# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params="k1=v1&k2=水电费&k3=v3&k3=vv3")
# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8'))
# 错误
# requests.request(method='get',
# url='http://127.0.0.1:8000/test/',
# params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8'))
pass
def param_data():
# 可以是字典
# 可以是字符串
# 可以是字节
# 可以是文件对象
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data={'k1': 'v1', 'k2': '水电费'})
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data="k1=v1; k2=v2; k3=v3; k3=v4"
# )
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data="k1=v1;k2=v2;k3=v3;k3=v4",
# headers={'Content-Type': 'application/x-www-form-urlencoded'}
# )
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4
# headers={'Content-Type': 'application/x-www-form-urlencoded'}
# )
pass
def param_json():
# 将json中对应的数据进行序列化成一个字符串,json.dumps(...)
# 然后发送到服务器端的body中,并且Content-Type是 {'Content-Type': 'application/json'}
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水电费'})
def param_headers():
# 发送请求头到服务器端
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
json={'k1': 'v1', 'k2': '水电费'},
headers={'Content-Type': 'application/x-www-form-urlencoded'}
)
def param_cookies():
# 发送Cookie到服务器端
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
data={'k1': 'v1', 'k2': 'v2'},
cookies={'cook1': 'value1'},
)
# 也可以使用CookieJar(字典形式就是在此基础上封装)
from http.cookiejar import CookieJar
from http.cookiejar import Cookie
obj = CookieJar()
obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None,
discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,
port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False)
)
requests.request(method='POST',
url='http://127.0.0.1:8000/test/',
data={'k1': 'v1', 'k2': 'v2'},
cookies=obj)
def param_files():
# 发送文件
# file_dict = {
# 'f1': open('readme', 'rb')
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)
# 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', open('readme', 'rb'))
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)
# 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)
# 发送文件,定制文件名
# file_dict = {
# 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)
pass
def param_auth():
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
print(ret.text)
# ret = requests.get('http://192.168.1.1',
# auth=HTTPBasicAuth('admin', 'admin'))
# ret.encoding = 'gbk'
# print(ret.text)
# ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
# print(ret)
#
def param_timeout():
# ret = requests.get('http://google.com/', timeout=1)
# print(ret)
# ret = requests.get('http://google.com/', timeout=(5, 1))
# print(ret)
pass
def param_allow_redirects():
ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
print(ret.text)
def param_proxies():
# proxies = {
# "http": "61.172.249.96:80",
# "https": "http://61.185.219.126:3128",
# }
# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}
# ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)
# print(ret.headers)
# from requests.auth import HTTPProxyAuth
#
# proxyDict = {
# 'http': '77.75.105.165',
# 'https': '77.75.105.165'
# }
# auth = HTTPProxyAuth('username', 'mypassword')
#
# r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
# print(r.text)
pass
def param_stream():
ret = requests.get('http://127.0.0.1:8000/test/', stream=True)
print(ret.content)
ret.close()
# from contextlib import closing
# with closing(requests.get('http://httpbin.org/get', stream=True)) as r:
# # 在此处理响应。
# for i in r.iter_content():
# print(i)
def requests_session():
import requests
session = requests.Session()
### 1、首先登陆任何页面,获取cookie
i1 = session.get(url="http://dig.chouti.com/help/service")
### 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
i2 = session.post(
url="http://dig.chouti.com/login",
data={
'phone': "8615131255089",
'password': "xxxxxx",
'oneMonth': ""
}
)
i3 = session.post(
url="http://dig.chouti.com/link/vote?linksId=8589623",
)
print(i3.text)
bs4模块
BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,然后将其进行格式化,之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。
安装:pip3 install beautifulsoup4
使用示例:
from bs4 import BeautifulSoup
html_doc = """
The Dormouse's story
...
"""
soup = BeautifulSoup(html_doc, features="lxml")
1. name,标签名称
# tag = soup.find('a')
# name = tag.name # 获取
# print(name)
# tag.name = 'span' # 设置
# print(soup)
2. attr,标签属性
# tag = soup.find('a')
# attrs = tag.attrs # 获取
# print(attrs)
# tag.attrs = {'ik':123} # 设置
# tag.attrs['id'] = 'iiiii' # 设置
# print(soup)
3. children,所有子标签
# body = soup.find('body')
# v = body.children
4. descendants,所有子子孙孙标签
# body = soup.find('body')
# v = body.descendants
5. clear,将标签的所有子标签全部清空(保留标签名)
# tag = soup.find('body')
# tag.clear()
# print(soup)
6. decompose,递归的删除所有的标签
# body = soup.find('body')
# body.decompose()
# print(soup)
7. extract,递归的删除所有的标签,并获取删除的标签
# body = soup.find('body')
# v = body.extract()
# print(soup)
8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
# body = soup.find('body')
# v = body.decode()
# v = body.decode_contents()
# print(v)
9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# body = soup.find('body')
# v = body.encode()
# v = body.encode_contents()
# print(v)
10. find,获取匹配的第一个标签
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)
11. find_all,获取匹配的所有标签
# tags = soup.find_all('a')
# print(tags)
# tags = soup.find_all('a',limit=1)
# print(tags)
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)
# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))
# v = soup.find_all(id=['link1','link2'])
# print(v)
# v = soup.find_all(href=['link1','link2'])
# print(v)
# ####### 正则 #######
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
# ####### 方法筛选 #######
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
# v = soup.find_all(name=func)
# print(v)
# ## get,获取标签属性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)
12. has_attr,检查标签是否具有该属性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)
13. get_text,获取标签内部文本内容
# tag = soup.find('a')
# v = tag.get_text('id')
# print(v)
14. index,检查标签在某标签中的索引位置
# tag = soup.find('body')
# v = tag.index(tag.find('div'))
# print(v)
# tag = soup.find('body')
# for i,v in enumerate(tag):
# print(i,v)
15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
# tag = soup.find('br')
# v = tag.is_empty_element
# print(v)
16. 当前的关联标签
# soup.next
# soup.next_element
# soup.next_elements
# soup.next_sibling
# soup.next_siblings
#
# tag.previous
# tag.previous_element
# tag.previous_elements
# tag.previous_sibling
# tag.previous_siblings
#
# tag.parent
# tag.parents
17. 查找某标签的关联标签
# tag.find_next(...)
# tag.find_all_next(...)
# tag.find_next_sibling(...)
# tag.find_next_siblings(...)
# tag.find_previous(...)
# tag.find_all_previous(...)
# tag.find_previous_sibling(...)
# tag.find_previous_siblings(...)
# tag.find_parent(...)
# tag.find_parents(...)
# 参数同find_all
18. select,select_one, CSS选择器
soup.select("title")
soup.select("p nth-of-type(3)")
soup.select("body a")
soup.select("html head title")
tag = soup.select("span,a")
soup.select("head > title")
soup.select("p > a")
soup.select("p > a:nth-of-type(2)")
soup.select("p > #link1")
soup.select("body > a")
soup.select("#link1 ~ .sister")
soup.select("#link1 + .sister")
soup.select(".sister")
soup.select("[class~=sister]")
soup.select("#link1")
soup.select("a#link2")
soup.select('a[href]')
soup.select('a[href="http://example.com/elsie"]')
soup.select('a[href^="http://example.com/"]')
soup.select('a[href$="tillie"]')
soup.select('a[href*=".com/el"]')
from bs4.element import Tag
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if not child.has_attr('href'):
continue
yield child
tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
print(type(tags), tags)
from bs4.element import Tag
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if not child.has_attr('href'):
continue
yield child
tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
print(type(tags), tags)
19. 标签的内容
# tag = soup.find('span')
# print(tag.string) # 获取
# tag.string = 'new content' # 设置
# print(soup)
# tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx'
# print(soup)
# tag = soup.find('body')
# v = tag.stripped_strings # 递归内部获取所有标签的文本
# print(v)
20.append在当前标签内部追加一个标签
# tag = soup.find('body')
# tag.append(soup.find('a'))
# print(soup)
#
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)
21.insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj)
# print(soup)
22. insert_after,insert_before 在当前标签后面或前面插入
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# # tag.insert_before(obj)
# tag.insert_after(obj)
# print(soup)
23. replace_with 在当前标签替换为指定标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
24. 创建标签之间的关系
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
25. wrap,将指定标签把当前标签包裹起来
# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)
# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)
26. unwrap,去掉当前标签,将保留其包裹的标签
# tag = soup.find('a')
# v = tag.unwrap()
# print(soup)
三. Scrapy
安装windows版scrapy
a. pip3 install wheel
b. 下载twisted http:
/
/
www.lfd.uci.edu
/
~gohlke
/
pythonlibs
/
#twisted
c. 进入下载目录,执行 pip3 install Twisted‑
17.1
.
0
‑cp35‑cp35m‑win_amd64.whl
d. pip3 install pywin32
e. pip3 install scrapy
1. 基本命令
1. scrapy startproject 项目名称
- 在当前目录中创建中创建一个项目文件(类似于Django)
2. scrapy genspider [-t template]
- 创建爬虫应用
如:
scrapy gensipider -t xmlfeed autohome autohome.com.cn
PS:
查看所有命令:scrapy gensipider -l
查看模板命令:scrapy gensipider -d 模板名称
3. scrapy list
- 展示爬虫应用列表
4. scrapy crawl 爬虫应用名称
- 运行单独爬虫应用
2. 项目结构以及爬虫应用简介
project_name
/
scrapy.cfg # 项目的主配置信息(爬虫相关的真正配置信息在settings.py文件中)
project_name
/
__init__.py
items.py # 设置数据存储模板,用于结构化数据,类似Django的Model
pipelines.py # 数据持久化处理
settings.py # 配置文件,如:递归的层数、并发数,延迟下载等
spiders
/ # 爬虫目录
__init__.py
爬虫
1.py
爬虫
2.py
爬虫
3.py
文件说明:
注意:一般创建爬虫文件时,以网站域名命名
3. 简单配置
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
4. 持久化存储操作
1)磁盘文件
(1)基于终端指令
a. 保证parse方法返回一个可迭代类型的对象(存储解析到的页面内容)
b. 使用终端指令完成数据存储到指定磁盘文件中的操作:
scrapy crawl 爬虫文件名称 -o 磁盘文件.后缀
(2)基于管道
a. items.py:存储解析到的页面数据
b. pipelines:处理持久化存储的相关操作
c. 代码实现流程:
a) 将解析到的页面数据存储到items对象
b) 使用yield关键字将items提交给管道文件进行处理
c) 在管道文件中编写代码完成数据存储的操作
d) 在配置文件中开启管道操作
2)数据库
(1)mysql
import pymysql
class QiubaiPipeline(object):
conn = None
cursor = None
def open_spider(self, spider):
# 1. 连接数据库
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='qiubai')
def process_item(self, item, spider):
# 2. 执行sql语句
sql = 'insert into qiubai values("%s", "%s")' % (item['author'], item['content'])
self.cursor = self.conn.cursor()
try:
# 3. 提交事务
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
# -*- coding: utf-8 -*-
import scrapy
from qiubai.items import QiubaiItem
class QiushibaikeSpider(scrapy.Spider):
name = 'qiushibaike'
# allowed_domains = ['www.qiushibaike.com/text']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 解析段子的内容合作者
div_list = response.xpath("//div[@id='content-left']/div")
for div in div_list:
author = div.xpath("./div/a[2]/h2/text()").extract()[0]
content = div.xpath(".//div[@class='content']/span/text()").extract()[0]
item = QiubaiItem(author=author, content=content)
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class QiubaiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
(2)redis
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import redis
class QiubaiPipeline(object):
conn = None
def open_spider(self, spider):
self.conn = redis.Redis('host=127.0.0.1', port=6379)
def process_item(self, item, spider):
dict = {
'author': item['author'],
'content': item['content']
}
self.conn.lpush('data', dict)
return item
def close_spider(self, spider):
pass
(3)代码实现流程:
a) 将解析到的页面数据存储到items对象
b) 使用yield关键字将items提交给管道文件进行处理
c) 在管道文件中编写代码完成数据存储的操作
d) 在配置文件中开启管道操作
案例一:爬取抽屉热榜新闻,并保存在本地文件当中
chouti.py的代码
# -*- coding: utf-8 -*-
import scrapy
from ..items import XzxItem
from scrapy.http import Request
# import sys, os, io
# sys.stdout=io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']
def parse(self, response):
# 1. 解析文本中的内容,将标题和简介提取出来
item_list = response.xpath("//div[@id='content-list']/div[@class='item']")
for item in item_list:
title = item.xpath(".//div[@class='part1']/a/text()").extract_first().strip()
# item.xpath(".//div[@class='part1']/a[0]/text()")
# summary = item.xpath(".//div[@class='area-summary']/span/text()").extract_first().strip()
# print(summary)
href = item.xpath(".//div[@class='part1']/a/@href").extract_first().strip()
yield XzxItem(title=title, href=href)
# 2. 获取页面
page_list = response.xpath("//div[@id='dig_lcpage']//a/@href").extract()
for url in page_list:
url = "https://dig.chouti.com" + url
yield Request(url=url, callback=self.parse) # 可以将关掉去重dont_filter=True
item.py的代码
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class XzxItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
settings.py的代码
# -*- coding: utf-8 -*-
# Scrapy settings for xzx project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'xzx'
SPIDER_MODULES = ['xzx.spiders']
NEWSPIDER_MODULE = 'xzx.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'xzx.middlewares.XzxSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'xzx.middlewares.XzxDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'xzx.pipelines.XzxPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
CHOUTI_NEWS_PATH = 'x1.log'
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
'''
1. 先去类中找from_crawler
有:执行且必须返回一个当前类的对象
没有:不执行,则去执行构造方法并返回一个对象
2. 再去执行对象其他方法
'''
class XzxPipeline(object):
def __init__(self, file_path):
self.f = None
self.file_path = file_path
@classmethod
def from_crawler(cls, crawler):
file_path = crawler.settings.get("CHOUTI_NEWS_PATH")
return cls(file_path)
def open_spider(self, spider):
'''
爬虫开始执行时,调用
:param spider:
:return:
'''
print("爬虫开始了")
self.f = open(self.file_path, "a+", encoding="utf-8")
def process_item(self, item, spider):
'''
:param item: 爬虫yield过来的item对象,封装:title和href
:param spider: 爬虫对象
:return:
'''
print(item)
self.f.write(item['title'] + "\n")
self.f.flush()
return item
def close_spider(self, spider):
'''
爬虫关闭时,调用
:param spider:
:return:
'''
self.f.close()
print("爬虫结束了")
5. 针对多个url进行数据的爬取
解决方案:手动发送请求
import scrapy
from qiubai.items import QiubaiItem
class QiushibaikeSpider(scrapy.Spider):
name = 'qiushibaike'
# allowed_domains = ['www.qiushibaike.com/text']
start_urls = ['https://www.qiushibaike.com/text/']
# 设计一个通用的url模板
url = 'https://www.qiushibaike.com/text/page/%d'
pageNum = 1
def parse(self, response):
# 解析段子的内容合作者
div_list = response.xpath("//div[@id='content-left']/div")
for div in div_list:
author = div.xpath("./div/a[2]/h2/text()").extract()[0]
content = div.xpath(".//div[@class='content']/span/text()").extract()[0]
item = QiubaiItem(author=author, content=content)
yield item
# 请求的手动发送
if self.pageNum <= 13:
self.pageNum += 1
new_ulr = format(self.url % self.pageNum)
yield scrapy.Request(url=new_ulr, callback=self.parse)
6. Scrapy核心组件
引擎(Scrapy):用来处理整个系统的数据流处理,触发事务(框架核心)
管道(Pipeline):负责处理爬虫从网页中抽取的实体,主要的功能是持久化实体、验证实体的有效性、清除不需要的信息。当页面被爬虫解析后,将被发送到项目管道,并经过几个特定的次序处理数据
调度器(Scheduler):用来接收引擎发过来的请求,压入队列中,并在引擎再次请求的时候返回。可以想象成一个url(抓取网页的网址或者说是链接)的优先队列,由它来决定下一个要抓取的网址是什么,同时去除重复的网址
下载器(Downloader):用于下载网页内容,并将网页内容返回给蜘蛛(Scrapy下载器是建立在twisted这个高效的异步模型上的)
爬虫(Spiders):爬虫是主要干活的,用于从特定的网页中提取自己需要的信息,即所谓的实体(item)。用户也可以从中提取出链接,让Scrapy继续抓取下一个页面
7. post请求
如何发送post请求:一定要重写start_request方法
# -*- coding: utf-8 -*-
import scrapy
class PostdemoSpider(scrapy.Spider):
name = 'postDemo'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://fanyi.baidu.com/sug']
# 该方法其实是父类中的一个方法:该方法可以对start_urls列表中的元素进行get请求的发送
# 发起post:
# 1. 将Request方法中method参数赋值成post
# 2. FormRequest()可以发起post请求(推荐)
def start_requests(self):
data = {
'kw': 'dog'
}
for url in self.start_urls:
yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse)
def parse(self, response):
print(response.text)
8. Cookie操作
需求:豆瓣网个人登录,获取该用户个人主页这个二级页面的页面数据
# -*- coding: utf-8 -*-
import scrapy
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['www.douban.com']
start_urls = ['https://www.douban.com/accounts/login']
# 重写start_requests方法
def start_requests(self):
# 将请求参数封装到字典
data = {
'source': 'index_nav',
'form_email': 'xxxx',
'form_password': 'xxx'
}
for url in self.start_urls:
yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse)
def parseSecondPage(self, response):
f = open('second.html', 'w', encoding='utf-8')
f.write(response.text)
def parse(self, response):
f = open('main.html', 'w', encoding='utf-8')
f.write(response.text)
url = 'https://www.douban.com/people/xxx/'
yield scrapy.Request(url=url, callback=self.parseSecondPage)
9. 代理操作
流程:
1. 下载中间件类的自定制
1)object
2)重写process_reqeust(self, request, spider)的方法
2. 配置文件中进行下载中间件的开启
from scrapy import signals
# 自定义一个下载中间件的类,在类中实现process_request(处理中间件拦截到的请求)方法
class MyProxy(object):
def process_request(self, request, spider):
# 请求ip的更换
request.meta['proxy'] = 'http://120.76.77.152:9999'
10. 日志等级
ERROR:错误
WARNING:警告
INFO:一般信息
DEBUG:调试信息(默认)
在settings.py文件中设置日志等级:LOG_LEVEL = 'ERROR'
将日志信息存储到指定文件中,而非显示在终端里,在settings.py文件中:LOG_FILE = 'log.txt'
11. 请求传参
# -*- coding: utf-8 -*-
import scrapy
from moviePro.moviePro.items import MovieproItem
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['www.id97.com']
start_urls = ['http://www.id97.com/movie']
# 专门用于解析二级子页面中的数据值
def parseBySecondPage(self, response):
actor = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[1]/td[2]/a/text()').extract_first()
language = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[6]/td[2]/text()').extract_first()
lastTime = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()').extract_first()
# 取出Request方法的meta参数传递过来的字典(response.meta)
item = response.meta['item']
item['actor'] = actor
item['language'] = language
item['lastTime'] = lastTime
# 将item提交给管道
yield item
def parse(self, response):
# 名称、类型、导演、语言、片场
div_list = response.xpath('/html/body/div[1]/div[1]/div[2]/div')
for div in div_list:
name = div.xpath('.//div[@class="meta"]/h1/a/text()').extract_first()
# 如下xpath表达式返回的是一个列表,
type = div.xpath('.//div[@class="otherinfo"]//text()').extract()
# 将type列表转化成字符串
type = "".join(type)
url = div.xpath('.//div[@class="meta"]/h1/a/@href').extract_first()
# 创建items对象
item = MovieproItem()
item['name'] = name
item['type'] = type
# 需要对url发起请求,获取页面数据,进行指定数据解析
# meta参数只可以赋值一个字典(将item对象先封装到字典中)
yield scrapy.Request(url=url, callback=self.parseBySecondPage, meta={'item': item})
12. CrawlSpider
CrawlSpider概念:CrawlSpider其实就是Spider的一个子类。CrawlSpider功能更加强大,因为链接提取器和规则解析器
代码:
1)创建一个基于CrawlSpider的爬虫文件
scrapy genspider -t crawl xxx xxx.com
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ChoutiSpider(CrawlSpider):
name = 'chouti'
allowed_domains = ['dig.chouti.com']
start_urls = ['http://dig.chouti.com/']
# 实例化了一个链接提取器对象
# 链接提取器:用来提取指定的链接(url)
# allow参数:赋值一个正则表达式
# 链接提取器就可以根据正则表达式在页面中提取指定的链接
# 提取到的链接会全部交给规则解析器
link = LinkExtractor(allow=r'/all/hot/recent/\d+')
rules = (
# 实例化了一个规则解析器对象
# 规则解析器接受了链接提取器发送的链接后,就会对这些链接发起请求,获取链接对应的页面内容,就会根据指定的规则对页面进行解析
# callback:指定一个解析规则(方法/函数)
# follow:是否将链接提取器继续作用到链接提取器提取出的链接所表示的页面数据中
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
i = {}
print(response.text)
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return i
13. 分布式爬虫
1)概念:多台机器上可以执行同一个爬虫程序,实现网站数据的分布爬取。
2)原生的scrapy是不可以实现分布式爬虫的
原因:a. 调度器无法共享
b. 管道无法共享
3)scrapy-redis组件:专门为scrapy开发的一套组件。该组件可以让scrapy实现分布式
下载:pip install scrapy-redis
4)分布式爬取的流程:
a. redis配置文件的配置:
i.bind 127.0.0.1 进行注释
ii. protected-mode no 关闭保护模式
b. redis服务器的开启:基于配置文件
c. 创建scrapy工程后,创建基于crawlSpider的爬虫文件
d. 导入RedisCrawlSpider类,然后将爬虫文件修改成基于该类的源文件
e. 将start_url修改成redis_key
f. 将项目的管道和调度器配置成基于scrapy_redis组件中的
--使用scrapy-redis组件总封装好的调度器,将所有的url存储到指定的调度器中,从而实现了多台机器的调度器共享。
# 使用scrapy-redis组件的去重队列
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True
--使用scrapy-redis组建中封装好的管道,将每台机器爬取到的数据存储通过改管道存储到redis数据库中,从而能实现了多台机器的管道共享
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 400,
}
g. 配置redis服务器的ip地址和端口号
# 如果redis服务器不在自己本机,则需要如下配置:
# REDIS_HOST = 'redis服务的ip地址'
# REDIS_PORT = 6379
h. 执行爬虫文件:
scrapy runspider xxx.py
i. 将起始url放置到调度器的队列中:redis-cli: lpush 队列名称(redis-key)起始url
代码:
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redisPro.redisPro.items import RedisproItem
from scrapy_redis.spiders import RedisCrawlSpider
class QiubaiSpider(RedisCrawlSpider):
name = 'qiubai'
# allowed_domains = ['www.qiushibaike.com/pic']
# start_urls = ['http://www.qiushibaike.com/pic/']
# 调度器队列的名称
redis_key = 'qiubaispider' # 表示跟start_urls含义是一样
link = LinkExtractor(allow=r'/pic/page/\d+')
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
div_list = response.xpath('//div[@id="content-left"]/div')
for div in div_list:
img_url = "https:" + div.xpath('.//div[@class="thumb"]/a/img/@src').extract_first()
item = RedisproItem()
item['img_url'] = img_url
yield item
settings.py文件
# -*- coding: utf-8 -*-
# Scrapy settings for redisPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'redisPro'
SPIDER_MODULES = ['redisPro.spiders']
NEWSPIDER_MODULE = 'redisPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'redisPro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'redisPro.middlewares.RedisproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'redisPro.middlewares.RedisproDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#'redisPro.pipelines.RedisproPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy-redis组件自己的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允许暂停
SCHEDULER_PERSIST = True
# 如果redis服务器不在自己本机,则需要如下配置:
# REDIS_HOST = 'redis服务的ip地址'
# REDIS_PORT = 6379
items.py文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class RedisproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_url = scrapy.Field()
补充:
1. 登录抽屉并点赞
# -*- coding: utf-8 -*-
import scrapy
from ..items import XzxItem
from scrapy.http import Request
from scrapy.http.cookies import CookieJar
# import sys, os, io
# sys.stdout=io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['https://dig.chouti.com/r/ask/hot/1']
cookie_dict = {}
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse)
def parse(self, response):
# 1. 去第一次访问页面中获取cookie
cookie_jar = CookieJar()
cookie_jar.extract_cookies(response, response.request)
for k, v in cookie_jar._cookies.items():
for i,j in v.items():
for m,n in j.items():
self.cookie_dict[m] = n.value
yield Request(
url='https://dig.chouti.com/login',
method='POST',
body='phone=00000000&password=xxxxxxx&oneMonth=1',
cookies=self.cookie_dict,
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'content-type':'application/x-www-form-urlencoded; charset=UTF-8',
},
callback=self.check_login
)
def check_login(self, response):
yield Request(url='https://dig.chouti.com/', callback=self.index)
def index(self, response):
news_id_list = response.xpath('//div[@id="content-list"]//div[@class="part2"]/@share-linkid').extract()
for news_id in news_id_list:
news_url = "https://dig.chouti.com/link/vote?linksId=%s" % (news_id,)
yield Request(
url=news_url,
method="POST",
cookies=self.cookie_dict,
callback=self.output,
)
page_list = response.xpath('//*[@id="dig_lcpage"]//a/@href').extract()
for url in page_list:
url = "https://dig.chouti.com" + url
yield Request(url=url, callback=self.index)
def output(self, response):
print(response.text)
# def parse(self, response):
# # 1. 解析文本中的内容,将标题和简介提取出来
# item_list = response.xpath("//div[@id='content-list']/div[@class='item']")
#
# for item in item_list:
# title = item.xpath(".//div[@class='part1']/a/text()").extract_first().strip()
# # item.xpath(".//div[@class='part1']/a[0]/text()")
# # summary = item.xpath(".//div[@class='area-summary']/span/text()").extract_first().strip()
# # print(summary)
# href = item.xpath(".//div[@class='part1']/a/@href").extract_first().strip()
# yield XzxItem(title=title, href=href)
#
# # 2. 获取页面
# page_list = response.xpath("//div[@id='dig_lcpage']//a/@href").extract()
# for url in page_list:
# url = "https://dig.chouti.com" + url
# yield Request(url=url, callback=self.parse) # 可以将关掉去重dont_filter=True
2. xpath语法
# hxs = HtmlXPathSelector(response)
# print(hxs)
# hxs = Selector(response=response).xpath('//a')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[2]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
# print(hxs)
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
# print(hxs)
3. 重写去重规则
from scrapy.dupefilter import BaseDupeFilter
import redis
from scrapy.utils.request import request_fingerprint
class XzxDupefilter(BaseDupeFilter):
def __init__(self,key):
self.conn = None
self.key = key
@classmethod
def from_settings(cls, settings):
key = settings.get('DUP_REDIS_KEY')
return cls(key)
def open(self):
self.conn = redis.Redis(host='127.0.0.1',port=6379)
def request_seen(self, request):
fp = request_fingerprint(request)
added = self.conn.sadd(self.key, fp)
return added == 0