被封了加代理
http://www.goubanjia.com/
- 原因:
- 短时间内发起了高频的请求导致ip被封
- http连接池中的连接资源被耗尽
- 解决:
- 代理
- headers中加入Conection:"close"
数据解析可以帮助我们实现聚焦爬虫
通过正则简单爬个糗事百科视频吧
import requests
import re
import os
dirName = './videos/'
if not os.path.exists(dirName):
os.mkdir(dirName)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
url = 'https://www.qiushibaike.com/video/'
response = requests.get(url=url, headers=headers)
page_text = response.text
#
ex = '
page_list_video = re.findall(ex, page_text, re.S)
for v in page_list_video:
v = "https:" + v
video_name = dirName + v.split("/")[-1]
# 方式一:
# response_video = requests.get(v, headers=headers).content # bytes类型数据
# with open(video_name, "wb") as fp:
# fp.write(response_video)
# 方式二:
from urllib import request
request.urlretrieve(url=v, filename=video_name)
bs4解析的原理
BeautifulSoup的实例化
通过BeautifulSoup下载个三国吧
from bs4 import BeautifulSoup
import requests
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
response_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(response_text, 'lxml')
a_list = soup.select('.book-mulu > ul > li > a')
fp = open('./sg.txt', 'w', encoding='utf-8')
for a in a_list:
title = a.string
content_url = "https://www.shicimingju.com" + a['href']
content_text = requests.get(url=content_url, headers=headers).text
soup = BeautifulSoup(content_text, 'lxml')
content = soup.find('div', class_="chapter_content").text
fp.write(f"{title}\n{content}\n\n")
print(f'{title}下载成功!')
fp.close()
xpath解析原理
xpath表达式
最左侧的/表示:xpath表达式一定要从根标签逐层进行标签查找和定位
最左侧//表示:xpath表达式可以从任意位置定位标签
非最左侧的/:表示一个层级
非最左侧的//:表示跨多个层级
xpath可用| 管道符连接两个表达式,提高xpath的通用性
定位标签的操作
取文本:
取属性:/@attrName
通过xpath简单爬取个糗事百科吧
import requests
from lxml import etree
url = 'https://www.qiushibaike.com/text/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
response_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(response_text)
div_list = tree.xpath("//div[@class='article block untagged mb15 typs_hot']")
for div in div_list:
author = div.xpath("./div/a[2]/h2/text()")[0] # 实现局部解析
content = div.xpath("./a[1]/div/span/text()") # 里面有
标签,需要处理一下
content = "".join(content)
print(author, content)
使用管道符|连接两个表达式
import requests
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
url = "https://www.aqistudy.cn/historydata/"
response_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(response_text)
citys = tree.xpath("//div[@class='bottom']/ul/li/a/text()| //div[@class='bottom']/ul/div[2]/li/a/text()")
print(citys)
数据.encode(“iso-8859-1”).decode(“gbk”) # iso-8859-1适用范围更广一些
爬些美女图片吧
import requests
from lxml import etree
import os
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
}
dirName = './mv'
if not os.path.exists(dirName):
os.mkdir(dirName)
url = "http://pic.netbian.com/4kmeinv/index_%d.html"
for page in range(1, 10):
if page == 1:
new_url = 'http://pic.netbian.com/4kmeinv/index.html'
else:
new_url = format(url%page)
response_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(response_text)
a_list = tree.xpath("//div[@class='slist']/ul/li/a")
for i in a_list:
pic_path = "http://pic.netbian.com/" + i.xpath("./img/@src")[0]
pic_name = i.xpath("./b/text()")[0]+".jpg"
pic_name = pic_name.encode("iso-8859-1").decode("gbk") # 解决编码问题
pic = requests.get(url=pic_path, headers=headers).content
with open(dirName+"/"+pic_name, "wb") as fp:
fp.write(pic)