# 爬取图片
import json
import re
import requests
if __name__ == '__main__':
# url='https://tiebapic.baidu.com/forum/w%3D580%3B/sign=778fd1cb54dfa9ecfd2e561f52ebf603/500fd9f9d72a6059a07152cd6d34349b023bba80.jpg?tbpicau=2022-12-21-05_344dce9c8cf09b1124ad41710692f68a'
# img_data=requests.get(url=url).content # content返回的是二进制形式的图片数据、text是字符串、json是对像
# with open('01.jpg','wb') as wstream:
# wstream.write(img_data)
url='http://jandan.net/pic'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 使用通用爬虫对url对应的一整张页面进行爬取
page_date=requests.get(url=url,headers=headers).text
# print(page_date)
'''
'''
ex='.*?'
imgsrc_list=re.findall(ex,page_date,re.S) #re.S:单行匹配、re.M:多行匹配
print(imgsrc_list)
i=0
for img in imgsrc_list:
img='http:'+img
img_data = requests.get(url=img,headers=headers).content
imgName='img/img'+str(i)+'.jpg'
with open(imgName,'wb') as wstream:
wstream.write(img_data)
i+=1
- 获取一个网站的图片,可以分页获取
-
import re
import requests
if __name__ == '__main__':
# 设置img=0,用来后面设置图片名称
imgs = 0
# 指定url,先找到主页,p后面是指定的页码
url = 'https://www.nyato.com/forum/f2?post_type=1&p={}'
# 设置ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 取第一页的数据
for i in range(1, 2):
# 更新url
new_url = url.format(i)
# print(new_url)
# 获取网页数据
page_text = requests.get(url=new_url, headers=headers).text
# print(page_text)
'''
'''
# 根据网页查询可以看出所需要的图片位置,注意不需要!360*360cut,否则得到的图片会很小
ex = '.*?
- 提供提供的用于数据解析的方法和属性
- soup.tagName:返回的是文档中第一次出现的tagName对应的标签
- soup.find('tagName')相当于soup.tagNam
- soup.find('tagName',class_/id/attr='xxx'):属性定位,返回也是第一个
- soup.find_all('tagName') # 返回符合要求的所有标签,形式是列表
- soup.tagName.text/string/get_text():获取标签之间的文本数据,可以结合find使用
- text/get_text():可以获取某一个标签中所有的文本内容
- string:只可以获取该标签下直系的文本内容
- soup.a['helf']:获取标签中属性值,可以结合find使用
- 实际案例
- 爬取小说所有的章体标题和章节内容
-
# 爬取三国演义小说所有的章体标题和章节内容
import time
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = 'http://www.ujxsw.com/read/35958/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 对首页页面数据进行爬取
page_text = requests.get(url=url, headers=headers).text
# 在首页解析出章节的标题和详情页的URL。
soup = BeautifulSoup(page_text, 'lxml')
a_list = soup.select('#readerlist > ul > li > a')
i = 0
fp = open('./xiaoshuo.txt', 'w', encoding='utf-8')
for a in a_list:
if i == 200:
break
i += 1
time.sleep(2)
title = a.string
detail_url = 'http://www.ujxsw.com/' + a['href']
# 对详情页发起请求。 解析出章节内容。
datail = requests.get(url=detail_url, headers=headers).text
# 解析出详情页相关的章节内容。
soup2 = BeautifulSoup(datail, 'lxml')
datail_text = soup2.find('div', class_='read-content').text
print(datail_text)
datail_text.replace(u' ', u'')
fp.write(title + ':' + datail_text + '\n\n')
print(title, '爬取成功!!!')
import requests
from bs4 import BeautifulSoup
import time
if __name__ == '__main__':
# 指定url
url = 'https://www.513gp.org/book/4961/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 获取该页的响应数据
page_text = requests.get(url=url, headers=headers).text
# 手动设定响应数据的编码格式
page_text = page_text.encode('iso-8859-1').decode('gbk')
# 实例化Beautiful Stop对象
soup = BeautifulSoup(page_text, 'lxml')
# 找到a标签,放到列表里面
a_list = soup.select('a[href]')
# 截取需要的a标签
a_list = a_list[1:]
# 打开文件
fp = open('./红寿寺.txt', 'w', encoding='utf-8')
# 打印
for a in a_list:
time.sleep(2)
# 找到a标签里面的标题
titie = a.string
# 获取url
detail_url = 'https://www.513gp.org/book/4961/' + a['href']
# 获取该页数据,并指定编码格式
detail_page = requests.get(url=detail_url, headers=headers).text
detail_page = detail_page.encode('iso-8859-1').decode('gbk')
# 创建实例化对象
soup2 = BeautifulSoup(detail_page, 'lxml')
# 找到内容并获取
detail = soup2.find('div', class_='bookcontent clearfix').text
fp.write(titie + ':' + detail + '\n\n')
print(titie + '爬取成功!!!')
- xpath
- 最常用且最便捷高效的一种解析方式
- (5条消息) 爬虫数据提取-xpath_黑马蓝汐的博客-CSDN博客_xpath爬取数据
- (5条消息) lxml.etree.XMLSyntaxError解决方法_sjyOvO的博客-CSDN博客
- 解析原理
- 1、需要去实例化一个etree的对象,并且需要将被解析的页面源码数据加载到该对象中
- 2、通过调用etree对象中的xpath方法,结合xpath表达式实现标签的定位和内容的捕获
- 如何实例化一个etree对象
- from lxml import etree
- 1、将本地的html文档中的源码数据加载到etree对象中
- etree.prase(filepath)
- 2、将从互联网是获取的源码数据加载到该对象中
- etree.HTML('page_text')
- xpath表达式
- /:表示的是从根节点开始进行定位,表示的是一个层级
- //:表示的的是多个层级,定位HTML标签下的所有div标签
- //tag[@attrName="attrValue"]:属性定位
- //tag[@attrName="attrValue"][number]:索引定位
- eg:r=tree.xpath('//div[@class="top-nav"]/ul/li[3]')
- /@attrName :获取对应属性值
- 实践
- 爬取二手房当中的房源信息
-
#爬取房屋标题信息
import requests
from lxml.html import etree
if __name__ == '__main__':
# 爬取页面源码数据。
url = 'https://zhoukou.esf.fang.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
page_text = requests.get(url=url, headers=headers).text
# 数据解析
tree = etree.HTML(page_text)
assert isinstance(tree, etree._Element)
div_list=tree.xpath('//div[@class="shop_list shop_list_4"]/dl[@class="clearfix"]')
# print(div_list)
fp=open('58.txt','w',encoding='utf-8')
for dl in div_list:
title=dl.xpath('./dd/h4/a/span/text()')
title=title[0]
print(title)
fp.write(title+'\n')
fp.close()
- 爬取高清图片
-
# 解析下载图片数据
import requests
from lxml.html import etree
if __name__ == '__main__':
# 指定该网站的url
# url = 'https://pic.netbian.com/4kmeinv/'
# url='https://pic.netbian.com/4kmeinv/index_3.html'
url = input('输入url:')
# 进行UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 先获取这一页的数据
page_text = requests.get(url=url, headers=headers).text
# 手动指定编码格式
page_text = page_text.encode('iso-8859-1').decode('gbk')
# 实例化etree对象
tree = etree.HTML(page_text)
# 提取这个网站的缩略图集合的li标签
li_list = tree.xpath('//div/ul[@class="clearfix"]/li')
# 循环
i = int(input('图片从数字几开始命名:'))
# i = 40
for li in li_list:
# 将src提取出来,并进行合并,形成大图页网站
src = li.xpath('./a/@href')[0]
new_page = 'https://pic.netbian.com' + src
# 获取大图页的响应数据,并指定编码格式
page = requests.get(url=new_page, headers=headers)
page.encoding = 'gbk'
page = page.text
# 再次实例化etree对象
tree2 = etree.HTML(page)
# 提取出来img的属性,并进行拼接
img = tree2.xpath('//div[@class="photo-pic"]/a/img/@src')[0]
zuihou_page = 'https://pic.netbian.com' + img
# 保存图片地址
img_local = 'meinv/img' + str(i) + '.jpg'
# 获取图片的二进制形式
img_zuizong = requests.get(url=zuihou_page, headers=headers).content
# 持久化保存
fp = open(img_local, 'wb')
fp.write(img_zuizong)
fp.close()
i += 1
print('图片' + str(i) + '下载完成!!!')
- 获取所有城市名称
-
from lxml import etree
import requests
if __name__ == '__main__':
# # UA伪装
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
# }
# # 指定url
# url = 'https://www.aqistudy.cn/historydata/'
# # 获取页面响应数据
# page_text = requests.get(url=url, headers=headers).text
# # print(page_text)
# # 实例化etree对象
# tree = etree.HTML(page_text)
# # 获取热门城市所在的li标签
# hot_list_li = tree.xpath('//div[@class="bottom"]/ul/li')
# # print(hot_list_li)
# # 创建空外表,用来放城市名字
# all_city_names = []
# # 循环获取城市名字
# for li in hot_list_li:
# # 获取城市名子,并添加到列表
# hot_li = li.xpath('./a/text()')[0]
# all_city_names.append(hot_li)
# # print(all_city_names)
# # 获取所有普通城市所在的ul
# all_list_ul = tree.xpath('//div[@class="bottom"]/ul')
# # print(all_list_ul)
# for ul in all_list_ul: # 循环遍历ul
# all_city = ul.xpath('./div/li/a/text()') # 拿到该ul分类下,所有的城市名字
# # print(all_city)
# all_city_names.extend(all_city) # 将城市名字添加到列表
# print(all_city_names)
# with open('./city.txt', 'w') as fp: # 持久化储存
# for city_name in all_city_names:
# # print(city_name)
# fp.write(city_name + '|')
###############################################################################################
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 指定url
url = 'https://www.aqistudy.cn/historydata/'
# 获取页面响应数据
page_text = requests.get(url=url, headers=headers).text
# print(page_text)
# 实例化etree对象
tree = etree.HTML(page_text)
# 一次性解析出热门和所有城市所在的a标签
# //div[@class="bottom"]/ul/li/a
# //div[@class="bottom"]/ul/div[2]/li/a
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
city_names = []
for a in a_list:
# print(a)
city_name = a.xpath('./text()')[0]
city_names.append(city_name)
print(city_names)
city_names = set(city_names)
print(city_names)
-
免费简历模板素材
-
# 爬取站长素材中所有免费简历模板
import requests
from lxml import etree
import time
if __name__ == '__main__':
# https://sc.chinaz.com/jianli/free.html
url = input('输入url:')
i = int(input('请输入从几开始储存:'))
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54'
}
# 获取页面响应源码
page_text = requests.get(url=url, headers=headers).text
# 创建实例化对象
tree = etree.HTML(page_text)
# 获取每个模板的页面链接
a_srcs = tree.xpath('//*[@id="container"]/div/a/@href')
# print(a_srcs)
for a in a_srcs:
# 文件名
filename = '模板/简历' + str(i) + '.rar'
i += 1
# 获取模板页面响应数据
a_page_text = requests.get(url=a, headers=headers).text
# 实例化etree对象
a_tree = etree.HTML(a_page_text)
# 获取下载链接
src = a_tree.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
# print(src)
# 获取数据,进行持久化保存
rar = requests.get(url=src, headers=headers).content
time.sleep(1)
with open(filename, 'wb') as fp:
fp.write(rar)
print('简历' + str(i) + '爬取完毕!')