目录
一、通用爬虫
1.books.toscrape
2.豆瓣top250 (get
3.爬取搜狗指定词条的搜索结果 (get
4.百度翻译 (post
5.爬取豆瓣
6.kfc爬取
遇到的编码问题
学习视频:
Python超强爬虫8天速成(完整版)爬取各种网站数据实战案例_哔哩哔哩_bilibili
【Python+爬虫】爆肝两个月!拜托三连了!这绝对是全B站最用心(没有之一)的Python+爬虫公开课程,从入门到(不)入狱 !_哔哩哔哩_bilibili
协议网站:robots.txt
练习网站:All products | Books to Scrape - Sandbox
(也称为全网爬虫):通用爬虫是一种广泛应用于搜索引擎的爬虫,目标是尽可能地覆盖整个互联网,并抓取尽量多的网页。通用爬虫在互联网上自动发现并爬取链接,构建一个巨大的索引以支持搜索引擎的查询功能。著名的搜索引擎如谷歌、百度等都使用了通用爬虫
from bs4 import BeautifulSoup
import requests
#把网页中的html储存在content(字符串形式
content=requests.get("https://books.toscrape.com").text
#print(content)
#指定解析器,解析成树状结构
soup=BeautifulSoup(content,"html.parser")
#看到第一个p标签里的所有内容
#print(soup.p)
#返回可迭代对象。可用for遍历
all_price=soup.findAll("p")
#print(all_price)
#print("_______________!!@#$%^&")
#print(all_price[1])#£51.77
#print(all_price[2])
#Instock
#根据标签或属性,找到所有符合的元素
all_price2=soup.findAll("p",attrs={"class":"price_color"})
print(all_price2)
forpriceinall_price2:
print(price.string)#print(price.string[2])只获得标签的内容
#找到所有h3元素
all_titles=soup.findAll("h3")
#返回所有h3元素下的a标签
fortitleinall_titles:
all_links=title.findAll("a")
forlinkinall_links:
print(link.string)
#获取a元素里的文字内容
print(type(all_titles))#
print(type(title))#
print(type(all_links))#
print(type(link))#
print(type(soup))#
print(type(content))#
总之,就是模拟浏览器,步骤:
import requests
from bs4 import BeautifulSoup
#伪装
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
for start_num in range(0,250,25):
print(start_num)
response=requests.get(f"https://movie.douban.com/top250?start={start_num}",headers=headers)
#start放到单独的参数里也可以
print(response)#返回200就对了
html = response.text
soup = BeautifulSoup(html,"html.parser")
# print(soup)
all_titles=soup.findAll("span",attrs={"class":"title"})
# print(all_titles[2])
for title in all_titles:
title_string=title.string
if "/" not in title_string:
print(title_string)
import requests
headers={"User-Agent":"Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 109.0) Gecko / 20100101 Firefox / 114.0"}
# url='https://www.sogou.com/web?query=slx'
#在网址是=
word=input('输入搜索的词语')
url='https://www.sogou.com/web'
params={
'query':word
}
# 字典用:
response=requests.get(url=url,params=params,headers=headers)
html=response.text
fileName=word+'.html'
with open(fileName,'w',encoding='utf-8') as fp:
fp.write(html)
#在 with 代码块结束时,文件会自动关闭,无需显式调用 close() 方法来关闭文件。
#这是由于 with 语句提供了上下文管理器,确保在代码块退出时资源会被正确释放。
#as fp 将打开的文件对象赋值给变量 fp,使我们可以使用 fp 来操作文件
import requests
from bs4 import BeautifulSoup
import json
# 伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 网址
url="https://fanyi.baidu.com/sug"
# 参数
data={
"kw":"dog"
}
#参数写在网址里也对
response=requests.post(url=url,data=data,headers=headers)
# 根据conten_type得知是json,所以用json()返回obj,而不用text
dict_obj=response.json()
fp=open("./dog.json",'w',encoding='utf-8')
json.dump(dict_obj,fp=fp,ensure_ascii=False)
#with open("./dog2.json",'w',encoding='utf-8') as fp :
# json.dump(dict_obj,fp=fp,ensure_ascii=False)
# dict_obj 字典对象以 JSON 格式写入到 fp 文件中,并保留非 ASCII 字符的原样输出。
import requests
import json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
url = "https://movie.douban.com/j/chart/top_list"
# https://movie.douban.com/explore
for i in range(0,200,20):
params = {
'type': 24,
'interval_id': '100:90',
'action': '',
'start': i,
'limit': 20,
}
# , params=params,
response = requests.get(url=url, params=params,headers=headers)
print(response)
# print(response.text)
li = response.json()
# print(type(dict_json))
# fp = open("./doubanxisjv.json", 'w', encoding='utf-8')
# json.dump(dict_json, fp=fp, ensure_ascii=False)
for j in range(len(li)):
print("序号: {:d} 名字:{:10s} 分数:{:.1f} ".format(i+j,li[j]['title'],float(li[j]['score'])) )
最开始爬取的是这个(https://m.douban.com/rexxar/api/v2/movie/recommend?refresh=0&start=40&count=20&selected_categories=%7B)网址,但一直没成功,显示400错误后来换了一个网址,发现get请求是可以双击,在网页里看到json串的,而前面爬取失败的,也就是在浏览器直接复制URL并访问来进行验证。
import requests
import json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
result_str=""
for i in range(1,21):
params={
'cname': '',
'pid': '',
'keyword': '北京',
'pageIndex': i,
'pageSize':'10' ,
}
response=requests.get(url=url,params=params,headers=headers)
# print(response)
# print(response.text)
json_data=response.json()
# with open('./kfc_jsondata.json','w',encoding='utf-8') as fp:
# json.dump(json_data,fp=fp,ensure_ascii=False)
for j in range(len(json_data["Table1"])):
# print(i,j)
print((i-1)*10+j+1,' :',json_data["Table1"][j]["storeName"])
str1=str((i-1)*10+j+1)+' :'+json_data["Table1"][j]["storeName"]+'\n'
result_str+=str1
with open('./kfctext.text','w',encoding='utf-8') as fp:
fp.write(result_str)
7.kfc爬取 改进
之前的页数是手动写上的,不灵活,改进后可以随机应变
寻找json数据的特点,明明就给你了总店数,所以直接用就行了
import requests
import json
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword"
result_str=""
i=1#页数
while True:
params={
'cname': '',
'pid': '',
'keyword': '大连',
'pageIndex': i,
'pageSize':'10' ,
}
response=requests.get(url=url,params=params,headers=headers)
json_data=response.json()
print(json_data)
rowcount=json_data['Table'][0]['rowcount']
ye=(rowcount+9)//10
print(rowcount)
for j in range(len(json_data["Table1"])):
print(json_data["Table1"][j]['rownum'],' :',json_data["Table1"][j]["storeName"])
str1=str(json_data["Table1"][j]['rownum'])+' :'+json_data["Table1"][j]["storeName"]+'\n'
result_str+=str1
i+=1
if i>ye:
break
with open('./kfctext2.text','w',encoding='utf-8') as fp:
fp.write(result_str)
http://t.csdn.cn/9cz6t