网络爬虫(web crawler)又称为网络蜘蛛(web spider)或网络机器人(web robot),另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或蠕虫,同时它也是“物联网”概念的核心之一。网络爬虫本质上是一段计算机程序或脚本,其按照一定的逻辑和算法规则自动地抓取和下载万维网的网页,是搜索引擎的一个重要组成部分
一般来说一个网页都有一个自带的抓包工具
在这里我们会用到一个抓包工具,
https://dl.softmgr.qq.com/original/Development/FiddlerSetup_5.0.20192.25091.exe
# 指定要爬取的路径
url = "http://www.baidu.com/s"
# 设置请求头,添加UA字段,模拟浏览器操作
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
"Cookie": "BAIDUID=CF508B611BADFBB09E8BB0DAAF132C76:FG=1; BIDUPSID=CF508B611BADFBB09E8BB0DAAF132C76; PSTM=1571369387; BD_UPN=19314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; PSINO=6; H_PS_PSSID=1434_21111_18559_29568_29699_29221_22157; H_PS_645EC=07c3Mw0o%2BVf2M5TrM%2FKqo9%2FUBks%2BMbN03sBiFZyAYg1ag%2FJTsXGc0IDFpmU; COOKIE_SESSION=0_0_0_0_1_0_0_0_0_0_0_0_0_0_7_0_1571369556_0_1571369549%7C1%230_0_1571369549%7C1; BD_HOME=0",
"Host": "www.baidu.com"
}
# 设置请求参数
params = {
"wd": "中国"
}
# 获取地址,获得响应
response = requests.get(url, params=params, headers=headers)
# 获取文本编码
print(f"响应状态码:{response.encoding}")
print(f"响应状态码:{response.status_code}")
# 设置编码格式
response.encoding = "UTF-8"
print(response.text)
# 以往的写法(获取到数据需要关闭)
# response.text 获取响应码
# f = open("百度。html", "w", encoding="UTF-8")
# f.write(response.text)
# f.close()
# 现在(取别名自动关闭)
with open("中国.html", "w", encoding="UTF-8") as f:
f.write(response.text)
"""
爬取百度首页
"""
import requests
# 指定要爬取的路径
url = "http://www.baidu.com/s"
# 设置请求头,添加UA字段,模拟浏览器操作
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
"Cookie": "BAIDUID=CF508B611BADFBB09E8BB0DAAF132C76:FG=1; BIDUPSID=CF508B611BADFBB09E8BB0DAAF132C76; PSTM=1571369387; BD_UPN=19314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; PSINO=6; H_PS_PSSID=1434_21111_18559_29568_29699_29221_22157; H_PS_645EC=07c3Mw0o%2BVf2M5TrM%2FKqo9%2FUBks%2BMbN03sBiFZyAYg1ag%2FJTsXGc0IDFpmU; COOKIE_SESSION=0_0_0_0_1_0_0_0_0_0_0_0_0_0_7_0_1571369556_0_1571369549%7C1%230_0_1571369549%7C1; BD_HOME=0",
"Host": "www.baidu.com"
}
# 设置请求参数
params = {
"wd": "中国"
}
# 获取地址,获得响应
response = requests.get(url, params=params, headers=headers)
# 获取文本编码
print(f"响应状态码:{response.encoding}")
print(f"响应状态码:{response.status_code}")
# 设置编码格式
response.encoding = "UTF-8"
print(response.text)
# 以往的写法(获取到数据需要关闭)
# response.text 获取响应码
# f = open("百度。html", "w", encoding="UTF-8")
# f.write(response.text)
# f.close()
# 现在(取别名自动关闭)
with open("中国.html", "w", encoding="UTF-8") as f:
f.write(response.text)
爬取豆瓣电影TOP250,分页保存电影数据
headers = {
"User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"
}
进行分页操作
for i in range(10):#0~9
url = f"https://movie.douban.com/top250?start={i*25}"
# 遇到加密的的网站忽略它的警告(verify=False)
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
with open(f"第{i+1}页.html", "w", encoding="UTF-8") as f:
f.write(response.text)
print(f"{url} 保存成功")
time.sleep(5)#睡眠两秒
"""
爬取豆瓣电影TOP250,分页保存电影数据
"""
import requests
import time
headers = {
"User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"
}
for i in range(10):
url = f"https://movie.douban.com/top250?start={i*25}"
response = requests.get(url, headers=headers, verify=False)
print(response.status_code)
if response.status_code == 200:
# 获取网页数据
with open(f"第{i+1}页.txt", "w", encoding="UTF-8") as f:
f.write(response.text)
print(f"{url} 保存成功")
time.sleep(2)