Python(网络爬虫【requests模块】一)

Python网络爬虫【requests模块】一

  • 网络爬虫的概念
  • 抓包工具
  • 获取静态网页数据

网络爬虫的概念

网络爬虫(web crawler)又称为网络蜘蛛(web spider)或网络机器人(web robot),另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或蠕虫,同时它也是“物联网”概念的核心之一。网络爬虫本质上是一段计算机程序或脚本,其按照一定的逻辑和算法规则自动地抓取和下载万维网的网页,是搜索引擎的一个重要组成部分

抓包工具

一般来说一个网页都有一个自带的抓包工具
在这里我们会用到一个抓包工具,
https://dl.softmgr.qq.com/original/Development/FiddlerSetup_5.0.20192.25091.exe
在这里插入图片描述

获取静态网页数据

# 指定要爬取的路径
url = "http://www.baidu.com/s"
# 设置请求头,添加UA字段,模拟浏览器操作
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
    "Cookie": "BAIDUID=CF508B611BADFBB09E8BB0DAAF132C76:FG=1; BIDUPSID=CF508B611BADFBB09E8BB0DAAF132C76; PSTM=1571369387; BD_UPN=19314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; PSINO=6; H_PS_PSSID=1434_21111_18559_29568_29699_29221_22157; H_PS_645EC=07c3Mw0o%2BVf2M5TrM%2FKqo9%2FUBks%2BMbN03sBiFZyAYg1ag%2FJTsXGc0IDFpmU; COOKIE_SESSION=0_0_0_0_1_0_0_0_0_0_0_0_0_0_7_0_1571369556_0_1571369549%7C1%230_0_1571369549%7C1; BD_HOME=0",
    "Host": "www.baidu.com"
}
# 设置请求参数
params = {
    "wd": "中国"
}
# 获取地址,获得响应
response = requests.get(url, params=params, headers=headers)
# 获取文本编码
print(f"响应状态码:{response.encoding}")

print(f"响应状态码:{response.status_code}")
# 设置编码格式
response.encoding = "UTF-8"
print(response.text)
# 以往的写法(获取到数据需要关闭)
# response.text 获取响应码
# f = open("百度。html", "w", encoding="UTF-8")
# f.write(response.text)
# f.close()


# 现在(取别名自动关闭)
with open("中国.html", "w", encoding="UTF-8") as f:
    f.write(response.text)

"""
爬取百度首页
"""

import requests

# 指定要爬取的路径
url = "http://www.baidu.com/s"

# 设置请求头,添加UA字段,模拟浏览器操作
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
    "Cookie": "BAIDUID=CF508B611BADFBB09E8BB0DAAF132C76:FG=1; BIDUPSID=CF508B611BADFBB09E8BB0DAAF132C76; PSTM=1571369387; BD_UPN=19314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; PSINO=6; H_PS_PSSID=1434_21111_18559_29568_29699_29221_22157; H_PS_645EC=07c3Mw0o%2BVf2M5TrM%2FKqo9%2FUBks%2BMbN03sBiFZyAYg1ag%2FJTsXGc0IDFpmU; COOKIE_SESSION=0_0_0_0_1_0_0_0_0_0_0_0_0_0_7_0_1571369556_0_1571369549%7C1%230_0_1571369549%7C1; BD_HOME=0",
    "Host": "www.baidu.com"
}


# 设置请求参数
params = {
    "wd": "中国"
}

# 获取地址,获得响应
response = requests.get(url, params=params, headers=headers)
# 获取文本编码
print(f"响应状态码:{response.encoding}")

print(f"响应状态码:{response.status_code}")

# 设置编码格式
response.encoding = "UTF-8"
print(response.text)

# 以往的写法(获取到数据需要关闭)
# response.text 获取响应码
# f = open("百度。html", "w", encoding="UTF-8")
# f.write(response.text)
# f.close()


# 现在(取别名自动关闭)
with open("中国.html", "w", encoding="UTF-8") as f:
    f.write(response.text)


爬取豆瓣电影TOP250,分页保存电影数据

headers = {
    "User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"
}

进行分页操作

for i in range(10):#0~9
    url = f"https://movie.douban.com/top250?start={i*25}"
    # 遇到加密的的网站忽略它的警告(verify=False)
    response = requests.get(url, headers=headers, verify=False)
    if response.status_code == 200:
       with open(f"第{i+1}页.html", "w", encoding="UTF-8") as f:
        f.write(response.text)
        print(f"{url} 保存成功")
    time.sleep(5)#睡眠两秒
"""
爬取豆瓣电影TOP250,分页保存电影数据
"""
import requests
import time


headers = {
    "User-Agent": "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14"
}

for i in range(10):
    url = f"https://movie.douban.com/top250?start={i*25}"
    response = requests.get(url, headers=headers, verify=False)
    print(response.status_code)
    if response.status_code == 200:
        # 获取网页数据
        with open(f"第{i+1}页.txt", "w", encoding="UTF-8") as f:
            f.write(response.text)
            print(f"{url} 保存成功")
    time.sleep(2)


你可能感兴趣的:(Python,爬虫)