Python爬虫基础

文章目录

        • 1.什么的爬虫
        • 2.爬虫的价值
        • 3.爬虫的流程
        • 4.爬取梨视频
        • 5.添加请求头
        • 6.带有cookies的请求
        • 7.模拟登录github
        • 8.session 发送请求

1.什么的爬虫

爬虫是一种应用程序,用于从互联网中获取有价值的数据,从本质上来看,属于client客户端程序

2.爬虫的价值

互联网中最有价值的就是数据,爬虫中首要任务就是要通过网络取获服务器的数据,来为自己创造价值

3.爬虫的流程

1. 分析请求
	web页面(chrome)
    原生软件(抓包工具如Charles)
2. 模拟请求
	第三方requests模块
    内置的urllib模块
    selenium模块(自动化测试模块)用于程序驱动浏览器发送请求
    截获app端发送的请求信息 Charles(青花瓷)
3. 获取响应数据
	浏览器接受相应后会渲染页面进行展示
    requests和urllib都会直接返回响应体
4. 解析数据
	re模块
   	BeautifulSoup模块
5. 存储数据
   文件
   MySQL等关系型数据库
   MongoDB、redis非关系型数据库          

4.爬取梨视频

import requests
import re
import json
import os
from concurrent.futures import ThreadPoolExecutor


dir = os.path.dirname(__file__)
# 存储解析完成的数据
datas = []
# 要爬取的页数
page_num = 1
# 要爬取的分类

def get_details(resp):
    res = re.findall(', resp.text)
    base_url = "https://www.pearvideo.com/"
    for i in res:
        # 拼接详情页面的地址
        detail_url = base_url + i
        detail_resp = requests.get(detail_url)
        # 解析标题
        title = re.search('

(.*?)

'
, detail_resp.text).group(1) # 时间 subdate = re.search('
(.*?)
'
, detail_resp.text).group(1) # 点赞数 f_count = re.search('
(\d+)
'
, detail_resp.text).group(1) # 作者 author = re.search('(.*?)
', detail_resp.text).group(1) # 详情 content = re.search('
(.*?)
'
, detail_resp.text).group(1) # 视频地址 video_url = re.search('srcUrl="(.*?)"',detail_resp.text).group(1) dic = {"title": title, "subdate": subdate, "f_count": f_count, "author": author, "content": content,"video_url":video_url} pool.submit(download_video,video_url,title) # 异步提交任务到线程池 datas.append(dic) # 请求首页列表 def get_page_data(categoryId): url = "https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=%s&start=" % categoryId for i in range(page_num): url1 = url + str(i * 12) resp = requests.get(url1) if resp.status_code == 200: print("请求成功返回!") get_details(resp) def download_video(video_url,video_name): try: print("开始下载",video_name) resp = requests.get(video_url) video_name = video_name.replace('"',"") video_name = video_name.replace('?', "") file_path = os.path.join(dir,"videos",video_name+".mp4") if os.path.exists(file_path): print(video_name,"已经下载过了!") return with open(file_path,"wb") as f: f.write(resp.content) except Exception as e: print("下载任务执行失败:",e) # 将数据写入json文件 def write_json(): with open("datas.json", "wt") as f: json.dump(datas, f) if __name__ == '__main__': # 开启线程池 pool = ThreadPoolExecutor() get_page_data(5) # 写入 write_json()

5.添加请求头

import requests
key = input('请输入关键字:')

# 手动进行中文编码
# from urllib.parse import urlencode
# print(urlencode({"wd:key"},encoding="utf-8"))

url = "https://www.baidu.com/s"
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
resp = requests.get(url,
                    params={'wd':key},
                    headers = {"User-Agent":user_agent})
with open('baidu.html','wb') as f:
    f.write(resp.content)

6.带有cookies的请求

import requests

url = "https://github.com/"

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"

cookie = "has_recent_activity=1; _ga=GA1.2.2022750455.1554880344; _gat=1; _octo=GH1.1.1046972625.1554880344; tz=Asia%2FShanghai; _device_id=1741760db0afe98d7421095e093591fa; user_session=_dKVhga81lwDQz8GcEHih1EHDSnVlHIgJA6PKu_xIyxGgnxN; __Host-user_session_same_site=_dKVhga81lwDQz8GcEHih1EHDSnVlHIgJA6PKu_xIyxGgnxN; logged_in=yes; dotcom_user=oldboyedujerry; _gh_sess=U0NJZy85U1BaVmtSd3piY3BGZ0M3V0RJenRjcEZCZHI5TGtuNWZqZjNud1pBOVR0K1lUZk9XRTNUY2ViSGtIQ0pVbDV4Y0ZEdk9GeGhYZ3BtVXlYYmtIYWc3ckJtMFZYdkpMK1JYZ1ZrWm5GNlM2VzdrWVlZNndlaUM3V0NEU1pETjEydzlMRWtEODRUY2dJbFNKMW1mMjM0bnZmL0Z1SVV0TldpUXBZSmppTGR6Tm9KaUpDTndaNnVZWmFDRmxlS3d3TlJTZGpOSVpzaHZhZW03R3Z5eVVWQWJVaFUvNlBka3B5OFZpdmJldWl6dDJCbjEwMVorM0dGK3pOUXMzRHZQU2NQQVJlUzFmT012ZmhKWFEycTExZDZSQ1VnREhNbHEvR2xudUNNbW1PT0o1MXkwSnI0OC84anRUNjhqYWlOK1hLdUFnQlVpWTg4MFFTeldBRkVpK2dQMTF1Q1hmNnJtVDlyQjB3T3VZPS0tTkxCSmx1QUdMVWg3M1BEbmhZMXN0UT09--782e5f85764c258336a9fe66faea11ddbf687a8b"
resp = requests.get(url,
             headers = {"user-agent":user_agent,
                        "Cookie":cookie,}
                   		# cookies={"has_recent_activity":"1"} # 已keyvalue的形式添加cookie
                   )

print(resp.status_code)
print("oldboyedujerry/testProject" in resp.text)

7.模拟登录github

# post请求不同之处仅在于参数提交通过post方法的data参数来指定
import requests
import re
# 1.获取token
login_page_url = "https://github.com/login"
resp = requests.get(login_page_url)

# 获取返回的cookie
cookie = resp.cookies.get_dict()
token = re.search('authenticity_token" value="(.*?)" /> ',resp.text).group(1)

# 2.请求登录接口
login_url = "https://github.com/session"
resp2 = requests.post(login_url,
              headers={
                  "Referer": "https://github.com/login",
                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
              },
              data={"commit": "Sign in",
                    "utf8": "✓",
                    "authenticity_token": token,
                    "login": "oldboyedujerry",
                    "password": "123654asdAsd",
                    "webauthn-support": "supported"},
              cookies = cookie)
print(resp2.status_code)
with open('github_home.html','wb') as f:
    f.write(resp2.content)

print("oldboyedujerry/testProject" in resp2.text)

8.session 发送请求

# 使用会话来完成请求,会话会自动帮我们保存和提交cookie
import requests
import re

session = requests.session()

# 1.获取token
login_page_url = "https://github.com/login"
resp = session.get(login_page_url) # 请求首页
token = re.search('authenticity_token" value="(.*?)" /> ',resp.text).group(1) # 获取token

# 2.请求登录接口
login_url = "https://github.com/session"
resp2 = session.post(login_url,
              headers={
                  "Referer": "https://github.com/login",
                  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
              },
              data={"commit": "Sign in",
                    "utf8": "✓",
                    "authenticity_token": token,
                    "login": "oldboyedujerry",
                    "password": "123654asdAsd",
                    "webauthn-support": "supported"},
              #allow_redirects = False
                     )
print(resp2.status_code)
with open("github_home.html","wb") as f:
    f.write(resp2.content)
print("oldboyedujerry/testProject" in resp2.text)

你可能感兴趣的:(Python爬虫)