Python爬虫爬取各大热门短视频平台视频

1、开发工具

Python3.9

requests库

其他一些Python内置库

pycharm

2、第三方库

安装第三方库

pip install requests

3、实现思路

1、利用tkinter库实例化一个GUI界面,包含提示框、输入框、选择按钮、功能按钮。

2、用requests发送get请求,获得下载链接

3、将下载到的文件保存到本地。


4、实现效果

 

 Python爬虫爬取各大热门短视频平台视频_第1张图片

 

Python爬虫爬取各大热门短视频平台视频_第2张图片

Python爬虫爬取各大热门短视频平台视频_第3张图片 


 

5、实现过程

1、B站视频爬虫

import requests
import re
import os
#判断是否存在文件夹video,不存在,则创建一个
filename='video\\'
if not os.path.exists(filename):
    os.mkdir(filename)
#定义一个爬虫函数,供主函数调用
def UrlCrawler(url,name):
 
#定制请求头   
      headers = {
"cookie":"buvid3=0D3353AC-5B77-680A-697F-8B66493826D160198infoc; b_nut=1670493160; CURRENT_FNVAL=4048; _uuid=AA102510B8-6113-12F5-10674-C7E67642D65561585infoc; rpdid=|(YukRR|mR|0J'uY~|RmJuYk; i-wanna-go-back=-1; fingerprint=9a9c4cc60b4c3b41bce4cf46c57c55ea; buvid_fp_plain=undefined; buvid4=60372B7A-A671-65D8-6993-2FB5D6E3B2CD61117-022120817-lih1xoB%2FrWiTqxe5epW4Zg%3D%3D; buvid_fp=9a9c4cc60b4c3b41bce4cf46c57c55ea; nostalgia_conf=-1; DedeUserID=3493087921833988; DedeUserID__ckMd5=f79b1c5b37110e69; b_ut=5; bp_video_offset_3493087921833988=undefined; PVID=1; SESSDATA=9b3f3db2%2C1691137529%2C03f5b%2A22; bili_jct=5587e773eeb7161f147d72322112dd01; b_lsid=46D44D61_18620C2A453; innersign=1; sid=nxtxooce",
"origin":"https://www.bilibili.com",
"user-agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36",
    }
    try:
         requests.head(url=url)
    except requests.exceptions.ConnectionError:
        return "Error"
    else:
#从页面源代码中解析出视频地址   
        response = requests.get(url=url,headers=headers)
        date = re.findall('readyVideoUrl":"(.*?)","readyDuration', response.text)[0]
        date1=requests.get(url=date,headers=headers).content
#将爬取到的数据写入文件     
         with open('video\\'+name+'.mp4',mode='wb') as f:
            f.write(date1)

2、抖音视频爬虫

import requests

import re

import os

#新建文件夹

filename='video\\'

if not os.path.exists(filename):

    os.mkdir(filename)

#定义函数,供主函数调用

def TikTok(url,name):

    response = requests.get(url=url)

    response = response.url

#检查主函数输入的链接是否正确 

    try:

        id = re.findall('video/(.*)/\?', response)[0]

    except IndexError:

        return 1

    # https://www.douyin.com/video/7197438641520610595

    print(id)

    Url = 'https://www.douyin.com/video/' + id

#定制请求头

    headers = {

        

        'cookie': 'douyin.com; ttcid=7a9f94f5337c4fb6a4937e5937748bb021; passport_csrf_token=a0ac1ccb642a36adda5944f1c015d48e; passport_csrf_token_default=a0ac1ccb642a36adda5944f1c015d48e; s_v_web_id=verify_lcbu5s5w_28wEiVmM_z3Vd_40ES_B51w_NzAlySlNxpX2; xgplayer_user_id=242308523073; ttwid=1%7CC40qHPAKUiS-rZzMjoNnaRQLVAmjwFKkQYJKPFgoP8w%7C1675328733%7C7c9d6834b4963ac8874725a61b87e3dd9557431c57f3f7751fac875fdc5db078; d_ticket=3c3e91316b0d2f2293d28fe6652d905cc7869; passport_assist_user=CkEfjpnNrq_dwTxeUvZDgvxiJJGX4vK_V2bvkG4hXT2Y93RD7N7Wpv9DwPWJ2-RIiM6ryeR0t7a7jrfKZx1645n2bRpICjwfL4B7J61soinEUkLo0zIVrDu52cNfrSTRckp__Zh7a6qAaiefB0n-jw85LEuN7fTxx3zgjB_uycKsyg4Qrb-oDRiJr9ZUIgEDxmngyQ%3D%3D; n_mh=EyHH0OrPqAYMNqnG7-FHeaRIMyHdxcz5bczy1ihDzJY; sso_auth_status=44d73d0e0c92093e78edc25f7dbd4ffe; sso_auth_status_ss=44d73d0e0c92093e78edc25f7dbd4ffe; sso_uid_tt=ace0d555e37a065789e9cfd86cc68d49; sso_uid_tt_ss=ace0d555e37a065789e9cfd86cc68d49; toutiao_sso_user=4410a2614fa77ef6cbbe4c90d8319abe; toutiao_sso_user_ss=4410a2614fa77ef6cbbe4c90d8319abe; sid_ucp_sso_v1=1.0.0-KDIwNGI4ZjQzMDZjZWY3ZjVmOWIxNTExNjQwOTcyMmMzNTA4MDE4OTEKHwjohYGDiY3dAxD47_6eBhjvMSAMMISOpJQGOAJA8QcaAmxmIiA0NDEwYTI2MTRmYTc3ZWY2Y2JiZTRjOTBkODMxOWFiZQ; ssid_ucp_sso_v1=1.0.0-KDIwNGI4ZjQzMDZjZWY3ZjVmOWIxNTExNjQwOTcyMmMzNTA4MDE4OTEKHwjohYGDiY3dAxD47_6eBhjvMSAMMISOpJQGOAJA8QcaAmxmIiA0NDEwYTI2MTRmYTc3ZWY2Y2JiZTRjOTBkODMxOWFiZQ; odin_tt=07cb42e67dc0b9fa65d040b535cc327a7a483b2242828f4c84668fe6a4fcae69b5a3ef54e4dfbdba322ef19aab7c1e73a008921d15e1a48378be1dd2dfa28fb1; passport_auth_status=61c50bbfa9400bbf2fef96292e2465be%2Cded43c4315dd52db289d582351d64d63; passport_auth_status_ss=61c50bbfa9400bbf2fef96292e2465be%2Cded43c4315dd52db289d582351d64d63; uid_tt=159fc791276b24a5528a79ac5776dcf7; uid_tt_ss=159fc791276b24a5528a79ac5776dcf7; sid_tt=748a279e94b51380eefa350d30df8041; sessionid=748a279e94b51380eefa350d30df8041; sessionid_ss=748a279e94b51380eefa350d30df8041; _tea_utm_cache_2018=undefined; LOGIN_STATUS=1; store-region=cn-gs; store-region-src=uid; sid_guard=748a279e94b51380eefa350d30df8041%7C1675606018%7C5183990%7CThu%2C+06-Apr-2023+14%3A06%3A48+GMT; sid_ucp_v1=1.0.0-KGFjZTk1YjdlZTQzZjhjMmM0NDM4MDYxMGExNmJiNTQyYjBjZDZiYTEKGQjohYGDiY3dAxCC8P6eBhjvMSAMOAJA8QcaAmhsIiA3NDhhMjc5ZTk0YjUxMzgwZWVmYTM1MGQzMGRmODA0MQ; ssid_ucp_v1=1.0.0-KGFjZTk1YjdlZTQzZjhjMmM0NDM4MDYxMGExNmJiNTQyYjBjZDZiYTEKGQjohYGDiY3dAxCC8P6eBhjvMSAMOAJA8QcaAmhsIiA3NDhhMjc5ZTk0YjUxMzgwZWVmYTM1MGQzMGRmODA0MQ; download_guide=%223%2F20230205%22; FOLLOW_LIVE_POINT_INFO=%22MS4wLjABAAAAbAE0Tv5yvVMPsAjb-4wSSB90utPobsmULQ_7kgLejFDqnfufpNzDNwIfkvJpjuCt%2F1675699200000%2F1675620287678%2F0%2F1675647531415%22; SEARCH_RESULT_LIST_TYPE=%22single%22; FOLLOW_NUMBER_YELLOW_POINT_INFO=%22MS4wLjABAAAAbAE0Tv5yvVMPsAjb-4wSSB90utPobsmULQ_7kgLejFDqnfufpNzDNwIfkvJpjuCt%2F1675699200000%2F1675649307074%2F1675649217197%2F0%22; live_can_add_dy_2_desktop=%220%22; VIDEO_FILTER_MEMO_SELECT=%7B%22expireTime%22%3A1676254485724%2C%22type%22%3A1%7D; __ac_nonce=063e3575c005659758d70; __ac_signature=_02B4Z6wo00f01D6bw0AAAIDBLsUmeUz5Ijg-u8fAAGxMkIVlgPVhkXvACKDrW5PQhox9NT7.sU9JfmICX4vwHkzh6YJTURiVvfV0V6JSqJjgtexaAwvibswH5m4jxG-hbyvx.CQFY7vWHr9Obb; passport_fe_beating_status=true; csrf_session_id=7b1abe19e2b6358087568b75dd1a0f95; strategyABtestKey=%221675844690.634%22; home_can_add_dy_2_desktop=%221%22; msToken=GJXwPYvB3xxwqGpTA9SHiEyyNOtqkIOLQ-aC53WzuItS77HThruQXqUa8KWSorSeTMCWREe_-H06gJ1D4iOk4wV1iOiJT6wRTyo_nTX7c129ED0TB2BjmeLdw5qIWaQ=; msToken=6p8d3ygLZuKLiISQm_63XijKvLSI0sqW04sHI1LzOhZLbRhIaYsqS59QJwZs6y6eEmEYSAuTNpmz9BhVG0t5I1LuUvaWbBxZyrCjlItMH9yZm2RaYk9ZonDx62JygVw=; tt_scid=2FhmuwuvP-leuEyOg46jFNIcPED5l4jUxFsh3H9PwiHLvTImQ1lgmXM5N3.33RFac36f',

        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78',   }

    response = requests.get(url=Url, headers=headers)

    date = response.text

#解析重定向后的地址    

    try:

        html_date = re.findall('

你可能感兴趣的:(python爬虫,python,爬虫)