【源码】爬虫---西瓜视频

1、瀑布流获取

import requests
import json
import time
import math
import hashlib
import re
import random
from zlib import crc32
from bs4 import BeautifulSoup

#61887739373 影视
#6797027941 推荐
def get_url(channelId = '6797027941'):
    url = 'https://www.ixigua.com/api/feedv2/feedById?&channelId={}&count=18&maxTime='.format(channelId)
    print(url)
    return url

tt_webid = ""

def get_item(url):
    headers = {'user-agent':"Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Mobile Safari/537.36"}
    resp = requests.get(url, headers = headers, cookies = '')
    wbdata = resp.content
    wbdata2 = json.loads(wbdata.decode('utf-8'))
    
    data1 = wbdata2['data']
    channelFeed = data1['channelFeed']
    BaseResp = channelFeed['BaseResp']
    if BaseResp['StatusMessage'] == 'error':
        return 0
    
    data = channelFeed['Data']
    for news in data:
        title = news['videoTitle']
        news_url = news['videoId']
        news_url = "https://www.ixigua.com/i"+news_url
        writer("bbb.txt", title, news_url)

        print(title,news_url)
        getinfo(news_url)
        time.sleep(1)
    return

#写文件
def writer(filename, content, source='', time='', tags=''):
    write_flag = True
    with open(filename, 'a', encoding='utf-8') as f:
        f.write('内容:'+'\n')
        f.writelines(content)
        f.write('\n\n')
        f.write('作者:'+source + '\n')
        f.write('时间:'+time + '\n')
        f.write('标签:'+tags + '\n')
        f.write('------------------------分割线------------------------'+'\n\n')

def getinfo(video_id):
    return

def main(refresh = 10):
    for x in range(0,refresh+1):
        print("{0}".format(x))
        url = get_url()
        get_item(url)

if __name__ == '__main__':
    main()

2、详情url获取

# coding:utf8
#参考:https://www.jianshu.com/p/6ca1344a09db
import re
import requests
import random
from zlib import crc32
from base64 import b64decode

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTHL, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
}
    
def get_video_url_api(video_id='v02004190000bjqdaaq0ifkkoafl5rg0'):
    '''取视蘋地iff所在包的uri'''
    r = str(random.random())[2:]
    url_part = "/video/urls/v/l/toutiao/mp4/{}?r={}".format(video_id, r) 
    s = crc32(url_part.encode())
    url = "https://ib.365yg.com{}&s={}".format(url_part,s) 
    return url

def get_video_url(url):
    #获取视频地址
    resp = requests.get(url, headers=headers) 
    j_resp = resp.json()
    video_url = j_resp['data']['video_list']['video_1']['main_url'] 
    video_url = b64decode(video_url.encode()).decode() 
    return video_url

def get_video_id(url):
    #获取视频id
    resp = requests.get(url, headers=headers)
    # 获取video_id
    #print(resp.text)
    search = re.search("\"vid\":\"([^\"]+)\",", resp.text)
    print(search.group(1))
    return search.group(1)

def main():
    url = "https://www.ixigua.com/i6705550728884142600/" 
    video_id = get_video_id(url)
    if video_id == None: 
        print("get video_id error")
        return
    video_url_api = get_video_url_api(video_id)
    print(video_url_api.encode())
    video_url = get_video_url(video_url_api)
    print(video_url)
    return

if __name__ == '__main__':
    main()

结束语

以上就是全部源码,希望对你有用。欢迎大家关注我们微信公众号,来交流程序员的技术。如果能留言或者点个赞,我也是很开心的,非常感谢!

你可能感兴趣的:(python,文章附属文件)