爬虫爬热榜

视频有讲解

爬虫

# -*- codeing = utf-8 -*-
# @Time : 2021/1/26 10:19
# @Author : 老七疯狂吸氧
# @file hotlist1.py
# @Software:PyCharm

import requests
import re
import time
import urllib.parse
def main():
    urllist={
     }
    t = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    虎嗅网热文榜 = "https://tophub.today/n/5VaobgvAj1"
    微博今日热榜 = "https://tophub.today/n/KqndgxeLl9"
    知乎热榜 = "https://tophub.today/n/mproPpoq6O"
    B站日榜 = "https://tophub.today/n/74KvxwokxM"
    six氪日榜 = "https://tophub.today/n/Q1Vd5Ko85R"
    吾爱破解日榜 = "https://tophub.today/n/NKGoRAzel6"
    豆瓣电影新片榜 = "https://tophub.today/n/mDOvnyBoEB"
    csdn技术区热帖 = "https://tophub.today/n/K7GdajgeQy"
    urllist.update(微博今日热榜=[微博今日热榜,50],虎嗅网热文榜=[虎嗅网热文榜,15],csdn技术区热帖=[csdn技术区热帖,50],知乎热榜=[知乎热榜,50],B站日榜=[B站日榜,100],six氪日榜=[six氪日榜,10],吾爱破解日榜=[吾爱破解日榜,15],豆瓣电影新片榜=[豆瓣电影新片榜,10])     #将排行榜的网站和数量加入字典。

    for key,value in urllist.items():

        datalist = get_html(value[0])
        hotname = saveurl(datalist)
        keys = list(hotname.keys())
        values = list(hotname.values())
        n=0
        for i in range(0,value[1]):
            n+=1
            content = str(n)+"."+keys[i]+"  "+values[i][0]+"  "+values[i][1]
            savelist(content,key,t)
            # print(n,".",keys[i],"  ",values[i][0],"  ",values[i][1])
        print("爬取", key, "完毕")
def get_html(url):         #一次请求
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }         #请输入你个人的User-Agent
    response = requests.get(url, headers=headers)
    return response.text
def saveurl(baseurl):     #筛选内容
    findlink=re.compile(r'(.*?)')
    findlink2=re.compile(r'(.*?)')
    findlink3=re.compile(r'
    next_url = urllib.parse.urljoin(url , get_url )
    return next_url
if __name__ == '__main__':
    main()


如果好用可以点个赞加个关注。

你可能感兴趣的:(笔记,python,爬虫)