今日头条爬取

第一部分爬取user-agent到本地并且随机读取一个

import requests
import json
#从网站读取并且保存
def write_browser_info_to_file():
    my_user_agent =requests.get("https://fake-useragent.herokuapp.com/browsers/0.1.11")
    with open("browser_info.json","w") as f:
        json.dump(my_user_agent.text,f)
#write_browser_info_to_file()      #第一次运行代码时需要它,以后把它注释掉
#从本地读取
import random
def get_random_browser():
    with open("browser_info.json","r") as f:
        browsers_json = json.load(f)
        #print(type(browsers_json))
        browsers_json=json.loads(browsers_json)
        #print(type(browsers_json))
        browsers=browsers_json["browsers"]
        #print(browsers)
        i= random.randint(0,len(browsers))
        if i==0:
            browsers_name="chrome"
        elif i==1:
            browsers_name = "opera"
        elif i==2:
            browsers_name = "firefox"
        elif i==3:
            browsers_name = "internetexplorer"
        else:
            browsers_name = "safari"
        final_browser = browsers[browsers_name][random.randint(0,len(browsers_name)-1)]
        #print(final_browser)
        return final_browser
get_random_browser()

第二部分爬取今日头条当日新闻

import requests
import  time
import json
from utils import get_random_browser
#准备以及获取一些必要参数
def get_request_url_and_headers():
    user_agent = get_random_browser()
    #print(user_agent)
    current_time=int(time.time())
    #print(current_time)
    headers={
        "user-agent":user_agent
    }
    base_usrl="https://www.toutiao.com/api/pc/feed/?"\
            "max_behot_time="+str(current_time)+"&"\
            "category=__all__&utm_source=toutiao&widen=1&" \
              "tadrequire=true&as=A1954D857F84D3F&" \
              "cp=5D5FB45D63AF4E1\
              &_signature=RWvYahAcGDZMJ6J.h5y2iEVr2H"
    proixes={"url":"http://114.235.23.172:9000"}
    return base_usrl,headers,proixes
#爬取
def get_respense_html():
    request_url,headers,proxies=get_request_url_and_headers()
    response=requests.get(request_url,headers=headers,proxies=proxies)

    global response_json     #为了解决递归留下的祸患
    print(response_json)
    response_json=json.loads(response.text)
    if response_json["message"]=="error":
        get_respense_html()
    return response_json

#保存数据到当地
def data_to_file():
    data=response_json["data"]

    for i in range(len(data)):
        data_dict=data[i]

        with open("toutiao.json","a+") as f:
            json.dump(data_dict,f,ensure_ascii=False)
            f.write("\n")
json_content=get_respense_html()
print(json_content)
data_to_file()

import pandas as pd
#再搞一份excel格式的
df = pd.read_json("toutiao.json",lines=True)
print(df)
df.to_excel("toutiao.xlsx")

你可能感兴趣的:(今日头条爬取)