python爬取百度指数

import datetime
import requests
import sys
import time
import json
import pandas as pd
import numpy as np

word_url = 'http://index.baidu.com/api/SearchApi/thumbnail?area=0&word={}'

def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
        "Host": "index.baidu.com",
        "Referer": "http://index.baidu.com/v2/main/index.html",
        "Cipher-Text": "1652425237825_1652501356206_VBpwl9UG8Dvs2fAi91KToRTSAP7sDsQU5phHL97raPDFJdYz3fHf9hBAQrGGCs+qJoP7yb44Uvf91F7vqJLVL0tKnIWE+W3jXAI30xx340rhcwUDQZ162FPAe0a1jsCluJRmMLZtiIplubGMW/QoE/0Pw+2caH39Ok8IsudE4wGLBUdYg1/bKl4MGwLrJZ7H6wbhR0vT5X0OdCX4bMJE7vcwRCSGquRjam03pWDGZ51X15fOlO0qMZ2kqa3BmxwNlfEZ81l3L9nZdrc3/Tl4+mNpaLM7vA5WNEQhTBoDVZs6GBRcJc/FSjd6e4aFGAiCp1Y8MD66chTiykjIN51s7gbJ44JfVS0NjBnsvuF55bs="
    }
    cookies = {
        'Cookie': "__bid_n=18379203a6589298324207; BIDUPSID=2E02EE98DCCE13B3871E90D13B293B90; PSTM=1667567321; FEID=v10-907dcd0a5cc6fa5de7eb6ddc03204ab967433e10; BDUSS=UNWM2dOeDZhU215U09IaUVISHp1UjdsNUEzTVJHYmlRdkFVWFRwSGpRN1JjY0pqSVFBQUFBJCQAAAAAAAAAAAEAAAAEyXrqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANHkmmPR5JpjM; BAIDU_WISE_UID=wapp_1671101300284_101; Hm_up_d101ea4d2a5c67dab98251f0b5de24dc=%7B%22uid_%22%3A%7B%22value%22%3A%223933915396%22%2C%22scope%22%3A1%7D%7D; BAIDUID=16C58FBA50EF47EF0A64399091E5A874:FG=1; BAIDUID_BFESS=16C58FBA50EF47EF0A64399091E5A874:FG=1; __xaf_fpstarttimer__=1672804972789; __xaf_thstime__=1672804973003; FPTOKEN=Ld+1YJ0/2IOF8n5w0o9hNT9x6Dgh37ZoNq6WLF+HyHg0O78kbrB2WuS6INjPP+1FBoMs/grBXUsbMe94Tcye08EvzsRFC2lD8+it5o50bMEnjRMzvEHqSO4EXjIJxCUdY8IvYeYgOK9IQVcGLklv7nEupfxOC2ncG7YSHLLczZPlFutqxJfGbs58AFqien3endohJmslnornN5poweWkL0TgmE1al+fqpEhxenk3vhGXJBxfdnvJX7cdtVJnt+N2p+CNnKvE7PtC8jDMs/BSVA01hw/vqjNYtVdLc7J+x7A0vpx4w/ZszYgPOA+ciMXDrz+kKFb353Zmv4Eof09tccc5n9qg/bdakV3Q7G3vzPu7ABZYQB+Y+J+qQZQvrDiAwe5Y/JzFlNCb/jAWKZm07w==|+zObTsWM9XyZ2tS/c7u+nKD0/ryhdm0WbmNhXuttlXM=|10|99a32f4aa90fa36396e141f8e300f55a; __xaf_fptokentimer__=1672804973023; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1672218950,1672832223; bdindexid=taqk72m8ot9a3r7inhsu68pvj7; ZD_ENTRY=bing; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04228953144t9K20c2FoG8ggPbsWRxmrGL9e3rTAF6eV4RJpk2phsJdVN9JSOlIBDGmvpI073W%2FlQOooVmfokGXaOUpF8Pul3BX2s4CiZjN%2FS036h7uD9dHHWwAmZQJYrlk2deNccShmF3ydoajbR612EEhqFINPyny0KROmA6zCGO1tObUWfjlPZ7SqlBHS6zqsOW47Xl7G%2FcoNrunrO1HT2M6dxm3uvmmtiHAPnVPB1e0%2B5gHL2rEy4C02%2FM88MFy87n%2F7lG%2BJZjwO8ITbgTbQgM3G7hZnxEIGuhnVOu%2Buu%2BaZaMDbFQ%3D18035950298843985659026702870995; __cas__rn__=422895314; __cas__st__212=187a74c8ca54618d46bf34074a2d06fa782febc8b31a4f0e29ffc36fc72e2394448db3087327d0a15f86363e; __cas__id__212=44855536; CPID_212=44855536; CPTK_212=266828396; RT=z=1&dm=baidu.com&si=a0738dc7-5b13-4a2c-9aa4-9be92b5bdc68&ss=lcifbk18&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1672891956; BDUSS_BFESS=UNWM2dOeDZhU215U09IaUVISHp1UjdsNUEzTVJHYmlRdkFVWFRwSGpRN1JjY0pqSVFBQUFBJCQAAAAAAAAAAAEAAAAEyXrqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANHkmmPR5JpjM; ab_sr=1.0.1_OTViZTdmNjAwZjljZTQwN2Q4OWQ0NWM0ODgwNzFlNDRjMThhNjlkOGY3MjFiMDQ5ZGZjMGVmNmQ4ZDEzYWJmMjI5ZThiM2NjZDI4N2E0MDcyMTkwNDYxZTVkMjU2Mjc5YzU4MTk4ZmQ2NDhhZWYxZDdmNDY2ZDFjNjU1YzhlOGFhYjQzOTRjYTNhNGMxZTFhN2Y5MTNiMGNhNWUzMzg1Mg=="
    }
    response = requests.get(url, headers=headers, cookies=cookies)
    return response.text


def decrypt(t, e):
    n = list(t)
    i = list(e)
    a = {}
    result = []
    ln = int(len(n) / 2)
    start = n[ln:]
    end = n[:ln]
    for j, k in zip(start, end):
        a.update({k: j})
    for j in e:
        result.append(a.get(j))
    return ''.join(result)


def get_ptbk(uniqid):
    url = 'http://index.baidu.com/Interface/ptbk?uniqid={}'
    resp = get_html(url.format(uniqid))
    return json.loads(resp)['data']


def get_data(keyword, start='2011-01-02', end='2022-01-02'):
    url = "https://index.baidu.com/api/SearchApi/index?area=0&word=[[%7B%22name%22:%22{}%22,%22wordType%22:1%7D]]&startDate={}&endDate={}".format(keyword, start, end)
    data = get_html(url)
    data = json.loads(data)
    uniqid = data['data']['uniqid']
    data = data['data']['userIndexes'][0]['all']['data']
    ptbk = get_ptbk(uniqid)
    result = decrypt(ptbk, data)
    result = result.split(',')
    start = start_date.split("-")
    end = end_date.split("-")
    a = datetime.date(int(start[0]), int(start[1]), int(start[2]))
    b = datetime.date(int(end[0]), int(end[1]), int(end[2]))
    node = 0
    y=[]
    for i in range(a.toordinal(), b.toordinal()):
        date = datetime.date.fromordinal(i)
        print(date, result[node])
        y.append(result[node])
        node += 1

    y=np.array(y).flatten()
    y=pd.DataFrame(y)
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20140101-2015.0101.csv')
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20150101-2016.0101.csv')
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20160101-2016.0601.csv')
    #y.to_csv(r'D:\alldata\pythonfiles\九寨沟\百度指数数据\九寨沟天气-百度20160601-2017.0101.csv')
    y.to_csv(r'D:\alldata\pythonfiles\四姑娘山\百度指数数据\四姑娘山-百度20190101-20200101.csv')
    



if __name__ == '__main__':
    keyword = "四姑娘山"
    start_date = "2019-01-01"
    end_date = "2020-01-01"
    get_data(keyword, start_date, end_date)
    # print(data)



你可能感兴趣的:(python,爬虫)