数据分析(一)百度指数,代码如下:

# -*- coding:utf-8 -*-
#coding=gbk
from urllib.parse import quote, unquote, urlencode
import csv,datetime
import time
import requests

#反爬、模拟登录
url='https://index.baidu.com/v2/index.html#/'
word = '新冠'#
encode_data = quote(word)
headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Cookie':'BIDUPSID=188ECF3688970AA816EFD7848E7C0F44; PSTM=1591517927; HMACCOUNT_BFESS=6E3C01ABA16BF895; __yjs_duid=1_923b4f0bc44c01f81654a1d6d3544c9e1619153600097; BAIDUID=AC784725B5B88D0700F1CB01DE9031BB:FG=1; BDUSS=TZVTU5GZjVUQ0ZMMkpRc0F4OWhBd2JXemtNeE1PbjhNUDFyTUpsc3lXVlFkV1pqSUFBQUFBJCQAAAAAAAAAAAEAAAB4QQOXy6fLpzEyMzA2MwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFDoPmNQ6D5je; MCITY=-%3A; BAIDUID_BFESS=AC784725B5B88D0700F1CB01DE9031BB:FG=1; ZFY=kjXXqzNsrP57jKcTmHkBRN9IDLkx1FBcsA1mc0sHpt0:C; BA_HECTOR=8001818k2k240g20a5akah771hrkh8f1l; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; bdindexid=9dsdpp4535ale7knipi9cstbb7; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a04231644633NTBta7dDFTEZ3s5fcgtz%2F0H5s3z57ilrkjgZQdGZbM4Ka%2FwdZNY9HVoDAXj9Vjpi6yxuge5K6VrIGHmHOjXsVHbbUtOC7BA6bP8CWJDwAFjG3eEhrSS2wcMYuEOAv8XigLDpmBYr33hOF5vzG7inOC8mi1PK6zOXiVU09AejQHAlINuvH3x0WJ6VYAMhkL7Bo8Jqz3z9GJtPBJ2XqO3GAjjE%2F9ij3PVv3%2BlBxMas%2BPTXTFTjSRKIIRRIl5N0HV0FqcwWCJGr1TEZ6Vvd1%2FgQUMONsV%2FhryRRutU%2BTXWxR%2FQ%3D98134841434633228578078353368783; ___wk_scode_token=cF%2BoF04TSVdbah5YS%2ByoXCq4Sjj%2BnI99sSFtfa96cGc%3D; H_PS_PSSID=36543_37971_37646_37906_37625_36920_38035_37989_37929_38040_26350_37881; BDUSS_BFESS=TZVTU5GZjVUQ0ZMMkpRc0F4OWhBd2JXemtNeE1PbjhNUDFyTUpsc3lXVlFkV1pqSUFBQUFBJCQAAAAAAAAAAAEAAAB4QQOXy6fLpzEyMzA2MwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFDoPmNQ6D5je; ab_sr=1.0.1_ZGNjZGRhZDdhNjExYjI2MzVmYWI5NmFiYTM1ZDdlODExYTRlNjU1MTQxYmYwMDFkZmVkY2RiYzQ4ZTljZmQ5Yjc5MGQyMjM1N2MzMzljZGZlNGE1ZGVhNWYzNjg3N2U4NWEyMjgwNjAyMDBkY2Q4OTAzOGVjNmVjMmY2ZWZjOTMyM2IwYzFjODAzNzY1MGExNzA3NzA3ZWZlYTY4OGQ4Mg==; RT="z=1&dm=baidu.com&si=783f8bbd-e797-4dc9-9404-397bb91f4a35&ss=lcmviax6&sl=6&tt=9xi&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=768d"',#!!!填自己的cookies!!!其他复制即可
    'Host': 'index.baidu.com',
    'Referer': 'https://index.baidu.com/v2/main/index.html',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'sec-ch-ua-mobile': '?0',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'
}
#数据爬取
start_time=datetime.date(2022,12,1)#开始爬取日期的上一周
for _ in range(1,3):
    next_time = start_time + datetime.timedelta(7)#+一周
    if next_time.month ==6 and next_time.day==22:
        break
    else:
        if next_time.month == start_time.month :#如果月份都是一样的,扔进一个csv文件
            with open('baiduzhishu' + str(start_time.month) + '.csv', 'a', newline='', encoding='gbk') as f:
                writer = csv.writer(f)
                params = {
                    'wordlist[]': str(word),
                    'datelist': str(next_time.strftime('%Y%m%d'))#开始日期
                }
                res_index = requests.get(url, params=params,headers=headers)
                json_index = res_index.json()
                print(json_index)
                list_index = json_index['data']['wordlist'][0]['wordGraph']
                for index in list_index:
                    keyword_list = index['word']
                    writer.writerow([keyword_list])
                time.sleep(15)
        else:
            with open('baiduzhishu' + str(next_time.month) + '.csv', 'a', newline='', encoding='gbk') as f:#进入新的月份时新建一个csv文件
                writer = csv.writer(f)
                params = {
                    'wordlist[]': '新冠',
                    'datelist': str(next_time.strftime('%Y%m%d'))
                }
                res_index = requests.get(url, params=params,headers=headers)
                json_index = res_index.json()
                list_index = json_index['data']['wordlist'][0]['wordGraph']
                for index in list_index:
                    keyword_list = index['word']
                    writer.writerow([keyword_list])
                time.sleep(15)
        start_time = next_time

你可能感兴趣的:(python)