360指数爬取

故事是面临跳槽的我,被一个hr问我说有没有爬过360指数,我说我没有但是如果是审核标准的话,我完全可以抽空爬爬看。完事那个hr没有回我。好气哦!!!闲来无事我就打开了360指数看了看,确实是有发现有一定的反爬,需要登陆还有就是有的指数需要通过图片来识别,还行啊,360还是给了点余地的。

1.登陆

我是直接用游览器登陆后的cookie

2.指数识别

像图片中的这个指数82032,他其实是图片展示出来的

360指数爬取_第1张图片

360指数爬取_第2张图片

https://trends.so.com/index/csssprite?q=肖战&area=全国&from=20190916&to=20191015&click=13&t=index

#click是第13个点,从20190916到20191015是30个点,从1开始数
#t是类别 index是关注趋势 media是曝光量
#得到的数据 
{
    "status":0,
    "data":{
        "肖战":{
            "css":"",
            "img":""
        }
    },
    "msg":"success"
}

得到的数据中img是一个base4图片,

在zhishu.js中找到了处理的函数,就是根据定位只露出相应的文字

function E(e, t, n, r, s, u) {
        e.$http.post("/index/csssprite?q=" + o.WIN.encodeURIComponent(t.join(",")) + "&area=" + (e[n.mod + "Param"] && e[n.mod + "Param"].region || "\u5168\u56fd") + "&from=" + n.period.from.split("-").join("") + "&to=" + n.period.to.split("-").join("") + "&click=" + r + "&t=" + (/^trend/.test(n.mod) ? "index" : n.mod) + (e.tk ? "&tk=" + e.tk : ""), {
            before: function(t) {
                this.spriteRequest && this.spriteRequest.abort(),
                this.spriteRequest = t
            }
        }).then(function(e) {
            if (e.body && e.body.status === 0 && e.body.data) {
                var t = e.body.data
                  , r = "";
                n.data.forEach(function(e) {
                    var n = e.query;
                    if (t[n] && t[n].css && t[n].img) {
                        r || (r += '
    '); var s = n.split("+").map(function(e) { return (0, i.cutStr)(e, v, "...") }).join("+"); r += '
  • ' + s + '
    ' + t[n].css + "
  • " } else r || (r += '
      '), r += "
    • " }); if (r) { r += "
    ", s.innerHTML = r; var o = s.querySelectorAll("span") , a = 0 , f = o.length; for (; a < f; a++) w({ backgroundImage: "url(" + o[a].parentNode.getAttribute("data-img") + ")", height: h + "px" }, o[a]) } } else s.innerHTML = '
    \u6682\u65e0\u6570\u636e
    ' }, function() { s.innerHTML = '
    \u6682\u65e0\u6570\u636e
    ' }) }

 识别文字的函数,用了pytesseract,可以自行安装一下,需要识别的文字工工整整挺好识别的。

这个图片真的贼小,把图片按定位裁剪下来,拼接到一起之后还是识别不出来,合成之后放大了三倍,pytesseract成功能识别出来。而且准确率超高。我还没看到识别出错的。

def get_rep_json(url):
    """
    获取json
    :param url: 请求接口
    :return:
    """
    hearder = {
        "Cookie":"",#这里需要填入游览器登陆后的cookie
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    }
    response = requests.get(url, headers=hearder)
    response_data = response.json()
    print(response_data)
    return response_data

class Base4ToNumber:
    def __init__(self, q, area, fr, to):
        self.q = q
        self.area = area
        self.fr = fr
        self.to = to

    def base4_to_image(self):
        """
        将base4字符串转化为image对象
        """
        base4 = self.data_item["img"]
        image_data = base64.b64decode(base4.split(',')[1])
        image = Image.open(BytesIO(image_data))

        self.image_bg = image

    def get_number_for_img(self):
        """
        根据图片的定位,截取图片,识别数字
        :param image_bg: 带有数字的图片,plt-image对象
        :param number_css:css样式
        :return: 数字
        """
        # 获取位置
        number_location_list = re.findall("""background-position:(.*?)['";]""", self.data_item["css"])
        number_location_int_list = [[abs(eval(x)) for x in str_location.strip().split("px") if x] for str_location in
                                    number_location_list]

        # 获得数字截图并拼接
        number = Image.new("RGBA", (200, 38), (255, 255, 255))
        for i, number_location in enumerate(number_location_int_list):
            image_number = self.image_bg.crop((number_location[0], 0, sum(number_location), self.image_bg.height))
            image_number = image_number.resize((image_number.width * 3, image_number.height * 3), Image.ANTIALIAS)
            number.paste(image_number, (i * 20, 0))

        # 识别数字
        num = pytesseract.image_to_string(number)
        number.close()
        self.image_bg.close()
        return num

    def get_image_base4(self, index, t):
        """
        根据获取到的css和图片,拼接出数字图片并识别返回
              例:{"status":0,"data":{"\u96ea\u8389":{"css":"<\/span><\/span><\/span>","img":"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAoAAAAAmCAYAAABUFyr5AAAIJUlEQVR4nOzcCawdVQHG8X+3x9JSoFaoiAoEEYxQUNCKCxBwZzNWwI3gEnFFA4qGqCiLGiLFuOBGKSIqYkVUAqQ0qAjVpqi0uFQpQkGwVEBau9Hy+swk3yQnk3nv3VehKZ3\/L5m8e889M3POzCT3e+fM3NFIkiSpUwyAkiRJHTMmf\/cEJgOTsqwCdgPWAf2psw3wDGBlD9vdGdg26\/dil9RfW5Q9Ddi+KJuQNv437yclwE5padOzgI3As4EdgdV5v7mNzXGs27cH8Ghej09\/VvW4rXdnO\/\/J+11yfNYUdap97ZpjMzb9rj0957J\/mP1U6z6zuBaqZaA4l\/tn38t7bHebkV4fQzkI2B14oFFebf\/AHOOHgB1yzOvrqSofBzz2BLRBkqSnpIeBK4ul+kI9B\/hkUeczwFk9bGtf4AZgpx73XQW7xcDHGuXnZKmdBKxIoKucB0wHFrVs8w+p\/zfgR8BfgPf22J4nUhX4fle8X5a\/fcCvgMNHsK3qvLy8eH8t8JNGndtSNhv4YOOzqvzMHvYzPfv6Y5bq9RvzWXUsZwKzgB+OoO1NVaicA+wzgnXObik7FPhl+l1em0cCdwKXpJ2fBd6Ta6Z2XsokSeqcsfm7IYGpdAEwH7g4I4UnAIfks9HA6\/IFXoWan2Y0p6p3Reo+2mMbqv3c0lLeNjK2OMHm80XZioTIuv42xesbgQ\/l86ovCxJqquD1+7RzdNq8H\/A4MBW4CXhDyvfJqONd2ebe2d7tCTJvynqzi9HJ4XwZuCohcCi7JpDd0xjB3CmjcKMzslXvd\/u0p2l\/4O\/Ai4GJjRHTozMqeH3ez85SB\/IvFXWnAc\/J63t76OfOwPE5XguKMPxIzuP3EuI2Nvr7D+BfwP3AeuAw4F3FOfh+\/lZB9Ku59hakbDxweULg4qIthj1JkqK+B3BURqv2KL7gqxD1deA04OPAjGL67JIExmWZbt2Q8lcDf80XeK0KVHMby9R89pqse1tL21a1hMBrgWMTdGp3A3tltPDcTF82w0m1ne9mXRJuZiawjk8b3gq8M6FwRkYyT03YOrfY1jkJfhPSl\/4E6bmZdhzOKTnuFw9Try8Bdl1C6FHFZ8dmBK3a5zFF+XbA2xLq+oryj+Zcfg04vbGfyZlu78WtuRY+AVzXQ\/05OT7LinNeuzPB9qiW\/u6bQPq8nJ+x6Vt5TQwkHO6VsoUpPyzhfnGPfZIkqXPGFn+\/mNf9CRGVb+dLvyr7VMqq8HNEvngHGts7NPVLCxvhpbZjgkQVVk5u+bxtBLAKTpc2pnOXJLg+PyNJuyVYNP0794TVZhWjXrWfJXT0pd3HAxdlmZzAcUCmHU\/KyOWlWfdF6cvsln3XqhDzYeDXQ9SpvTSjdjPzvgx604Ev5HicAfwg5RckHL8y5UdkOn8c8M8spyfwVeUX5r7OgQTTM4og1ebq3BYwEfhAUT6xCOVrihHGtdnXzEHuHZ0HvCRBcbD+PgD8PKH8msb6R6f8uJz7q3J9Luvh+EqS1FlDTQGTqc1r8uVaT9NNSphqhj8ScJo39k9N0CidkYcalib8vSJTtwcXo4FtgWEUcBlwc0IYCYAH5aGE\/twn1xYAn9sYGbxvkP7253hsLEZIZyUgPZyQMZBg81Cx7vI8aNFsb3mc1iXw\/CZ\/57e0obZzpkpr9esdMhV7Ud7vmVGy1cA3UvYd4M95CIYivFc+krBWB\/NT0u8rhmgLCdZnZd\/bJsS+Fngw06uvT73rEspI2anp7+Ut18HqjNgO1d+hLM09fjcA5wO\/yHnYvaXuxuJ6J9fb4z3sQ5KkrU4vPwOzofHk6H0JA1Na6tbTsaU6aJTLwkzn3pov4Y2NkEk+a7s3cH1C2PS8rwLgmzMden2mcpsB8AXAW\/JAw6aoQueJCcn1iN\/tuaeuNi0Pn9AoK9sykP5WoehbCSG1KRlBrd2Ve\/dIkNwvr4\/JiN+0LD\/O\/YpjUo8E8R0Sou5tBN+7G+H3\/pYnaNvsUtyTtzL\/BEzOZzOKczujWGdVQt8hLQ\/5kFC+ZJj+kuuvea1OSiA9KGF6bQL2LRmlPbhR\/54c31HZ1ssy4ihJUufUIyJ9jfvwTixuuG96LPcFzk2w2DGh5JE8DDI398kNN7oyp9GOCY0AdXYC4Wkt636zeEJ5Se4ZuykBYK982U9OSJyWacmTE2A2xcoEvimZRiVB4470d0wecJgPvCpPmK7PSNsJLdtblAD86WJ07h0JPfPy\/o7s8+Y8UPNgyqs+faXY1tWZjv1twuDyjPx9rsefmLmxx2OwMNubk+vlTxllHMyYtGlpwtqsxufjMoV75CD9LX\/eZqAI+CsSxFfmPsJjU3Z46q1J2J+VEdtxmVa+LNfrolxXtxbHWpIk9Wh08Vt8pTPzsxv\/r+0zkrWluDAjjU3btTz8sd0IfgandmXu2WuaWPxe43BGZb+91t8UE0ZwXkYVv0nYdP4g4b7ub\/Nnb8ioZl+j7EDghYPsf6eW+uO3sOtKkqStxvtbpoKfqqZmhG3ekxysjnsSt72l2Rt43zB12gKgJEnSZnEA8PbiB6i1eRw1yH2mkiRJkiRJGolengKWJEnSVsQAKEmS1DEGQEmSpI4xAEqSJHWMAVCSJKljDICSJEkdYwCUJEnqGAOgJElSxxgAJUmSOsYAKEmS1DEGQEmSpI4xAEqSJHWMAVCSJKljDICSJEkdYwCUJEnqGAOgJElSxxgAJUmSOsYAKEmS1DEGQEmSpI4xAEqSJHWMAVCSJKljDICSJEkdYwCUJEnqGAOgJElSx\/wvAAD\/\/0F6uWOWxFvVAAAAAElFTkSuQmCC"}},"msg":"success"}
        :param index: 点的序号,从一开始
        :param t: 类型
        :return: 拼接后的数字
        """
        url = f"https://trends.so.com/index/csssprite?q={self.q}&area={self.area}&from={self.fr}&to={self.to}&click={index}&t={t}"
        response_data = get_rep_json(url)
        self.data_item = response_data["data"][self.q]

        self.base4_to_image()
        number = self.get_number_for_img()
        return number

这俩问题解决了就没啥问题了,总结一下数据接口

 

关注度:https://trends.so.com/index/overviewJson?area=全国&q=肖战
关注趋势:https://trends.so.com/index/soIndexJson?area=全国&q=肖战&from=20181016&to=20191015&s=0
曝光量:https://trends.so.com/index/soMediaJson?q=肖战&from=20181016&to=20191015&s=0
24小时关注:https://trends.so.com/index/indexqueryhour?q=肖战&t=30
需求分布:https://trends.so.com/index/radarJson?t=30&q=肖战
用户画像:https://trends.so.com/index/indexquerygraph?t=30&area=全国&q=肖战

 

360指数爬取_第3张图片

360指数爬取_第4张图片得到step,然后从from开始获取指数,曝光量同理

    def get_soIndexJson(self):
        """
        关注趋势:list

        """
        url = f"https://trends.so.com/index/soIndexJson?area={self.area}&q={self.q}&from={self.strftime(self.fr)}&to={self.strftime(self.to)}&s=0"
        response = get_rep_json(url)
        self.soIndexList = []

        if response['status'] == 0:
            fr = self.fr
            one_step = datetime.timedelta(days=response['data']['step'])
            i = 0
            while fr < self.to:
                i += 1
                number = self.number_img.get_image_base4(i, "index")
                to = fr + one_step
                date= fr if response['data']['step']==1 else f"{self.strftime(fr)}-{self.strftime(to)}"
                self.soIndexList.append({
                    "date":date,
                    "value":number
                })
                fr=to

 

用户画像收到数据,年龄和省份可以在zhishu.js中找到对应

360指数爬取_第5张图片

360指数爬取_第6张图片

完整代码(没有存储)

import pytesseract
import base64
from PIL import Image
from io import BytesIO
import re
import requests
import datetime



PROVINCE={'01': '北京', '03': '天津', '14': '河北', '16': '山西', '07': '内蒙古', '17': '辽宁', '18': '吉林', '19': '黑龙江', '02': '上海', '20': '江苏', '21': '浙江', '23': '安徽', '24': '福建', '22': '江西', '15': '山东', '13': '河南', '08': '湖北', '09': '湖南', '10': '广东', '11': '广西', '12': '海南', '04': '重庆', '25': '四川', '26': '贵州', '27': '云南', '05': '西藏', '28': '陕西', '29': '甘肃', '30': '青海', '31': '宁夏', '06': '新疆', '32': '香港', '33': '澳门', '34': '台湾'}
AGE={
        "01": "18\u5c81\u53ca\u4ee5\u4e0b",
        "02": "19-24\u5c81",
        "03": "25-34\u5c81",
        "04": "35-49\u5c81",
        "05": "50\u5c81\u53ca\u4ee5\u4e0a"
    }
def get_rep_json(url):
    """
    获取json
    :param url: 请求接口
    :return:
    """
    hearder = {
        "Cookie": "",#这里需要填入游览器登陆后的cookie
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
    }
    response = requests.get(url, headers=hearder)
    response_data = response.json()
    print(response_data)
    return response_data


def yesterday():
    """
    :return:  昨天日期
    """
    today = datetime.date.today()
    oneday = datetime.timedelta(days=1)
    yesterday = today - oneday
    return yesterday


class Base4ToNumber:
    def __init__(self, q, area, fr, to):
        self.q = q
        self.area = area
        self.fr = fr
        self.to = to

    def base4_to_image(self):
        """
        将base4字符串转化为image对象
        """
        base4 = self.data_item["img"]
        image_data = base64.b64decode(base4.split(',')[1])
        image = Image.open(BytesIO(image_data))

        self.image_bg = image

    def get_number_for_img(self):
        """
        根据图片的定位,截取图片,识别数字
        :param image_bg: 带有数字的图片,plt-image对象
        :param number_css:css样式
        :return: 数字
        """
        # 获取位置
        number_location_list = re.findall("""background-position:(.*?)['";]""", self.data_item["css"])
        number_location_int_list = [[abs(eval(x)) for x in str_location.strip().split("px") if x] for str_location in
                                    number_location_list]

        # 获得数字截图并拼接
        number = Image.new("RGBA", (200, 38), (255, 255, 255))
        for i, number_location in enumerate(number_location_int_list):
            image_number = self.image_bg.crop((number_location[0], 0, sum(number_location), self.image_bg.height))
            image_number = image_number.resize((image_number.width * 3, image_number.height * 3), Image.ANTIALIAS)
            number.paste(image_number, (i * 20, 0))

        # 识别数字
        num = pytesseract.image_to_string(number)
        number.close()
        self.image_bg.close()
        return num

    def get_image_base4(self, index, t):
        """
        根据获取到的css和图片,拼接出数字图片并识别返回
              例:{"status":0,"data":{"\u96ea\u8389":{"css":"<\/span><\/span><\/span>","img":"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAoAAAAAmCAYAAABUFyr5AAAIJUlEQVR4nOzcCawdVQHG8X+3x9JSoFaoiAoEEYxQUNCKCxBwZzNWwI3gEnFFA4qGqCiLGiLFuOBGKSIqYkVUAqQ0qAjVpqi0uFQpQkGwVEBau9Hy+swk3yQnk3nv3VehKZ3\/L5m8e889M3POzCT3e+fM3NFIkiSpUwyAkiRJHTMmf\/cEJgOTsqwCdgPWAf2psw3wDGBlD9vdGdg26\/dil9RfW5Q9Ddi+KJuQNv437yclwE5padOzgI3As4EdgdV5v7mNzXGs27cH8Ghej09\/VvW4rXdnO\/\/J+11yfNYUdap97ZpjMzb9rj0957J\/mP1U6z6zuBaqZaA4l\/tn38t7bHebkV4fQzkI2B14oFFebf\/AHOOHgB1yzOvrqSofBzz2BLRBkqSnpIeBK4ul+kI9B\/hkUeczwFk9bGtf4AZgpx73XQW7xcDHGuXnZKmdBKxIoKucB0wHFrVs8w+p\/zfgR8BfgPf22J4nUhX4fle8X5a\/fcCvgMNHsK3qvLy8eH8t8JNGndtSNhv4YOOzqvzMHvYzPfv6Y5bq9RvzWXUsZwKzgB+OoO1NVaicA+wzgnXObik7FPhl+l1em0cCdwKXpJ2fBd6Ta6Z2XsokSeqcsfm7IYGpdAEwH7g4I4UnAIfks9HA6\/IFXoWan2Y0p6p3Reo+2mMbqv3c0lLeNjK2OMHm80XZioTIuv42xesbgQ\/l86ovCxJqquD1+7RzdNq8H\/A4MBW4CXhDyvfJqONd2ebe2d7tCTJvynqzi9HJ4XwZuCohcCi7JpDd0xjB3CmjcKMzslXvd\/u0p2l\/4O\/Ai4GJjRHTozMqeH3ez85SB\/IvFXWnAc\/J63t76OfOwPE5XguKMPxIzuP3EuI2Nvr7D+BfwP3AeuAw4F3FOfh+\/lZB9Ku59hakbDxweULg4qIthj1JkqK+B3BURqv2KL7gqxD1deA04OPAjGL67JIExmWZbt2Q8lcDf80XeK0KVHMby9R89pqse1tL21a1hMBrgWMTdGp3A3tltPDcTF82w0m1ne9mXRJuZiawjk8b3gq8M6FwRkYyT03YOrfY1jkJfhPSl\/4E6bmZdhzOKTnuFw9Try8Bdl1C6FHFZ8dmBK3a5zFF+XbA2xLq+oryj+Zcfg04vbGfyZlu78WtuRY+AVzXQ\/05OT7LinNeuzPB9qiW\/u6bQPq8nJ+x6Vt5TQwkHO6VsoUpPyzhfnGPfZIkqXPGFn+\/mNf9CRGVb+dLvyr7VMqq8HNEvngHGts7NPVLCxvhpbZjgkQVVk5u+bxtBLAKTpc2pnOXJLg+PyNJuyVYNP0794TVZhWjXrWfJXT0pd3HAxdlmZzAcUCmHU\/KyOWlWfdF6cvsln3XqhDzYeDXQ9SpvTSjdjPzvgx604Ev5HicAfwg5RckHL8y5UdkOn8c8M8spyfwVeUX5r7OgQTTM4og1ebq3BYwEfhAUT6xCOVrihHGtdnXzEHuHZ0HvCRBcbD+PgD8PKH8msb6R6f8uJz7q3J9Luvh+EqS1FlDTQGTqc1r8uVaT9NNSphqhj8ScJo39k9N0CidkYcalib8vSJTtwcXo4FtgWEUcBlwc0IYCYAH5aGE\/twn1xYAn9sYGbxvkP7253hsLEZIZyUgPZyQMZBg81Cx7vI8aNFsb3mc1iXw\/CZ\/57e0obZzpkpr9esdMhV7Ud7vmVGy1cA3UvYd4M95CIYivFc+krBWB\/NT0u8rhmgLCdZnZd\/bJsS+Fngw06uvT73rEspI2anp7+Ut18HqjNgO1d+hLM09fjcA5wO\/yHnYvaXuxuJ6J9fb4z3sQ5KkrU4vPwOzofHk6H0JA1Na6tbTsaU6aJTLwkzn3pov4Y2NkEk+a7s3cH1C2PS8rwLgmzMden2mcpsB8AXAW\/JAw6aoQueJCcn1iN\/tuaeuNi0Pn9AoK9sykP5WoehbCSG1KRlBrd2Ve\/dIkNwvr4\/JiN+0LD\/O\/YpjUo8E8R0Sou5tBN+7G+H3\/pYnaNvsUtyTtzL\/BEzOZzOKczujWGdVQt8hLQ\/5kFC+ZJj+kuuvea1OSiA9KGF6bQL2LRmlPbhR\/54c31HZ1ssy4ihJUufUIyJ9jfvwTixuuG96LPcFzk2w2DGh5JE8DDI398kNN7oyp9GOCY0AdXYC4Wkt636zeEJ5Se4ZuykBYK982U9OSJyWacmTE2A2xcoEvimZRiVB4470d0wecJgPvCpPmK7PSNsJLdtblAD86WJ07h0JPfPy\/o7s8+Y8UPNgyqs+faXY1tWZjv1twuDyjPx9rsefmLmxx2OwMNubk+vlTxllHMyYtGlpwtqsxufjMoV75CD9LX\/eZqAI+CsSxFfmPsJjU3Z46q1J2J+VEdtxmVa+LNfrolxXtxbHWpIk9Wh08Vt8pTPzsxv\/r+0zkrWluDAjjU3btTz8sd0IfgandmXu2WuaWPxe43BGZb+91t8UE0ZwXkYVv0nYdP4g4b7ub\/Nnb8ioZl+j7EDghYPsf6eW+uO3sOtKkqStxvtbpoKfqqZmhG3ekxysjnsSt72l2Rt43zB12gKgJEnSZnEA8PbiB6i1eRw1yH2mkiRJkiRJGolengKWJEnSVsQAKEmS1DEGQEmSpI4xAEqSJHWMAVCSJKljDICSJEkdYwCUJEnqGAOgJElSxxgAJUmSOsYAKEmS1DEGQEmSpI4xAEqSJHWMAVCSJKljDICSJEkdYwCUJEnqGAOgJElSxxgAJUmSOsYAKEmS1DEGQEmSpI4xAEqSJHWMAVCSJKljDICSJEkdYwCUJEnqGAOgJElSx\/wvAAD\/\/0F6uWOWxFvVAAAAAElFTkSuQmCC"}},"msg":"success"}
        :param index: 点的序号,从一开始
        :param t: 类型
        :return: 拼接后的数字
        """
        url = f"https://trends.so.com/index/csssprite?q={self.q}&area={self.area}&from={self.fr}&to={self.to}&click={index}&t={t}"
        response_data = get_rep_json(url)
        self.data_item = response_data["data"][self.q]

        self.base4_to_image()
        number = self.get_number_for_img()
        return number


class Trends360:

    def __init__(self, keywords: list, area="全国", to=yesterday(), fr_step=30):
        self.q = "+".join(keywords)
        self.area = area
        self.to = to
        self.fr_date(fr_step-1)
        self.number_img = Base4ToNumber(self.q, area, self.strftime(self.fr), self.strftime(to))
        self.overviewJson = None

    def fr_date(self, num):
        """
        计算以天为单位的日期赋值给fr,例 前三天的日期
        :param num: 几天
        :return:
        """
        dates = datetime.timedelta(days=num)

        self.fr = self.to - dates

    @staticmethod
    def strftime(date):
        """
        :param date: 时间类型
        :return:string格式化时间
        """
        return date.strftime("%Y%m%d")

    def get_overviewJson(self):
        """
        关注度
        :param q: 关键词
        :param area: 区域
        :return: 返回关注度相关参数
        """
        url = f"https://trends.so.com/index/overviewJson?area={self.area}&q={self.q}"
        response = get_rep_json(url)
        none_data = {'week_year_ratio': '--',
                     'month_year_ratio': '--',
                     'week_chain_ratio': '--',
                     'month_chain_ratio': '--',
                     'week_index': 0, 'month_index': 0}

        self.overviewJson = none_data if response['status'] == 7001 else response['data'][0]['data']


    def get_soIndexJson(self):
        """
        关注趋势:list

        """
        url = f"https://trends.so.com/index/soIndexJson?area={self.area}&q={self.q}&from={self.strftime(self.fr)}&to={self.strftime(self.to)}&s=0"
        response = get_rep_json(url)
        self.soIndexList = []

        if response['status'] == 0:
            fr = self.fr
            one_step = datetime.timedelta(days=response['data']['step'])
            i = 0
            while fr < self.to:
                i += 1
                number = self.number_img.get_image_base4(i, "index")
                to = fr + one_step
                date= fr if response['data']['step']==1 else f"{self.strftime(fr)}-{self.strftime(to)}"
                self.soIndexList.append({
                    "date":date,
                    "value":number
                })
                fr=to

    def get_soMediaJson(self):
        """
        曝光量:list

        """
        url = f"https://trends.so.com/index/soMediaJson?q={self.q}&from={self.strftime(self.fr)}&to={self.strftime(self.to)}&s=0"
        response = get_rep_json(url)
        self.soMediaJson = []

        if response['status'] == 0:
            fr = self.fr
            one_step = datetime.timedelta(days=response['data']['step'])
            i = 0
            while fr < self.to:
                i += 1
                number = self.number_img.get_image_base4(i, "media")
                to = fr + one_step
                date= fr if response['data']['step']==1 else f"{self.strftime(fr)}-{self.strftime(to)}"
                self.soMediaJson.append({
                    "date":date,
                    "value":number
                })
                fr=to

    def get_indexqueryhour(self):
        """
        24小时关注

        """
        url=f"https://trends.so.com/index/indexqueryhour?q={self.q}&t=30"
        response = get_rep_json(url)
        self.indexqueryhour = []
        if response['status']==0:
            fr = datetime.datetime.strptime(response['data']["period"]['from'], '%Y-%m-%d')
            one_step = datetime.timedelta(days=1)
            for number in response['data']['index'][self.q].split("|"):
                self.indexqueryhour.append({
                    "date": fr,
                    "value": number
                })
                fr+=one_step
    def get_radarJson(self):
        """
        需求分布

        """
        url=f"https://trends.so.com/index/radarJson?t=30&q={self.q}"
        response = get_rep_json(url)

        self.from_query=response['data']['from_query'] if response['status'] == 0 else []
        self.list = response['data']['list'] if response['status'] == 0 else []
        self.to_query = response['data']['to_query'] if response['status'] == 0 else []
        self.burst_query = response['data']['burst_query'] if response['status'] == 0 else []

    def get_indexquerygraph(self):
        """
        用户画像

        """
        url=f"https://trends.so.com/index/indexquerygraph?t=30&area={self.area}&q={self.q}"
        response = get_rep_json(url)
        if response['status'] == 0:
            province=response["data"]['province']
            self.interest = response["data"]['interest']
            self.sex = response["data"]['sex']
            self.age = response["data"]['age']
            self.province=[{"entity":PROVINCE[str(each['entity'])],"percent":each['percent']}for each in province]
            for each in self.age:
                each['entity']=AGE[str(each["entity"])]

    def run(self):
        """
        运行函数
        """
        self.get_overviewJson()
        self.get_soIndexJson()
        self.get_soMediaJson()
        self.get_indexqueryhour()
        self.get_indexquerygraph()
        pass


if __name__ == "__main__":
    # number_img = Base4ToNumber("陈情令", "全国", 20190916, 20191015)
    # num=number_img.get_image_base4(1,"index")
    # print(num)
    trends360 = Trends360(["陈情令"])
    trends360.run()


360指数爬取_第7张图片

你可能感兴趣的:(爬虫)