Python3网络爬虫:腾讯新闻App的广告数据抓取

废话就不说了,咱们直接上代码

def startGetData(self):
    index = 0
    while index < 3:
        index = index + 1
        self.url = "http://r.inews.qq.com/getQQNewsUnreadList?idfa=18454932-A441-4720-8973-776284A58B7F&apptype=ios&rtAd=1&screen_height=667&network_type=wifi&" \
                   "store=1&activefrom=&global_info=0%7C1%7C1%7C1%7C1%7C4%7C2%7C1%7C2%7C0%7C1%7C2%7C2%7C0%7C&screen_scale=2&adcode=440112&screen_width=375&__qnr=2" \
                   "08e" + str(
            self.getRondomStr()) + "&isJailbreak=1&qqnews_refpage=QNLaunchWindowViewController&omgid=96192500048fe94e120b203d6b2be528edb2001011321e&device_model=iPhone7%2" \
                                   "C2&pagestartFrom=icon&startFrom=icon&startarticleid=&devid=F3D6D07C-4D87-40B5-8533-A6E8D1331C89&omgbizid=1a9a8d53c482a14b7e7bd33737409def8cae006011" \
                                   "321e&appver=9.3.1_qqnews_5.5.60"
        time.sleep(2)
        self.parse_url(self.url, self.getBody(), self.getHeader())

这个是启动函数

def parse_url(self, url, body, header):
    response = requests.post(url, data=body, headers=header, verify=False)
    self.parse_json(response.content.decode("utf-8"))

网络抓取数据

def getBody(self):
    body = "cachedCount=12&adReqData=%7B%22chid%22%3A2%2C%22adtype%22%3A0%2C%22pf%22%3A%22iphone%22%2C%22launch%22%3A%220%22%2C%22ext%2" \
           "2%3A%7B%22mob%22%3A%7B%22mobstr%22%3A%22Aejy45%2BNeSZw4VxymYnnIhMV%2BMEM%2B6sW9RSCUl%5C%2F9xdgy%2BxrCQ6TAqjN60uZp%5C%2FkwPlA4" \
           "%2BwjM1T8AXgIGEry2mILkUiCjeN70vErvi4%5C%2FmstXaNzifAOoa7z%5C%2FtBFHEaFMuXU7nT4QWZqSqFPkQZtFndYKCzXlF0BpxSjiix7NQ55fW7kauVHkYlEI" \
           "UF%2BKIlWeD42St4AymSqB7MqMT343a%2BdeUKG9Qm6YMZrmG%5C%2FWgZZZ7LiwfgC7hox%5C%2FhhCwOHmOwSbiZKRdUGgs%2ByCxL6FPLofYZdVDSABgNzm95ie%2" \
           "B2Wdp25gHy%5C%2FyadnSpx6BokKCKMXOa6oZsz%5C%2FLtqRwFTPco7vKuptgpNHTn29wMeH2zfK4Kmb8Gg%2B9fTe8y3xQuC1lV81EZdDhL5QZfYkXh5%2BTcri%5C" \
           "%2F6sH0i3714dC0jB475JLTIeKL5H3i3sYt8CECIpGKyexU6tUwrYl%5C%2FlL%2BGfQy15pG09WxlMBWraD1iQuS9jLwcCrRRlsOe86Y8gun1094i74NgAsud%2B9HTiMsB" \
           "V4%5C%2FEItskEhTGXXV1V7Ps7yB8vzgsjWtHXyEacaUyWAnSfJDD%2BCFbwcPxNWuz%5C%2FYfeTryrTWyuMJh5mLiX2Ly9cuIuSYDciv679xopKEGDMtEC4tUq0x%2BKY" \
           "QY9R8EaEDfTcPZrG6BPhIwRA9WiG0oTCOzhEiewq7FTG373fbMjpLuikxQ%5C%2FUbf7B5SAnLj%5C%2F4MsIfX5XeJAfmbsvzqG8zZbEpYwbebgjYthwoEwqgmNcO28KEISy2" \
           "Z8uNW95qyuRlXokQBsVYPMb7l8isNsVZr0r9rCRFueMIlMtke6lkI1peXQNJbrhoOgqAUhcloxe7Ot%2Bqn9o0YutSr0RbWhycUV0%2Bc2DMAipZM4vtct7cMBYsVUuXP1GLBP1G" \
           "TInbkGKdpRPDKl7HXaLq0Zn9Cvs59zCbJc6ND0wQXfq%2BgGTFCIAcysbbNIejC2CiRcjlyUBLdsqp4tqD6uGTX3FByULEkDrE1DO7AHE%5C%2FPqg3An7CFR0BkRh1KsCd34GWerx" \
           "mB9WQIEa0tIUKZuRWkW3qZKJyo5eUieVcAI78Ul09C5JHwebRfPLQzSy1fTl4lgaKtmM2y3Lo6WY48P9PLCeQbA1lDSKw1Ku8U3wzOnmyieimQvdNAc0lEpOgykMhSfAva4lGvYGhvd" \
           "M7RzTq%2BaoJh4p2ip2Oa30gojevgjc%22%7D%7D%2C%22ver%22%3A%225.5.60%22%2C%22slot%22%3A%5B%7B%22islocal%22%3A0%2C%22orders_info%22%3A%5B%22503856" \
           "81%2C2120191%2C3602870493%2C19%2C101%2C110%2C1%22%2C%2250708076%2C2787691%2C2897359378%2C19%2C101%2C110%2C1%22%2C%2249679576%2C1918850882%2C28" \
           "90192620%2C19%2C4307%2C110%2C1%22%2C%2250645127%2C6868108%2C4173790238%2C1000%2C705%2C110%2C2%22%5D%2C%22channel%22%3A%22news_news_top%22%2C%22r" \
           "efresh_type%22%3A1%2C%22loid%22%3A%221%2C13%22%2C%22recent_rot%22%3A%5B%221%2C2%2C3%22%2C%224%22%5D%7D%5D%2C%22appversion%22%3A%22180319%22%7D&lon" \
           "=113.4367974175347&uid=A6D2B510-4986-4884-8250-964B34B8FB22&chlid=news_news_top&is_new_user=0&feedbackNewsId=NEW2018042302871600%7C0%2CTWF20180423" \
           "04111100%7C1%2CNEW2016111603351800%7C3%2CFIN2018042400905700%7C0%2C20180424A0FFXO00%7C0%2C20180424A0CP7I00%7C2%2C20180422A0UM4R00%7C0%2C20180423A1SOC" \
           "P00%7C2%2C20180423A0PJNH00%7C2%2CHVD2017TOP000000000%7C10%2C20180422A127CW00%7C0%2C20180423A1VAON00%7C0%2C20180423A1WOUH00%7C2%2C20180423A02FOZ00%7C0" \
           "%2C20180423A1VFAF00%7C0&newsTopPage=1&user_chlid=news_video_top%2Cnews_news_19%2Cnews_news_gz%2Cnews_news_ent%2Cnews_news_sports%2Cnews_news_mil&town_n" \
           "ame=Unknown&addPushNews=0&lat=23.10431803385417&feedbackModulePos=%28null%29%7C3%2C10&channelPosition=0&page=2&picType=0%2C0%2C0%2C0%2C0%2C2%2C0%2C2%2C" \
           "2%2C0%2C0%2C0%2C2%2C0%2C0&forward=0&adcode=440112&village_name=Unknown"
    return body

网络请求需要携带的参数

def parse_json(self, jsonStr):
    print(jsonStr)
    DataInfo.time = Util().getCurrTime()
    try:
        json_object = json.loads(jsonStr)
        if "adList" in json_object:
            adList = json_object["adList"]
            json_list = (json.loads(adList))["order"]
            for json_str in json_list:
                self.saveDataInfo(json_str)
    except KeyError as x:
        print(x)

解析抓取到的数据

def saveDataInfo(self, json_str):
    DataInfo.title = json_str["title"]
    DataInfo.channel = "tengxunxinwen"
    DataInfo.appdownload = json_str["url"]
    DataInfo.pic_list = self.getBitmap(json_str)
    DataInfo.device_type = "ios"
    DataInfo.type = 1
    MySqlManager().insert_inspection_list(1)

保存数据到mysql

def getBitmap(self, json_str):
    file_path = self.path
    filename = str(int(time.time() * 1000000)) + ".jpg"
    bitmap = {}
    bitmap_path = {}
    if "resource_url0" in json_str:
        DataInfo.source_type = 1
        bitmap["pic1"] = json_str["resource_url0"]
        bitmap_path["pic_path1"] = file_path + "pic1_" + filename
        DataInfo.pic_path = bitmap_path
        Util().save_img(json_str["resource_url0"], "pic1_" + filename,file_path)
    elif "resource_urlList" in json_str:
        DataInfo.source_type = 2
        bitmap["pic1"] = json_str["resource_urlList"][0]["url"]
        bitmap_path["pic_path1"] = file_path + "pic1_" + filename
        Util().save_img(json_str["resource_urlList"][0]["url"], "pic1_" + filename,file_path)

        bitmap["pic2"] = json_str["resource_urlList"][1]["url"]
        bitmap_path["pic_path2"] = file_path + "pic2_" + filename
        Util().save_img(json_str["resource_urlList"][1]["url"], "pic2_" + filename,file_path)

        bitmap["pic3"] = json_str["resource_urlList"][2]["url"]
        bitmap_path["pic_path3"] = file_path + "pic3_" + filename
        Util().save_img(json_str["resource_urlList"][2]["url"], "pic3_" + filename,file_path)
        DataInfo.pic_path = bitmap_path
    return bitmap

保存抓取到的图片到本地

你可能感兴趣的:(python)