爬取Instagram霉霉的关注者(Json方式)


由于Instagram是一个动态网站,所以我利用Json来爬取我想要的信息。

利用Chrome的开发者工具Network选项中找到了以?query为开头的文件,这是一个Json格式的文件。


如何进行解析呢?其实里面的'end_cursor'就是通往下一页好友列表的一个密钥,而该页下关注者的信息在node下面。

废话不多说,直接上代码。


#coding:utf-8
import simplejson
from bs4 import BeautifulSoup
import requests
import time
from multiprocessing import Pool

###载入json,读取json中的end_corsor,即为下一页的标致。然后用其替换jason的LinkAdress。
headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
           "cookie":"mid=WVR6LgAEAAErsKhZPudqXWAngh6j; sessionid=IGSC57d73fa3ccf2201c703d81d9c0b9d4d3bb458c1602e61cb3d7df2bbe67047528%3AJg91xL101zJVIqP9UwfUNDJumcZVWeOf%3A%7B%22_auth_user_id%22%3A1629140917%2C%22_auth_user_backend%22%3A%22accounts.backends.CaseInsensitiveModelBackend%22%2C%22_auth_user_hash%22%3A%22%22%2C%22_token_ver%22%3A2%2C%22_token%22%3A%221629140917%3Aei7DIObawCMQSfDV95i4aEvXX1WDXkFV%3A2d6a7c67da37170938c99a94681265481447a9ec766692a703bb64c93bd55913%22%2C%22_platform%22%3A4%2C%22last_refreshed%22%3A1498982339.7495558262%2C%22asns%22%3A%7B%22time%22%3A1498982336%2C%2247.74.12.219%22%3A45102%7D%7D; ig_vw=375; ig_pr=2; rur=ATN; csrftoken=jnlAmritgONlURyLWUkFhUae6O5387ZN; ds_user_id=1629140917",
           "referer":"https://www.instagram.com/ddlovato/",
           "host":"www.instagram.com"
}
baseUrl = 'https://www.instagram.com/graphql/query/?query_id=17851374694183129&variables=%7B%22id%22%3A%2211830955%22%2C%22first%22%3A20%7D'
list_next_page = []
outputfile = open('output.txt','a') #存放结果的文件
count = 0 #用来计算递归调用的次数
def get_json_dict(url): #得到json格式并转为Python可读的dict格式
    response = requests.get(url, headers=headers, proxies={'https': 'http://127.0.0.1:58258'})
    soup = BeautifulSoup(response.text, 'lxml')
    json = soup.text
    json_dict = simplejson.loads(json)
    return json_dict

def get_end_cursor(url):
    global list_next_page, outputfile,count
    json_dict = get_json_dict(url)
    count += 1
    ###通过键has_next_page查看是否有下一页,有的话获取end_cursor,不然结束
    if (json_dict['data']['user']['edge_followed_by']['page_info']['has_next_page']):
        try:
            end_cursor = json_dict['data']['user']['edge_followed_by']['page_info']['end_cursor']
            list_edges = json_dict['data']['user']['edge_followed_by']['edges']
            next_page = 'https://www.instagram.com/graphql/query/?query_id=17851374694183129&variables=%7B%22id%22%3A%2211830955%22%2C%22first%22%3A10%2C%22after%22%3A%22'+end_cursor+'%22%7D'
        except:
            pass
        if len(list_next_page) < 400: #设置了抓取的人数,当然也可以不设置 
            list_next_page.append(next_page)
            # time.sleep(2)
            try:
                #这里可能是网络的问题会遇到中断的现象,如果中断就让他继续。
                print("抓取了%s页!" %str(count))
                for e in list_edges:
                    outputfile.write(e['node']['username']+'\n')
                # time.sleep(2)
                get_end_cursor(next_page) 
            except:
                # time.sleep(2)
                last_page = list_next_page[-1]
                get_end_cursor(last_page)
           
        else:
            print(next_page)
    else:
        return 0



get_end_cursor(baseUrl)
outputfile.close()


后续还会加上爬取图片的方案。

你可能感兴趣的:(Python爬虫)