新浪微博数据挖掘食谱之十五: 爬虫篇 (抓取用户的朋友)

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2015-1-11
@author: beyondzhou
@name: crawl_friendship_graph.py
'''

# Crawl friendship graph
def crawl_friendship_graph():
    
    # import 
    #import json
    from login import weibo_login
    from users import crawl_weibo_followers
    
    # Access to sina api
    weibo_api = weibo_login()
    
    screen_name = 'beyondzhou8'
    crawl_weibo_followers(weibo_api, screen_name, depth=1, limit=10)
    
if __name__ == '__main__':
    crawl_friendship_graph()

# Crawl a friendship graph
def crawl_weibo_followers(weibo_api, screen_name, limit=1000000, depth=2):
    
    from data import save_to_mongo
    
    # Resolve the ID for screen_name and start working with IDs for consistency in storage
    seed_id = str(weibo_api.users.show.get(screen_name=screen_name)['id'])
    _, next_queue = get_friends_followers_ids(weibo_api, user_id=seed_id, friends_limit=0, followers_limit=limit)
    
    # Store a seed_id => _follower_ids mapping in MongoDB
    save_to_mongo({'followers' : [ _id for _id in next_queue ]}, 'followers_crawl', '{0}-follower_ids'.format(seed_id))

    d = 1
    while d < depth:
        d += 1
        (queue, next_queue) = (next_queue, [])
        for fid in queue:
            follower_ids = get_friends_followers_ids(weibo_api, user_id=fid, friends_limit=0, followers_limit=limit)

            # Store a fid => follower_ids mapping in MongoDB
            save_to_mongo({'followers' : [ _id for _id in next_queue ]}, 'followers_crawl', '{0}-follower_ids'.format(fid))
            next_queue += follower_ids

你可能感兴趣的:(数据挖掘)