新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2015-1-6
@author: beyondzhou
@name: extract_repost_attributions.py
'''

# Extract repost attributions
def extract_repost_attributions():
    
    # import 
    from search import weibo_search
    from entities import weibo_entities
    from login import weibo_login
    from statuses import fetch_repost_timeline, fetch_weibo_status, get_rt_attributions
    import json
    
    # Access to sina api
    weibo_api = weibo_login()
    
    # Do the search
    subject = weibo_search(topic='iphone')
    
    # Decode entities
    (mids, names, texts, dates, reposts, comments, likes) = weibo_entities(subject)
    
    # Find weibo id whose repost number is above then 1
    for index in range(len(reposts)):
        if reposts[index] > 0:
            weibo_id_reposted = mids[index]
            print 'reposts number: %s, weibo_id_reposted: %s' % (reposts[index], weibo_id_reposted)
            break
        
    # Find repost timeline     
    repost_timeline = fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = weibo_id_reposted)
    
    # Extract repost attribution (use the first record of repost_timeline to do the example)
    repost_attributions = get_rt_attributions(repost_timeline[0])
    
    # Output repost weibo
    repost_weibo = fetch_weibo_status(weibo_api, weibo_id = weibo_id_reposted)
    print json.dumps(repost_weibo, indent=1)
    print 'Output repost weibo done!\n'
    
    # Output repost attribution
    for repost_att in repost_attributions:
        print repost_att    
    print 'Output repost attribution done!\n'
        
if __name__ == '__main__':
    extract_repost_attributions()
# Get repost repost weibo timeline
def fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = 1):
    
    repost_timeline = weibo_api.statuses.repost_timeline.get(count=count, page=page, id = weibo_id)
    statuses = repost_timeline['reposts']
    return statuses

# Get weibo status
def fetch_weibo_status(weibo_api, weibo_id = 1):
    
    weibo_status = weibo_api.statuses.show.get(id = weibo_id)
    return weibo_status

# get repost attributions
def get_rt_attributions(repost):
    import re
    
    # Regex adapted from Stack Overflow (http://bit.ly/1821y0J)
    rt_patterns = re.compile(ur"(RT|via|\u8f6c\u53d1)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_attributions = []

    # Inspect the tweet to see if it was produced with /statuses/retweet/:id.
    # See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.
    if repost.has_key('retweeted_status'):
        attribution = repost['retweeted_status']['user']['screen_name'].lower()
        rt_attributions.append(attribution)

    # Also, inspect the tweet for the presence of "legacy" retweet patterns
    # such as "RT" and "via", which are still widely used for various reasons
    # and potentially very useful. See https://dev.twitter.com/discussions/2847 
    # and https://dev.twitter.com/discussions/1748 for some details on how/why.
    '''
    subject = 'RT @SocialWebMining'
    import re
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_patterns.findall(subject)[0][1]
    Out[29]: ' @SocialWebMining'
    rt_patterns.findall(subject)
    Out[30]: [('RT', ' @SocialWebMining')]

    rt_patterns.findall(subject)[0]
    Out[31]: ('RT', ' @SocialWebMining')

    rt_patterns.findall(subject)[0][1]
    Out[32]: ' @SocialWebMining'
    
    In [32]: repost_attributions = '\u798f\u5229\u6d3e\u9001\u673a'

    In [33]: repost_attributions.decode("unicode_escape")
    Out[33]: u'\u798f\u5229\u6d3e\u9001\u673a'

    In [34]: print repost_attributions.decode("unicode_escape")
              福利派送机
    '''
        
    try:
        rt_attributions += [
            mention.strip()
                for mention in rt_patterns.findall(repost['text'])[0][1].split()
        ]
    except IndexError, _:
        pass
    
    # Filter out any duplicates
    return list(set([rta.strip("@").lower() for rta in rt_attributions]))

Result:

callback_url: https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//apps.weibo.com/guaguastd&response_type=code&client_id=2925245021
return_redirect_uri: http://weibo.com/login.php?url=http%3A%2F%2Fapps.weibo.com%2Fguaguastd%3Fcode%3D9d0a0ecb4df4db1d8d1a6ef5460c5e82
code: ['9d0a0ecb4df4db1d8d1a6ef5460c5e82']
now_handle: ce2b7c50-9531-11e4-b8c2-7bd88716b5dd
http://passport.weibo.com/
all_handles: [u'ce2b7c50-9531-11e4-b8c2-7bd88716b5dd', u'd3ba1000-9531-11e4-b8c2-7bd88716b5dd']
search done!
mids entities done!
names entities done!
texts entities done!
dates entities done!
reposts entities done!
comments entities done!
likes entities done!
reposts number: 6964, weibo_id_reposted: 3795801400243898
{
 "reposts_count": 6975, 
 "truncated": false, 
 "text": "1 toy 1 day\uff0c\u7b2c178\u671f\uff1a\u7f8e\u56fdBluelounge\uff0diPhone 5/5s\u6700\u4f73\u89c2\u770b\u89d2\u5ea6\u5145\u7535\u57fa\u5ea7\u3002\u624b\u673a\u653e\u5728\u684c\u4e0a\u5145\u7535\uff0c\u60f3\u770b\u4e00\u4e9b\u4e1c\u897f\uff0c\u611f\u89c9\u603b\u662f\u4e0d\u8212\u670d\u3002\u6709\u4e86\u5b83\uff0c\u4e0d\u4ec5\u5916\u89c2\u9ad8\u5927\u4e0a\uff0c\u8fd8\u8ba9\u4f60\u6709\u4e2a\u66f4\u597d\u7684\u89c2\u770b\u89d2\u5ea6\uff0c\u5145\u7535\u65f6\u7528\u8d77\u6765\u4e5f\u662f\u90a3\u4e48\u987a\u7545\u81ea\u5982\uff08\u8fd9\u662f\u6211\u9001\u51fa\u7684\u7b2c2232\u4ef6\u793c\u7269\uff0c\u5173\u8f6c\uff0c1\u67086\u65e5\u62bd\uff0c\u4e0d\u52301\u5929\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff0c\u56e0\u4e3a\u6709\u4f60\uff01\uff09", 
 "visible": {
  "type": 0, 
  "list_id": 0
 }, 
 "in_reply_to_status_id": "", 
 "bmiddle_pic": "http://ww1.sinaimg.cn/bmiddle/005wRYdajw1enz2uspb4xj313y0pgmza.jpg", 
 "id": 3795801400243898, 
 "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg", 
 "mid": "3795801400243898", 
 "source": "\u5fae\u535a weibo.com", 
 "attitudes_count": 187, 
 "in_reply_to_screen_name": "", 
 "pic_urls": [
  {
   "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww2.sinaimg.cn/thumbnail/005wRYdajw1enz2p7nulhj30go0egmxu.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2pa8b0fj30i20i20uj.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2pd1z82j312w12w40j.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2us8x7vj31420mytaf.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2utiqwqj30r00n2dgk.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2uu0mcuj30us0ps76b.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww2.sinaimg.cn/thumbnail/005wRYdajw1enz2vbtjkxj30lo0c7q53.jpg"
  }, 
  {
   "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/005wRYdajw1enz2vmki74j30sg0ilab2.jpg"
  }
 ], 
 "in_reply_to_user_id": "", 
 "darwin_tags": [], 
 "favorited": false, 
 "original_pic": "http://ww1.sinaimg.cn/large/005wRYdajw1enz2uspb4xj313y0pgmza.jpg", 
 "idstr": "3795801400243898", 
 "source_type": 1, 
 "user": {
  "cover_image": "http://ww4.sinaimg.cn/crop.0.0.920.300/005wRYdajw1emok192jcyj30pk08cgoi.jpg", 
  "bi_followers_count": 3, 
  "domain": "", 
  "avatar_large": "http://tp1.sinaimg.cn/5066369752/180/5712388302/1", 
  "verified_source": "", 
  "ptype": 0, 
  "cover_image_phone": "http://ww2.sinaimg.cn/crop.0.0.0.0/005wRYdajw1emovpmsh52j30hs0hrwhh.jpg", 
  "statuses_count": 12132, 
  "id": 5066369752, 
  "verified_reason_url": "", 
  "city": "1000", 
  "verified": true, 
  "friends_count": 4, 
  "verified_reason_modified": "", 
  "credit_score": 80, 
  "block_app": 1, 
  "follow_me": false, 
  "verified_reason": "\u5317\u4eac\u7231\u8d34\u8fbe\u4eba\u7f51\u7edc\u6280\u672f\u6709\u9650\u516c\u53f8", 
  "followers_count": 634775, 
  "location": "\u5317\u4eac", 
  "verified_state": 0, 
  "verified_trade": "", 
  "mbtype": 12, 
  "verified_source_url": "", 
  "profile_url": "u/5066369752", 
  "block_word": 0, 
  "avatar_hd": "http://ww1.sinaimg.cn/crop.0.0.943.943.1024/005wRYdajw1emu8osezk8j30q90q9jsp.jpg", 
  "star": 0, 
  "description": "\u6211\u9001\u7684\u4e0d\u662f\u793c\u7269\uff0c\u662f\u4efd\u5e0c\u671b\uff0c\u6bcf\u592910\u4e2a\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff01", 
  "verified_contact_email": "[email protected]", 
  "online_status": 0, 
  "mbrank": 4, 
  "verified_level": 3, 
  "profile_image_url": "http://tp1.sinaimg.cn/5066369752/50/5712388302/1", 
  "idstr": "5066369752", 
  "verified_contact_mobile": "", 
  "allow_all_act_msg": false, 
  "allow_all_comment": true, 
  "geo_enabled": true, 
  "class": 1, 
  "screen_name": "\u7231\u8d34", 
  "lang": "zh-cn", 
  "weihao": "", 
  "remark": "", 
  "favourites_count": 21, 
  "name": "\u7231\u8d34", 
  "url": "", 
  "province": "11", 
  "created_at": "Tue Mar 11 20:16:07 +0800 2014", 
  "verified_contact_name": "Jason", 
  "verified_type": 2, 
  "gender": "m", 
  "following": false, 
  "pagefriends_count": 0, 
  "urank": 15
 }, 
 "geo": null, 
 "created_at": "Mon Jan 05 23:50:39 +0800 2015", 
 "mlevel": 0, 
 "comments_count": 745
}
Output repost weibo done!

爱贴
Output repost attribution done!


你可能感兴趣的:(业务掘金)