抓取微博

import os

from os import path

import sys

working_dir = os.path.abspath(os.path.realpath(__file__)+ '/../../')

sys.path.append(working_dir)

import requests

import common.constants as CT

import common.utils as CU

import json

import simplejson as SJ

from Queue import Queue

import threading

from socialplatform import SocialPlatform

import common.errorcodes as ERROR

from pool.ippool import IPPool

import traceback

import threading

import re

#import grequests


class Sina(SocialPlatform):

    TIMELINE_BASE_URL = 'https://api.weibo.com/2/statuses/user_timeline.json?'

    TAG_BASE_URL = 'https://api.weibo.com/2/tags.json'

    BASEINFO_BASE_URL = 'https://api.weibo.com/2/users/show.json'

    WEIBO_BASE_URL = 'https://api.weibo.com/2/statuses/show.json'

    REPOST_WEIBO_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline.json'

    REPOST_WEIBOID_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline/ids.json' 

    FWDCN_BASE_URL = 'https://api.weibo.com/2/statuses/count.json'

    save_queue = Queue()

    sina = None


    def __init__(self):

        super(Sina, self).__init__()

        #th = threading.Thread(target = self.store)

        #th.start()


    @classmethod

    def get_instance(cls):

        if cls.sina is None:

            cls.sina = Sina()

            print 'new sina instance'

        else:

            print 'sina obj existed'

        return cls.sina


    #get trimmed origin weibo by default 

    def get_timeline(self, access_token, uid, feature='1', trim_user='1', since_id='0', count=20, proxy = None):

        res = None

        params = {

                  #'source':'1917566200',

                  'access_token':access_token, 

                  'uid':uid, 

                  'feature':str(feature), 

                  'trim_user':str(trim_user),

                  'since_id':str(since_id),

                  'count':str(count)}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = requests.get(url = Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            #res = grequests.request('GET', Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)

        except Exception as E:

            print 'sina get timeline exception:', E

            #self.request_timeout(proxy)

        

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    #self.save_queue.put({'type':'origin','content':res.text+'\n'})

        #    since_id = self.parse_weibo_json(res.text)

        #    return since_id


    def check_response(self, text):

        data = SJ.loads(text)

        if data is None:

            error_msg = {'msg':'',

                         'error_code':ERROR.NO_RETURN_VALUE,

                         'caller':''}

            raise Exception(str(error_msg))

        elif type(data) == dict:

            if data.get('error_code') is not None:

                error_msg = {'msg':data.get('error'),

                             'error_code':data.get('error_code'),

                             'caller':data.get('request')}

                raise Exception(str(error_msg))


    def request_timeout(self, proxy):

        error_msg = {'msg':'%s request time out'%proxy,

                     'error_code':ERROR.REQUEST_TIMEOUT,

                     'proxy':proxy}

        raise Exception(str(error_msg))

        


    def get_tags(self, access_token, uid, count = 20, proxy=None):

        res = None

        tags = []

        params = {'access_token':access_token, 

                  'uid':uid,

                  'count':count} 

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            #self.request_timeout(proxy)

            pass

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tags = self.parse_tag_json(res.text)

        #return tags


    def get_basicinfo(self, access_token, uid, proxy):

        res = None

        params = {'access_token':access_token, 

                  'uid':uid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

            res = requests.get (url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)    

        except Exception as E:

            print 'sina get basicinfo exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    basicinfo = self.parse_basicinfo_json(res.text)

        #return basicinfo


    def get_weibo_by_id(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            #res = grequests.request('GET', url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = requests.get (url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

        except Exception as E:

            print 'sina get weibo by id exception:', E

            #self.request_timeout(proxy)

        print res.text

        #if res is not None:

        #    res = grequests.map([res])[0]

        #    self.check_response(res.text)

        #    tmp_data = SJ.loads(res.text)

        #    userdata = SJ.dumps(tmp_data.get('user'))

        #    userdata = self.parse_basicinfo_json(userdata)

        #    weibodata = self.parse_single_weibo(res.text)

        #    return {'userdata':userdata, 'weibodata':weibodata}


    def get_fwdcn_by_ids(self, access_token, weiboids, proxy):

        res = None

        weiboids = ','.join(weiboids)

        params = {'access_token':access_token,

                  'ids':weiboids}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.FWDCN_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)



    def get_repost_weibo_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBO_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print 'sina repost weibo exception:', E

            self.request_timeout(proxy)


    def get_repost_weiboid_list(self, access_token, weiboid, proxy):

        res = None

        params = {'access_token':access_token,

                  'id':weiboid}

        if proxy is not None:

            proxies = {'http':"http://%s"%proxy}

        else:

            proxies = None

        try:

            res = grequests.request('GET', url = Sina.REPOST_WEIBOID_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies) 

            res = grequests.map([res])[0]

            print res.text

        except Exception as E:

            print E

            self.request_timeout(proxy)


    def parse_tag_json(self, text):

        data = SJ.loads(text)

        tags = []

        try:

            for item in data:

                for k, v in item.iteritems():

                    if k != 'weight':

                        tags.append(v)

            return tags

        except Exception as E:

            print E

            traceback.print_stack()

            print data


    def parse_weibo_json(self, text):

        save_content = ''

        since_id = '0'

        try:

            data = SJ.loads(text)

            timeline = data.get('statuses',[])

            for l in timeline:

                uid = str(l.get('uid'))

                text = l.get('text')

                weiboid = str(l.get('id'))

                save_content = '%s\t%s\t%s\n'%(uid, text, weiboid)

                #self.save_queue.put({'type':'trimmed', 'content':save_content, 'usid':uid})

            if timeline not in [None,[]]:

                l = timeline[0]

                since_id = str(l.get('id'))

        except Exception as E:

            print E

            traceback.print_stack()

        finally:

            return since_id


    def parse_basicinfo_json(self, text):

        try:

            PROFILE_HEADER = 'http://weibo.com/'

            data = SJ.loads(text)

            location = CU.convert_utf8(data.get('location',''))

            gender = data.get('gender','')

            isverified = data.get('verified','')

            username = CU.convert_utf8(data.get('screen_name',''))

            icon = data.get('avatar_large','')

            usid = data.get('idstr','')

            city_code = data.get('city','')

            province_code = data.get('province','')

            followers_ct = data.get('followers_count', 0)

            if usid not in['',None]:

                profile_url = PROFILE_HEADER + usid

            if isverified:

                isverified = 1

                description = data.get('verified_reason')

            else:

                isverified = 0

                description = data.get('description')

            if description is not None:

                description = CU.convert_utf8(description)

            return {'location':location,

                    'gender':gender,

                    'isverified':isverified,

                    'username':username,

                    'icon':icon,

                    'description':description,

                    'profile_url':profile_url, 

                    'city_code':city_code,

                    'province_code':province_code, 

                    'followers_ct':followers_ct}

        except Exception as E:

            print E

            traceback.print_stack()


    def parse_single_weibo(self, text):

        try:

            data = SJ.loads(text)

            time = data.get('created_at')

            tp_time = ''.join(re.findall(r'(\+)(\d+?)( )',time)[0])

            time = time.replace(tp_time,'')

            time = CU.formatted_str_to_millis(time, '%a %b %d %H:%M:%S %Y')

            text = data.get('text')

            source = data.get('source')

            image = data.get('bmiddle_pic','')

            return {'date':time, 'content':text, 'image':image}    

        except Exception as E:

            print E

             

    def check(self):

        dead_crawl_thread_count = 0 

        crawl_thread_over = False

        dead_save_thread_count = 0 

        save_thread_over = False

        while(True):

            for thread in self.crawl_thread_pool:

                if not thread.isAlive():

                    dead_crawl_thread_count += 1

            if dead_crawl_thread_count == len(self.crawl_thread_pool):

                crawl_thread_over = True


            for thread in self.save_thread_pool:

                if not thread.isAlive():

                    dead_save_thread_count += 1

            if dead_save_thread_count == len(self.save_thread_pool):

                save_thread_over = True


            if save_thread_over and crawl_thread_over:

                break


    def run_crawler(self):

        for i in range(self.crawl_thread_amount):

            crawl_thread = threading.Thread(target=self.get_timeline())



    def test_usage_limit(self, access_token, proxy):

        url = 'https://api.weibo.com/2/account/rate_limit_status.json'

        params = {'access_token':access_token,

                   'source':'1917566200'}

        proxies = {'http':"http://%s"%proxy}

        res = requests.get(url = url, params=params, timeout = CT.TIMEOUT, proxies=proxies)

        print res.text



    def test_unit(self):

        jobs = []

        import gevent

        for i in range(100):

            jobs.append(gevent.spawn(self.test_ip_limit))

        gevent.joinall(jobs)


    def test_loop(self):

        while(True):

            self.test_unit()


    def test_ip_limit(self):

        plat = 'sina'

        #ip_port = '223.4.241.244:3128'

        #ip_port = '80.18.170.245:3128'

        ip_port = '218.57.136.202:80'

        proxy = {'http': "http://%s/" % ip_port}

        self.get_timeline('2.008kKrVCGZulFC85b3b9496f0iMsYD','1816963684', count=1, proxy = proxy)

        IPPool.inc_ip_used_count(key={'proxy':ip_port}, plat=plat, step = 1)

        print IPPool.get_ip_used_count(key={'proxy':ip_port}, plat = plat)


if __name__ == '__main__':

    sina = Sina.get_instance()

    #ip_port = '42.62.5.100:5978'

    #ip_port = '183.60.97.98:80'

    #ip_port = '222.178.37.42:1337'

    ##acs_token = '2.008kKrVCS5lgJB773f46691cvRBfDD'

    acs_token = '2.008kKrVCGZulFC85b3b9496f0iMsYD'

    #th = threading.Thread(target = sina.test_loop)

    #th.start()

    sina.get_timeline(acs_token,'2958598935', count=100, since_id = '3497205966678321')

    #sina.get_tags (acs_token, '2958598935')

    #sina.get_basicinfo (acs_token, '2958598935', None)

    #sina.get_weibo_by_id (acs_token,'3592778635816523', None)

    

    #res = sina.get_basicinfo('2.008kKrVCGZulFC5bab580682RGYueB','1881428463',ip_port)

    #res = sina.get_weibo_by_id(acs_token, '3562960347780645', None)

    #sina.test_ip_limit()

    #wids = ['354464853658119', '354316072507065', '354285872621913']

    #sina.get_fwdcn_by_ids(acs_token, wids, None)

    #sina.get_repost_weibo_list('2.008kKrVCGZulFC5bab580682RGYueB', '3557946866271328', None)

    #sina.get_timeline(acs_token,'1881428463', count=3, proxy = None)

    #sina.get_tags('2.008kKrVCGZulFCdacfff9787oYTfeE','1881428463')

    #sina.get_timeline('2.0','1881428463', count=100, since_id = '3497205966678321', proxy = proxy)

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '223.4.241.244:3128')

    #sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '183.60.97.98:80')

    #proxy = '222.197.214.91:808'

    #requests.get('http://211.151.139.231:8031', proxies = {'http':"http://%s"%proxy})

    


你可能感兴趣的:(抓取微博)