import os
from os import path
import sys
working_dir = os.path.abspath(os.path.realpath(__file__)+ '/../../')
sys.path.append(working_dir)
import requests
import common.constants as CT
import common.utils as CU
import json
import simplejson as SJ
from Queue import Queue
import threading
from socialplatform import SocialPlatform
import common.errorcodes as ERROR
from pool.ippool import IPPool
import traceback
import threading
import re
#import grequests
class Sina(SocialPlatform):
TIMELINE_BASE_URL = 'https://api.weibo.com/2/statuses/user_timeline.json?'
TAG_BASE_URL = 'https://api.weibo.com/2/tags.json'
BASEINFO_BASE_URL = 'https://api.weibo.com/2/users/show.json'
WEIBO_BASE_URL = 'https://api.weibo.com/2/statuses/show.json'
REPOST_WEIBO_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline.json'
REPOST_WEIBOID_LIST_BASE_URL = 'https://api.weibo.com/2/statuses/repost_timeline/ids.json'
FWDCN_BASE_URL = 'https://api.weibo.com/2/statuses/count.json'
save_queue = Queue()
sina = None
def __init__(self):
super(Sina, self).__init__()
#th = threading.Thread(target = self.store)
#th.start()
@classmethod
def get_instance(cls):
if cls.sina is None:
cls.sina = Sina()
print 'new sina instance'
else:
print 'sina obj existed'
return cls.sina
#get trimmed origin weibo by default
def get_timeline(self, access_token, uid, feature='1', trim_user='1', since_id='0', count=20, proxy = None):
res = None
params = {
#'source':'1917566200',
'access_token':access_token,
'uid':uid,
'feature':str(feature),
'trim_user':str(trim_user),
'since_id':str(since_id),
'count':str(count)}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
res = requests.get(url = Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)
#res = grequests.request('GET', Sina.TIMELINE_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)
except Exception as E:
print 'sina get timeline exception:', E
#self.request_timeout(proxy)
print res.text
#if res is not None:
# res = grequests.map([res])[0]
# self.check_response(res.text)
# #self.save_queue.put({'type':'origin','content':res.text+'\n'})
# since_id = self.parse_weibo_json(res.text)
# return since_id
def check_response(self, text):
data = SJ.loads(text)
if data is None:
error_msg = {'msg':'',
'error_code':ERROR.NO_RETURN_VALUE,
'caller':''}
raise Exception(str(error_msg))
elif type(data) == dict:
if data.get('error_code') is not None:
error_msg = {'msg':data.get('error'),
'error_code':data.get('error_code'),
'caller':data.get('request')}
raise Exception(str(error_msg))
def request_timeout(self, proxy):
error_msg = {'msg':'%s request time out'%proxy,
'error_code':ERROR.REQUEST_TIMEOUT,
'proxy':proxy}
raise Exception(str(error_msg))
def get_tags(self, access_token, uid, count = 20, proxy=None):
res = None
tags = []
params = {'access_token':access_token,
'uid':uid,
'count':count}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
#res = grequests.request('GET', url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)
res = requests.get (url = Sina.TAG_BASE_URL, params=params, timeout=CT.TIMEOUT, proxies=proxies)
except Exception as E:
#self.request_timeout(proxy)
pass
print res.text
#if res is not None:
# res = grequests.map([res])[0]
# self.check_response(res.text)
# tags = self.parse_tag_json(res.text)
#return tags
def get_basicinfo(self, access_token, uid, proxy):
res = None
params = {'access_token':access_token,
'uid':uid}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
#res = grequests.request('GET', url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
res = requests.get (url = Sina.BASEINFO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
except Exception as E:
print 'sina get basicinfo exception:', E
#self.request_timeout(proxy)
print res.text
#if res is not None:
# res = grequests.map([res])[0]
# self.check_response(res.text)
# basicinfo = self.parse_basicinfo_json(res.text)
#return basicinfo
def get_weibo_by_id(self, access_token, weiboid, proxy):
res = None
params = {'access_token':access_token,
'id':weiboid}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
#res = grequests.request('GET', url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
res = requests.get (url = Sina.WEIBO_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
except Exception as E:
print 'sina get weibo by id exception:', E
#self.request_timeout(proxy)
print res.text
#if res is not None:
# res = grequests.map([res])[0]
# self.check_response(res.text)
# tmp_data = SJ.loads(res.text)
# userdata = SJ.dumps(tmp_data.get('user'))
# userdata = self.parse_basicinfo_json(userdata)
# weibodata = self.parse_single_weibo(res.text)
# return {'userdata':userdata, 'weibodata':weibodata}
def get_fwdcn_by_ids(self, access_token, weiboids, proxy):
res = None
weiboids = ','.join(weiboids)
params = {'access_token':access_token,
'ids':weiboids}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
res = grequests.request('GET', url = Sina.FWDCN_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
res = grequests.map([res])[0]
print res.text
except Exception as E:
print E
self.request_timeout(proxy)
def get_repost_weibo_list(self, access_token, weiboid, proxy):
res = None
params = {'access_token':access_token,
'id':weiboid}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
res = grequests.request('GET', url = Sina.REPOST_WEIBO_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
res = grequests.map([res])[0]
print res.text
except Exception as E:
print 'sina repost weibo exception:', E
self.request_timeout(proxy)
def get_repost_weiboid_list(self, access_token, weiboid, proxy):
res = None
params = {'access_token':access_token,
'id':weiboid}
if proxy is not None:
proxies = {'http':"http://%s"%proxy}
else:
proxies = None
try:
res = grequests.request('GET', url = Sina.REPOST_WEIBOID_LIST_BASE_URL, params=params, timeout = CT.TIMEOUT, proxies=proxies)
res = grequests.map([res])[0]
print res.text
except Exception as E:
print E
self.request_timeout(proxy)
def parse_tag_json(self, text):
data = SJ.loads(text)
tags = []
try:
for item in data:
for k, v in item.iteritems():
if k != 'weight':
tags.append(v)
return tags
except Exception as E:
print E
traceback.print_stack()
print data
def parse_weibo_json(self, text):
save_content = ''
since_id = '0'
try:
data = SJ.loads(text)
timeline = data.get('statuses',[])
for l in timeline:
uid = str(l.get('uid'))
text = l.get('text')
weiboid = str(l.get('id'))
save_content = '%s\t%s\t%s\n'%(uid, text, weiboid)
#self.save_queue.put({'type':'trimmed', 'content':save_content, 'usid':uid})
if timeline not in [None,[]]:
l = timeline[0]
since_id = str(l.get('id'))
except Exception as E:
print E
traceback.print_stack()
finally:
return since_id
def parse_basicinfo_json(self, text):
try:
PROFILE_HEADER = 'http://weibo.com/'
data = SJ.loads(text)
location = CU.convert_utf8(data.get('location',''))
gender = data.get('gender','')
isverified = data.get('verified','')
username = CU.convert_utf8(data.get('screen_name',''))
icon = data.get('avatar_large','')
usid = data.get('idstr','')
city_code = data.get('city','')
province_code = data.get('province','')
followers_ct = data.get('followers_count', 0)
if usid not in['',None]:
profile_url = PROFILE_HEADER + usid
if isverified:
isverified = 1
description = data.get('verified_reason')
else:
isverified = 0
description = data.get('description')
if description is not None:
description = CU.convert_utf8(description)
return {'location':location,
'gender':gender,
'isverified':isverified,
'username':username,
'icon':icon,
'description':description,
'profile_url':profile_url,
'city_code':city_code,
'province_code':province_code,
'followers_ct':followers_ct}
except Exception as E:
print E
traceback.print_stack()
def parse_single_weibo(self, text):
try:
data = SJ.loads(text)
time = data.get('created_at')
tp_time = ''.join(re.findall(r'(\+)(\d+?)( )',time)[0])
time = time.replace(tp_time,'')
time = CU.formatted_str_to_millis(time, '%a %b %d %H:%M:%S %Y')
text = data.get('text')
source = data.get('source')
image = data.get('bmiddle_pic','')
return {'date':time, 'content':text, 'image':image}
except Exception as E:
print E
def check(self):
dead_crawl_thread_count = 0
crawl_thread_over = False
dead_save_thread_count = 0
save_thread_over = False
while(True):
for thread in self.crawl_thread_pool:
if not thread.isAlive():
dead_crawl_thread_count += 1
if dead_crawl_thread_count == len(self.crawl_thread_pool):
crawl_thread_over = True
for thread in self.save_thread_pool:
if not thread.isAlive():
dead_save_thread_count += 1
if dead_save_thread_count == len(self.save_thread_pool):
save_thread_over = True
if save_thread_over and crawl_thread_over:
break
def run_crawler(self):
for i in range(self.crawl_thread_amount):
crawl_thread = threading.Thread(target=self.get_timeline())
def test_usage_limit(self, access_token, proxy):
url = 'https://api.weibo.com/2/account/rate_limit_status.json'
params = {'access_token':access_token,
'source':'1917566200'}
proxies = {'http':"http://%s"%proxy}
res = requests.get(url = url, params=params, timeout = CT.TIMEOUT, proxies=proxies)
print res.text
def test_unit(self):
jobs = []
import gevent
for i in range(100):
jobs.append(gevent.spawn(self.test_ip_limit))
gevent.joinall(jobs)
def test_loop(self):
while(True):
self.test_unit()
def test_ip_limit(self):
plat = 'sina'
#ip_port = '223.4.241.244:3128'
#ip_port = '80.18.170.245:3128'
ip_port = '218.57.136.202:80'
proxy = {'http': "http://%s/" % ip_port}
self.get_timeline('2.008kKrVCGZulFC85b3b9496f0iMsYD','1816963684', count=1, proxy = proxy)
IPPool.inc_ip_used_count(key={'proxy':ip_port}, plat=plat, step = 1)
print IPPool.get_ip_used_count(key={'proxy':ip_port}, plat = plat)
if __name__ == '__main__':
sina = Sina.get_instance()
#ip_port = '42.62.5.100:5978'
#ip_port = '183.60.97.98:80'
#ip_port = '222.178.37.42:1337'
##acs_token = '2.008kKrVCS5lgJB773f46691cvRBfDD'
acs_token = '2.008kKrVCGZulFC85b3b9496f0iMsYD'
#th = threading.Thread(target = sina.test_loop)
#th.start()
sina.get_timeline(acs_token,'2958598935', count=100, since_id = '3497205966678321')
#sina.get_tags (acs_token, '2958598935')
#sina.get_basicinfo (acs_token, '2958598935', None)
#sina.get_weibo_by_id (acs_token,'3592778635816523', None)
#res = sina.get_basicinfo('2.008kKrVCGZulFC5bab580682RGYueB','1881428463',ip_port)
#res = sina.get_weibo_by_id(acs_token, '3562960347780645', None)
#sina.test_ip_limit()
#wids = ['354464853658119', '354316072507065', '354285872621913']
#sina.get_fwdcn_by_ids(acs_token, wids, None)
#sina.get_repost_weibo_list('2.008kKrVCGZulFC5bab580682RGYueB', '3557946866271328', None)
#sina.get_timeline(acs_token,'1881428463', count=3, proxy = None)
#sina.get_tags('2.008kKrVCGZulFCdacfff9787oYTfeE','1881428463')
#sina.get_timeline('2.0','1881428463', count=100, since_id = '3497205966678321', proxy = proxy)
#sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '223.4.241.244:3128')
#sina.test_usage_limit('2.00khtAoBGZulFC3b30a5e1bepMslNB', '183.60.97.98:80')
#proxy = '222.197.214.91:808'
#requests.get('http://211.151.139.231:8031', proxies = {'http':"http://%s"%proxy})