最近由于实验室研究需求,需要对Twitter15及Twitter16数据集进行扩展。具体为:1.根据user_id,获取用户画像;2.根据tweet_id,获取推文下的评论。
首先根据自己申请的推特开发者账号,去连接Twitter API
import tweepy
import time
import csv
import pandas as pd
import json
from collections import OrderedDict
import datetime
import re
# 填写twitter提供的开发Key和secret
consumer_key = 'XXX'
consumer_secret = 'XXX'
access_token = 'XXX'
access_token_secret = 'XXX'
# 提交你的Key和secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# 获取类似于内容句柄的东西
api = tweepy.API(auth, proxy='127.0.0.1:7890')
# 获取所有的tweet_id
with open('uid.csv','r',encoding='utf-8') as f1:
reader = csv.reader(f1)
tweets_id = [row[0] for row in reader]
print(tweets_id)
print(len(tweets_id))
f1.close()
for tweet_id in tweets_id:
print(f"正在获取 tweet_id = %s 的推特的评论..." % tweet_id)
with open('comments_id/' + tweet_id + '.csv', 'r', encoding='utf-8') as f2:
reader = csv.reader(f2)
comments_id = [row[0] for row in reader]
with open('comments/' + tweet_id + '.csv', 'a', encoding='utf-8') as f3:
for comment_id in comments_id:
print(f"正在获取 comment_id = %s 的评论内容..." % comment_id)
flag = 0
while flag == 0:
try:
comment = api.get_status(id=comment_id).text
# print(type(comment))
except Exception as e:
print(repr(e))
if repr(e) == "TweepError([{'code': 179, 'message': 'Sorry, you are not authorized to see this status.'}])":
print("没有权限查看此条评论!")
flag = 1
comment_list = [comment_id,]
break
elif repr(e) == "TweepError([{'code': 144, 'message': 'No status found with that ID.'}])":
print("此条评论已被删除!")
flag = 1
comment_list = [comment_id, ]
break
elif repr(e) == "TweepError([{'code': 63, 'message': 'User has been suspended.'}])":
print("用户被冻结!")
flag = 1
comment_list = [comment_id, ]
break
elif repr(e) == "TweepError([{'code': 34, 'message': 'Sorry, that page does not exist.'}])":
print("该页面不存在!")
flag = 1
comment_list = [comment_id, ]
break
else:
print("网络不稳定,正常重新连接...")
flag=0
else:
flag = 1
print("成功找的此条评论!")
print(comment)
pattern1 = re.compile(r'\@(?:.|\s)*? ')
comment = ''.join(re.sub(pattern1, '', comment))
pattern2 = re.compile(r'\@(?:.|\s)*?$')
comment = ''.join(re.sub(pattern2,'',comment))
comment = ''.join(re.sub('@','',comment))
pattern3 = re.compile(r'\n')
comment = ''.join(re.sub(pattern3, ' ', comment))
print(comment)
comment_list = [comment_id, comment]
break
writer = csv.writer(f3)
writer.writerows([comment_list])
f3.close()
f2.close()
# 获取所有的user_id
with open('uid.csv','r',encoding='utf-8') as f1:
reader = csv.reader(f1)
user_ids = [row[1] for row in reader]
print(user_ids)
print(len(user_ids))
# 获取所有的user_id
with open('uid.csv','r',encoding='utf-8') as f1:
reader = csv.reader(f1)
user_ids = [row[1] for row in reader]
print(user_ids)
print(len(user_ids))
for user_id in user_ids:
print(f"正在查找用户ID=%s的用户信息..." % user_id)
flag = 0
while flag == 0:
try:
User = api.get_user(user_id=user_id)
except Exception as e:
print(repr(e))
if repr(e) == "TweepError: [{'code': 50, 'message': 'User not found.'}]":
print('用户不存在!')
with open('user_not_found.csv','a',encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(user_id)
f.close()
flag = 1
break
elif repr(e) == "TweepError: [{'code': 50, 'message': 'User not found.'}]":
print('用户被冻结!')
with open('user_has_been_suspended.csv','a',encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerows(user_id)
f.close()
flag = 1
break
else:
print("网络不稳定,正在尝试重新连接...")
flag = 0
else:
flag = 1
user_dict = {"user_id":user_id}
user_dict["name"] = User.name
user_dict["screen_name"] = User.screen_name
user_dict["location"] = User.location
user_dict["profile_location"] = User.profile_location
user_dict["description"] = User.description
user_dict["protected"] = User.protected
user_dict["followers_count"] = User.followers_count
user_dict["friends_count"] = User.friends_count
user_dict["listed_count"] = User.listed_count
user_dict["created_at"] = str(User.created_at)
user_dict["favourites_count"] = User.favourites_count
user_dict["utc_offset"] = User.utc_offset
user_dict["time_zone"] = User.time_zone
user_dict["geo_enabled"] = User.geo_enabled
user_dict["verified"] = User.verified
user_dict["statuses_count"] = User.statuses_count
user_dict["lang"] = User.lang
user_dict["contributors_enabled"] = User.contributors_enabled
user_dict["is_translator"] = User.is_translator
user_dict["is_translation_enabled"] = User.is_translation_enabled
user_dict["profile_background_tile"] = User.profile_background_tile
user_dict["profile_use_background_image"] = User.profile_use_background_image
user_dict["has_extended_profile"] = User.has_extended_profile
user_dict["default_profile"] = User.default_profile
user_dict["default_profile_image"] = User.default_profile_image
user_dict["following"] = User.following
user_dict["follow_request_sent"] = User.follow_request_sent
user_dict["notifications"] = User.notifications
user_dict["translator_type"] = User.translator_type
print(user_dict)
json_str = json.dumps(user_dict, indent=4, ensure_ascii=False)
with open(user_id + '.json', 'w',encoding='utf-8') as f2:
f2.write(json_str)
break
# time.sleep(1)
# f2.close()
f1.close()