继续探索Twitter API的使用,这次获取一下Twitter的实时推文。
1、sample-steam 样本流
这是Twitter提供的代码,比较简单,只需要更改一下"bearer_token"即可使用,相对获得的数据单一,满足不了需求,下文讲解如何设定搜索规则
# -*- codeing =utf-8 -*-
# @Time : 2021/11/22 10:47
# @Author:yuchuan
# @File : twitter-sample-steam.py
# @Software : PyCharm
import requests
import os
import json
# To set your environment variables in your terminal run the following line:
# export 'BEARER_TOKEN'=''
bearer_token = 'your bearer_token'#look like "AAAAAAAAAAAAAAAAAAAAOHKRwEAAAAAXF5NOvPXXUPATBLLo*********"
def create_url():
return "https://api.twitter.com/2/tweets/sample/stream"
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2SampledStreamPython"
return r
def connect_to_endpoint(url):
response = requests.request("GET", url, auth=bearer_oauth, stream=True)
print(response.status_code)
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
print(json.dumps(json_response, indent=4, sort_keys=True))
if response.status_code != 200:
raise Exception(
"Request returned an error: {} {}".format(
response.status_code, response.text
)
)
def main():
url = create_url()
timeout = 0
while True:
connect_to_endpoint(url)
timeout += 1
if __name__ == "__main__":
main()
简单更改过BEARER_TOKEN后,就可以得到下面这样的数据啦
{
"data": {
"id": "1462613720471900162",
"text": "RT @shinslighterr: nao mas e essa official art de shingeki eu to me mijando https://t.co/m63s6sFGsJ"
}
}
{
"data": {
"id": "1462613720442581000",
"text": "RT @Corinthians: SE BUSCA RIVAL EN AMERICA \nSE BUSCA RIVAL EN AMERICA\nSE BUSCA RIVAL EN AMERICA\nSE BUSCA RIVAL EN AMERICA\nSE BUSCA RIVAL EN\u2026"
}
}
{
"data": {
"id": "1462613720463331328",
"text": "@kimkai_kggk yuk, gwe jaga lilin yh loe muter\ud83d\ude0d"
}
}
这个样本得到的只有单纯的id,text,如果你需要其他元数据就需要进阶版的Stream Tweets了,下面就开始进阶讲解
2、Stream Tweets in real-time
1、首先讲解一下搜索内容的添加规则
详细规则可参考:
https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/integrate/build-a-rule
{"value": "dog has:images", "tag": "dog pictures"},
{"value": "cat has:images -grumpy", "tag": "cat pictures"}
Example :
"cat has:images“:关键词”cat"且有图片
“tag":“cat images”:分配的标签
lang:en 仅过滤英语推文
def set_rules(delete):
# You can adjust the rules if needed
sample_rules = [
{"value": "dog has:images", "tag": "dog pictures"},
{"value": "cat has:images -grumpy", "tag": "cat pictures"},
]
payload = {"add": sample_rules}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
auth=bearer_oauth,
json=payload,
)
2、搜索内容的返回字段设定
成功连接到流之后,默认响应Tweet字段:id、text,如果要获取这之外的内容,可以通过设置fields和扩展参数来指定
https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet
这是Twitter给出的参考文件,关于field 和 expansion的说明,可以根据需要自己组合
比如我需要的内容有,author.id , tweets , entities ( hashtags , urls , mentions ) , public_metrics ( like , reply , retweet )
Key | Value | Returned fields |
---|---|---|
tweet.fields | public_metrics | like,reply,retweet |
expansions | author_id | includes.users.id, includes.users.name, includes.users.username |
tweet.fields | entities | hashtags,urls,mentions |
根据需要的扩展部分参数设定url,就可以获得相应的返回字段。
tweet_fields = "tweet.fields=public_metrics,entities"
"https://api.twitter.com/2/tweets/search/stream?{}&expansions=author_id".format(tweet_fields)
def get_stream(set):
tweet_fields = "tweet.fields=public_metrics,entities"
url = "https://api.twitter.com/2/tweets/search/stream?{}&expansions=author_id".format(tweet_fields)
response = requests.get(
url, auth=bearer_oauth, stream=True,
)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Cannot get stream (HTTP {}): {}".format(
response.status_code, response.text
)
)
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
print(json.dumps(json_response, indent=4, sort_keys=True))
3.总结一下
①set_rules(request.post)设定搜索内容
②get_stream(request.get)设定获取内容
发现了一个问题,因为我在设计搜索规则的时候debug了好几次,每一次我更改规则后,都是在原来关键词的stream中增加入新关键词的stream, 我查了一下应该是因为request.post的原因。总之,爬出的内容要进行过滤保存,过滤时精确一点即可,这个搜索出来的内容应该是不占用Twitter api中每月限制爬取数额的。
# -*- codeing =utf-8 -*-
# @Time : 2021/11/22 15:28
# @Author:yuchuan
# @File : twitter_stream_tweets.py
# @Software : PyCharm
import requests
import os
import json
# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'=''
bearer_token ='' #same as'AAAAAAAAAAAAAAAAAAAAAOHKRwEA**********'
def bearer_oauth(r):
"""
Method required by bearer token authentication.
"""
r.headers["Authorization"] = f"Bearer {bearer_token}"
r.headers["User-Agent"] = "v2FilteredStreamPython"
return r
#
# def get_rules():
# response = requests.get(
# "https://api.twitter.com/2/tweets/search/stream/rules", auth=bearer_oauth
# )
# if response.status_code != 200:
# raise Exception(
# "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
# )
# print(json.dumps(response.json()))
# return response.json()
#
#
# def delete_all_rules(rules):
# if rules is None or "data" not in rules:
# return None
#
# ids = list(map(lambda rule: rule["id"], rules["data"]))
# payload = {"delete": {"ids": ids}}
# response = requests.post(
# "https://api.twitter.com/2/tweets/search/stream/rules",
# auth=bearer_oauth,
# json=payload
# )
# if response.status_code != 200:
# raise Exception(
# "Cannot delete rules (HTTP {}): {}".format(
# response.status_code, response.text
# )
# )
# print(json.dumps(response.json()))
#
#
def set_rules():
# You can adjust the rules if needed
sample_rules = [
{"value": "Russia",},
]
payload = {"add": sample_rules}
response = requests.post(
"https://api.twitter.com/2/tweets/search/stream/rules",
auth=bearer_oauth,
json=payload,
)
if response.status_code != 201:
raise Exception(
"Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
)
print(json.dumps(response.json()))
def get_stream(set):
tweet_fields = "tweet.fields=public_metrics,entities"
url = "https://api.twitter.com/2/tweets/search/stream?{}&expansions=author_id".format(tweet_fields)
response = requests.get(
url, auth=bearer_oauth, stream=True,
)
print(response.status_code)
if response.status_code != 200:
raise Exception(
"Cannot get stream (HTTP {}): {}".format(
response.status_code, response.text
)
)
for response_line in response.iter_lines():
if response_line:
json_response = json.loads(response_line)
print(json.dumps(json_response, indent=4, sort_keys=True))
def main():
# rules = get_rules()
# delete = delete_all_rules(rules)
set = set_rules()
get_stream(set)
if __name__ == "__main__":
main()
本人创建了一个公众号,分享科研路上的小问题,新发现,欢迎关注公众号,给我留言!!!
一起奋发向上,攻克难题吧~~