API · Reddit 阅读说明
OAuth2 · Reddit create a app,得到client id和secret
acquire a token
import requests
import requests.auth
client_auth = requests.auth.HTTPBasicAuth('client_id','secret')
post_data = {"grant_type": "password", "username": "XXX", "password": "XXX"}
headers = {"User-Agent": "ChangeMeClient/0.1 by YourUsername"}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
print(response.json())
得到结果(会过期)
{'access_token': 'XXX', 'token_type': 'bearer', 'expires_in': 3600, 'scope': '*'}
参数 | 类型 | 描述 |
---|---|---|
id |
int | 唯一标识 |
url | varchar(255) | |
url_md5 | varchar(255) | |
title | varchar(255) | |
author | varchar(255) | |
created_utc | datetime | |
selftext | text | 本身的文本或跳转到的超链接 |
score | int | 得分 |
num_comments | int | 评论的数量 |
upvote_ratio | float | up的比例 |
需要注意的是,在标题、文本等地方可能出现表情符号,所以需要代码和数据库同一编码为utf8mb4
PRAW: The Python Reddit API WrapperPRAW是一个reddit API的包装器,提供了使用API的接口。
爬虫:
class RedditSpider(scrapy.Spider):
name = "reddit"
allowed_domains = ["reddit.com"]
start_urls = [
"https://www.reddit.com"
]
def parse(self, response):
#使用client id 和secret 进行登陆
reddit = praw.Reddit(client_id='XXX', client_secret='XXX',
grant_type='client_credentials', user_agent='mytestscripts/1.0')
"""
sub = reddit.submission(id='9klf7s')
#print(sub.title)
#pprint.pprint(vars(sub))
"""
#可以通过 subreddit.stream.submissions()来监控某一个子版块出现的新帖子
#subreddit = reddit.subreddit('dapps')
#for sub in subreddit.stream.submissions():
#limit=None来获取所有的贴子,默认为100
#每次得到的属性类别数量可能不一样
subs = reddit.subreddit('dapps').new(limit=None)
for sub in subs:
item = RedditItem()
item['html'] = response.body
#print(item['html'])
#permalink是网站下该帖子的前缀,需要和网站地址拼接构成该帖子的链接地址
url = 'https://{}{}'.format(self.allowed_domains[0], sub.permalink)
item['url'] = url
......
redditor = sub.author
#作者可能为空
#print("author:", redditor.name)
if redditor is not None:
item['author'] = redditor.name
else:
item['author'] = ""
#sub.created_utc是一个utc时间戳,需要转换成datetime格式
#print("created utc:", sub.created_utc)
item['created_time'] = datetime.datetime.utcfromtimestamp(sub.created_utc)
#如果帖子本身只是一个超链接,那么sub.selftext为空
item['selftext'] = sub.selftext
if sub.is_self==False :
item['selftext'] = sub.url
......
yield item
pipeline:
html_insert = '''insert into reddit_dapps_html(html) values('{html}')'''
reddit_insert = '''insert into reddit_dapps(url, url_md5, title, author,
created_time, selftext, score, num_comments, upvote_ratio)
values('{url}', '{url_md5}', '{title}', '{author}',
'{created_time}', '{selftext}', '{score}', '{num_comments}', '{upvote_ratio}')'''
def process_item(self, item, spider):
html = item['html']
if html:
item['html'] = html.strip().decode(encoding="utf-8")
......
#将时间格式化
#created_time
created_time = item['created_time']
if created_time:
item['created_time'] = created_time.strftime("%Y-%m-%d %H:%M:%S")
selftext = item['selftext']
if selftext:
item['selftext'] = selftext.replace('\n', '').replace(' ', ' ')
......
sqltext1 = self.html_insert.format(
html = pymysql.escape_string(item['html']))
#由于score等是数字,需要先转换为字符串格式
sqltext2 = self.reddit_insert.format(
url = pymysql.escape_string(item['url']),
......
score = pymysql.escape_string(str(item['score'])),
num_comments = pymysql.escape_string(str(item['num_comments'])),
upvote_ratio = pymysql.escape_string(str(item['upvote_ratio'])))
self.cursor.execute(sqltext1)
self.cursor.execute(sqltext2)
return item
def open_spider(self, spider):
# connet database
# 选择字符集为'utf8mb4'
self.connect = pymysql.connect(
host=self.settings.get('MYSQL_HOST'),
port=self.settings.get('MYSQL_PORT'),
db=self.settings.get('MYSQL_DBNAME'),
user=self.settings.get('MYSQL_USER'),
passwd=self.settings.get('MYSQL_PASSWD'),
charset='utf8mb4',
use_unicode=True)
数据库字符集的设定
oauth token
访问Many endpoints on reddit use the same protocol for controlling pagination and filtering. These endpoints are called Listings and share five common parameters:
after
/before
,limit
,count
, andshow
.Listings do not use page numbers because their content changes so frequently. Instead, they allow you to view slices of the underlying data. Listing JSON responses contain
after
andbefore
fields which are equivalent to the “next” and “prev” buttons on the site and in combination withcount
can be used to page through the listing.The common parameters are as follows:
after
/before
- only one should be specified. these indicate the fullname of an item in the listing to use as the anchor point of the slice.limit
- the maximum number of items to return in this slice of the listing.count
- the number of items already seen in this listing. on the html site, the builder uses this to determine when to give values forbefore
andafter
in the response.show
- optional parameter; ifall
is passed, filters such as “hide links that I have voted on” will be disabled.To page through a listing, start by fetching the first page without specifying values for
after
andcount
. The response will contain anafter
value which you can pass in the next request. It is a good idea, but not required, to send an updated value forcount
which should be the number of items already fetched.
slice_headers = {'Authorization':'token_type access_token'}
print(slice_headers)
params = {'limit':'1'} #限制一次取得的数量
count = 1
while count<3:
response = requests.get("https://oauth.reddit.com/r/dapps/new", headers = slice_headers, params=params)
print(response.status_code)
#需要判断response的状态码
if response.status_code==200:
response_json = response.json()
#print(response_json)
#需要的submission数据在返回的json['data']['children']中
for child in response_json['data']['children']:
print("submission json:",child)
url = 'https://{}{}'.format(self.allowed_domains[0], child['data']['permalink'])
print("url:", url)
print("title:", child['data']['title'])
print("author:", child['data']['author'])
print("created time:", datetime.datetime.utcfromtimestamp(child['data']['created_utc']))
if child['data']['is_self']==False:
print("self text:", child['data']['url'])
else:
print("self text:", child['data']['selftext'])
print("score: ", child['data']['score'])
print("num comments:", child['data']['num_comments'])
#
after = response_json['data']['after']
if after==None:
break
params = {'limit':'1', 'after':after}
else:
print("null")
break