大家好,我是空空star,本篇给大家分享一下
《通过Python+Selenium获取我的所有文章质量分》
。
目前在质量分查询界面只能一篇文章一篇文章查,如果我们想快速知道我们哪些文章质量分不是高质量,手动一篇一篇查太慢,借助Selenium代替手动,快速查出我们所有文章的质量分,然后再保存成excle。这样我们就可以很直观得看到哪些文章不是80分以上。
当然,也可以直接通过质量分接口去查询,接口做了HMAC认证,感兴趣的同学可以尝试。
浏览器:本篇使用的是Chrome
Chrome驱动版本:110.0.5481.77
Python版本:Python3.8
selenium版本: 4.8.2
Selenium基础篇之环境准备
def get_article_total(username):
url = 'https://blog.csdn.net/community/home-api/v1/get-tab-total'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'
}
params = {'username': username}
response = requests.get(url, params=params, headers=headers)
return response.json()['data']['blog']
通过请求参数,可以看到每次最多返回20个,那么翻页数就是:
max_page = math.ceil(blog_total/20)
def get_article_list(username):
article_list = []
url = 'https://blog.csdn.net/community/home-api/v1/get-business-list'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'
}
blog_total = get_article_total(username)
max_page = math.ceil(blog_total/20)
for page in range(1, max_page+1):
params = {'page': page, 'size': 20, 'businessType': 'blog', 'username': username}
response = requests.get(url, params=params, headers=headers)
for item in response.json()['data']['list']:
article_list.append(item['url'])
return article_list
def save_excle(filename,post_time_list,title_list,score_list,remark_list,article_list):
data = {'作者-发布时间': post_time_list, '文章标题': title_list, '质量分': score_list, '分数说明': remark_list, '文章url': article_list}
df = pd.DataFrame(data)
writer = pd.ExcelWriter(filename+'.xlsx')
df.to_excel(writer, index=False)
writer.save()
def get_score(username):
post_time_list = []
article_list = get_article_list(username)
title_list = []
score_list = []
remark_list = []
cnt = 0
for blog_url in article_list:
cnt += 1
driver.find_elements(By.CLASS_NAME, 'el-input__inner')[0].send_keys(blog_url)
print(f'开始进行第{cnt}次查询')
driver.find_elements(By.CLASS_NAME, 'trends-input-box-btn')[0].click()
time.sleep(1)
texts = driver.find_elements(By.TAG_NAME, 'p')
title = driver.find_elements(By.TAG_NAME, 'span')[3].text
post_time = driver.find_elements(By.TAG_NAME, 'span')[4].text
post_time_list.append(post_time)
title_list.append(title)
score_list.append(texts[1].text)
remark_list.append(texts[2].text)
print('查询完毕!开始写入excle。')
save_excle(username, post_time_list, title_list, score_list, remark_list, article_list)
print('写入excle完毕!')
只需要把username改成你的即可
if __name__ == '__main__':
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get('https://www.csdn.net/qc')
driver.maximize_window()
get_score('weixin_38093452')
driver.quit()
import math
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
import requests
import pandas as pd