2023博客之星活动已经过了半年之久,出于好奇,想看看目前为止到底有多少人参与了, 由于小助手每次只发单独赛道的, 因此无法窥其全貌,进行对比, 因此写了这个脚本,来分析一下, 看到结果之后, 很想放弃啊, 太卷了.
from selenium import webdriver
import json
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
import pandas as pd
from openpyxl import Workbook, load_workbook
driver_path = ''
driver = webdriver.Chrome(driver_path)
url = 'https://bbs.csdn.net/forums/blogstar2023?typeId=3092730&spm=1001.2014.3001.9616'
driver.get(url)
time.sleep(5)
results = driver.find_element(By.CLASS_NAME, "user-tabs").find_elements(By.CLASS_NAME, "tab-list-item")
data = []
current_datetime = datetime.now()
current_date = current_datetime.date()
result_workbook = Workbook()
result_sheet = result_workbook.active
result_sheet.append(['排名',"用户名","总原力值","当月获得原力值","2023年获得原力值","2023年高质量博文数"])
for result in results:
time.sleep(5)
title = result.find_element(By.CLASS_NAME, 'content-wrapper').find_element(By.CLASS_NAME, 'long-text-title').text
link = result.find_element(By.CLASS_NAME, 'content-wrapper').find_element(By.CLASS_NAME, 'align-items-center').get_attribute("href")
if str(current_date) in title:
item = {
'title': title, # 标题
'link': link
}
data.append(item)
else:
print(f'不是今天的不做处理. 标题{title}')
print(data)
df = pd.DataFrame(columns=["Link", "Content"])
for item in data:
print(item['link'])
driver.get(item['link'])
time.sleep(5)
table_element = driver.find_element(By.CLASS_NAME, 'markdown_views').find_element(By.TAG_NAME, 'table')
rows = table_element.find_elements(By.TAG_NAME, 'tr')
for row in rows:
row_data = []
columns = row.find_elements(By.TAG_NAME, 'td')
for column in columns:
cell_data = column.text
row_data.append(cell_data)
print(cell_data)
result_sheet.append(row_data)
driver.quit()
result_workbook.save('博客之星.xlsx')
from selenium import webdriver
import json
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
import pandas as pd
from openpyxl import Workbook, load_workbook
# 设置ChromeDriver的路径
driver_path = ''
# 创建Chrome WebDriver对象
driver = webdriver.Chrome(driver_path)
# 打开网页
url = 'https://bbs.csdn.net/forums/blogstar2023?typeId=3092730&spm=1001.2014.3001.9616'
driver.get(url)
time.sleep(5)
# 找到结果元素
results = driver.find_element(By.CLASS_NAME, "user-tabs").find_elements(By.CLASS_NAME, "tab-list-item")
# 创建一个空列表用于存储数据
data = []
# 获取当前日期和时间
current_datetime = datetime.now()
# 提取当前日期
current_date = current_datetime.date()
# 创建一个新的 Excel 文件
result_workbook = Workbook()
result_sheet = result_workbook.active
# 写入标题行
result_sheet.append(['排名',"用户名","总原力值","当月获得原力值","2023年获得原力值","2023年高质量博文数"])
# 遍历结果元素并提取数据
for result in results:
time.sleep(5)
title = result.find_element(By.CLASS_NAME, 'content-wrapper').find_element(By.CLASS_NAME, 'long-text-title').text
link = result.find_element(By.CLASS_NAME, 'content-wrapper').find_element(By.CLASS_NAME, 'align-items-center').get_attribute("href")
if str(current_date) in title:
# 将提取的数据存储为字典格式
item = {
'title': title, # 标题
'link': link
}
# 将字典添加到数据列表中
data.append(item)
else:
print(f'不是今天的不做处理. 标题{title}')
print(data)
# 创建一个空的DataFrame来存储数据
df = pd.DataFrame(columns=["Link", "Content"])
# 遍历链接并爬取数据
for item in data:
print(item['link'])
# 导航到链接
driver.get(item['link'])
time.sleep(5)
table_element = driver.find_element(By.CLASS_NAME, 'markdown_views').find_element(By.TAG_NAME, 'table')
rows = table_element.find_elements(By.TAG_NAME, 'tr') # 获取所有行
for row in rows:
row_data = []
columns = row.find_elements(By.TAG_NAME, 'td') # 获取每行中的所有列
for column in columns:
cell_data = column.text
row_data.append(cell_data)
print(cell_data)
result_sheet.append(row_data)
# 关闭浏览器驱动
driver.quit()
# 保存结果到一个新的 Excel 文件
result_workbook.save('博客之星.xlsx')
太难了, 卷不起啊!!!