【目标】
无需登录,直接获取博客下所有文章列表,包括:地址、标题、阅读数、点赞数等,供后续下载之用。采用python加selenium实现。中间会执行js脚本范例,去掉部分影响页面显示效果的元素。
【范例】
#pip install selenium
from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def exec_js(browser, js_str): browser.execute_script(js_str)
chrome_options = Options() browser1 = webdriver.Chrome(options=chrome_options)
url = 'https://blog.csdn.net/cdl3' browser.get(url) browser.maximize_window()
time.sleep(2) # 关闭各种登录弹窗 exec_js(browser1, '$(".login-box").css("display","none")') exec_js(browser1, '$(".passport-login-tip-container").css("display","none")') exec_js(browser1, '$(".csdn-toolbar-creative-mp").css("display","none")')
main_content = browser1.find_element(By.CLASS_NAME, 'mainContent') if main_content: article_list = main_content.find_elements(By.TAG_NAME, 'article') for article in article_list: url2 = article.find_element(By.TAG_NAME, 'a').get_attribute('href') title = article.find_element(By.TAG_NAME, 'h4').text detail = article.find_element(By.CLASS_NAME, 'blog-list-content').text view_num = article.find_element(By.CLASS_NAME, 'view-num').text.replace('·', '') dianzan_num = article.find_element(By.CLASS_NAME, 'give-like-num').text.replace('·', '') print(url2, title, detail, view_num, dianzan_num)
input('请输入....')
【实现效果】
【发文章不易,请多多点赞或关注支持!谢谢您的美意!】