cnki论文爬取自动下载,基于selenium

cnki论文爬取自动下载,基于selenium

import time
import os
from selenium import webdriver
def download(key, page):
    url = "https://web.bit.edu.cn/login" #我们学校的登录网址
    username = "" # 账号
    password = "" # 密码
    opt = webdriver.ChromeOptions()  # 创建浏览器
    # opt.set_headless()                            #无窗口模式
    # profile.default_content_setting.popups':0 设置为0表示禁止弹出下载窗口
    # 'download.default_directory':"E:\\dir" 修改下载地址为path
    prefs = {
     'profile.default_content_setting.popups': 0,
             'download.default_directory': path}
    opt.add_experimental_option('prefs', prefs)

    driver = webdriver.Chrome(options=opt)  # 创建浏览器对象
    driver.get(url)  # 打开网页
    # driver.maximize_window()                      #最大化窗口
    time.sleep(2)  # 加载等待
    driver.find_element_by_id('cas_login').click() # 模拟点击登录
    time.sleep(1)
    driver.find_element_by_id('username').send_keys(username)  # 自动输入用户名
    driver.find_element_by_id('password').send_keys(password)  # 自动输入用户名密码
    driver.find_element_by_id('login_submit').click()
    time.sleep(1)
    # 定位到知网位置并点击
    zhiwang = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[4]/div[18]/div/div[1]/div/span')
    driver.execute_script("arguments[0].click();", zhiwang)
    time.sleep(0.5)
    # 获取所有窗口
    windows = driver.window_handles
    # 切换到当前最新打开的窗口
    driver.switch_to.window(windows[-1])
    # txt_SearchText 为搜索框的位置  键入搜索内容
    driver.find_element_by_id('txt_SearchText').send_keys(key)
    # 定位到搜索按钮的位置并点击
    search = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[1]/input[2]')
    driver.execute_script("arguments[0].click();", search)
    time.sleep(1) #等待加载
    try:
        # 点击下载
        # 第一页
        for i in range(1,21):
            # 获取所有窗口
            windows = driver.window_handles
            # 切换到下载页面对应的窗口
            driver.switch_to.window(windows[1])
            download = '//*[@id="gridTable"]/table/tbody/tr['+str(i)+']/td[9]/a[1]/i'
            download_click = driver.find_element_by_xpath(download)
            driver.execute_script("arguments[0].click();", download_click)
            time.sleep(1)
        time.sleep(1)
        # 第二到九页
        for k in range(2,page):
            page = 'page' + str(k)
            driver.find_element_by_id(page).click()
            for i in range(1, 21):
                # 获取所有窗口
                windows = driver.window_handles
                # 切换到下载页面对应的窗口
                driver.switch_to.window(windows[1])
                download = '//*[@id="gridTable"]/table/tbody/tr[' + str(i) + ']/td[9]/a[1]/i'
                download_click = driver.find_element_by_xpath(download)
                driver.execute_script("arguments[0].click();", download_click)
                time.sleep(1)
            time.sleep(1)
    except Exception as e:
        print(e)
    time.sleep(5)
    driver.close()
if __name__ == '__main__':
    kw = input("请输入要搜索的关键词:")
    pg = int(input("请输入要下载的页数:"))
    path = r'F:\论文下载'+'\\'+kw
    if not os.path.exists(path):
        os.makedirs(path)
    try:
        download(kw, pg)
    except Exception as e:
        print(e)

你可能感兴趣的:(python编程,selenium,python,chrome)