知网论文抓取selenium

from selenium import webdriver
import time
import xlwt
import os
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# import numpy
from selenium.webdriver.common.by import By
import re
# import pymysql
class cnki():
    # 初始化
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path='D:\chromweb\chromedriver.exe')
        self.wait = WebDriverWait(self.driver, 300)
        self.driver.maximize_window()
        # 开始抓取
    def get_info(self,name1,name2):
        try:
            self.driver.get('https://kns.cnki.net/kns8/AdvSearch?dbprefix=SCDB&&crossDbcodes=CJFQ%2CCDMD%2CCIPD%2CCCND%2CCISD%2CSNAD%2CBDZK%2CCJFN%2CCCJD')
           # 利用显示等待点击进入高级检索页面
            self.wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//ul[@class="search-classify-menu"]/li[4]'))).click()
            # 找到输入框
            input = self.wait.until(
                EC.presence_of_element_located((By.XPATH, '//textarea[@class="textarea-major ac_input"]'))
            )
            # 类点击,然后输入查询主题或关键字
            input.clear()
            input.send_keys(name1)
            # 显示等待
            self.wait.until(
                EC.element_to_be_clickable((By.XPATH, '//input[@class="btn-search"]'))
            ).click()

            time.sleep(3)
            total = self.driver.find_element_by_xpath('//*[@id="countPageDiv"]/span/em').text
            print(name2 + "一共有" + total + "条数据")
            total = re.sub("\D", "", total)
            page = (int(total) // 20) + 1
            print('一共有{}'.format(page)+'页文章')
            a = 1
            for p in range(page):
                for i in range(1, 21):
                    link = self.driver.find_element_by_xpath('//*[@id="gridTable"]/table/tbody/tr[%d]/td[2]/a' % i)
                    print(link)
                    flag1 = self.isElementExist('//*[@id="gridTable"]/table/tbody/tr[%d]/td[2]' % i)
                    if flag1:
                        # 将该模块与浏览器顶部对齐
                        self.driver.execute_script("arguments[0].scrollIntoView();", link)
                        time.sleep(3)
                        # 使用鼠标的操作
                        actions = ActionChains(self.driver)
                        actions.move_to_element(link)
                        actions.click(link)
                        actions.perform()

                        time.sleep(10)
                        # 切换页面
                        windows = self.driver.window_handles
                        self.driver.switch_to.window(windows[-1])
                        time.sleep(3)
                        # 寻找pdf下载按键位置是否可以下载,之后进行下载
                        try:
                            flag2 = self.isElementExist('//*[@id="pdfDown"]')
                            if flag2:
                                pdf = self.driver.find_element_by_xpath('//*[@id="pdfDown"]')
                                self.driver.execute_script("arguments[0].scrollIntoView();", pdf)
                                time.sleep(3)
                                self.wait.until(EC.presence_of_element_located(
                                    (By.XPATH, '//*[@id="pdfDown"]'))).click()

                            else:
                                print('错误')
                                pass
                        except Exception as exc:
                            print(exc)
                        time.sleep(10)
                        # 关闭当前已下载好的页面
                        self.driver.close()
                        time.sleep(5)
                        self.driver.switch_to.window(windows[0])
                        print("-----正在爬取--" + name2 + '--药品的第' + str(int(p) + 1) + '页' + str(a) + "条数据------")
                        a = a + 1
                    else:
                        print('cw')
                        break

                flag3 = self.isElementExist('//*[@id="PageNext"]')
                # 点击下一页
                if flag3:
                    time.sleep(10)
                    next_page = self.driver.find_element_by_xpath('//*[@id="PageNext"]')
                    self.driver.execute_script("arguments[0].scrollIntoView();", next_page)
                    self.wait.until(EC.element_to_be_clickable(
                        (By.XPATH, '//*[@id="PageNext"]'))).click()
                    time.sleep(15)
                else:
                    break
        except Exception as exc:
            print(exc)

    def isElementExist(self, element):
        flag = True
        try:
            self.driver.find_element_by_xpath(element)
            return flag
        except:
            flag = False
            return flag

if __name__ == '__main__':
    name1 = "TI='经济与环境保护'"
    name2 = '经济'
    run = cnki()
    run.get_info(name1,name2)

注释在里面都有写,大家可以自行观看,有需要交流的地方可以评论!

你可能感兴趣的:(selenium,爬虫,selenium)