python+selenium爬取某网专利数据

记录一下爬某网专利数据经历~~

新的这个某网页面,检索数据是在主页面嵌套了一个iframe页面,直接通过页面爬的话,还不能直接拿到数据,本次经历主要有以下几个麻烦点:

1、搜索的结果,最多只有300页数据,每页20条,如果需要爬的数据量大的话,只能按时间区间一点一点爬。

2、不能通过URL或者其他参数进行直接翻页,如果中途报错,需重新一页一页翻。

3、无法通过xpath定位到每页显示值进行设置每页显示数量(反正我定位一直报错)。

4、xpath定位验证码,中途偶尔会获取不到验证码,报错。

5、可以通过页面源码找到嵌套的iframe内具体嵌套的table地址,页面可以打开,但是爬虫获取不到页面数据,应该是某些参数的问题,同时中途遇到验证码时,无法提交验证码。

下面直接附源码:


import requests

from lxml import etree
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import ddddocr
from PIL import Image
import datetime

def get_html():
    url = "*************************"
    #地址需自己粘贴

    driver = webdriver.Chrome("C:\Program Files\Google\Chrome\Application\chromedriver.exe")
    driver.get(url)

    qzsj = driver.find_element("id","publishdate_from")
    jssj = driver.find_element("id","publishdate_to")
    js = driver.find_element(By.CLASS_NAME,"buttOther")

    driver.execute_script("arguments[0].value= '1992-05-16'",qzsj)
    driver.execute_script("arguments[0].value= '1992-05-24'",jssj)
    js.click()
    time.sleep(3)

    # size = driver.find_element('xpath', '//*[@id="id_grid_display_num"]/a[3]/font')
    #定位不到显示数量
    # size.click()
    iframe = driver.find_element("xpath","//*[@id='iframeResult']")
    driver.switch_to.frame(iframe)
    flag = 1

    while flag <= 83:#设置区间内最大页数
        print("第"+str(flag)+"页")

        data_list = []
        list = driver.find_elements(By.CLASS_NAME,"fz14")
        print("这是list-----"+str(list))
        while str(list) == "[]":#为空即为出现验证码
            get_yzm(driver)
            list = driver.find_elements(By.CLASS_NAME, "fz14")
            print("这是list-----" + str(list))

        if flag >= 16:#中途报错后,翻页到当前页开始继续爬数据
            for itme in list:
                href = itme.get_attribute("href")
                param = "dbcode"+href.split("&dbcode")[1]
                new_url = "************************"+param
                #地址需自己粘贴
                print(new_url)
                data_list.append(get_data(new_url))
            time.sleep(1)
            print(str(data_list))
            df = pd.DataFrame(data_list)
            df.to_csv('专利数据_92.csv', mode='a', index=False, header=False, encoding='gb18030')
        time.sleep(1)
        if flag == 1:
        #第一页中“下一页”的位置与其他页“下一页的位置不一样”
            next = driver.find_element('xpath','//*[@id="ctl00"]/table/tbody/tr[3]/td/table/tbody/tr/td/div/a[9]')
            next.click()
        else:
            try:
                next = driver.find_element('xpath', '//*[@id="ctl00"]/table/tbody/tr[3]/td/table/tbody/tr/td/div/a[11]')
                next.click()
            except Exception as e :
                print("已完成当前数据段采集")
            # continue

        flag = flag + 1
    print("当前段数据爬取完成")



def get_data(url):#获取专利具体内容
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    }
    response = requests.request('GET', url, headers=header)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    # print(soup)
    title = soup.find("title").text.split('-')[0]
    content = soup.find_all('p', attrs={'class': 'funds'})
    list = [url, title]
    for itme in content:
        list.append(itme.text)
    try:
        zqx = soup.find('div', attrs={'class': 'claim-text'}).text
        list.append(zqx)
    except Exception as e:
        print(e)
    try:
        zy = soup.find('div', attrs={'class': 'abstract-text'}).text
        list.append(zy)
    except Exception as e:
        print(e)

    return list


def get_yzm(driver):#中途出现验证码,对验证码截图识别
    element = driver.find_element('xpath','/html/body')
    driver.execute_script("return arguments[0].scrollIntoView(true);",element)
    try:
        img = driver.find_element('xpath', '//*[@id="CheckCodeImg"]')
    except Exception as e :
        time.sleep(3)
        img = driver.find_element('xpath', '//*[@id="CheckCodeImg"]')
    img.screenshot('2.png')
    with open('2.png', 'rb') as f:
        img_bytes = f.read()
    ocr = ddddocr.DdddOcr()
    res = ocr.classification(img_bytes)

    input = driver.find_element('id','CheckCode')
    button = driver.find_element('xpath','/html/body/p[1]/input[2]')
    driver.execute_script("arguments[0].value= '"+res.upper()+"'", input)
    print("---"+str(input)+"---")
    if input == "" or input == None:#没有识别到验证码,重新识别
        get_yzm(driver)
    button.click()

if __name__ == '__main__':
    get_html()

第一次发,不喜勿喷,需改善之处,请多指教。

你可能感兴趣的:(selenium,爬虫,测试工具)