from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import csv
import time
import random
import pymysql
try:
db= pymysql.connect(host = "localhost",
user = "root",
password = "",
db = "zhuanlidata",
port = 3306)
cur = db.cursor()
except:
print("数据库连接出错")
browser = webdriver.Chrome()
browser.minimize_window()
def start_spider():
for i in range(1601683,1601684):
try:
print(i)
url = 'https://kns8.cnki.net/kcms/detail/detail.aspx?dbcode=SCPD&dbname=SCPD2020&filename=CN20' + str(
i) + 'U'
browser.get(url)
time.sleep(random.randint(0,2))
try:
title=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[1]/h1')
sqh=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[3]/div[1]/p')
sqr=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[3]/div[2]/p')
gkh=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[4]/div[1]/p')
gkr=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[4]/div[2]/p')
try:
sq_r=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[5]/p/a')
except:
sq_r=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[5]/p')
add=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[6]/p')
try:
inventor = ''
x_inventor=browser.find_elements_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[7]/p/a')
for inventors in x_inventor:
inventors=inventors.text
inventor+=inventors
except:
inventor=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[7]/p')
inventor=inventor.text
try:
dailijg=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[11]/div[1]/p')
dailijg=dailijg.text
dailir=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[11]/div[2]/p')
dailir=dailir.text
except:
dailijg=''
dailir=''
gsdm=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[9]/div[2]/p')
try:
zy=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[12]/div[2]/div')
zqx=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[12]/div[1]/div')
except:
try:
zy = browser.find_element_by_xpath(
'/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[13]/div[2]/div')
zqx = browser.find_element_by_xpath(
'/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[13]/div[1]/div')
except:
zy = browser.find_element_by_xpath(
'/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[14]/div[2]/div')
zqx = browser.find_element_by_xpath(
'/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[14]/div[1]/div')
yeshu=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[10]/p')
zflh=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[9]/div[1]/p')
zlflh=browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div[8]/p')
print(title.text)
except NoSuchElementException:
print('爬取失败')
break
try:
sql = 'insert into cn_u(专利名称,申请号,申请日,公开号,公开日,申请人,地址,发明人,国省代码,摘要,主权项,页数,主分类号,专利分类号,专利代理机构,代理人) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);'
data=[title.text,sqh.text,sqr.text,gkh.text,gkr.text,sq_r.text,add.text,inventor,gsdm.text,zy.text,zqx.text,yeshu.text,zflh.text,zlflh.text,dailijg,dailir]
print(data)
cur.execute(sql,data)
db.commit()
except:
print("数据导入数据库失败")
break
except:
print('这个号码不行')
break
if __name__ == '__main__':
start_spider()
cur.close()
db.close()
print("爬取完成,请到相应数据库查看!")