google patent爬取专利数据

网址:
g​​​​​​​whttps://patents.google.com/

通过F12找到请求头

https://patents.google.com/xhr/query?url=q=(google)&oq=google&exp=&tags=

google patent爬取专利数据_第1张图片

# -*- coding: utf-8 -*-
import scrapy
import io
import sys
import requests
import xlrd
from xlwt import *
from openpyxl import Workbook as wb
import os
import re
import csv
import time
import random
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')

def read_company():
    fileName0 = 'sample.xlsx'
    bk=xlrd.open_workbook(fileName0)
    shxrange=range(bk.nsheets)
    try:
        sh=bk.sheet_by_name("Sheet1")
    except:
        print ("代码出错")
    # ncols=sh.ncols #获取列数
    # nrows=sh.nrows #获取列数

    book = Workbook(encoding='utf-8')
    # sheet = book.add_sheet('Sheet1') #创建一个sheet
    UPC = []
    tmp1 = sh.col_values(0)[1:]  #company
    tmp2 = sh.col_values(1)[1:]  #tic
    # tmp3 = sh.col_values(2)[1:]  #IRI
    return tmp1

def start_requests():
    base_url = 'https://patents.google.com/xhr/query?'
    company = 'CANCERVAX CORP'
    patent_name = 'url=assignee=' + company + '&oq=' + company + '&exp=&download=true'
    param = {}
    suburl = base_url + patent_name
    print(suburl)
    file_name = 'test.csv'
    r = requests.get(suburl)
    fo = open(file_name,'wb')                         # 注意要用'wb',b表示二进制,不要用'w'
    fo.write(r.content)                               # r.content -> requests中的二进制响应内容:以字节的方式访问请求响应体,对于非文本请求
    fo.close()
      
# start_requests()
read_company()
proxies = {"https": "https://127.0.0.1:1080", "http": "http://127.0.0.1:1080"}
base_url = 'https://patents.google.com/xhr/query?'
url_list = []
company_list = read_company()
headers = {'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'authority':'patents.google.com',
'method':'GET',
'scheme':'https',
'accept':'*/*',
'accept-encoding':'gzip, deflate, br',
'accept-language':'en,en-US;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'cookie':'_ga=GA1.3.650546010.1557842690; 1P_JAR=2019-06-13-03; NID=185=HFLQWsc9gyTy7jWJiX-sZ242_kqMdEVUKf89m0r0R8jrCT1n2jN8cuSFmh6abb50bDB8u6qYhcF7KXWHgZy4TPj-zkheFl9g6kiLCqFrNEf6G_2hLhWzCfjwkz7EjLB8jrROilpayn5NIIKf0WLZsZCBemnNt88RdO4Tik_zYwg; _gid=GA1.3.814134454.1560407883; _gat=1'
}
user_agent_pool = ["Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"]
ip_pool = ['114.230.69.170:9999','61.135.155.82:443']
print(company_list)
for company in company_list:
    # print(company)
    patent_name = 'url=assignee=' + company + '&oq=' + company + '&exp=&download=true'
    url = base_url + patent_name
    filename = './company_patent/' + company + '.csv'
    # print(url)
    ip = ip_pool[random.randrange(0,2)]
    headers['user_agent'] = user_agent_pool[random.randrange(0,len(user_agent_pool))]
    proxy_ip = 'http://'+ip
    proxies = {'http':proxy_ip}
    r = requests.get(url,headers=headers,proxies=proxies)
    fo = open(filename,'wb')                         # 注意要用'wb',b表示二进制,不要用'w'
    fo.write(r.content)                               # r.content -> requests中的二进制响应内容:以字节的方式访问请求响应体,对于非文本请求
    # filename1 = './company_patent/' + company + '.csv'
    # with open(filename1) as f:
    #     csv_reader =csv.reader(f)

    # for line in csv_reader:
    #     print(line)
    fo.close() 
    time.sleep(15)

  • 导入需要的库,如Scrapy、requests、xlrd等;
  • 通过read_company()函数读取excel表格中的公司名称;
  • 通过start_requests()函数发送GET请求获取特定公司的专利信息,并将响应内容写入csv文件中;
  • 最后循环遍历所有公司,调用requests库发送GET请求获取专利信息,再将响应内容写入csv文件中。

反爬措施:

(1)随机ip池
(2)随机user-agent池
(3)增加每次爬取的延时

 

 

你可能感兴趣的:(python,scrapy,爬虫)