信用百度公司商标信息和图片爬取,ip代理和动态header没做,这只是个测试小脚本,可以在这个基础上继续修改,小改动后再选择自己的存储方式直接存储就好。
希望对大家有帮助。直接复制粘贴即可使用
import re
import time
import requests
from lxml import etree
import json
import execjs
import uuid,oss2
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
#获取搜索公司后的结果列表
def get_company_list():
company_name = ['徐州灵匠信息科技有限公司']
resp = requests.get(url="https://xin.baidu.com/s?q=%s&t=0" % company_name[0],
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"},
verify=False)
response = resp.content.decode()
html = etree.HTML(response)
urls = html.xpath('//*[@class="zx-list-wrap"]/div//a[@class="zx-list-item-url"]/@href')
for url in urls:
get_markinfo(company_name, "https://xin.baidu.com" + url)
#获取组成请求的参数pid,tk
def get_markinfo(company_name, url):
resp = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"},
verify=False)
text = resp.text
response = resp.content.decode()
html2 = etree.HTML(response)
d = html2.xpath('//*[@id="baiducode"]/text()')[0]
pid = eval(re.findall(r'"pid":(.*?)\,.*?"defTags"', text, re.S)[0])
id1, att = re.findall(r"document\.getElementById\('(.*?)'\)\.getAttribute\('(.*?)'\)", text)[0]
tk_func = "function mix(" + re.findall(r'mix\((.*?)\(function', text, re.S)[0]
# print(tk_func)
tk = re.findall(att + r'="(.*?)"\>', text)[0]
# print(tk, d)
tk = execjs.compile(tk_func).call('mix', tk, d)
# print(tk)
time1 = int(time.time() * 1000)
url1 = "https://xin.baidu.com/detail/markAjax?pid={}&tot={}&_={}".format(pid, tk, time1)
get_company_mark_info(url1, pid, tk, company_name)
#发送请求获取商标
def get_company_mark_info(url, pid, tk, company_name):
try:
resp1 = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"}
)
data = json.loads(resp1.content.decode('unicode_escape'))
pageCount = data['data']['pageCount']
if pageCount == 0:
print('-------', company_name, '暂未有注册的商标数据')
elif pageCount == 1:
print('-------', company_name, '只有一页数据')
for da in data['data']['list']:
markName = da['markName'] #商标名
markNo = da['markRegNo'] #商标号
mareImg = da['markStyle'] #商标
markType = da['markType'] #商标类型
markValidTime = da['markValidTime'] #有效时间
markStatus = da['markStatus'] #注册方式
else:
page = data['data']['page']
print('-------', company_name, '有多页数据,当前是第%d页---' % page)
for da in data['data']['list']:
markName = da['markName'] #商标名
markNo = da['markRegNo'] #商标号
mareImg = da['markStyle'] #商标地址
#上传到oss
# markeImg = update_img(da['markStyle'])
# if markeImg is not None:
# print('商标上传成功')
# else:
# print('上传成功')
markType = da['markType'] #商标类型
markValidTime = da['markValidTime'] #有效时间
markStatus = da['markStatus'] #注册方式
print(da)
if page is not pageCount:
for i in range(2, pageCount + 1):
print(i)
url1 = "https://xin.baidu.com/detail/markAjax?pid={}&tot={}&_={}&p={}".format(pid, tk, int(
time.time() * 1000), i)
get_company_mark_info(url1, pid, tk, company_name)
else:
print('-------', company_name, '商标数据循环输出完毕')
# print(resp1.content.decode('unicode_escape'))
except Exception as e:
print('get_company_mark_info方法出现错误:', e)
#商标上传到oss,使用的是oss存储图标,
def update_img(url):
account='账号'
key='密码'
uid = uuid.uuid1()
auth = oss2.Auth(account, key)
bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou.aliyuncs.com', 'juhe-app')
try:
input = requests.get(url)
path = 'zb_news/%s.jpg' % uid #存储路径
result = bucket.put_object(path, input)
if result.status == 200:
return 'http://juhe-app.oss-cn-hangzhou.aliyuncs.com/' + path
return None
except:
return None
#启动程序
def run():
get_company_list()
if __name__=='__main__':
run()