信用百度公司商标信息爬取

信用百度公司商标信息和图片爬取,ip代理和动态header没做,这只是个测试小脚本,可以在这个基础上继续修改,小改动后再选择自己的存储方式直接存储就好。
希望对大家有帮助。直接复制粘贴即可使用

import re
import time
import requests
from lxml import etree
import json
import execjs
import uuid,oss2
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

#获取搜索公司后的结果列表
def get_company_list():
    company_name = ['徐州灵匠信息科技有限公司']
    resp = requests.get(url="https://xin.baidu.com/s?q=%s&t=0" % company_name[0],
                        headers={
                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"},
                        verify=False)
    response = resp.content.decode()
    html = etree.HTML(response)
    urls = html.xpath('//*[@class="zx-list-wrap"]/div//a[@class="zx-list-item-url"]/@href')
    for url in urls:
        get_markinfo(company_name, "https://xin.baidu.com" + url)

#获取组成请求的参数pid,tk
def get_markinfo(company_name, url):
    resp = requests.get(url, headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"},
                        verify=False)
    text = resp.text
    response = resp.content.decode()
    html2 = etree.HTML(response)
    d = html2.xpath('//*[@id="baiducode"]/text()')[0]
    pid = eval(re.findall(r'"pid":(.*?)\,.*?"defTags"', text, re.S)[0])
    id1, att = re.findall(r"document\.getElementById\('(.*?)'\)\.getAttribute\('(.*?)'\)", text)[0]
    tk_func = "function mix(" + re.findall(r'mix\((.*?)\(function', text, re.S)[0]

    # print(tk_func)
    tk = re.findall(att + r'="(.*?)"\>', text)[0]
    # print(tk, d)
    tk = execjs.compile(tk_func).call('mix', tk, d)
    # print(tk)
    time1 = int(time.time() * 1000)
    url1 = "https://xin.baidu.com/detail/markAjax?pid={}&tot={}&_={}".format(pid, tk, time1)
    get_company_mark_info(url1, pid, tk, company_name)

#发送请求获取商标
def get_company_mark_info(url, pid, tk, company_name):
    try:
        resp1 = requests.get(url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36"}
                             )
        data = json.loads(resp1.content.decode('unicode_escape'))
        pageCount = data['data']['pageCount']

        if pageCount == 0:
            print('-------', company_name, '暂未有注册的商标数据')
        elif pageCount == 1:
            print('-------', company_name, '只有一页数据')
            for da in data['data']['list']:
                markName = da['markName'] #商标名
                markNo = da['markRegNo']  #商标号
                mareImg = da['markStyle']   #商标
                markType = da['markType']   #商标类型
                markValidTime = da['markValidTime'] #有效时间
                markStatus = da['markStatus'] #注册方式
        else:
            page = data['data']['page']
            print('-------', company_name, '有多页数据,当前是第%d页---' % page)
            for da in data['data']['list']:
                markName = da['markName'] #商标名
                markNo = da['markRegNo']  #商标号
                mareImg = da['markStyle']   #商标地址
                #上传到oss
                # markeImg = update_img(da['markStyle'])
                # if markeImg is not None:
                #     print('商标上传成功')
                # else:
                #     print('上传成功')
                markType = da['markType']   #商标类型
                markValidTime = da['markValidTime'] #有效时间
                markStatus = da['markStatus'] #注册方式
                print(da)

            if page is not pageCount:
                for i in range(2, pageCount + 1):
                    print(i)
                    url1 = "https://xin.baidu.com/detail/markAjax?pid={}&tot={}&_={}&p={}".format(pid, tk, int(
                        time.time() * 1000), i)
                    get_company_mark_info(url1, pid, tk, company_name)
            else:
                print('-------', company_name, '商标数据循环输出完毕')


        # print(resp1.content.decode('unicode_escape'))

    except Exception as e:
        print('get_company_mark_info方法出现错误:', e)

#商标上传到oss,使用的是oss存储图标,
def update_img(url):
    account='账号'
    key='密码'
    uid = uuid.uuid1()
    auth = oss2.Auth(account, key)
    bucket = oss2.Bucket(auth, 'http://oss-cn-hangzhou.aliyuncs.com', 'juhe-app')
    try:
        input = requests.get(url)
        path = 'zb_news/%s.jpg' % uid #存储路径
        result = bucket.put_object(path, input)
        if result.status == 200:
            return 'http://juhe-app.oss-cn-hangzhou.aliyuncs.com/' + path
        return None
    except:
        return None


#启动程序
def run():
    get_company_list()

if __name__=='__main__':
    run()

你可能感兴趣的:(python学习)