天眼查新方式信息爬取

#本文通过新的方式爬取突破

  1. 由于公司列表页信息很少反爬,除了公司名称其他信息都没有,所以可以取巧提取注册时间注册资本信息
  2. 访问过多过快也会封,测试可以通过随机UA突破
  3. 另外公司具体信息详情页可能不同公司展示xpath位置不一样,所以用re
  4. 另外经营范围下载回来出现&#x开头的乱码,#&#xxx 的格式其实是unicode,用HTMLParser库解析
    在这里插入图片描述
    天眼查新方式信息爬取_第1张图片
    在这里插入图片描述#最后实现结果:基本3秒能查出5家相关企业具体公司信息
    **更新下,天眼查有更新,现在可以通过企业名和工商号或者纳税号进行查询

import requests
from lxml import etree
import random
import re
# import HTMLParser
from html.parser import HTMLParser

proxy = {
    "http": 'http://125.70.13.77:8080',
    "http": 'https://183.6.129.212:41949'
}
USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd=random.choice(USER_AGENTS)
print(dd)
headers={
    "Referer": "https://www.baidu.com/",

    "User-Agent": "%s"%dd
}
def down_load(url):
    cc=requests.get(url=url,headers=headers,proxies=proxy)
    # cc=etree.HTML(cc)
    # cc.encode("utf-8").decode("utf-8")

    cc.encoding="utf-8"
    return  cc.text

i=input("请输入企业相关信息(企业名、工商号或纳税人号):")
first_url="https://m.tianyancha.com/search?key=%s"%i
# first_url="http://www.baidu.com"
a=down_load(first_url)
a=etree.HTML(a)
detail_url=a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')
boss=a.xpath('//div[@class="search_row_new_mobil"]//a/text()')
the_registered_capital=a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')
the_registered_time=a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')
# print(detail_url,boss,the_registered_capital,the_registered_time)

gs=[]
gs1={}
for ii in range(len(boss)):
    aa=down_load(detail_url[ii])
    bb=etree.HTML(aa)
    company=bb.xpath('//div[@class="over-hide"]/div/text()')[0]
    industry = re.findall("行业:(.*?)
",aa,re.S)[0] the_enterprise_type = re.findall("企业类型:(.*?)
",aa,re.S)[0] registration_number = re.findall("工商注册号:(.*?)
",aa,re.S)[0] organization_code = re.findall("组织结构代码:(.*?)
",aa,re.S)[0] credit_code = re.findall("统一信用代码:(.*?)
",aa,re.S)[0] business_period = re.findall("经营期限:(.*?)
",aa,re.S)[0] # approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0] registration_authority =re.findall("登记机关:(.*?)
",aa,re.S)[0] registered_address =re.findall("注册地址:(.*?)
",aa,re.S)[0] scope_of_business =re.findall('(.*?)',aa,re.S)[0] h=HTMLParser() #&#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法 scope_of_business=h.unescape(scope_of_business) new=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business] # print(new) gs1[ii+1]=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business] gs.append(new) print(gs) # print(gs1)

也可以参考看下上篇:通过scrapy结合selenium抓取天眼查



后续通过tkinter制作的可视化界面


天眼查新方式信息爬取_第2张图片


import tkinter as tk
import requests
from lxml import etree
import random
import  re
from html.parser import HTMLParser

window=tk.Tk()
window.title("公司信息查询器")
window.geometry("790x550+500+200")

l=tk.Label(window,text="企业名、工商号或纳税人号:",font="微软雅黑 11",height=2)
l.grid()
l1=tk.Label(window,text="这就是为你查询的结果:",font="微软雅黑 11",height=2)
l1.grid()

var=tk.StringVar()

e=tk.Entry(window,width=62)
e.grid(row=0,column=1)
e1=tk.Text(window,height=30)
# e1=tk.Entry(window,textvariable=var,width=60,)
e1.grid(row=2,column=1)



def click():
    content=e.get()
    proxy = {
        "http": 'http://125.70.13.77:8080',
        "http": 'https://183.6.129.212:41949'
    }
    USER_AGENTS = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]
    dd = random.choice(USER_AGENTS)

    headers = {
        "Referer": "https://www.baidu.com/",

        "User-Agent": "%s" % dd
    }

    def down_load(url):
        cc = requests.get(url=url, headers=headers, proxies=proxy)
        # cc=etree.HTML(cc)
        # cc.encode("utf-8").decode("utf-8")

        cc.encoding = "utf-8"
        return cc.text

    # i = input("请输入企业相关信息(企业名、工商号或纳税人号):")
    first_url = "https://m.tianyancha.com/search?key=%s" % content
    # first_url="http://www.baidu.com"
    a = down_load(first_url)
    a = etree.HTML(a)
    detail_url = a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')
    boss = a.xpath('//div[@class="search_row_new_mobil"]//a/text()')
    the_registered_capital = a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')
    the_registered_time = a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')
    # print(detail_url,boss,the_registered_capital,the_registered_time)

    gs = []
    gs1 = {}
    for ii in range(len(boss)):
        aa = down_load(detail_url[ii])
        bb = etree.HTML(aa)
        company = bb.xpath('//div[@class="over-hide"]/div/text()')[0]
        industry = re.findall("行业:(.*?)
", aa, re.S)[0] the_enterprise_type = re.findall("企业类型:(.*?)", aa, re.S)[0] registration_number = re.findall("工商注册号:(.*?)", aa, re.S)[0] organization_code = re.findall("组织结构代码:(.*?)", aa, re.S)[0] credit_code = re.findall("统一信用代码:(.*?)", aa, re.S)[0] business_period = re.findall("经营期限:(.*?)", aa, re.S)[0] # approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0] registration_authority = re.findall("登记机关:(.*?)", aa, re.S)[0] registered_address = re.findall("注册地址:(.*?)", aa, re.S)[0] scope_of_business = re.findall('(.*?)', aa, re.S)[0] h = HTMLParser() # &#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法 scope_of_business = h.unescape(scope_of_business) new = [ii+1,"公司名:" + company, "法人:" + boss[ii], "注册时间:" + the_registered_time[ii], "注册资本:" + the_registered_capital[ii], "企业类型:" + the_enterprise_type, "工商注册号:" + registration_number, "组织结构代码:" + organization_code, "统一信用代码:" + credit_code, "经营年限:" + business_period, "登记机关:" + registration_authority, "注册地址:" + registered_address, "经营范围:" + scope_of_business] gs1[ii + 1] = ["公司名:" + company, "法人:" + boss[ii], "注册时间:" + the_registered_time[ii], "注册资本:" + the_registered_capital[ii], "企业类型:" + the_enterprise_type, "工商注册号:" + registration_number, "组织结构代码:" + organization_code, "统一信用代码:" + credit_code, "经营年限:" + business_period, "登记机关:" + registration_authority, "注册地址:" + registered_address, "经营范围:" + scope_of_business] e1.insert("end", new) e1.insert("end", "\n\n") #换行骚操作 # gs.append(new) # print(gs) # bb=response["translateResult"][0][0]["tgt"] # print(bb) # print(type(bb)) # e1.insert("end",gs) b=tk.Button(window,text="点击查询",command=click,width=10,font="微软雅黑 12") b.grid(row=6,column=0) b1=tk.Button(window,text="退出",command=window.quit,width=10,font="微软雅黑 12") b1.grid(row=6,column=1) window.mainloop()

后续有目标的爬取1000来家公司信息,进行了一定的改造

1.padas转化成list—

#padas 转化成list
content=pd.read_csv(r"C:\Users\Administrator\Desktop\5005.csv",encoding="utf-8").values.tolist()

2.大批量数据程序的异常判断问题 try,不然程序中途爬的全部丢失重头来过,主要两个方面去重和断点续爬

天眼查新方式信息爬取_第3张图片

import requests
from lxml import etree
import random
import re
import csv
import pandas as pd
# from multiprocess import Pool
# import HTMLParser
from html.parser import HTMLParser
from fake_useragent import UserAgent

# ua=UserAgent()

proxy = {
     "https": 'https://114.116.10.21:3128',
     "http": 'http://47.105.151.97:80',
    "https": 'https://113.200.56.13:8010',
    "https": 'https://14.20.235.220:9797',
    "https": 'https://119.31.210.170:7777',
    "http": 'http://221.193.222.7:8060',
     "http": 'http://115.223.222.206:9000',

    "http": 'http://106.12.3.84:80',
    "http": 'http://49.81.125.62:9000',
    "http": 'http://119.29.26.242:8080',
    "http": 'http://118.24.98.96:9999',
    "http": 'http://183.129.207.84:52264',
    "http": 'http://121.10.71.82:8118',
    "http": 'http://113.16.160.101:8118',
}
USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd=random.choice(USER_AGENTS)

headers={
    "Referer": "https://www.baidu.com/",

    "User-Agent": "%s"%dd,
    # "User-Agent":ua.random
}
def down_load(url):
    cc=requests.get(url=url,headers=headers,proxies=proxy,verify=True)  #,proxies=proxy,verify=True
    # cc=etree.HTML(cc)
    # cc.encode("utf-8").decode("utf-8")

    cc.encoding="utf-8"
    return  cc.text
#padas 转化成list
content=pd.read_csv(r"C:\Users\Administrator\Desktop\5005.csv",encoding="utf-8").values.tolist()

gs = []

for m in range(1101,len(content)+1):
    i=content[m][0]

    # i=input("请输入企业相关信息(企业名、工商号或纳税人号):")
    first_url="https://m.tianyancha.com/search?key=%s"%i
    # first_url="http://www.baidu.com"
    a=down_load(first_url)
    a=etree.HTML(a)
    detail_url=a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')[0]
    # boss=a.xpath('//div[@class="search_row_new_mobil"]//a/text()')[0]

    if a.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div/div/div[4]/span/text()') =="未公开" or a.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div/div/div[4]/span/text()') =="仍注册" or a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')[0] =="-" or a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')[0] =="-":

        pass
    else:
        the_registered_capital = a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')[0]
        the_registered_time = a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')[0]
        boss = a.xpath('//div[@class="search_row_new_mobil"]//a/text()')[0]
        print(detail_url)


        aa = down_load(detail_url)
        bb = etree.HTML(aa)
        try:
            company = bb.xpath('//div[@class="over-hide"]/div/text()')[0]
            # industry = re.findall("行业:(.*?)", aa, re.S)[0]
            the_enterprise_type = re.findall("企业类型:(.*?)", aa, re.S)[0]
            registration_number = re.findall("工商注册号:(.*?)", aa, re.S)[0]
            organization_code = re.findall("组织结构代码:(.*?)", aa, re.S)[0]
            credit_code = re.findall("统一信用代码:(.*?)", aa, re.S)[0]
            business_period = re.findall("经营期限:(.*?)", aa, re.S)[0]
            # approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]
            registration_authority = re.findall("登记机关:(.*?)", aa, re.S)[0]
            registered_address = re.findall("注册地址:(.*?)", aa, re.S)[0]
            scope_of_business = re.findall('(.*?)', aa, re.S)[0]
            h = HTMLParser()  # &#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法
            scope_of_business = h.unescape(scope_of_business)
            new = [str(m+1),company,boss, the_registered_time,the_registered_capital,
                   the_enterprise_type,registration_number,organization_code,
                  credit_code, business_period,registration_authority,
                  registered_address,scope_of_business]
            print(m+1)
            # gs1[ii+1]=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]
            gs.append(new)
            raise exception  #抛出异常
        except:

            with open("5006663.csv", "w", encoding="utf-8",newline="") as f:
                k = csv.writer(f, dialect="excel")
                k.writerow(["编号", "公司名", "法人", "注册时间", "注册资本", "企业类型","工商注册号","组织结构代码","统一信用代码","经营年限","登记机关","注册地址","经营范围"])

                for list in gs:
                    k.writerow(list)

with open("500666666666.csv", "w", encoding="utf-8",newline="") as f:
    k = csv.writer(f, dialect="excel")
    k.writerow(["编号", "公司名", "法人", "注册时间", "注册资本", "企业类型","工商注册号","组织结构代码","统一信用代码","经营年限","登记机关","注册地址","经营范围"])

    for list in gs:
        k.writerow(list)
# print(gs)

    # print(gs1)

你可能感兴趣的:(爬虫)