#本文通过新的方式爬取突破
import requests
from lxml import etree
import random
import re
# import HTMLParser
from html.parser import HTMLParser
proxy = {
"http": 'http://125.70.13.77:8080',
"http": 'https://183.6.129.212:41949'
}
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd=random.choice(USER_AGENTS)
print(dd)
headers={
"Referer": "https://www.baidu.com/",
"User-Agent": "%s"%dd
}
def down_load(url):
cc=requests.get(url=url,headers=headers,proxies=proxy)
# cc=etree.HTML(cc)
# cc.encode("utf-8").decode("utf-8")
cc.encoding="utf-8"
return cc.text
i=input("请输入企业相关信息(企业名、工商号或纳税人号):")
first_url="https://m.tianyancha.com/search?key=%s"%i
# first_url="http://www.baidu.com"
a=down_load(first_url)
a=etree.HTML(a)
detail_url=a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')
boss=a.xpath('//div[@class="search_row_new_mobil"]//a/text()')
the_registered_capital=a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')
the_registered_time=a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')
# print(detail_url,boss,the_registered_capital,the_registered_time)
gs=[]
gs1={}
for ii in range(len(boss)):
aa=down_load(detail_url[ii])
bb=etree.HTML(aa)
company=bb.xpath('//div[@class="over-hide"]/div/text()')[0]
industry = re.findall("行业:(.*?)
",aa,re.S)[0]
the_enterprise_type = re.findall("企业类型:(.*?)
",aa,re.S)[0]
registration_number = re.findall("工商注册号:(.*?)
",aa,re.S)[0]
organization_code = re.findall("组织结构代码:(.*?)
",aa,re.S)[0]
credit_code = re.findall("统一信用代码:(.*?)
",aa,re.S)[0]
business_period = re.findall("经营期限:(.*?)
",aa,re.S)[0]
# approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]
registration_authority =re.findall("登记机关:(.*?)
",aa,re.S)[0]
registered_address =re.findall("注册地址:(.*?)
",aa,re.S)[0]
scope_of_business =re.findall('(.*?) ',aa,re.S)[0]
h=HTMLParser() #&#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法
scope_of_business=h.unescape(scope_of_business)
new=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]
# print(new)
gs1[ii+1]=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]
gs.append(new)
print(gs)
# print(gs1)
也可以参考看下上篇:通过scrapy结合selenium抓取天眼查
import tkinter as tk
import requests
from lxml import etree
import random
import re
from html.parser import HTMLParser
window=tk.Tk()
window.title("公司信息查询器")
window.geometry("790x550+500+200")
l=tk.Label(window,text="企业名、工商号或纳税人号:",font="微软雅黑 11",height=2)
l.grid()
l1=tk.Label(window,text="这就是为你查询的结果:",font="微软雅黑 11",height=2)
l1.grid()
var=tk.StringVar()
e=tk.Entry(window,width=62)
e.grid(row=0,column=1)
e1=tk.Text(window,height=30)
# e1=tk.Entry(window,textvariable=var,width=60,)
e1.grid(row=2,column=1)
def click():
content=e.get()
proxy = {
"http": 'http://125.70.13.77:8080',
"http": 'https://183.6.129.212:41949'
}
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd = random.choice(USER_AGENTS)
headers = {
"Referer": "https://www.baidu.com/",
"User-Agent": "%s" % dd
}
def down_load(url):
cc = requests.get(url=url, headers=headers, proxies=proxy)
# cc=etree.HTML(cc)
# cc.encode("utf-8").decode("utf-8")
cc.encoding = "utf-8"
return cc.text
# i = input("请输入企业相关信息(企业名、工商号或纳税人号):")
first_url = "https://m.tianyancha.com/search?key=%s" % content
# first_url="http://www.baidu.com"
a = down_load(first_url)
a = etree.HTML(a)
detail_url = a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')
boss = a.xpath('//div[@class="search_row_new_mobil"]//a/text()')
the_registered_capital = a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')
the_registered_time = a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')
# print(detail_url,boss,the_registered_capital,the_registered_time)
gs = []
gs1 = {}
for ii in range(len(boss)):
aa = down_load(detail_url[ii])
bb = etree.HTML(aa)
company = bb.xpath('//div[@class="over-hide"]/div/text()')[0]
industry = re.findall("行业:(.*?)
", aa, re.S)[0]
the_enterprise_type = re.findall("企业类型:(.*?)", aa, re.S)[0]
registration_number = re.findall("工商注册号:(.*?)", aa, re.S)[0]
organization_code = re.findall("组织结构代码:(.*?)", aa, re.S)[0]
credit_code = re.findall("统一信用代码:(.*?)", aa, re.S)[0]
business_period = re.findall("经营期限:(.*?)", aa, re.S)[0]
# approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]
registration_authority = re.findall("登记机关:(.*?)", aa, re.S)[0]
registered_address = re.findall("注册地址:(.*?)", aa, re.S)[0]
scope_of_business = re.findall('(.*?) ', aa, re.S)[0]
h = HTMLParser() # &#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法
scope_of_business = h.unescape(scope_of_business)
new = [ii+1,"公司名:" + company, "法人:" + boss[ii], "注册时间:" + the_registered_time[ii],
"注册资本:" + the_registered_capital[ii], "企业类型:" + the_enterprise_type, "工商注册号:" + registration_number,
"组织结构代码:" + organization_code, "统一信用代码:" + credit_code, "经营年限:" + business_period,
"登记机关:" + registration_authority, "注册地址:" + registered_address, "经营范围:" + scope_of_business]
gs1[ii + 1] = ["公司名:" + company, "法人:" + boss[ii], "注册时间:" + the_registered_time[ii],
"注册资本:" + the_registered_capital[ii], "企业类型:" + the_enterprise_type,
"工商注册号:" + registration_number, "组织结构代码:" + organization_code, "统一信用代码:" + credit_code,
"经营年限:" + business_period, "登记机关:" + registration_authority, "注册地址:" + registered_address,
"经营范围:" + scope_of_business]
e1.insert("end", new)
e1.insert("end", "\n\n") #换行骚操作
# gs.append(new)
# print(gs)
# bb=response["translateResult"][0][0]["tgt"]
# print(bb)
# print(type(bb))
# e1.insert("end",gs)
b=tk.Button(window,text="点击查询",command=click,width=10,font="微软雅黑 12")
b.grid(row=6,column=0)
b1=tk.Button(window,text="退出",command=window.quit,width=10,font="微软雅黑 12")
b1.grid(row=6,column=1)
window.mainloop()
后续有目标的爬取1000来家公司信息,进行了一定的改造
1.padas转化成list—
#padas 转化成list
content=pd.read_csv(r"C:\Users\Administrator\Desktop\5005.csv",encoding="utf-8").values.tolist()
2.大批量数据程序的异常判断问题 try,不然程序中途爬的全部丢失重头来过,主要两个方面去重和断点续爬
import requests
from lxml import etree
import random
import re
import csv
import pandas as pd
# from multiprocess import Pool
# import HTMLParser
from html.parser import HTMLParser
from fake_useragent import UserAgent
# ua=UserAgent()
proxy = {
"https": 'https://114.116.10.21:3128',
"http": 'http://47.105.151.97:80',
"https": 'https://113.200.56.13:8010',
"https": 'https://14.20.235.220:9797',
"https": 'https://119.31.210.170:7777',
"http": 'http://221.193.222.7:8060',
"http": 'http://115.223.222.206:9000',
"http": 'http://106.12.3.84:80',
"http": 'http://49.81.125.62:9000',
"http": 'http://119.29.26.242:8080',
"http": 'http://118.24.98.96:9999',
"http": 'http://183.129.207.84:52264',
"http": 'http://121.10.71.82:8118',
"http": 'http://113.16.160.101:8118',
}
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
dd=random.choice(USER_AGENTS)
headers={
"Referer": "https://www.baidu.com/",
"User-Agent": "%s"%dd,
# "User-Agent":ua.random
}
def down_load(url):
cc=requests.get(url=url,headers=headers,proxies=proxy,verify=True) #,proxies=proxy,verify=True
# cc=etree.HTML(cc)
# cc.encode("utf-8").decode("utf-8")
cc.encoding="utf-8"
return cc.text
#padas 转化成list
content=pd.read_csv(r"C:\Users\Administrator\Desktop\5005.csv",encoding="utf-8").values.tolist()
gs = []
for m in range(1101,len(content)+1):
i=content[m][0]
# i=input("请输入企业相关信息(企业名、工商号或纳税人号):")
first_url="https://m.tianyancha.com/search?key=%s"%i
# first_url="http://www.baidu.com"
a=down_load(first_url)
a=etree.HTML(a)
detail_url=a.xpath('//div[contains(@class,"col-xs-10")]/a/@href')[0]
# boss=a.xpath('//div[@class="search_row_new_mobil"]//a/text()')[0]
if a.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div/div/div[4]/span/text()') =="未公开" or a.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div/div/div[4]/span/text()') =="仍注册" or a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')[0] =="-" or a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')[0] =="-":
pass
else:
the_registered_capital = a.xpath('//div[@class="search_row_new_mobil"]/div/div[2]/span/text()')[0]
the_registered_time = a.xpath('//div[@class="search_row_new_mobil"]/div/div[3]/span/text()')[0]
boss = a.xpath('//div[@class="search_row_new_mobil"]//a/text()')[0]
print(detail_url)
aa = down_load(detail_url)
bb = etree.HTML(aa)
try:
company = bb.xpath('//div[@class="over-hide"]/div/text()')[0]
# industry = re.findall("行业:(.*?)", aa, re.S)[0]
the_enterprise_type = re.findall("企业类型:(.*?)", aa, re.S)[0]
registration_number = re.findall("工商注册号:(.*?)", aa, re.S)[0]
organization_code = re.findall("组织结构代码:(.*?)", aa, re.S)[0]
credit_code = re.findall("统一信用代码:(.*?)", aa, re.S)[0]
business_period = re.findall("经营期限:(.*?)", aa, re.S)[0]
# approval_date = aa.xpath('/html/body/div[3]/div[1]/div[7]/div/div[11]/span[2]/text()')[0]
registration_authority = re.findall("登记机关:(.*?)", aa, re.S)[0]
registered_address = re.findall("注册地址:(.*?)", aa, re.S)[0]
scope_of_business = re.findall('(.*?) ', aa, re.S)[0]
h = HTMLParser() # &#xxx;‘ 的格式其实是unicode,&#后面跟的是unicode字符的十进制值,解决字体这样的方法
scope_of_business = h.unescape(scope_of_business)
new = [str(m+1),company,boss, the_registered_time,the_registered_capital,
the_enterprise_type,registration_number,organization_code,
credit_code, business_period,registration_authority,
registered_address,scope_of_business]
print(m+1)
# gs1[ii+1]=["公司名:"+company,"法人:"+boss[ii],"注册时间:"+the_registered_time[ii],"注册资本:"+the_registered_capital[ii],"企业类型:"+the_enterprise_type,"工商注册号:"+registration_number,"组织结构代码:"+organization_code,"统一信用代码:"+credit_code,"经营年限:"+business_period,"登记机关:"+registration_authority,"注册地址:"+registered_address,"经营范围:"+scope_of_business]
gs.append(new)
raise exception #抛出异常
except:
with open("5006663.csv", "w", encoding="utf-8",newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["编号", "公司名", "法人", "注册时间", "注册资本", "企业类型","工商注册号","组织结构代码","统一信用代码","经营年限","登记机关","注册地址","经营范围"])
for list in gs:
k.writerow(list)
with open("500666666666.csv", "w", encoding="utf-8",newline="") as f:
k = csv.writer(f, dialect="excel")
k.writerow(["编号", "公司名", "法人", "注册时间", "注册资本", "企业类型","工商注册号","组织结构代码","统一信用代码","经营年限","登记机关","注册地址","经营范围"])
for list in gs:
k.writerow(list)
# print(gs)
# print(gs1)