爬取企查查数据用的代码,搜索信息,企查查最多爬100条,没有vip的话。
1.注意点就是会因为请求多的话,会导致请求不了。改cookies吧
#-*- coding-8 -*-
import requests
import lxml
import sys
from bs4 import BeautifulSoup
import xlwt
import time
import urllib
import random
from pyquery import PyQuery as pq
def get_user_agent():
user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
uer_agent = random.choice(user_agent_list)
return uer_agent
def get_ip():
list = ["112.1.22.111", "200.34.98.11", "99.200.23.10","122.234.143.15","122.234.143.17","122.234.143.63",'1.0.1.0',
'1.0.2.0',
'1.0.8.0',
'1.0.32.0',
'1.1.0.0',
'1.1.2.0',
'1.1.4.0',
'1.1.8.0',
'1.1.16.0',
'1.1.32.0',
'1.2.0.0',
'1.2.2.0',
'1.2.5.0',
'1.2.6.0',
'1.2.8.0',
'1.2.16.0',
'1.2.32.0',
'1.2.64.0',
'1.3.0.0',
'1.4.1.0',
'1.4.2.0',
'1.4.4.0',
'1.4.8.0',
'1.4.16.0',
'1.4.32.0',
'1.4.64.0',
'1.8.0.0',
'1.8.64.0',
'1.8.96.0',
'1.8.100.0',
'1.8.112.0',
'1.8.128.0',
'1.8.144.0',
'1.8.148.0',
'1.8.154.0',
'1.8.156.0',
'1.8.160.0',
'1.8.192.0',
'1.8.224.0',
'1.8.244.0',
'1.8.248.0',
'1.10.0.0',
'1.10.8.0',
'1.10.11.0',
'1.10.12.0',
'1.10.16.0',
'1.10.32.0',
'1.10.64.0',
'1.12.0.0',
'1.24.0.0',
'1.45.0.0',
'1.48.0.0',
'1.56.0.0',
'1.68.0.0',
'1.80.0.0',
'1.116.0.0',
'1.180.0.0',
'1.184.0.0',
'1.188.0.0',
'1.192.0.0',
'1.202.0.0',
'1.204.0.0',
'1.213.105.0',
'12.118.130.0',
'12.126.40.0',
'14.0.0.0',
'14.0.12.0',
'14.1.0.0',
'14.1.24.0',
'14.1.108.0',
'14.16.0.0',
'14.102.128.0',
'14.102.180.0',
'14.103.0.0',
'14.104.0.0',
'14.112.0.0',
'14.130.0.0',
'14.134.0.0',
'14.144.0.0',
'14.192.56.0',
'14.192.76.0',
'14.196.0.0',
'14.204.0.0',
'14.208.0.0',
'20.134.160.0',
'20.139.160.0',
'27.0.128.0',
'27.0.160.0',
'27.0.188.0',
'27.8.0.0',
'27.16.0.0',
'27.34.232.0',
'27.36.0.0',
'27.40.0.0',
'27.50.40.0',
'27.50.128.0',
'27.54.72.0',
'27.54.152.0',
'27.54.192.0',
'27.98.208.0',
'27.98.224.0',
'27.99.128.0',
'27.103.0.0',
'27.106.128.0',
'27.106.204.0',
'27.109.32.0',
'27.109.124.0',
'27.112.0.0',
'27.112.80.0',
'27.112.112.0',
'27.113.128.0',
'27.115.0.0',
'27.116.44.0',
'27.121.72.0',
'27.121.120.0',
'27.128.0.0',
'27.131.220.0',
'27.144.0.0',
'27.148.0.0',
'27.152.0.0',
'27.184.0.0',
'27.192.0.0',
'27.224.0.0',
'36.0.0.0',
'36.0.16.0',
'36.0.32.0',
'36.0.64.0',
'36.0.128.0',
'36.1.0.0',
'36.4.0.0',
'36.16.0.0',
'36.32.0.0',
'36.36.0.0',
'36.37.0.0',
'36.37.36.0',
'36.37.39.0',
'36.37.40.0',
'36.37.48.0',
'36.40.0.0',
'36.48.0.0',
'36.51.0.0',
'36.51.128.0',
'36.51.192.0',
'36.51.224.0',
'36.51.240.0',
'36.51.248.0',
'36.51.252.0',
'36.56.0.0',
'36.96.0.0',
'36.128.0.0',
'36.192.0.0',
'36.248.0.0',
'36.254.0.0',
'36.255.116.0',
'36.255.128.0',
'36.255.164.0',
'36.255.172.0',
'36.255.176.0',
'39.0.0.0',
'39.0.2.0',
'39.0.4.0',
'39.0.8.0',
'39.0.16.0',
'39.0.32.0',
'39.0.64.0',
'39.0.128.0',
'39.64.0.0',
'39.96.0.0',
'39.104.0.0',
'39.108.0.0',
'39.128.0.0',
'40.0.176.0',
'40.0.247.0',
'40.0.248.0',
'40.0.252.0',
'40.0.255.0',
'40.72.0.0',
'40.125.128.0',
'40.126.64.0',
'40.198.10.0',
'40.198.16.0',
'40.198.24.0',
'40.251.225.0',
'40.251.227.0',
'42.0.0.0',
'42.0.8.0',
'42.0.16.0',
'42.0.24.0',
'42.0.32.0',
'42.0.128.0',
'42.0.160.0',
'42.0.176.0',
'42.0.184.0',
'42.0.186.0',
'42.0.188.0',
'42.0.192.0',
'42.0.208.0',
'42.0.216.0',
'42.0.220.0',
'42.0.223.0',
'42.0.224.0',
'42.1.0.0',
'42.1.32.0',
'42.1.48.0',
'42.1.56.0',
'42.4.0.0',
'42.48.0.0',
'42.56.0.0',
'42.62.0.0',
'42.62.128.0',
'42.62.160.0',
'42.62.180.0',
'42.62.184.0',
'42.63.0.0',
'42.80.0.0',
'42.83.64.0',
'42.83.80.0',
'42.83.88.0',
'42.83.96.0',
'42.83.128.0',
'42.83.134.0',
'42.83.140.0',
'42.83.142.0',
'42.83.144.0',
'42.83.160.0',
'42.83.192.0',
'42.84.0.0',
'42.88.0.0',
'42.96.64.0',
'42.96.96.0',
'42.96.108.0',
'42.96.112.0',
'42.96.128.0',
'42.97.0.0',
'42.99.0.0',
'42.99.64.0',
'42.99.96.0',
'42.99.112.0',
'42.99.120.0',
'42.100.0.0',
'42.120.0.0',
'42.122.0.0',
'42.123.0.0',
'42.123.36.0',
'42.123.40.0',
'42.123.48.0',
'42.123.64.0',
'42.123.128.0',
'42.123.160.0',
'42.123.164.0',
'42.123.166.0',
'42.123.168.0',
'42.123.176.0',
'42.123.192.0',
'42.128.0.0',
'42.156.0.0',
'42.156.36.0',
'42.156.40.0',
'42.156.48.0',
'42.156.64.0',
'42.156.128.0',
'42.157.0.0',
'42.158.0.0',
'42.160.0.0',
'42.176.0.0',
'42.184.0.0',
'42.186.0.0',
'42.187.0.0',
'42.187.64.0',
'42.187.96.0',
'42.187.112.0',
'42.187.120.0',
'42.187.128.0',
'42.192.0.0',
'42.201.0.0',
'42.202.0.0',
'42.204.0.0',
'42.208.0.0',
'42.224.0.0',
'42.240.0.0',
'42.242.0.0',
'42.244.0.0',
'42.248.0.0',
'43.224.12.0',
'43.224.24.0',
'43.224.44.0',
'43.224.52.0',
'43.224.56.0',
'43.224.64.0',
'43.224.72.0',
'43.224.80.0',
'43.224.100.0',
'43.224.144.0',
'43.224.160.0',
'43.224.176.0',
'43.224.184.0',
'43.224.200.0',
'43.224.208.0',
'43.224.216.0',
'43.224.240.0',
'43.225.76.0',
'43.225.84.0',
'43.225.120.0',
'43.225.180.0',
'43.225.184.0',
'43.225.208.0',
'43.225.216.0',
'43.225.224.0',
'43.225.240.0',
'43.225.252.0',
'43.226.32.0',
'43.226.64.0',
'43.226.96.0',
'43.226.112.0',
'43.226.120.0',
'43.226.128.0',
'43.226.160.0',
'43.226.236.0',
'43.226.240.0',
'43.227.0.0',
'43.227.8.0',
'43.227.32.0',
'43.227.64.0',
'43.227.104.0',
'43.227.136.0',
'43.227.144.0',
'43.227.152.0',
'43.227.160.0',
'43.227.176.0',
'43.227.188.0',
'43.227.192.0',
'43.227.232.0',
'43.227.248.0',
'43.228.0.0',
'43.228.64.0',
'43.228.76.0',
'43.228.100.0',
'43.228.116.0',
'43.228.132.0',
'43.228.136.0',
'43.228.148.0',
'43.228.152.0',
'43.228.188.0',
'43.229.40.0',
'43.229.56.0',
'43.229.96.0',
'43.229.136.0',
'43.229.168.0',
'43.229.176.0',
'43.229.192.0',
'43.229.216.0',
'43.229.232.0',
'43.230.20.0',
'43.230.32.0',
'43.230.68.0',
'43.230.72.0',
'43.230.84.0',
'43.230.124.0',
'43.230.220.0',
'43.230.224.0',
'43.231.12.0',
'43.231.32.0',
'43.231.80.0',
'43.231.96.0',
'43.231.136.0',
'43.231.144.0',
'43.231.160.0',
'43.231.176.0',
'43.236.0.0',
'43.238.0.0',
'43.239.0.0',
'43.239.32.0',
'43.239.48.0',
'43.239.116.0',
'43.239.120.0',
'43.239.172.0',
'43.240.0.0',
'43.240.56.0',
'43.240.68.0',
'43.240.72.0',
'43.240.84.0',
'43.240.124.0',
'43.240.128.0',
'43.240.136.0',
'43.240.156.0',
'43.240.160.0',
'43.240.192.0',
'43.240.240.0',
'43.241.0.0',
'43.241.16.0',
'43.241.48.0',
'43.241.76.0',
'43.241.80.0',
'43.241.112.0',
'43.241.168.0',
'43.241.176.0',
'43.241.184.0',
'43.241.208.0',
'43.241.224.0',
'43.241.240.0',
'43.241.248.0',
'43.242.8.0',
'43.242.16.0',
'43.242.48.0',
'43.242.64.0',
'43.242.72.0',
'43.242.80.0',
'43.242.96.0',
'43.242.144.0',
'43.242.160.0',
'43.242.180.0'
]
ip = random.choice(list)
return ip
def craw(url,key_word,x):
# if x == 0:
# re = 'http://www.qichacha.com/search?key='+key_word
# else:
# re = 'https://www.qichacha.com/search?key={}#p:{}&'.format(key_word,x-1)
re = r'https://www.qichacha.com/search?key='+key_word
headers = {
'Host':'www.qichacha.com',
'Connection': 'keep-alive',
'Accept':r'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent':get_user_agent(),
'Referer': re,
'X-Forwarded-For': get_ip(),
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':r'QCCSESSID=ii39oo0rtltvogj0bq5ur1mul3; zg_did=%7B%22did%22%3A%20%2216c07be94cb9e-05ecf112418431-454c092b-1fa400-16c07be94cc690%22%7D; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201563498026192%2C%22updated%22%3A%201563498026197%2C%22info%22%3A%201563498026194%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qichacha.com%22%7D; hasShow=1',
}
try:
response = requests.get(url,headers = headers)
if response.status_code != 200:
response.encoding = 'utf-8'
print(response.status_code)
print('ERROR')
soup = pq(response.text)
com_all_info = soup.find(".m_srchList")
except Exception:
print('请求都不让,这企查查是想逆天吗???')
try:
com_all_info_array = com_all_info.find("tr")
print('开始爬取数据,请勿打开excel')
for tr in com_all_info_array.items():
mtxs = tr.find(".m-t-xs")
temp_g_name = tr.find(".ma_h1").text() #获取公司名
temp_g_tag = tr.find("p").eq(3).text().split(u"品牌/产品:") #获取公司标签
print(temp_g_tag)
temp_r_name = mtxs.eq(0).find("a").text() #获取法人名
temp_g_money = mtxs.eq(0).find(".m-l").eq(0).text().replace(u"注册资本:", "") #获取注册资本
temp_g_date = mtxs.eq(0).find(".m-l").eq(1).text().replace(u"成立日期:", "") #获取公司注册时间
two = mtxs.eq(1).text().split(u"电话:")
temp_r_email = two[0] #获取法人Email
temp_r_phone = two[1] #获取法人手机号
temp_g_addr = mtxs.eq(2).text() #获取公司地址
temp_g_state = tr.find(".nstatus").text() #获取公司状态
g_name_list.append(temp_g_name)
g_tag_list.append(temp_g_tag)
r_name_list.append(temp_r_name)
g_money_list.append(temp_g_money)
g_date_list.append(temp_g_date)
r_email_list.append(temp_r_email)
r_phone_list.append(temp_r_phone)
g_addr_list.append(temp_g_addr)
g_state_list.append(temp_g_state)
# except Exception:
# print('错误!')
except Exception:
print('好像被拒绝访问了呢...请稍后再试叭...')
if __name__ == '__main__':
global g_name_list
global g_tag_list
global r_name_list
global g_money_list
global g_date_list
global r_email_list
global r_phone_list
global g_addr_list
global g_state_list
g_name_list=[]
g_tag_list=[]
r_name_list=[]
g_money_list=[]
g_date_list=[]
r_email_list=[]
r_phone_list=[]
g_addr_list=[]
g_state_list=[]
print(g_name_list)
key_word = input('请输入您想搜索的关键词:')
num = int(input('请输入您想检索的次数:'))+1
sleep_time = int(input('请输入每次检索延时的秒数:'))
key_word = urllib.parse.quote(key_word)
print('正在搜索,请稍后')
for x in range(1,num):
url = r'https://www.qichacha.com/search_index?key={}&ajaxflag=1&p={}&'.format(key_word,x)
s1 = craw(url,key_word,x)
time.sleep(sleep_time)
workbook = xlwt.Workbook()
#创建sheet对象,新建sheet
sheet1 = workbook.add_sheet('企查查数据', cell_overwrite_ok=True)
#---设置excel样式---
#初始化样式
style = xlwt.XFStyle()
#创建字体样式
font = xlwt.Font()
font.name = '仿宋'
# font.bold = True #加粗
#设置字体
style.font = font
#使用样式写入数据
print('正在存储数据,请勿打开excel')
#向sheet中写入数据
name_list = ['公司名字','品牌/产品','法定法人','注册资本','成立日期','法人邮箱','法人电话','公司地址','公司状态']
for cc in range(0,len(name_list)):
sheet1.write(0,cc,name_list[cc],style)
for i in range(0,len(g_name_list)):
print(g_name_list[i])
sheet1.write(i+1,0,g_name_list[i],style)#公司名字
sheet1.write(i+1,1,g_tag_list[i],style)#公司标签
sheet1.write(i+1,2,r_name_list[i],style)#法定法人
sheet1.write(i+1,3,g_money_list[i],style)#注册资本
sheet1.write(i+1,4,g_date_list[i],style)#成立日期
sheet1.write(i+1,5,r_email_list[i],style)#法人邮箱
sheet1.write(i+1,6,r_phone_list[i],style)#法人电话
sheet1.write(i+1,7,g_addr_list[i],style)#公司地址
sheet1.write(i+1,8,g_state_list[i],style)#公司状态
#保存excel文件,有同名的直接覆盖
workbook.save(r"D:\test .xls")
print('保存完毕~')