输入数据为企业名称的csv文件,输出为对应企业的名称、资金、地址、专利、范围、行业范围、股东、软著等信息的csv文件
# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import sys,os
import imp
import csv
import pandas as pd
import re
from PIL import Image
from time import sleep
from io import BytesIO
from selenium.webdriver.common.action_chains import ActionChains
import time,random
imp.reload(sys)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.get('https://www.qichacha.com/user_login')
def qichacha(inc_list,outputfile,city,username,password) :
tag = driver.find_element_by_xpath('//*[@id="normalLogin"]') #登录
tag.click()
tag = driver.find_element_by_xpath('//*[@class="btn-weibo m-l-xs"]') #微博登录
tag.click()
# 将用户名、密码注入
driver.find_element_by_id('userId').send_keys(username) #账号
driver.find_element_by_id('passwd').send_keys(password) #密码
time.sleep(3) # 休眠,人工完成验证步骤,等待程序单击“登录”
btn = driver.find_element_by_xpath('//*[@id="outer"]/div/div[2]/form/div/div[2]/div/p/a[1]').click() # 微博二维码扫描登录
time.sleep(5)
count = 0
#################################################
for i in range(len(inc_list)):
fid = inc_list["FID"][i]
txt = inc_list["name"][i]
if city in txt:
txts = txt
else:
txts = city+txt
time.sleep(2)
try:
if i==0:
driver.find_element_by_id('searchkey').send_keys(txts) #向搜索框注入文字
srh_btn = driver.find_element_by_xpath('//*[@id="indexSearchForm"]/div/span/input').click() #单击搜索按钮
else:
try:
clear_btn = driver.find_element_by_xpath('//*[@id="clearSearchkey"]').click() #清空搜索栏
driver.find_element_by_id('headerKey').send_keys(txts) #向搜索框注入下一个公司地址
srh_btn = driver.find_element_by_xpath('/html/body/header/div/form/div/div/span/button').click()#搜索
except:
driver.find_element_by_id('searchkey').send_keys(txts)
srh_btn = driver.find_element_by_xpath('//*[@id="indexSearchForm"]/div/span/input').click()
inc_full = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').text.strip()
print(inc_full)#企业名称
try:
addr = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[3]').text.strip()
# print(addr) #地址
except:
addr =None
try:
inner = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').get_attribute("href")
driver.get(inner)
try:
money = driver.find_element_by_xpath('//*[@id="Cominfo"]/table/tbody/tr[2]/td[2]').get_attribute('innerText').strip()
money_c = re.findall("\d+",money)[0]
money_u = "".join(re.findall('[^0-9]',money))
# print(money_c,money_u) #资金单位
except:
money = None
try:
hangye = driver.find_element_by_xpath('//*[@id="Cominfo"]/table/tbody/tr[4]/td[6]').get_attribute('innerText').strip()
# print(hangye) #行业
except:
hangye = None
try:
fanwei = driver.find_element_by_xpath('//*[@id="Cominfo"]/table/tbody/tr[9]/td[2]').get_attribute('innerText').strip()
# print(fanwei) #范围
except:
fanwei = None
except:
money_c = None
money_u = None
hangye = None
fanwei = None
try: #股东信息
gd_count = driver.find_element_by_xpath('//*[@id="partnerslist"]/div[1]/span').get_attribute('innerText').strip()
gudong=[]
if int(gd_count)>0:
for i in range(int(gd_count)):
gd_name = driver.find_element_by_xpath('//*[@id="partnerslist"]/table/tbody/tr['+str(i+2)+']/td[2]/table/tbody/tr/td[2]/a/h3').get_attribute('innerText').strip()
gd_bili = driver.find_element_by_xpath('//*[@id="partnerslist"]/table/tbody/tr['+str(i+2)+']/td[3]').get_attribute('innerText').strip()
gd_money = driver.find_element_by_xpath('//*[@id="partnerslist"]/table/tbody/tr['+str(i+2)+']/td[4]').get_attribute('innerText').strip()
gudong_dict = "{0},{1},{2}".format(gd_name,gd_bili,gd_money+'万元')
gudong.append(gudong_dict)
else:
gudong = None
# print(gudong)
except:
pass
try:
zl_count = driver.find_element_by_xpath('//*[@id="assets_title"]')
zl_count.click()
time.sleep(2)
inner1 = driver.find_element_by_xpath('//*[@id="assets_title"]').get_attribute("href")
driver.get(inner1)
zl = driver.find_element_by_xpath('//*[@id="assets_div"]/section[1]/div/a[2]').get_attribute('innerText').strip()
zl_count = re.findall("\d+",zl)[0]
# print(zl_count)
zhuanli = []
if int(zl_count)>0:
for i in range(int(zl_count)):
zl_name = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/table/tbody/tr['+str(i+2)+']/td[5]/a').get_attribute('innerText').strip()
zhuanli.append(zl_name)
else:
zhuanli = None
# print(zhuanli)
rz = driver.find_element_by_xpath('//*[@id="assets_div"]/section[1]/div/a[5]').get_attribute('innerText').strip()
rz_count = re.findall("\d+",rz)[0]
# print(rz_count)
ruanzhu = []
if int(rz_count)>0:
for i in range(int(rz_count)):
rz_name = driver.find_element_by_xpath('//*[@id="rjzzqlist"]/table/tbody/tr['+str(i+2)+']/td[2]').get_attribute('innerText').strip()
ruanzhu.append(rz_name)
else:
ruanzhu = None
# print(ruanzhu)
except:
pass
with open( outputfile, 'a',newline = '') as csv_file:#避免覆盖,用a
writer = csv.writer(csv_file)
writer.writerow([fid,txt,inc_full,money_c,money_u,addr,hangye,fanwei,gudong,zl_count,zhuanli,rz_count,ruanzhu])
try:
driver.find_element_by_xpath('//*[@id="firstcaseModal"]/div/div/div[2]/button').click()
except:
pass
#############################################################################################
except:
inc_full=None
money_c =None
money_u =None
addr =None
hangye = None
fanwei = None
gudong= None
zl_count = None
zhuanli = None
rz_count = None
ruanzhu = None
count+=1
currentPageUrl = driver.current_url
title = driver.title
print(title)
if title=='用户验证-企查查':
os.system("pause")
i = i-1
else:
with open( outputfile, 'a',newline = '') as csv_file:#避免覆盖,用a
writer = csv.writer(csv_file)
writer.writerow([fid,txt,inc_full,money_c,money_u,addr,hangye,fanwei,gudong,zl_count,zhuanli,rz_count,ruanzhu])
with open(outputfile1, 'w',newline = '') as csv_file:#避免覆盖,用a
writer = csv.writer(csv_file)
writer.writerow([zongnum,i+1,count,i+1-count])