企查查爬虫

企查查网站爬取

输入数据为企业名称的csv文件,输出为对应企业的名称、资金、地址、专利、范围、行业范围、股东、软著等信息的csv文件

# -*- coding: utf-8 -*-
from selenium import webdriver
import time
import sys,os
import imp
import csv
import pandas as pd
import re
from PIL import Image
from time import sleep
from io import BytesIO
from selenium.webdriver.common.action_chains import ActionChains
import time,random
imp.reload(sys)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
driver.get('https://www.qichacha.com/user_login')
def qichacha(inc_list,outputfile,city,username,password) :
        tag = driver.find_element_by_xpath('//*[@id="normalLogin"]')  #登录
        tag.click()
        tag = driver.find_element_by_xpath('//*[@class="btn-weibo m-l-xs"]') #微博登录
        tag.click()       
        # 将用户名、密码注入
        driver.find_element_by_id('userId').send_keys(username)  #账号
        driver.find_element_by_id('passwd').send_keys(password)  #密码
        time.sleep(3)  # 休眠,人工完成验证步骤,等待程序单击“登录”
        btn = driver.find_element_by_xpath('//*[@id="outer"]/div/div[2]/form/div/div[2]/div/p/a[1]').click()  # 微博二维码扫描登录
        time.sleep(5) 
        count = 0
#################################################
        for i in range(len(inc_list)):
            fid = inc_list["FID"][i]
            txt = inc_list["name"][i]
            if city in txt:
                txts = txt
            else:
                txts = city+txt
            time.sleep(2)
            try:
                if i==0:
                    driver.find_element_by_id('searchkey').send_keys(txts)  #向搜索框注入文字
                    srh_btn = driver.find_element_by_xpath('//*[@id="indexSearchForm"]/div/span/input').click()    #单击搜索按钮
                else:
                    try:
                        clear_btn = driver.find_element_by_xpath('//*[@id="clearSearchkey"]').click()     #清空搜索栏 
                        driver.find_element_by_id('headerKey').send_keys(txts)     #向搜索框注入下一个公司地址
                        srh_btn = driver.find_element_by_xpath('/html/body/header/div/form/div/div/span/button').click()#搜索
                    except:
                        
                        driver.find_element_by_id('searchkey').send_keys(txts)
                        srh_btn = driver.find_element_by_xpath('//*[@id="indexSearchForm"]/div/span/input').click()
 
                inc_full = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').text.strip()
                print(inc_full)#企业名称
                try:
                    addr = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/p[3]').text.strip()
                    # print(addr)  #地址
                except:
                    addr =None

                try:
                    inner = driver.find_element_by_xpath('//*[@id="search-result"]/tr[1]/td[3]/a').get_attribute("href")
                    driver.get(inner)
                    try:  
                        money = driver.find_element_by_xpath('//*[@id="Cominfo"]/table/tbody/tr[2]/td[2]').get_attribute('innerText').strip()
                        money_c = re.findall("\d+",money)[0]
                        money_u = "".join(re.findall('[^0-9]',money))
                        # print(money_c,money_u)  #资金单位
                    except: 
                        money = None
                    
                    try:
                        hangye = driver.find_element_by_xpath('//*[@id="Cominfo"]/table/tbody/tr[4]/td[6]').get_attribute('innerText').strip()
                        # print(hangye)  #行业
                    except: 
                        hangye = None
                    try:
                        fanwei = driver.find_element_by_xpath('//*[@id="Cominfo"]/table/tbody/tr[9]/td[2]').get_attribute('innerText').strip()
                        # print(fanwei)  #范围
                    except:
                        fanwei = None
                except:
                    money_c = None
                    money_u = None
                    hangye = None
                    fanwei = None
                try:     #股东信息
                    gd_count = driver.find_element_by_xpath('//*[@id="partnerslist"]/div[1]/span').get_attribute('innerText').strip()
                    gudong=[]
                    if int(gd_count)>0:
                        for i in range(int(gd_count)):
                            gd_name = driver.find_element_by_xpath('//*[@id="partnerslist"]/table/tbody/tr['+str(i+2)+']/td[2]/table/tbody/tr/td[2]/a/h3').get_attribute('innerText').strip()
                            gd_bili = driver.find_element_by_xpath('//*[@id="partnerslist"]/table/tbody/tr['+str(i+2)+']/td[3]').get_attribute('innerText').strip()
                            gd_money =  driver.find_element_by_xpath('//*[@id="partnerslist"]/table/tbody/tr['+str(i+2)+']/td[4]').get_attribute('innerText').strip()
                            gudong_dict = "{0},{1},{2}".format(gd_name,gd_bili,gd_money+'万元')
                            gudong.append(gudong_dict) 
                    else:
                        gudong = None
                    # print(gudong)
                    
                except:
                    pass
                try:
       
                     zl_count = driver.find_element_by_xpath('//*[@id="assets_title"]')
                     zl_count.click()
                     time.sleep(2)
                     inner1 = driver.find_element_by_xpath('//*[@id="assets_title"]').get_attribute("href")
                     driver.get(inner1)
                     zl = driver.find_element_by_xpath('//*[@id="assets_div"]/section[1]/div/a[2]').get_attribute('innerText').strip()
                     zl_count = re.findall("\d+",zl)[0]
                     # print(zl_count)
                     zhuanli = []
                     if int(zl_count)>0:
                         for i in range(int(zl_count)):
                              zl_name = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/table/tbody/tr['+str(i+2)+']/td[5]/a').get_attribute('innerText').strip()
                              zhuanli.append(zl_name)
                     else:
                         zhuanli = None
                     # print(zhuanli)
                     rz = driver.find_element_by_xpath('//*[@id="assets_div"]/section[1]/div/a[5]').get_attribute('innerText').strip()
                     rz_count = re.findall("\d+",rz)[0]
                     # print(rz_count)
                     ruanzhu = []
                     if int(rz_count)>0:
                         for i in range(int(rz_count)):
                              rz_name = driver.find_element_by_xpath('//*[@id="rjzzqlist"]/table/tbody/tr['+str(i+2)+']/td[2]').get_attribute('innerText').strip()
                              ruanzhu.append(rz_name)
                     else:
                         ruanzhu = None
                     # print(ruanzhu)
                                          
                except:
                    pass
                
                with open( outputfile, 'a',newline = '') as csv_file:#避免覆盖,用a
                                writer = csv.writer(csv_file)
                                writer.writerow([fid,txt,inc_full,money_c,money_u,addr,hangye,fanwei,gudong,zl_count,zhuanli,rz_count,ruanzhu])
                try:
                    driver.find_element_by_xpath('//*[@id="firstcaseModal"]/div/div/div[2]/button').click()
                except:
                    pass
#############################################################################################
            except:
                
                inc_full=None
                money_c =None
                money_u =None
                addr =None
                hangye = None
                fanwei = None
                gudong= None
                zl_count = None
                zhuanli = None
                rz_count = None
                ruanzhu = None
                count+=1
                currentPageUrl = driver.current_url
                title = driver.title
                print(title)
                if title=='用户验证-企查查':
                    
                    os.system("pause")
              
                    i = i-1
                else:
                    with open( outputfile, 'a',newline = '') as csv_file:#避免覆盖,用a
                                writer = csv.writer(csv_file)
                                writer.writerow([fid,txt,inc_full,money_c,money_u,addr,hangye,fanwei,gudong,zl_count,zhuanli,rz_count,ruanzhu])
            with open(outputfile1, 'w',newline = '') as csv_file:#避免覆盖,用a
                            writer = csv.writer(csv_file)
                            writer.writerow([zongnum,i+1,count,i+1-count])
            
            
      

你可能感兴趣的:(爬虫,python)