记录一次利用python调用chrome爬取天眼查网址工商信息的过程

1.首先准备工作:

 1.pyhton3.6(官网有下载)。https://www.python.org/downloads/release/python-363/

 2.pycharm 2017 开发工具。(官网有下载,破解方法百度)。

 下载地址:https://www.jetbrains.com/zh/pycharm/specials/pycharm/pycharm.html

 破解方法参考:http://blog.csdn.net/u014044812/article/details/78727496

 3.看看谷歌浏览器的版本然后下载对应的驱动,放到pyhton安装目录下。(因为本次爬虫由python代码调用chorme浏览器)

要下载哪个版本可以参考:http://blog.csdn.net/huilan_same/article/details/51896672

下载驱动的网址有时候需要梯子

记录一次利用python调用chrome爬取天眼查网址工商信息的过程_第1张图片

 4.验证安装完成。有如下提示,代表安装完成

记录一次利用python调用chrome爬取天眼查网址工商信息的过程_第2张图片

5.输入以下两个安装命令(pip install selenium; pip install pymysql;),因为爬虫程序要用到以下两个包。

我的已经安装过了,提示跟你的可能不一样。

记录一次利用python调用chrome爬取天眼查网址工商信息的过程_第3张图片


2.数据库

 捉取到的数据保存到mysql中,以下是我的两个表,实际需求实际分析

company表是任务列表(done字段 null代表待执行的任务,0代表失败,1代表成功,2代表没有查询到数据),

result是保存结果的表

记录一次利用python调用chrome爬取天眼查网址工商信息的过程_第4张图片

3.代码

记录一次利用python调用chrome爬取天眼查网址工商信息的过程_第5张图片

附上代码:

example:

import re
from selenium import webdriver
import time
import uuid
import mysqlDao

class mainAll(object):

    def __init__(self):
        self.url = 'https://www.tianyancha.com/login'
        self.username = ''  #自己的天眼查账号
        self.password = ''  #自己的密码
        self.word = '淘宝'
        self.driver = self.login()
        # self.getData(self.driver)
        # self.scrapy(self.driver)
        print("ok,the work is done!")

    def login(self):

        # driver = webdriver.Chrome()
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(self.url)

        # 模拟登陆
        driver.find_element_by_xpath(
            ".//*[@id='web-content']/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/input"). \
            send_keys(self.username)
        driver.find_element_by_xpath(
            ".//*[@id='web-content']/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[3]/input"). \
            send_keys(self.password)
        driver.find_element_by_xpath(
            ".//*[@id='web-content']/div/div/div/div[2]/div/div[2]/div[2]/div[2]/div[5]").click()
        time.sleep(3)
        driver.refresh()
        # driver.get('https://www.tianyancha.com/company/28723141')

        # # 模拟登陆完成,输入搜索内容
        driver.find_element_by_xpath(".//*[@id='home-main-search']").send_keys(self.word)  # 输入搜索内容
        driver.find_element_by_xpath(".//*[@class='input-group-addon search_button']").click()  # 点击搜索
        driver.implicitly_wait(10)
        #
        #
        #
        # # 选择相关度最高的搜索结果 第一条搜索框,然后再
        # tag = driver.find_elements_by_xpath("//div[@class='search_right_item ml10']")
        # btn = tag[0].find_element_by_tag_name('a')
        # print(btn);
        # closeBtn = driver.find_element_by_id("bannerClose")
        # if not closeBtn is None:
        #     closeBtn.click()
        # btn.click()
        # driver.implicitly_wait(5)
        #
        # # 转化句柄
        # now_handle = driver.current_window_handle
        # all_handles = driver.window_handles
        # for handle in all_handles:
        #     if handle != now_handle:
        #         # 输出待选择的窗口句柄
        #         print(handle)
        #         driver.switch_to.window(handle)

        cons = conn_mysql.selectUnFinishCompany();
        for row in cons:
            driver.implicitly_wait(3)
            id = row[0]
            conpanyname = row[1]
            time.sleep(1)
            print("正在查询第"+str(id)+"个【"+conpanyname + "】")
            try:
                self.refsh(driver, id, conpanyname);
            except:
                print("出现异常!!!第" + str(id) + "个【" + conpanyname + "】")
                conn_mysql.updateConpanyFlase(id);
                driver.refresh();
            print("已经完成第" + str(id) + "个【" + conpanyname + "】")
        return driver

    def refsh(self,driver,id,conpanyname):
        # 模拟登陆完成,输入搜索内容
        driver.find_element_by_xpath("//input[@class='search_input form-control "
                                     "search_form input search-input-v1 f12 "
                                     "js-live-search']").clear();
        driver.find_element_by_xpath("//input[@class='search_input form-control "
                                     "search_form input search-input-v1 f12 "
                                     "js-live-search']").send_keys(conpanyname)  # 输入搜索内容
        driver.find_element_by_xpath("//div[@class='input-group-addon button-blue-sm pt0 pb0']").click()  # 点击搜索
        driver.implicitly_wait(10)

        # 选择相关度最高的搜索结果 第一条搜索框,然后再
        tag = driver.find_elements_by_xpath("//div[@class='search_right_item ml10']")
        if len(tag) ==0:
            print("没有查询到数据!!!第" + str(id) + "个【" + conpanyname + "】")
            conn_mysql.updateConpanyFlase(id);
            return 0;
        btn = tag[0].find_element_by_tag_name('a')
        print(btn);
        try:
            closeBtn = driver.find_element_by_id("bannerClose")
            if not closeBtn is None:
                closeBtn.click()
        except:
            print("")
        btn.click()
        driver.implicitly_wait(5)

        # 转化句柄
        now_handle = driver.current_window_handle
        all_handles = driver.window_handles
        for handle in all_handles:
            if handle != now_handle:
                # 输出待选择的窗口句柄
                print(handle)
                driver.switch_to.window(handle)
        self.getData(driver,id);
        driver.close();
        driver.switch_to_window(all_handles[0])
        return driver

    def getData(self, driver,id):

        re = self.baseInfo("test",driver,id);
        print(re);
        conn_mysql.insertCompany(re);
        conn_mysql.updateConpany(id);
        return 1;



    def baseInfo(self, idd,driver,id):
        # base = self.driver.find_element_by_xpath("//div[@class='company_header_width ie9Style position-rel']/div")
        # # base '淘宝(中国)软件有限公司浏览40770\n高新企业\n电话:18768440137邮箱:暂无\n网址:http://www.atpanel.com
        # # 地址:杭州市余杭区五常街道荆丰村'
        # name = base.text.split('浏览')[0]
        # tel = base.text.split('电话:')[1].split('邮箱:')[0]
        # email = base.text.split('邮箱:')[1].split('\n')[0]
        # web = base.text.split('网址:')[1].split('地址')[0]
        # address = base.text.split('地址:')[1]
        # abstract = self.driver.find_element_by_xpath("//div[@class='sec-c2 over-hide']//script")
        # # 获取隐藏内容
        # abstract = self.driver.execute_script("return arguments[0].textContent", abstract).strip()
        cname = driver.find_element_by_xpath(
            "//div[@class='position-rel']/span[@class='f18 in-block vertival-middle sec-c2']").text
        pname = driver.find_element_by_xpath(
            "//div[@class='f18 overflow-width sec-c3']/a").text
        tabs = driver.find_elements_by_tag_name('table')
        rows = tabs[1].find_elements_by_tag_name('tr')
        cols = rows[0].find_elements_by_tag_name('td' and 'th')
        # 工商注册号
        reg_code = rows[0].find_elements_by_tag_name('td')[1].text
        # 注册地址
        reg_address = rows[5].find_elements_by_tag_name('td')[1].text
        # 英文名称
        # english_name = rows[5].find_elements_by_tag_name('td')[1].text
        # 经营范围
        # ent_range = rows[6].find_elements_by_tag_name('td')[1].text
        # 统一信用代码
        creditcode = rows[1].find_elements_by_tag_name('td')[1].text
        # 纳税人识别号
        tax_code = rows[2].find_elements_by_tag_name('td')[1].text
        # 营业期限
        # deadline = rows[3].find_elements_by_tag_name('td')[1].text
        # 企业类型
        # ent_type = rows[1].find_elements_by_tag_name('td')[3].text

        # baseInfo = (idd, name, tel, email, web, address, abstract, reg_code, reg_address, english_name, ent_range,
        #             creditcode, tax_code, deadline, ent_type)

        return ( id,cname,pname , reg_code , creditcode , reg_address , tax_code )


mysqlDao:

import pymysql

def selectUnFinishCompany():
    # 名称 职位 公司名称  entuid
    conn = pymysql.connect(host='localhost', user='root', passwd='root', db='tianyan', port=3306, charset='utf8')
    cur = conn.cursor()  # 获取一个游标
    sql = "select * from company where done is null and id >= 2900 ORDER BY id desc"
    try:
        cur.execute(sql)
        results = cur.fetchall()
        return results
        # for row in results:
        #     companyname = row[0]
        #     id = row[1]
        #     done = row[2]

    except:
        print("Error: unable to fecth data")
    cur.close()  # 关闭游标
    conn.close()  # 释放数据库资源


def updateConpany(id):
    conn = pymysql.connect(host='localhost', user='root', passwd='root', db='tianyan', port=3306, charset='utf8')
    cur = conn.cursor()  # 获取一个游标
    sql = "update company set done = 1 where id = '%d'"
    print(sql)
    try:
        cur.execute(sql % id)
    except:
        print("Error: unable to fecth data")
    conn.commit()
    cur.close()  # 关闭游标
    conn.close()  # 释放数据库资源

def updateConpanyFlase(id):
    conn = pymysql.connect(host='localhost', user='root', passwd='root', db='tianyan', port=3306, charset='utf8')
    cur = conn.cursor()  # 获取一个游标
    sql = "update company set done = 0 where id = '%d'"
    print(sql)
    try:
        cur.execute(sql % id)
    except:
        print("Error: unable to fecth data")
    conn.commit()
    cur.close()  # 关闭游标
    conn.close()  # 释放数据库资源

def insertCompany(values):
    conn = pymysql.connect(host='localhost', user='root', passwd='root', db='tianyan', port=3306, charset='utf8')
    cur = conn.cursor()  # 获取一个游标
    sql = "INSERT INTO result (id, companyname,pname, reg_code, creditcode , reg_address, tax_code) " \
            "VALUES ( '%d', '%s','%s', '%s','%s', '%s','%s') "
    print(sql)
    try:
        cur.execute(sql % values)
    except:
        print("Error: unable to fecth data")
    conn.commit()
    cur.close()  # 关闭游标
    conn.close()  # 释放数据库资源





你可能感兴趣的:(python)