使用Python和selenium的Chromedriver模拟登陆爬取网站信息(beautifulsoup)

爬取的信息很多,所以需要设置断点,在程序重启时能继续爬取。并且能在断掉之后自动重启。

1.setting.py

对爬取的常量进行设置

"""
基本信息设置
"""

base_url = "https://com"

login_page = "https://com/users/sign_in"

company_bsae_url = "https://com/companies/{}?branch=&commit=Go&inactive=false&mode=best_fields&nonprofit=&order=score&page={}&q=&search_fields%5B%5D=name&search_fields%5B%5D=previous_names&search_fields%5B%5D=company_number&utf8=%E2%9C%93"

max_page = 30

DATABASE = {
    "host": '127.0.0.1',  # 数据库地址
    "port": 3306,  # 数据库端口
    "db": 'opencorporates',  # 数据库名
    "user": 'root',  # 数据库用户名
    "passwd": 'root',  # 数据库密码
    "charset": 'utf8',  # 编码方式
}

# 登陆信息
login_email = "[email protected]"
login_password = "15184378218lk"

# 发送方邮箱
SENDEMAIL = 'com'
# 邮箱地址
EMAILPSD = ''
# 收件人邮箱
TOEMAIL = 'com'

# 缩写 131个国家
country_list = ['us_la', 'us_sd', 'be', 'us_ok', 'je', 'ca_nl', 'us_nm', 'ca_qc', 'bg', 'us_fl', 'rw', 'dk', 'us_ut', 'us_sc', 'do', 'mt', 'us_mo',
                'us_ks', 'us_ms', 'si', 'is', 'th', 'bm', 'aw', 'us_nd', 'me', 'us_or', 'us_az', 'ro', 'ae_az', 'gi', 'mx', 'bo', 'us_ri', 'fr',
                'us_wv', 'au', 'ch', 'jm', 'mm', 'vu', 'bz', 'us_wy', 'us_nv', 'us_co', 'jp', 'us_tx', 'us_ne', 'ca_ns', 'us_ak', 'bs', 'al', 'gb',
                'sk', 'za', 'us_ga', 'ua', 'gl', 'gg', 'mu', 'us_de', 'us_tn', 'nl', 'ir', 'us_me', 'bh', 'de', 'ca', 'us_nc', 'pa', 'li', 'pr',
                'us_id', 'no', 'us_nh', 'us_ar', 'il', 'us_ca', 'hr', 'se', 'us_al', 'us_mi', 'us_ny', 'us_mt', 'us_oh', 'es', 'us_ct', 'cw', 'us_ia',
                'br', 'ca_pe', 'us_va', 'im', 'bb', 'us_ky', 'bd', 'us_vt', 'us_mn', 'to', 'md', 'tj', 'my', 'nz', 'vn', 'us_pa', 'hk', 'kh', 'sg',
                'lv', 'fi', 'us_md', 'gr', 'us_in', 'us_wi', 'cy', 'tn', 'tz', 'pk', 'in', 'pl', 'ca_nb', 'ae_du', 'us_ma', 'ug', 'us_dc', 'us_hi',
                'ie', 'lu', 'us_wa', 'by', 'us_nj']

# 缩写与国家参照
country_dic = {'gl': 'Greenland', 'us_in': 'Indiana (US)', 'ie': 'Ireland', 'us_wi': 'Wisconsin (US)', 'jm': 'Jamaica', 'bg': 'Bulgaria',
               'tj': 'Tajikistan',
               'us_ok': 'Oklahoma (US)', 'im': 'Isle of Man', 'ug': 'Uganda', 'ro': 'Romania', 'ca_nb': 'New Brunswick (Canada)',
               'us_wv': 'West Virginia (US)',
               'in': 'India', 'mx': 'Mexico', 'pl': 'Poland', 'us_wy': 'Wyoming (US)', 'us_mo': 'Missouri (US)', 'us_pa': 'Pennsylvania (US)',
               'bb': 'Barbados',
               'us_az': 'Arizona (US)', 'us_dc': 'District of Columbia (US)', 'nl': 'Netherlands', 'gr': 'Greece', 'us_nm': 'New Mexico (US)',
               'aw': 'Aruba',
               'ir': 'Iran', 'fr': 'France', 'us_nc': 'North Carolina (US)', 'us_vt': 'Vermont (US)', 'us_ut': 'Utah (US)',
               'us_nh': 'New Hampshire (US)',
               'by': 'Belarus', 'ch': 'Switzerland', 'us_al': 'Alabama (US)', 'us_ar': 'Arkansas (US)', 'us_va': 'Virginia (US)', 'es': 'Spain',
               'mm': 'Myanmar', 'hk': 'Hong Kong', 'us_md': 'Maryland (US)', 'be': 'Belgium', 'lv': 'Latvia', 'ca_qc': 'Quebec (Canada)',
               'bz': 'Belize',
               'us_ny': 'New York (US)', 'us_sd': 'South Dakota (US)', 'us_mt': 'Montana (US)', 'us_oh': 'Ohio (US)', 'us_ma': 'Massachusetts (US)',
               'nz': 'New Zealand', 'br': 'Brazil', 'kh': 'Cambodia', 'pk': 'Pakistan', 'us_mn': 'Minnesota (US)', 'us_ct': 'Connecticut (US)',
               'us_wa': 'Washington (US)', 'us_me': 'Maine (US)', 'fi': 'Finland', 'gg': 'Guernsey', 'gi': 'Gibraltar', 'bd': 'Bangladesh',
               'us_ga': 'Georgia (US)', 'th': 'Thailand', 'do': 'Dominican Republic', 'li': 'Liechtenstein', 'us_ky': 'Kentucky (US)',
               'my': 'Malaysia',
               'je': 'Jersey', 'us_sc': 'South Carolina (US)', 'us_id': 'Idaho (US)', 'cw': 'Curaçao', 'se': 'Sweden', 'pa': 'Panama',
               'ca_nl': 'Newfoundland and Labrador (Ca...', 'de': 'Germany', 'vn': 'Viet Nam', 'us_nv': 'Nevada (US)', 'us_ne': 'Nebraska (US)',
               'ua': 'Ukraine', 'us_fl': 'Florida (US)', 'ca_pe': 'Prince Edward Island (Canada)', 'mt': 'Malta', 'us_hi': 'Hawaii (US)',
               'vu': 'Vanuatu',
               'si': 'Slovenia', 'rw': 'Rwanda', 'sg': 'Singapore', 'us_ks': 'Kansas (US)', 'me': 'Montenegro', 'is': 'Iceland', 'bh': 'Bahrain',
               'au': 'Australia', 'us_ms': 'Mississippi (US)', 'us_ak': 'Alaska (US)', 'us_co': 'Colorado (US)', 'pr': 'Puerto Rico', 'md': 'Moldova',
               'us_mi': 'Michigan (US)', 'to': 'Tonga', 'bo': 'Bolivia', 'za': 'South Africa', 'ca_ns': 'Nova Scotia (Canada)', 'il': 'Israel',
               'us_or': 'Oregon (US)', 'tz': 'Tanzania', 'us_tn': 'Tennessee (US)', 'ae_az': 'Abu Dhabi (UAE)', 'lu': 'Luxembourg', 'tn': 'Tunisia',
               'us_ca': 'California (US)', 'us_la': 'Louisiana (US)', 'al': 'Albania', 'us_ia': 'Iowa (US)', 'gb': 'United Kingdom',
               'mu': 'Mauritius',
               'hr': 'Croatia', 'dk': 'Denmark', 'sk': 'Slovakia', 'cy': 'Cyprus', 'ae_du': 'Dubai (UAE)', 'us_tx': 'Texas (US)',
               'us_de': 'Delaware (US)',
               'us_nd': 'North Dakota (US)', 'no': 'Norway', 'bm': 'Bermuda', 'jp': 'Japan', 'us_ri': 'Rhode Island (US)', 'bs': 'Bahamas',
               'ca': 'Canada',
               'us_nj': 'New Jersey (US)'}

2.allcountry.py

主要爬虫文件,利用浏览器驱动模拟登录网站,然后对网站信息进行爬取,如果爬取出错,继续下一个页面的爬取

# -*- coding: utf-8 -*-
import time
import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from setting import *
from db import db_insert, db_update
from tools import format_str, send_email, dic_change_str, query_company_is_exist, find_breakpoint, save_breakpoint


def main():
    chrome_profile = webdriver.ChromeOptions()
    # chrome_profile.add_argument("--disable-extensions")
    # chrome_profile.add_argument('--headless')

    driver = webdriver.Chrome(executable_path=r"D:\chromedriver\chromedriver.exe", chrome_options=chrome_profile)
    wait = WebDriverWait(driver, 60)
    driver.get(login_page)

    txtemail = wait.until(
        EC.presence_of_element_located((By.ID, 'user_email'))
    )
    txtpassword = wait.until(
        EC.presence_of_element_located((By.ID, 'user_password'))
    )
    btnsearch = wait.until(
        EC.presence_of_element_located((By.NAME, 'submit'))
    )

    time.sleep(3)
    txtemail.clear()
    txtemail.send_keys(login_email)
    txtpassword.clear()
    txtpassword.send_keys(login_password)
    btnsearch.click()

    time.sleep(3)

    country, page_number = find_breakpoint()


    # 异常处理
    try:
        for country in new_country_list:
            # 继续上一次位置
            for page_number1 in range(page_number, max_page + 1):
                # 页码更新,存断点
                print(page_number1)
                save_breakpoint(country, page_number1)

                data_page = company_bsae_url.format(country, page_number1)
                print(data_page)
                jurisdiction = country_dic[country]

                driver.get(data_page)
                html_page = driver.page_source.encode('utf-8')  # 取得网页的源代码
                html_page_soup = BeautifulSoup(html_page, 'lxml')
                company_list = html_page_soup.select('li.search-result a.company_search_result')
                print(company_list)
                if len(company_list) > 0:
                    for i in company_list:

                        company_url = base_url + i.get('href')

                        company_name = i.get_text()

                        driver.get(company_url)
                        company_html = driver.page_source.encode('utf-8')
                        bs_company_html = BeautifulSoup(company_html, 'lxml')
                        company_dt_list = bs_company_html.select('div#attributes dl.attributes')
                        if company_dt_list:
                            company_dt = company_dt_list[0]

                            # 先判断是否存在
                            company_number = ""
                            bs_company_number = company_dt.select('dd.company_number')
                            if bs_company_number:
                                company_number = bs_company_number[0].get_text()

                                # 数据库中存在就继续下一个循环
                                # if company_number in company_number_list:
                                #     continue
                            else:
                                print(company_url + "中公司编号不存在")

                            status = ""
                            bs_status = company_dt.select('dd.status')
                            if bs_status:
                                status = bs_status[0].get_text()

                            incorporation_date = ""
                            bs_incorporation_date = company_dt.select('span[itemprop=foundingDate]')
                            if bs_incorporation_date:
                                incorporation_date = bs_incorporation_date[0].get_text()
                            else:
                                incorporation_date1_dl = bs_company_html.select('div#events div.oc-events-timeline dl')
                                if incorporation_date1_dl:
                                    for incorporation_date1 in incorporation_date1_dl:
                                        incorporation_date_text = incorporation_date1.get_text()
                                        # print(incorporation_date_text)
                                        if "Incorporated" in incorporation_date_text:
                                            incorporation_date = incorporation_date_text.split('Incorporated')[0].strip()
                                            # print(incorporation_date)

                            company_type = ""
                            bs_company_type = company_dt.select('dd.company_type')
                            if bs_company_type:
                                company_type = bs_company_type[0].get_text()

                            business_number = ""
                            bs_business_number = company_dt.select('dd.business_number ')
                            if bs_business_number:
                                business_number = bs_business_number[0].get_text()

                            agent_name = ""
                            bs_agent_name = company_dt.select('dd.agent_name')
                            if bs_agent_name:
                                agent_name = bs_agent_name[0].get_text()

                            agent_address = ""
                            bs_agent_address = company_dt.select('dd.agent_address')
                            if bs_agent_address:
                                agent_address = bs_agent_address[0].get_text()

                            directors_officers = ""
                            bs_directors_officers = company_dt.select('dd.officers')
                            if bs_directors_officers:
                                directors_officers_li = bs_directors_officers[0].select('ul.attribute_list li.attribute_item')
                                directors_officers_dic = {}
                                for directors_officers in directors_officers_li:
                                    directors_officers_name = ""
                                    directors_officers_href = ""
                                    directors_officers_position = ""
                                    directors_officers_date = ""

                                    directors_officers_href_name = directors_officers.select('a.officer')
                                    if directors_officers_href_name:
                                        directors_officers_href = directors_officers_href_name[0].get('href')
                                        directors_officers_href = base_url + directors_officers_href
                                        directors_officers_name = directors_officers_href_name[0].get_text()
                                    else:
                                        continue
                                    directors_officers_text = directors_officers.get_text()
                                    directors_officers_text_list = directors_officers_text.split(',')
                                    if ',' not in directors_officers_name:
                                        if len(directors_officers_text_list) > 2:
                                            directors_officers_position = directors_officers_text_list[1]
                                            directors_officers_date = directors_officers_text_list[2]
                                    # for d directors_officers_text_list:

                                    # directors_officers_position_font = directors_officers.select('font')
                                    # if len(directors_officers_position_font) > 1:
                                    #     directors_officers_position = directors_officers_position_font[0].get_text().rstirp(',').lstirp(',')
                                    #     directors_officers_date = directors_officers.get_text().split(',')[-1]

                                    directors_officers_name_dic = {"directors_officers_position": directors_officers_position,
                                                                   "directors_officers_href": directors_officers_href,
                                                                   "directors_officers_date": directors_officers_date}
                                    directors_officers_dic[directors_officers_name] = directors_officers_name_dic
                                print(directors_officers_dic)
                            else:
                                directors_officers_dic = {}
                            registry_page = ""
                            bs_registry_page = company_dt.select('dd.registry_page a.url')
                            if bs_registry_page:
                                registry_page = bs_registry_page[0].get('href')

                            branch = ""
                            bs_branch = company_dt.select('dd.branch')
                            if bs_branch:
                                branch = bs_branch[0].get_text()

                            industry_codes = ""
                            bs_industry_codes = company_dt.select('dd.industry_codes')
                            if bs_industry_codes:
                                industry_codes = bs_industry_codes[0].get_text()

                            ultimate_beneficial_owners = ""
                            bs_ultimate_beneficial_owners = company_dt.select('dd.ultimate_beneficial_owners')
                            if bs_ultimate_beneficial_owners:
                                ultimate_beneficial_owners = bs_ultimate_beneficial_owners[0].get_text()

                            latest_accounts_date = ""
                            bs_latest_accounts_date = company_dt.select('dd.latest_accounts_date')
                            if bs_latest_accounts_date:
                                latest_accounts_date = bs_latest_accounts_date[0].get_text()

                            registered_address = ""
                            bs_registered_address = company_dt.select('dd.registered_address')
                            if bs_registered_address:
                                # registered_address = bs_registered_address[0].get_text()
                                bs_registered_address_li = bs_registered_address[0].select('ul.address_lines li.address_line')
                                if bs_registered_address_li:
                                    for registered_address_li in bs_registered_address_li:
                                        registered_address_li_str = str(registered_address_li)
                                        registered_address = registered_address + registered_address_li_str.replace('
', ' ').replace( '
  • ', ' ').replace('
  • ', ' ') alternative_names = "" bs_alternative_names = company_dt.select('dd.alternative_names') if bs_alternative_names: alternative_names = bs_alternative_names[0].get_text() print(company_name, company_number, status, incorporation_date, company_type, jurisdiction) else: print("没有抓取到该公司数据,url=" + company_url) source_mechanism = "" bs_source_mechanism = bs_company_html.select('div#source span.publisher') if bs_source_mechanism: source_mechanism = bs_source_mechanism[0].get_text() source_page = "" bs_source_page = bs_company_html.select('div#source a.url') if bs_source_page: source_page = bs_source_page[0].get('href') source_time = "" bs_source_time = bs_company_html.select('div#source span.retrieved') if bs_source_time: source_time = bs_source_time[0].get_text() update_time = datetime.datetime.now() # print(directors_officers_dic) company_is_exist = query_company_is_exist(company_number, jurisdiction) # 存在该公司就更新,不存在就插入 if company_is_exist: print("数据库中存在,更新"+company_number) db_update(format_str(company_name), format_str(status), format_str(incorporation_date), format_str(company_type), format_str(business_number), format_str(agent_name), format_str(agent_address), format_str(alternative_names), format_str(registered_address), format_str(registry_page), format_str(dic_change_str(directors_officers_dic)), format_str(branch), format_str(ultimate_beneficial_owners), format_str(industry_codes), format_str(latest_accounts_date), format_str(source_mechanism), format_str(source_page), format_str(source_time), format_str(company_url), update_time, page_number1, format_str(company_number), format_str(jurisdiction)) else: print("数据库中不存在,插入" + company_number) db_insert(format_str(company_name), format_str(company_number), format_str(status), format_str(incorporation_date), format_str(company_type), format_str(jurisdiction), format_str(business_number), format_str(agent_name), format_str(agent_address), format_str(alternative_names), format_str(registered_address), format_str(registry_page), format_str(dic_change_str(directors_officers_dic)), format_str(branch), format_str(ultimate_beneficial_owners), format_str(industry_codes), format_str(latest_accounts_date), format_str(source_mechanism), format_str(source_page), format_str(source_time), format_str(company_url), update_time, page_number1) page_number = 1 else: print("访问公司页码出错,出错url" + data_page + " 所属国家:" + country_dic[country]) time.sleep(5) # 捕获所有异常 except Exception as e: # title = "爬取出现异常,时间:" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S %A') print(e) # send_email(title, "异常") driver.quit() time.sleep(20) os.system('python run.py') # finally: # print('爬取结束') # send_email("爬取结束", "爬取结束")

    3.tools.py

    通用函数文件,工具类

    """
    功能函数
    """
    from db import select_latest, select_all_by_company_number_jurisdiction
    from setting import SENDEMAIL, EMAILPSD, TOEMAIL
    import smtplib
    from email.mime.text import MIMEText
    
    import json
    
    
    # 转换特殊字符
    def format_str(str1):
        str2 = str1.replace("'", "\\'")
        str3 = str2.replace('"', '\\"')
        return str3
    
    
    # 查询最近爬取数据的页码和国家
    def query_page_and_country():
        ret = select_latest()
        page_number = ret[20]
        jurisdiction = ret[6]
        print(page_number, jurisdiction)
        return page_number, jurisdiction
    
    
    # 发邮件
    def send_email(title, articletext):
        msg_from = SENDEMAIL  # 发送方邮箱
        passwd = EMAILPSD  # 密码
        msg_to = TOEMAIL  # 收件人邮箱
    
        subject = title  # 主题
        content = articletext  # 正文
        msg = MIMEText(content)
        msg['Subject'] = subject
        msg['From'] = msg_from
        msg['To'] = msg_to
        try:
            s = smtplib.SMTP_SSL("smtp.163.com", 465)  # 邮件服务器及端口号
            s.login(msg_from, passwd)
            s.sendmail(msg_from, msg_to, msg.as_string())
            print("邮件发送成功")
        except Exception as e:
            print(e)
        finally:
            s.quit()
    
    
    # 将字典转换为字符串 利用json
    def dic_change_str(dic1):
        str1 = json.dumps(dic1)
        return str1
    
    
    # 查询数据库中是否存在该公司
    def query_company_is_exist(company_number, jurisdiction):
        ret = select_all_by_company_number_jurisdiction(company_number, jurisdiction)
        if ret:
            return True
        else:
            return False
    
    
    # if query_company_is_exist('43364123K', 'Louisiana (US)'):
    #     print(1)
    
    
    # 找到断点,返回国家和页码
    def find_breakpoint():
        with open('breakpoint.txt', "r") as f:
            str1 = f.read()
        country = str1.split(',')[0]
        page_number = int(str1.split(',')[1])
        return country, page_number
    
    
    # 存断点,传入国家和页码
    def save_breakpoint(country, page_number):
        str1 = country + ',' + str(page_number)
        with open('breakpoint.txt', 'w') as f:
            f.write(str1)
    
    
            # print(find_breakpoint())
            # save_breakpoint('china',11)
    

    4.db.py

    数据库操作文件

    """
    数据库操作函数
    """
    import pymysql
    from setting import DATABASE
    
    
    # 插入数据
    def db_insert(company_name, company_number, status, incorporation_date, company_type, jurisdiction, business_number, agent_name, agent_address,
                  alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
                  latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number):
        conn = pymysql.connect(host=DATABASE['host'],
                               port=DATABASE['port'],
                               db=DATABASE['db'],
                               user=DATABASE['user'],
                               passwd=DATABASE['passwd'],
                               charset=DATABASE['charset'], )
        cur = conn.cursor()
        sql = "INSERT INTO company(company_name,company_number,status,incorporation_date,company_type,jurisdiction,business_number,agent_name,agent_address,alternative_names,registered_address,registry_page,directors_officers,branch,ultimate_beneficial_owners,industry_codes,latest_accounts_date,source_mechanism,source_page,source_time,company_url,update_time,page_number) VALUES ('{}', '{}','{}', '{}', '{}', '{}','{}','{}', '{}', '{}', '{}','{}','{}', '{}', '{}', '{}','{}','{}', '{}', '{}', '{}','{}','{}')".format(
            company_name, company_number, status, incorporation_date, company_type, jurisdiction, business_number, agent_name, agent_address,
            alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
            latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number)
        # print(sql)
        cur.execute(sql.encode('utf-8'))
        conn.commit()
        conn.close()
    
    
    # 更新数据
    def db_update(company_name, company_number, status, incorporation_date, company_type, jurisdiction, business_number, agent_name, agent_address,
                  alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
                  latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number):
        conn = pymysql.connect(host=DATABASE['host'],
                               port=DATABASE['port'],
                               db=DATABASE['db'],
                               user=DATABASE['user'],
                               passwd=DATABASE['passwd'],
                               charset=DATABASE['charset'], )
        cur = conn.cursor()
        sql = "UPDATE company SET company_name='{}',status='{}',incorporation_date='{}',company_type='{}',business_number='{}',agent_name='{}',agent_address='{}',alternative_names='{}',registered_address='{}',registry_page='{}',directors_officers='{}',branch='{}',ultimate_beneficial_owners='{}',industry_codes='{}',latest_accounts_date='{}',source_mechanism='{}',source_page='{}',source_time='{}',company_url='{}',update_time='{}',page_number='{}' WHERE company_number='{}' AND jurisdiction='{}'".format(
            company_name, status, incorporation_date, company_type, business_number, agent_name, agent_address,
            alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
            latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number, company_number, jurisdiction)
        cur.execute(sql.encode('utf-8'))
        conn.commit()
        conn.close()
    
    
    # 查询最新一条记录
    def select_latest():
        conn = pymysql.connect(host=DATABASE['host'],
                               port=DATABASE['port'],
                               db=DATABASE['db'],
                               user=DATABASE['user'],
                               passwd=DATABASE['passwd'],
                               charset=DATABASE['charset'], )
        cur = conn.cursor()
        sql = "SELECT * FROM company ORDER BY id DESC LIMIT 1"
        cur.execute(sql.encode('utf-8'))
        ret = cur.fetchone()
        cur.close()
        conn.close()
        print(ret)
        return ret
    
    
    # 根据页码查询该页码的公司编号
    def select_all_by_page_number(page_number):
        conn = pymysql.connect(host=DATABASE['host'],
                               port=DATABASE['port'],
                               db=DATABASE['db'],
                               user=DATABASE['user'],
                               passwd=DATABASE['passwd'],
                               charset=DATABASE['charset'], )
        cur = conn.cursor()
        sql = "SELECT company_number FROM company WHERE page_number={}".format(page_number)
        print(sql)
        cur.execute(sql.encode('utf-8'))
        results = cur.fetchall()
        company_number_list = []
        for row in results:
            company_number_list.append(row[0])
        return company_number_list
    
    
    # 通过国家和公司编号查询公司信息
    def select_all_by_company_number_jurisdiction(company_number, jurisdiction):
        conn = pymysql.connect(host=DATABASE['host'],
                               port=DATABASE['port'],
                               db=DATABASE['db'],
                               user=DATABASE['user'],
                               passwd=DATABASE['passwd'],
                               charset=DATABASE['charset'], )
        cur = conn.cursor()
        sql = "SELECT * FROM company WHERE company_number='{}' AND  jurisdiction='{}'".format(company_number, jurisdiction)
        cur.execute(sql.encode('utf-8'))
        ret = cur.fetchone()
        cur.close()
        conn.close()
        return ret
    
    

    5.breakpoint.txt

    断点记录文件,程序重启后继续断点运行

    ca,21

    6.run.py

    启动文件

    from allcountry import main
    
    if __name__ == '__main__':
        main()

     

    你可能感兴趣的:(Python爬虫)