对爬取的常量进行设置
"""
基本信息设置
"""
base_url = "https://com"
login_page = "https://com/users/sign_in"
company_bsae_url = "https://com/companies/{}?branch=&commit=Go&inactive=false&mode=best_fields&nonprofit=&order=score&page={}&q=&search_fields%5B%5D=name&search_fields%5B%5D=previous_names&search_fields%5B%5D=company_number&utf8=%E2%9C%93"
max_page = 30
DATABASE = {
"host": '127.0.0.1', # 数据库地址
"port": 3306, # 数据库端口
"db": 'opencorporates', # 数据库名
"user": 'root', # 数据库用户名
"passwd": 'root', # 数据库密码
"charset": 'utf8', # 编码方式
}
# 登陆信息
login_email = "[email protected]"
login_password = "15184378218lk"
# 发送方邮箱
SENDEMAIL = 'com'
# 邮箱地址
EMAILPSD = ''
# 收件人邮箱
TOEMAIL = 'com'
# 缩写 131个国家
country_list = ['us_la', 'us_sd', 'be', 'us_ok', 'je', 'ca_nl', 'us_nm', 'ca_qc', 'bg', 'us_fl', 'rw', 'dk', 'us_ut', 'us_sc', 'do', 'mt', 'us_mo',
'us_ks', 'us_ms', 'si', 'is', 'th', 'bm', 'aw', 'us_nd', 'me', 'us_or', 'us_az', 'ro', 'ae_az', 'gi', 'mx', 'bo', 'us_ri', 'fr',
'us_wv', 'au', 'ch', 'jm', 'mm', 'vu', 'bz', 'us_wy', 'us_nv', 'us_co', 'jp', 'us_tx', 'us_ne', 'ca_ns', 'us_ak', 'bs', 'al', 'gb',
'sk', 'za', 'us_ga', 'ua', 'gl', 'gg', 'mu', 'us_de', 'us_tn', 'nl', 'ir', 'us_me', 'bh', 'de', 'ca', 'us_nc', 'pa', 'li', 'pr',
'us_id', 'no', 'us_nh', 'us_ar', 'il', 'us_ca', 'hr', 'se', 'us_al', 'us_mi', 'us_ny', 'us_mt', 'us_oh', 'es', 'us_ct', 'cw', 'us_ia',
'br', 'ca_pe', 'us_va', 'im', 'bb', 'us_ky', 'bd', 'us_vt', 'us_mn', 'to', 'md', 'tj', 'my', 'nz', 'vn', 'us_pa', 'hk', 'kh', 'sg',
'lv', 'fi', 'us_md', 'gr', 'us_in', 'us_wi', 'cy', 'tn', 'tz', 'pk', 'in', 'pl', 'ca_nb', 'ae_du', 'us_ma', 'ug', 'us_dc', 'us_hi',
'ie', 'lu', 'us_wa', 'by', 'us_nj']
# 缩写与国家参照
country_dic = {'gl': 'Greenland', 'us_in': 'Indiana (US)', 'ie': 'Ireland', 'us_wi': 'Wisconsin (US)', 'jm': 'Jamaica', 'bg': 'Bulgaria',
'tj': 'Tajikistan',
'us_ok': 'Oklahoma (US)', 'im': 'Isle of Man', 'ug': 'Uganda', 'ro': 'Romania', 'ca_nb': 'New Brunswick (Canada)',
'us_wv': 'West Virginia (US)',
'in': 'India', 'mx': 'Mexico', 'pl': 'Poland', 'us_wy': 'Wyoming (US)', 'us_mo': 'Missouri (US)', 'us_pa': 'Pennsylvania (US)',
'bb': 'Barbados',
'us_az': 'Arizona (US)', 'us_dc': 'District of Columbia (US)', 'nl': 'Netherlands', 'gr': 'Greece', 'us_nm': 'New Mexico (US)',
'aw': 'Aruba',
'ir': 'Iran', 'fr': 'France', 'us_nc': 'North Carolina (US)', 'us_vt': 'Vermont (US)', 'us_ut': 'Utah (US)',
'us_nh': 'New Hampshire (US)',
'by': 'Belarus', 'ch': 'Switzerland', 'us_al': 'Alabama (US)', 'us_ar': 'Arkansas (US)', 'us_va': 'Virginia (US)', 'es': 'Spain',
'mm': 'Myanmar', 'hk': 'Hong Kong', 'us_md': 'Maryland (US)', 'be': 'Belgium', 'lv': 'Latvia', 'ca_qc': 'Quebec (Canada)',
'bz': 'Belize',
'us_ny': 'New York (US)', 'us_sd': 'South Dakota (US)', 'us_mt': 'Montana (US)', 'us_oh': 'Ohio (US)', 'us_ma': 'Massachusetts (US)',
'nz': 'New Zealand', 'br': 'Brazil', 'kh': 'Cambodia', 'pk': 'Pakistan', 'us_mn': 'Minnesota (US)', 'us_ct': 'Connecticut (US)',
'us_wa': 'Washington (US)', 'us_me': 'Maine (US)', 'fi': 'Finland', 'gg': 'Guernsey', 'gi': 'Gibraltar', 'bd': 'Bangladesh',
'us_ga': 'Georgia (US)', 'th': 'Thailand', 'do': 'Dominican Republic', 'li': 'Liechtenstein', 'us_ky': 'Kentucky (US)',
'my': 'Malaysia',
'je': 'Jersey', 'us_sc': 'South Carolina (US)', 'us_id': 'Idaho (US)', 'cw': 'Curaçao', 'se': 'Sweden', 'pa': 'Panama',
'ca_nl': 'Newfoundland and Labrador (Ca...', 'de': 'Germany', 'vn': 'Viet Nam', 'us_nv': 'Nevada (US)', 'us_ne': 'Nebraska (US)',
'ua': 'Ukraine', 'us_fl': 'Florida (US)', 'ca_pe': 'Prince Edward Island (Canada)', 'mt': 'Malta', 'us_hi': 'Hawaii (US)',
'vu': 'Vanuatu',
'si': 'Slovenia', 'rw': 'Rwanda', 'sg': 'Singapore', 'us_ks': 'Kansas (US)', 'me': 'Montenegro', 'is': 'Iceland', 'bh': 'Bahrain',
'au': 'Australia', 'us_ms': 'Mississippi (US)', 'us_ak': 'Alaska (US)', 'us_co': 'Colorado (US)', 'pr': 'Puerto Rico', 'md': 'Moldova',
'us_mi': 'Michigan (US)', 'to': 'Tonga', 'bo': 'Bolivia', 'za': 'South Africa', 'ca_ns': 'Nova Scotia (Canada)', 'il': 'Israel',
'us_or': 'Oregon (US)', 'tz': 'Tanzania', 'us_tn': 'Tennessee (US)', 'ae_az': 'Abu Dhabi (UAE)', 'lu': 'Luxembourg', 'tn': 'Tunisia',
'us_ca': 'California (US)', 'us_la': 'Louisiana (US)', 'al': 'Albania', 'us_ia': 'Iowa (US)', 'gb': 'United Kingdom',
'mu': 'Mauritius',
'hr': 'Croatia', 'dk': 'Denmark', 'sk': 'Slovakia', 'cy': 'Cyprus', 'ae_du': 'Dubai (UAE)', 'us_tx': 'Texas (US)',
'us_de': 'Delaware (US)',
'us_nd': 'North Dakota (US)', 'no': 'Norway', 'bm': 'Bermuda', 'jp': 'Japan', 'us_ri': 'Rhode Island (US)', 'bs': 'Bahamas',
'ca': 'Canada',
'us_nj': 'New Jersey (US)'}
主要爬虫文件,利用浏览器驱动模拟登录网站,然后对网站信息进行爬取,如果爬取出错,继续下一个页面的爬取
# -*- coding: utf-8 -*-
import time
import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from setting import *
from db import db_insert, db_update
from tools import format_str, send_email, dic_change_str, query_company_is_exist, find_breakpoint, save_breakpoint
def main():
chrome_profile = webdriver.ChromeOptions()
# chrome_profile.add_argument("--disable-extensions")
# chrome_profile.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"D:\chromedriver\chromedriver.exe", chrome_options=chrome_profile)
wait = WebDriverWait(driver, 60)
driver.get(login_page)
txtemail = wait.until(
EC.presence_of_element_located((By.ID, 'user_email'))
)
txtpassword = wait.until(
EC.presence_of_element_located((By.ID, 'user_password'))
)
btnsearch = wait.until(
EC.presence_of_element_located((By.NAME, 'submit'))
)
time.sleep(3)
txtemail.clear()
txtemail.send_keys(login_email)
txtpassword.clear()
txtpassword.send_keys(login_password)
btnsearch.click()
time.sleep(3)
country, page_number = find_breakpoint()
# 异常处理
try:
for country in new_country_list:
# 继续上一次位置
for page_number1 in range(page_number, max_page + 1):
# 页码更新,存断点
print(page_number1)
save_breakpoint(country, page_number1)
data_page = company_bsae_url.format(country, page_number1)
print(data_page)
jurisdiction = country_dic[country]
driver.get(data_page)
html_page = driver.page_source.encode('utf-8') # 取得网页的源代码
html_page_soup = BeautifulSoup(html_page, 'lxml')
company_list = html_page_soup.select('li.search-result a.company_search_result')
print(company_list)
if len(company_list) > 0:
for i in company_list:
company_url = base_url + i.get('href')
company_name = i.get_text()
driver.get(company_url)
company_html = driver.page_source.encode('utf-8')
bs_company_html = BeautifulSoup(company_html, 'lxml')
company_dt_list = bs_company_html.select('div#attributes dl.attributes')
if company_dt_list:
company_dt = company_dt_list[0]
# 先判断是否存在
company_number = ""
bs_company_number = company_dt.select('dd.company_number')
if bs_company_number:
company_number = bs_company_number[0].get_text()
# 数据库中存在就继续下一个循环
# if company_number in company_number_list:
# continue
else:
print(company_url + "中公司编号不存在")
status = ""
bs_status = company_dt.select('dd.status')
if bs_status:
status = bs_status[0].get_text()
incorporation_date = ""
bs_incorporation_date = company_dt.select('span[itemprop=foundingDate]')
if bs_incorporation_date:
incorporation_date = bs_incorporation_date[0].get_text()
else:
incorporation_date1_dl = bs_company_html.select('div#events div.oc-events-timeline dl')
if incorporation_date1_dl:
for incorporation_date1 in incorporation_date1_dl:
incorporation_date_text = incorporation_date1.get_text()
# print(incorporation_date_text)
if "Incorporated" in incorporation_date_text:
incorporation_date = incorporation_date_text.split('Incorporated')[0].strip()
# print(incorporation_date)
company_type = ""
bs_company_type = company_dt.select('dd.company_type')
if bs_company_type:
company_type = bs_company_type[0].get_text()
business_number = ""
bs_business_number = company_dt.select('dd.business_number ')
if bs_business_number:
business_number = bs_business_number[0].get_text()
agent_name = ""
bs_agent_name = company_dt.select('dd.agent_name')
if bs_agent_name:
agent_name = bs_agent_name[0].get_text()
agent_address = ""
bs_agent_address = company_dt.select('dd.agent_address')
if bs_agent_address:
agent_address = bs_agent_address[0].get_text()
directors_officers = ""
bs_directors_officers = company_dt.select('dd.officers')
if bs_directors_officers:
directors_officers_li = bs_directors_officers[0].select('ul.attribute_list li.attribute_item')
directors_officers_dic = {}
for directors_officers in directors_officers_li:
directors_officers_name = ""
directors_officers_href = ""
directors_officers_position = ""
directors_officers_date = ""
directors_officers_href_name = directors_officers.select('a.officer')
if directors_officers_href_name:
directors_officers_href = directors_officers_href_name[0].get('href')
directors_officers_href = base_url + directors_officers_href
directors_officers_name = directors_officers_href_name[0].get_text()
else:
continue
directors_officers_text = directors_officers.get_text()
directors_officers_text_list = directors_officers_text.split(',')
if ',' not in directors_officers_name:
if len(directors_officers_text_list) > 2:
directors_officers_position = directors_officers_text_list[1]
directors_officers_date = directors_officers_text_list[2]
# for d directors_officers_text_list:
# directors_officers_position_font = directors_officers.select('font')
# if len(directors_officers_position_font) > 1:
# directors_officers_position = directors_officers_position_font[0].get_text().rstirp(',').lstirp(',')
# directors_officers_date = directors_officers.get_text().split(',')[-1]
directors_officers_name_dic = {"directors_officers_position": directors_officers_position,
"directors_officers_href": directors_officers_href,
"directors_officers_date": directors_officers_date}
directors_officers_dic[directors_officers_name] = directors_officers_name_dic
print(directors_officers_dic)
else:
directors_officers_dic = {}
registry_page = ""
bs_registry_page = company_dt.select('dd.registry_page a.url')
if bs_registry_page:
registry_page = bs_registry_page[0].get('href')
branch = ""
bs_branch = company_dt.select('dd.branch')
if bs_branch:
branch = bs_branch[0].get_text()
industry_codes = ""
bs_industry_codes = company_dt.select('dd.industry_codes')
if bs_industry_codes:
industry_codes = bs_industry_codes[0].get_text()
ultimate_beneficial_owners = ""
bs_ultimate_beneficial_owners = company_dt.select('dd.ultimate_beneficial_owners')
if bs_ultimate_beneficial_owners:
ultimate_beneficial_owners = bs_ultimate_beneficial_owners[0].get_text()
latest_accounts_date = ""
bs_latest_accounts_date = company_dt.select('dd.latest_accounts_date')
if bs_latest_accounts_date:
latest_accounts_date = bs_latest_accounts_date[0].get_text()
registered_address = ""
bs_registered_address = company_dt.select('dd.registered_address')
if bs_registered_address:
# registered_address = bs_registered_address[0].get_text()
bs_registered_address_li = bs_registered_address[0].select('ul.address_lines li.address_line')
if bs_registered_address_li:
for registered_address_li in bs_registered_address_li:
registered_address_li_str = str(registered_address_li)
registered_address = registered_address + registered_address_li_str.replace('
', ' ').replace(
'', ' ').replace(' ', ' ')
alternative_names = ""
bs_alternative_names = company_dt.select('dd.alternative_names')
if bs_alternative_names:
alternative_names = bs_alternative_names[0].get_text()
print(company_name, company_number, status, incorporation_date, company_type, jurisdiction)
else:
print("没有抓取到该公司数据,url=" + company_url)
source_mechanism = ""
bs_source_mechanism = bs_company_html.select('div#source span.publisher')
if bs_source_mechanism:
source_mechanism = bs_source_mechanism[0].get_text()
source_page = ""
bs_source_page = bs_company_html.select('div#source a.url')
if bs_source_page:
source_page = bs_source_page[0].get('href')
source_time = ""
bs_source_time = bs_company_html.select('div#source span.retrieved')
if bs_source_time:
source_time = bs_source_time[0].get_text()
update_time = datetime.datetime.now()
# print(directors_officers_dic)
company_is_exist = query_company_is_exist(company_number, jurisdiction)
# 存在该公司就更新,不存在就插入
if company_is_exist:
print("数据库中存在,更新"+company_number)
db_update(format_str(company_name), format_str(status), format_str(incorporation_date), format_str(company_type),
format_str(business_number), format_str(agent_name), format_str(agent_address), format_str(alternative_names),
format_str(registered_address), format_str(registry_page), format_str(dic_change_str(directors_officers_dic)),
format_str(branch), format_str(ultimate_beneficial_owners), format_str(industry_codes),
format_str(latest_accounts_date), format_str(source_mechanism), format_str(source_page),
format_str(source_time), format_str(company_url), update_time, page_number1, format_str(company_number),
format_str(jurisdiction))
else:
print("数据库中不存在,插入" + company_number)
db_insert(format_str(company_name), format_str(company_number), format_str(status), format_str(incorporation_date),
format_str(company_type), format_str(jurisdiction), format_str(business_number), format_str(agent_name),
format_str(agent_address), format_str(alternative_names), format_str(registered_address),
format_str(registry_page),
format_str(dic_change_str(directors_officers_dic)), format_str(branch),
format_str(ultimate_beneficial_owners), format_str(industry_codes), format_str(latest_accounts_date),
format_str(source_mechanism), format_str(source_page), format_str(source_time),
format_str(company_url), update_time, page_number1)
page_number = 1
else:
print("访问公司页码出错,出错url" + data_page + " 所属国家:" + country_dic[country])
time.sleep(5)
# 捕获所有异常
except Exception as e:
# title = "爬取出现异常,时间:" + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S %A')
print(e)
# send_email(title, "异常")
driver.quit()
time.sleep(20)
os.system('python run.py')
# finally:
# print('爬取结束')
# send_email("爬取结束", "爬取结束")
通用函数文件,工具类
"""
功能函数
"""
from db import select_latest, select_all_by_company_number_jurisdiction
from setting import SENDEMAIL, EMAILPSD, TOEMAIL
import smtplib
from email.mime.text import MIMEText
import json
# 转换特殊字符
def format_str(str1):
str2 = str1.replace("'", "\\'")
str3 = str2.replace('"', '\\"')
return str3
# 查询最近爬取数据的页码和国家
def query_page_and_country():
ret = select_latest()
page_number = ret[20]
jurisdiction = ret[6]
print(page_number, jurisdiction)
return page_number, jurisdiction
# 发邮件
def send_email(title, articletext):
msg_from = SENDEMAIL # 发送方邮箱
passwd = EMAILPSD # 密码
msg_to = TOEMAIL # 收件人邮箱
subject = title # 主题
content = articletext # 正文
msg = MIMEText(content)
msg['Subject'] = subject
msg['From'] = msg_from
msg['To'] = msg_to
try:
s = smtplib.SMTP_SSL("smtp.163.com", 465) # 邮件服务器及端口号
s.login(msg_from, passwd)
s.sendmail(msg_from, msg_to, msg.as_string())
print("邮件发送成功")
except Exception as e:
print(e)
finally:
s.quit()
# 将字典转换为字符串 利用json
def dic_change_str(dic1):
str1 = json.dumps(dic1)
return str1
# 查询数据库中是否存在该公司
def query_company_is_exist(company_number, jurisdiction):
ret = select_all_by_company_number_jurisdiction(company_number, jurisdiction)
if ret:
return True
else:
return False
# if query_company_is_exist('43364123K', 'Louisiana (US)'):
# print(1)
# 找到断点,返回国家和页码
def find_breakpoint():
with open('breakpoint.txt', "r") as f:
str1 = f.read()
country = str1.split(',')[0]
page_number = int(str1.split(',')[1])
return country, page_number
# 存断点,传入国家和页码
def save_breakpoint(country, page_number):
str1 = country + ',' + str(page_number)
with open('breakpoint.txt', 'w') as f:
f.write(str1)
# print(find_breakpoint())
# save_breakpoint('china',11)
数据库操作文件
"""
数据库操作函数
"""
import pymysql
from setting import DATABASE
# 插入数据
def db_insert(company_name, company_number, status, incorporation_date, company_type, jurisdiction, business_number, agent_name, agent_address,
alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number):
conn = pymysql.connect(host=DATABASE['host'],
port=DATABASE['port'],
db=DATABASE['db'],
user=DATABASE['user'],
passwd=DATABASE['passwd'],
charset=DATABASE['charset'], )
cur = conn.cursor()
sql = "INSERT INTO company(company_name,company_number,status,incorporation_date,company_type,jurisdiction,business_number,agent_name,agent_address,alternative_names,registered_address,registry_page,directors_officers,branch,ultimate_beneficial_owners,industry_codes,latest_accounts_date,source_mechanism,source_page,source_time,company_url,update_time,page_number) VALUES ('{}', '{}','{}', '{}', '{}', '{}','{}','{}', '{}', '{}', '{}','{}','{}', '{}', '{}', '{}','{}','{}', '{}', '{}', '{}','{}','{}')".format(
company_name, company_number, status, incorporation_date, company_type, jurisdiction, business_number, agent_name, agent_address,
alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number)
# print(sql)
cur.execute(sql.encode('utf-8'))
conn.commit()
conn.close()
# 更新数据
def db_update(company_name, company_number, status, incorporation_date, company_type, jurisdiction, business_number, agent_name, agent_address,
alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number):
conn = pymysql.connect(host=DATABASE['host'],
port=DATABASE['port'],
db=DATABASE['db'],
user=DATABASE['user'],
passwd=DATABASE['passwd'],
charset=DATABASE['charset'], )
cur = conn.cursor()
sql = "UPDATE company SET company_name='{}',status='{}',incorporation_date='{}',company_type='{}',business_number='{}',agent_name='{}',agent_address='{}',alternative_names='{}',registered_address='{}',registry_page='{}',directors_officers='{}',branch='{}',ultimate_beneficial_owners='{}',industry_codes='{}',latest_accounts_date='{}',source_mechanism='{}',source_page='{}',source_time='{}',company_url='{}',update_time='{}',page_number='{}' WHERE company_number='{}' AND jurisdiction='{}'".format(
company_name, status, incorporation_date, company_type, business_number, agent_name, agent_address,
alternative_names, registered_address, registry_page, directors_officers, branch, ultimate_beneficial_owners, industry_codes,
latest_accounts_date, source_mechanism, source_page, source_time, company_url, update_time, page_number, company_number, jurisdiction)
cur.execute(sql.encode('utf-8'))
conn.commit()
conn.close()
# 查询最新一条记录
def select_latest():
conn = pymysql.connect(host=DATABASE['host'],
port=DATABASE['port'],
db=DATABASE['db'],
user=DATABASE['user'],
passwd=DATABASE['passwd'],
charset=DATABASE['charset'], )
cur = conn.cursor()
sql = "SELECT * FROM company ORDER BY id DESC LIMIT 1"
cur.execute(sql.encode('utf-8'))
ret = cur.fetchone()
cur.close()
conn.close()
print(ret)
return ret
# 根据页码查询该页码的公司编号
def select_all_by_page_number(page_number):
conn = pymysql.connect(host=DATABASE['host'],
port=DATABASE['port'],
db=DATABASE['db'],
user=DATABASE['user'],
passwd=DATABASE['passwd'],
charset=DATABASE['charset'], )
cur = conn.cursor()
sql = "SELECT company_number FROM company WHERE page_number={}".format(page_number)
print(sql)
cur.execute(sql.encode('utf-8'))
results = cur.fetchall()
company_number_list = []
for row in results:
company_number_list.append(row[0])
return company_number_list
# 通过国家和公司编号查询公司信息
def select_all_by_company_number_jurisdiction(company_number, jurisdiction):
conn = pymysql.connect(host=DATABASE['host'],
port=DATABASE['port'],
db=DATABASE['db'],
user=DATABASE['user'],
passwd=DATABASE['passwd'],
charset=DATABASE['charset'], )
cur = conn.cursor()
sql = "SELECT * FROM company WHERE company_number='{}' AND jurisdiction='{}'".format(company_number, jurisdiction)
cur.execute(sql.encode('utf-8'))
ret = cur.fetchone()
cur.close()
conn.close()
return ret
断点记录文件,程序重启后继续断点运行
ca,21
6.run.py
启动文件
from allcountry import main
if __name__ == '__main__':
main()