#!python3 #coding:utf-8 ''' ################################ @ MyBlog: blog.csdn.net/hjxzt1 www.mykurol.com github: https://github.com/kurolz ################################ 爬取集思录网站债券数据 目前可存为txt或xlsx两种格式 超过定义的涨幅或跌幅,可邮件通知 #### 请填写发送的邮箱密码 #### ''' '''邮件通知''' def sendMail(id,uplift): import smtplib from email.mime.text import MIMEText from email.header import Header # 第三方 SMTP 服务 mail_host = "smtp.163.com" # 设置smtp服务器,例如:smtp.163.com mail_user = "[email protected]" # 发送的邮箱用户名 mail_pass = "******" # 发送的邮箱密码 sender = '[email protected]' # 发送的邮箱 receivers = '[email protected]' # 接收的邮箱 if uplift < format(0, '.0%'): # uplift为涨幅或跌幅 text = '债券代码:' + id + ', 跌幅为:' + uplift else: text = '债券代码:' + id + ', 涨幅为:' + uplift # 发送的文本 message = MIMEText(text) message['From'] = sender message['To'] = receivers subject = text message['Subject'] = Header(subject) try: smtpObj = smtplib.SMTP() smtpObj.connect(mail_host, 25) # 25 为 SMTP 端口号 smtpObj.login(mail_user, mail_pass) # login smtpObj.sendmail(sender, receivers, message.as_string()) # 发送邮件 print("邮件发送成功") except smtplib.SMTPException as e: print("Error: 无法发送邮件" + str(e)) '''当债券涨幅超过定义的limit值时,发送邮件通知''' def limitsendMail(limit,i,data): if limit < 0: if data < format(0, '.0%'): limit = format(limit, '.3%') if data.split('-')[1] > limit.split('-')[1]: sendMail(i, data) else: limit = format(limit, '.0%') if data > limit: sendMail(i, data) '''定义抓取数据的行''' def data_row(): jisilu_id = [] # 要爬取的债券代码 p = 0 # 可选,0为取双行,-1为取单行 while True: try: p += 2 # 可选,2为取单双行,1位取所有行 p = str(p) # 转换为str,用于抓取数据 element = driver.find_element_by_xpath('//*[@id="flex3"]/tbody/tr['+ p +']') data_id = element.get_attribute("id") # 抓取所有需要爬取的债券代码 p = int(p) # 转换为int,维持增量循环 except: break jisilu_id.append(data_id) return jisilu_id '''定义抓取数据的列''' def data_colum(): c = [] # 存储要抓取的列 for num in range(1,24): c.append(str(num)) # 删除三个无关数据的列 del c[20] del c[12] del c[11] return c '''定义爬取的操作''' try: from selenium import webdriver import ssl html = 'https://www.jisilu.cn/data/cbnew/#tlink_3' # 定义爬取的网站 ssl._create_default_https_context = ssl._create_unverified_context # 取消证书认证 try: driver = webdriver.PhantomJS() driver.get(html) driver.implicitly_wait(3) # 等待3秒 except: print ('请安装phantomjs') except ImportError: print ('No module named selenium. 请安装selenium模块') '''抓取数据''' a = {} # 存储数据,存储格式:a = {债券代码:{title:data,title:data, ...}, ...} for i in data_row(): b = {} # 存储格式:b = {title:data, ...} for lie in data_colum(): title = driver.find_element_by_xpath('//*[@id="flex3"]/thead/tr[2]/th['+lie+']').text # 抓取title data = driver.find_element_by_xpath('//*[@id='+i+']/td['+lie+']').text # 抓取数值 title = title.replace("\n", "") # 去掉title中的换行符 b[title] = data if lie == "4": limitsendMail(0.05, i, data) a[i] = b '''数据输出保存''' class print_data(object): def __init__(self, filename): self.filename = filename '''输出到TXT''' def printTxt(self): import time with open(self.filename, 'ab+') as w: nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) w.write(("\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ " + nowtime + " @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n").encode()) w.close() for key in a: for key2 in a[key]: with open(self.filename, 'ab+') as w: w.write((key2 + ":" + a[key][key2] + ",").encode('utf-8')) w.close() with open(self.filename, 'ab+') as w: w.write(("\n").encode()) w.close() '''输出到Xlsx''' def printXlsx(self): try: import xlsxwriter workbook = xlsxwriter.Workbook(self.filename) # 创建一个Excel文件 worksheet = workbook.add_worksheet() # 创建一个工作表对象 colum_len = '' for key in a: colum_len = key if colum_len != '': break '''写入title''' colum_num = 0 while (colum_num < len(a[colum_len].keys())): for i in list(a[colum_len].keys()): worksheet.write(0, colum_num, i) # 写入行列表示法的单元格 colum_num += 1 '''写入数值''' row_num = 1 colum_num_2 = 0 while (colum_num_2 < len(a[colum_len].keys())): for key in a: colum_num_2 = 0 for key2 in a[key]: worksheet.write(row_num, colum_num_2, a[key][key2]) # 写入行列表示法的单元格 colum_num_2 += 1 row_num += 1 except ImportError: print ('No module named xlsxwriter,输出为xlsx文件需要安装xlsxwriter模块,或重新定义输出为txt文件') if __name__ == "__main__": printfilename = '07150240-2.xlsx' file = print_data(printfilename) if printfilename.split('.')[1] == 'xlsx': file.printXlsx() elif printfilename.split('.')[1] == 'txt': file.printTxt() else: print ('输出文件名定义错误,无法输出,只能为xlsx或txt格式')