Python爬虫抓取携程网机票信息并发邮件通知

背景:
由于要买机票,所以一直进行搜索,爬虫可以帮我解决这个问题;

  • 用Python抓取携程网机票信息 过程纪实(上篇)

解释的超级详细。
于是通过这一过程,基本了解了一些;

  • 查询 上海 到 西安 4.29~05.02的机票:
#coding:utf-8

import urllib2
from lxml import etree
import json
import random
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def get_json2(date,rk,CK,r):
    '''根据构造出的url获取到航班数据'''
    url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=SIA&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s"%(date,rk,CK,r)
    headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",'Referer':"http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=2017-04-29"}
    headers['Referer']="http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s"%date
    req=urllib2.Request(url,headers=headers)
    res=urllib2.urlopen(req)
    content=res.read()
    dict_content=json.loads(content,encoding="gb2312")
    length = len(dict_content['fis']) 
    # print length
    i = 0
    for i in range(length):
        if ((dict_content['fis'][i][u'lp']) < 600 ):
            print (dict_content['fis'][i][u'lp']),
            print (dict_content['fis'][i][u'dt']),
            print (dict_content['fis'][i][u'at']) 
            #print (dict_content['fis'][i][u'dpbn']) 


def get_parameter(date):
    '''获取重要的参数 date:日期,格式示例:2016-05-13 '''
    url='http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s'%date
    res=urllib2.urlopen(url).read()
    tree=etree.HTML(res)
    pp=tree.xpath('''//body/script[1]/text()''')[0].split()
    CK_original=pp[3][-34:-2]
    CK=CK_original[0:5]+CK_original[13]+CK_original[5:13]+CK_original[14:]

    rk=pp[-1][18:24]
    num=random.random()*10
    num_str="%.15f"%num
    rk=num_str+rk
    r=pp[-1][27:len(pp[-1])-3]

    return rk,CK,r

if __name__=='__main__':
    dates=['2017-04-29','2017-04-30','2017-05-01','2017-05-02']

    for date in dates:
        rk,CK,r=get_parameter(date)
        get_json2(date,rk,CK,r)
        print "-----"

发送邮件程序(我找到出处就放上来):

# -*- coding: utf-8 -*-

from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib

def _format_addr(s):
    name, addr = parseaddr(s)
    return formataddr(( \
        Header(name, 'utf-8').encode(), \
        addr.encode('utf-8') if isinstance(addr, unicode) else addr))

from_addr = raw_input('From: ')
password = raw_input('Password: ')
to_addr = raw_input('To: ')
smtp_server = raw_input('SMTP server: ')

msg = MIMEText('Not just fly fight...', 'plain', 'utf-8')
msg['From'] = _format_addr(u'Air <%s>' % from_addr)
msg['To'] = _format_addr(u'126.Air <%s>' % to_addr)
msg['Subject'] = Header(u'flight……', 'utf-8').encode()

server = smtplib.SMTP(smtp_server, 25)
server.set_debuglevel(1) # 正式用改为0就好啦
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()

将查询信息和发送邮件的程序整合起来,大概就是这样,

#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib2
from lxml import etree
import json
import random
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
import sys
reload(sys)
sys.setdefaultencoding('utf8')

from_addr = "****@126.com" #raw_input('From: ')
password = "******" #raw_input('Password: ')
to_addr = "********@qq.com" #raw_input('To: ')
smtp_server = "smtp.126.com"#raw_input('SMTP server: ')

def _format_addr(s):
    name, addr = parseaddr(s)
    return formataddr(( \
        Header(name, 'utf-8').encode(), \
        addr.encode('utf-8') if isinstance(addr, unicode) else addr))

def get_json2(date,rk,CK,r):
    '''根据构造出的url获取到航班数据'''
    url= "http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1=SHA&ACity1=SIA&SearchType=S&DDate1=%s&IsNearAirportRecommond=0&rk=%s&CK=%s&r=%s"%(date,rk,CK,r)
    headers={'Host':"flights.ctrip.com",'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",'Referer':"http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=2017-04-29"}
    headers['Referer']="http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s"%date
    req=urllib2.Request(url,headers=headers)
    res=urllib2.urlopen(req)
    content=res.read()
    dict_content=json.loads(content,encoding="gb2312")
    length = len(dict_content['fis']) 
    # print length
    i = 0
    for i in range(length):
        if ((dict_content['fis'][i][u'lp']) < 600 ):
            print (dict_content['fis'][i][u'lp']),
            print (dict_content['fis'][i][u'dt']),
            print (dict_content['fis'][i][u'at']),
            print (dict_content['fis'][i][u'dpbn'])  
            if ((dict_content['fis'][i][u'lp']) <= 450 ):
                msg = MIMEText(('%r at %s in %s'% ((dict_content['fis'][i][u'lp']),(dict_content['fis'][i][u'dt']),(dict_content['fis'][i][u'dpbn']))),'plain', 'utf-8')
                msg['From'] = _format_addr(u'Air <%s>' % from_addr)
                msg['To'] = _format_addr(u'126.Air <%s>' % to_addr)
                msg['Subject'] = Header(u'flight…%r '%(dict_content['fis'][i][u'lp']), 'utf-8').encode()
                server = smtplib.SMTP(smtp_server, 25)
                server.set_debuglevel(0)
                server.login(from_addr, password)
                server.sendmail(from_addr, [to_addr], msg.as_string())
                server.quit()




def get_parameter(date):
    '''获取重要的参数 date:日期,格式示例:2016-05-13 '''
    url='http://flights.ctrip.com/booking/hrb-sha-day-1.html?ddate1=%s'%date
    res=urllib2.urlopen(url).read()
    tree=etree.HTML(res)
    pp=tree.xpath('''//body/script[1]/text()''')[0].split()
    CK_original=pp[3][-34:-2]
    CK=CK_original[0:5]+CK_original[13]+CK_original[5:13]+CK_original[14:]

    rk=pp[-1][18:24]
    num=random.random()*10
    num_str="%.15f"%num
    rk=num_str+rk
    r=pp[-1][27:len(pp[-1])-3]

    return rk,CK,r

if __name__=='__main__':
    dates=['2017-04-29','2017-04-30','2017-05-01']

    for date in dates:
        rk,CK,r=get_parameter(date)
        get_json2(date,rk,CK,r)
        print "-----"

然后使用crontab 做一个定时任务,每20mins执行一次;
基本格式 :

*  *  *  *  *  command
分  时  日  月  周  命令 

so,

0,20,40 * * * * python ~/test.py

尽管有很多问题,正在学习。

你可能感兴趣的:(技能心得)