爬虫--爬取csdn消息并邮箱通知(python3)

之前有很多同学给我发消息,咨询相关问题,我都没能及时回复解答。

主要原因是工作比较忙,博客没有每天登入查看消息。等到打开消息,看一些同学的消息,无奈都已经过去了多天。

所以这里写了个小脚本,每天爬取博客消息通知,如果有新消息,就发送到个人邮箱提醒。

代码如下:

#Version: python3
#Author: 程松
#-*- coding: utf-8 -*-

import urllib
import http.cookiejar
from bs4 import BeautifulSoup
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib

#爬取csnd博客消息信息
def run_spider():
    cookie = http.cookiejar.CookieJar()
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler)
    loginurl = "https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn"

    #登陆前准备:获取lt和exection
    response = opener.open(loginurl)
    soup = BeautifulSoup(response.read(), "lxml")
    for input in  soup.form.find_all("input"):
        if input.get("name") == "lt":
            lt = input.get("value")
        if input.get("name") == "execution":
            execution = input.get("value")
    #post信息
    values = {
            "username":"*******@qq.com", #博客帐号
            "password":"******", #博客密码
            "lt":lt,
            "execution":execution,
            "_eventId":"submit"
        }
    postdata = urllib.parse.urlencode(values).encode(encoding='UTF8')

    headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
    }

    request = urllib.request.Request(loginurl, postdata, headers)
    result=opener.open(request)

    # #登陆后跳转到博客消息页面
    url = "http://msg.csdn.net/"
    result_response = opener.open(url)
    result_soup = BeautifulSoup(result_response.read(), "html.parser")
    result_unread_count=result_soup.find_all('b')
    result_unread_count_text = result_unread_count[0].text
    return result_unread_count_text

# 发送爬虫内容到邮件
def send_email(result_unread_count_text):
    def _format_addr(s):
        name, addr = parseaddr(s)
        return formataddr((Header(name, 'utf-8').encode(), addr))

    from_addr = '*******@sina.com' #发送邮箱帐号
    password = '******' #发送邮箱密码
    to_addr = '[email protected]' #接受邮箱帐号(pd:这是本人邮箱,请不要随意发垃圾信息)
    smtp_server = 'smtp.sina.com'

    msg = MIMEText('博客有'+'result_unread_count_text'+'条未读消息', 'plain', 'utf-8') #内容
    msg['From'] = _format_addr('程松 <%s>' % from_addr) #发件人
    msg['To'] = _format_addr('程松 <%s>' % to_addr) #收件人
    msg['Subject'] = Header('博客有新消息啦……', 'utf-8').encode() #主题

    server = smtplib.SMTP(smtp_server, 25)
    server.set_debuglevel(1)
    server.login(from_addr, password)
    server.sendmail(from_addr, [to_addr], msg.as_string())
    server.quit()
    print('已经成功将:" %s "发送到%s,请查收'%(str(result_unread_count_text),to_addr))

if __name__ == '__main__':
    result_unread_count_text=run_spider()
    if result_unread_count_text!='':
        send_email(result_unread_count_text)
    else:
        print("博客没有新消息!")

你可能感兴趣的:(数据挖掘(python))