Python:PhantomJS 无头浏览器获取通知内容

本文是文章 《Python:定时获取通知内容并发送到邮箱》 的重写版

无头浏览器即headless browser,是一种没有界面的浏览器

重写的原因在于原来的网站多加了安全措施,直接 POST 操作会被封掉,而且有意打算用面向对象重写一次。

参考内容等请点进 《Python:定时获取通知内容并发送到邮箱》 查阅

要注意的是用 apt安装,使用过程会出现 Message: Error - Unable to load Atom 'find_element' from file ':/ghostdriver/./third_party/webdriver-atoms/find_element.js' 的错误,搜索到 python 3.x - PhantomJS with Selenium - Unable to load Atom ''find_element" - Stack Overflow,推荐这样安装

apt purge phantomjs
wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
tar xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2
cp /path/to/phantom/untar/bin/phantomjs /usr/bin/

没写多少注释,不懂的再看看之前的内容吧。

#!/usr/bin/python3
# -*- coding:utf-8 -*-

'''
【留意!!】
启动程序前要先留意 locale (Linux 命令)输出的内容是否为zh_CN.UTF-8
建议写 shell 脚本启动并在运行前 export LC_ALL=zh_CN.UTF-8
'''


from conf import *
from sys import argv
from urllib.parse import unquote
from bs4 import BeautifulSoup
from jinja2 import  Environment,FileSystemLoader,select_autoescape
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys

import re,os,json,time,redis,yagmail,requests

class SchoolNewsMail:
    def __init__(self):
        self.ROOT_URL = 'http://test.com'

        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0")

        self.jinja2_env = Environment(
            loader=FileSystemLoader(os.getcwd() + '/template'),
            autoescape=select_autoescape(['html'])
        )

        self.browser = webdriver.PhantomJS(desired_capabilities=dcap)

        self.weather_data = []
        self.article_data = []
        self.update_content = ''

        self.mail_content = ''

    def get_weather_data(self):
        request_url = WEATHER_API_URL + 'city=' + WEATHER_API_CITY + '&&key=' + WEATHER_API_KEY
        request = requests.get(request_url)
        data = []
        if request.status_code == 200:
            data = json.loads(request.content.decode(encoding='utf-8'))
        self.weather_data = data

    def get_update_data(self):
        self.update_content = '最近更新:'+VERSION+':'+ANNOUNCEMENT+'\n'

    def login(self):

        self.printf('login begin')

        try:
            self.browser.get(self.ROOT_URL+'/UserLogin.aspx')
            self.browser.find_element_by_id('ContentPlaceHolder1_userEmail').send_keys(LOGIN_ACCOUNT)
            self.browser.find_element_by_id('ContentPlaceHolder1_userPassWord').send_keys(LOGIN_PASSWORD)
            time.sleep(2)
            self.browser.find_element_by_id('ContentPlaceHolder1_userPassWord').send_keys(Keys.ENTER)
            time.sleep(5)
        except Exception as e:
            print(e)

        time.sleep(3)

        self.browser.save_screenshot("index.png")  # 截图保存

        self.printf('login end')

    def parse(self):

        self.printf('parse begin')

        self.browser.get(self.ROOT_URL + '/ArticleList.aspx?category=4')
        self.browser.save_screenshot('parse.png')

        index = self.browser.page_source

        html_soup = BeautifulSoup(index, 'lxml')
        articles = html_soup.find('div', attrs={'class': 'articles'})

        if not articles:
            self.printf('article not found')
            return

        article_result = []

        for val in articles.find_all('p'):
            article_id = val.find('a')['href'][-6:]
            article_url = self.ROOT_URL + val.find('a')['href'][1:]
            article_date = val.find_all('span')[1].getText()[:-1]
            article_title = val.find('a')['title']
            article_author = val.span['title']
            article_attachment = []
            article_excerpt = ''

            self.printf(article_title)

            if self.article_id_exist(article_id):
                self.printf('article exist in database %s' % (article_title))
                continue

            self.browser.get(article_url)

            article_soup = BeautifulSoup(self.browser.page_source, 'lxml')
            article_content = article_soup.find('div', attrs={'id': 'articleBody'})

            article_link = article_content.find_all('a')
            attachment_url_pattern = re.compile(r'http://news.gdut.edu.cn/DepartmentUploadFiles/(.+)/files/(.+)')
            for link in article_link:
                if 'http://news.gdut.edu.cn/DepartmentUploadFiles' not in link['href']:
                    self.printf('%s do not have file attachment' % (link['href']))
                    continue

                match = attachment_url_pattern.match(link['href'])
                if not match:
                    self.printf('%s do not have file attachment' % (link['href']))
                    continue

                attachment_name = match.group(2)
                attachment_url = link['href']

                if '%' in attachment_name:
                    attachment_name = unquote(attachment_name)

                article_attachment.append({'attach_name': attachment_name, 'attach_url': attachment_url})

            info = ''.join(article_content.getText().split())
            info = info.replace(article_title, '')
            info = info.replace('单位:' + article_author, '')
            article_excerpt = article_excerpt.join(info[:150])

            article_result.append(
                {
                    'url': article_url,
                    'date': article_date,
                    'title': article_title,
                    'author': article_author,
                    'excerpt': article_excerpt,
                    'attachment': article_attachment
                }
            )

        self.article_data = article_result

        self.printf('parse end')

    def render(self):
        weather_render = self.jinja2_env.get_template('weather.html')

        if self.weather_data:
            now = self.weather_data['HeWeather5'][0]['now']
            forecast = self.weather_data['HeWeather5'][0]['hourly_forecast'];
            weather_content = weather_render.render(now=now, forecast=forecast)
        else:
            weather_content = article_render.render()

        article_render = self.jinja2_env.get_template('article.html')
        if self.article_data:
            article_content = article_render.render(articles=self.article_data)
        else:
            article_content = article_render.render()

        self.mail_content = weather_content + article_content + self.update_content

    def send_mail(self,type):
        mail_client = yagmail.SMTP(user=SEND_MAIL_USER, password=SEND_MAIL_PWD, host=SEND_MAIL_HOST,
                                   port=SEND_MAIL_PORT)
        if(type == 1):
            for addr in SEND_TO_LIST_TEST:
                printf('sending[test user]: ' + addr)
                mail_client.send(addr, subject=SEND_MAIL_SUBJECT, contents=self.mail_content)
                time.sleep(1)
        else:
            for addr in SEND_TO_LIST:
                printf('sending : ' + addr)
                mail_client.send(addr, subject=SEND_MAIL_SUBJECT, contents=mail_content)
                time.sleep(1)

    def article_id_exist(self,id):

        if not id:
            printf('empty article id')
            return False

        if not id.isdigit():
            printf('need number instead of other value type')
            return False

        r = redis.StrictRedis(host='localhost', port=6379, db=0)

        if not r.get(id):
            r.set(id,"True")
            return False
        else:
            return True

    def printf(self,string):
        if string:
            print(time.strftime("%Y-%m-%d %H:%M:%S : ", time.localtime()) + string)


obj = SchoolNewsMail()
obj.get_weather_data()
obj.login()
obj.parse()
obj.render()
obj.get_update_data()
if len(argv) == 2 and '-t' in argv:
    obj.send_mail(1)
else:
    obj.send_mail(0)

来自个人 Python 文集

你可能感兴趣的:(Python:PhantomJS 无头浏览器获取通知内容)