python结合selenium.webdriver+PhantomJS登陆qq邮箱抓取数据

所有监控邮件发送到qq邮箱,需要分析一些服务器监控数据
qq邮箱登陆验证过程较为复杂,用urllib或request比较困难,可以用selenium.webdriver+PhantomJS无窗口画登陆qq邮箱,只是速度稍慢,但也能接受
事先安装好selenium
pip install selenium
下载phantomjs.exe
http://phantomjs.org/download.html

#!/usr/bin/env python
# _*_coding:utf-8_*_
# selenium 模拟登陆QQ邮箱
import time
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup
from urllib import parse


os.chdir('E:\\python3\\爬虫\\')
u = "邮箱帐号"
p = "邮箱密码"
phajs_path = 'phantomjs.exe'
CORP_API_TYPE = {
    'base': '/',
    'neicun': '/cgi-bin/mail_list',
}


class QQ_mail():
    headers = {':authority': 'mail.qq.com'}
    cap = DesiredCapabilities.PHANTOMJS.copy()
    cap["phantomjs.page.settings.loadImages"] = False
    for key, value in headers.items():
        cap['phantomjs.page.customHeaders.{}'.format(key)] = value

    def __init__(self):
        # self.driver = webdriver.Firefox()
        self.uname = u
        self.pwd = p
        self.driver = self.test_mail_login(CORP_API_TYPE['base'])
        print("正在访问QQ邮箱")

    @staticmethod
    def __makeUrl(shortUrl):
        base = "https://mail.qq.com"
        if shortUrl == '/':
            return base + shortUrl
        else:
            return base + '/' + shortUrl

    @staticmethod
    def __appendArgs(url, args):
        if args is None:
            return url
        for key, value in args.items():
            if '?' in url:
                url += ('&' + key + '=' + value)
            else:
                url += ('?' + key + '=' + value)
        return url

    def test_mail_login(self, shortUrl, args=None):
        """QQ邮箱登录"""
        try:
            driver = webdriver.PhantomJS(phajs_path, desired_capabilities=QQ_mail.cap)
            driver.start_session(QQ_mail.cap)
            driver.implicitly_wait(30)
            driver.set_window_size(800, 600)
            driver.get(self.__makeUrl(shortUrl))
            driver.switch_to.frame("login_frame")
            driver.find_element_by_id("switcher_plogin").click()
            driver.find_element_by_id("u").clear()
            driver.find_element_by_id("u").send_keys(self.uname)
            driver.find_element_by_id("p").clear()
            driver.find_element_by_id("p").send_keys(self.pwd)
            driver.find_element_by_id("p").send_keys(Keys.ENTER)
            print("登录QQ邮箱成功")
            return driver
        except Exception as e:
            raise e

    def get_floder_url(self, args):
        """获取<内存监控>url地址中的参数,返回字符串sid"""
        driver = self.driver
        time.sleep(3)
        try:
            link = driver.find_element_by_id(args).get_attribute('href')
        except Exception as e:
            raise e
        else:
            link_args = parse.parse_qs(link.split('?')[1])
            return link_args['sid'][0]

    def parse_mail_list(self, shortUrl, args=None):
        """抓取内存监控页面,清洗数据"""
        driver = self.driver
        # 构造url地址,https://mail.qq.com//cgi-bin/mail_list?folderid=137&page=11&sid=idTg0AOevkwneTph&nocheckframe=true
        url = self.__makeUrl(shortUrl)
        url = self.__appendArgs(url, args)
        print('开始抓取:', url)
        try:
            ip_add = []
            driver.get(url)
            time.sleep(3)
            # print(driver.page_source)
            response = driver.page_source
            soup = BeautifulSoup(response, 'html.parser')
            tf_no = soup.find_all('div', {'class': 'tf no'})
            for i in tf_no:
                black_tt = i.find('u', {'class': 'black tt '}).get_text()
                if '恢复' in black_tt or "0个" in black_tt:
                    continue
                else:
                    text = i.find('b', {'class': 'no '}).get_text()
                    ip = re.search(r'\d+\.\d+\.\d+\.\d+', text).group(0)
                    ip_add.append(ip)
            return ip_add
        except Exception as e:
            raise e

    def tearDown(self):
        self.driver.quit()


def main():
    mail = QQ_mail()
    sid = mail.get_floder_url("folder_137")
    ip_all = []
    # 抓取页数
    for i in range(12):
        ip_add = mail.parse_mail_list(
            CORP_API_TYPE['neicun'],
            {'folderid': '137',
             'page': str(i),
             'sid': sid,
             'nocheckframe': 'true',
             })
        ip_all += ip_add
    print("ip地址             内存不足报警次数")
    # 统计列表元素出现次数
    ip_set = set(ip_all)
    for item in ip_set:
        print('{}:     {}'.format(item, ip_all.count(item)))
    mail.tearDown()


if __name__ == "__main__":
    main()

你可能感兴趣的:(python)