Pyrthon2.7 爬取微博热搜

闲来无聊,搞搞热搜,先拿微博练手

将爬取下来的数据发到自己的邮箱

相关的moudle通过pip install就行

效果展示

什么都是扯淡,直接贴代码

# -*- coding: utf-8 -*-
#!/usr/bin/python

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
from email.header import Header
import traceback
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

weibo_url = "http://s.weibo.com"


class HotSearchInfo:
    def __init__(self, isForceTop, index, title, url, num, flag):
        self.isForceTop = isForceTop
        self.index = index
        self.title = title
        self.url = url
        self.num = num
        self.flag = flag

    def __str__(self):
        return u'置顶: %s, 排名: %s, 标题: %s, 链接: %s, 热度: %s, 标识: %s' % \
               (self.isForceTop, self.index, self.title, self.url, self.num, self.flag)


# 获取热搜页面
def get_html():
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    data = {
        'cate':'realtimehot'
    }
    html = ""
    try:
        r = requests.get('%s/top/summary?' % (weibo_url), params=data, headers=headers)
        if r.status_code == 200:
            html = r.text
    except:
        print "error"
    return html


# 解析热搜页面获取热搜列表数据
def parse_html(html):
    soup = BeautifulSoup(html, 'lxml')
    # 获取数据所在的div
    table = soup.find("div", attrs={"id": "pl_top_realtimehot"})
    # 获取热搜数据所在的tbody
    tbody = table.find("tbody")
    # 获取真正的热搜数据所有的列表
    trs = tbody.find_all('tr')
    result = []
    # 遍历获取每一个热搜的信息
    for tr in trs:
        td01 = tr.find("td", attrs={"class": "td-01"})
        td02 = tr.find("td", attrs={"class": "td-02"})
        td03 = tr.find("td", attrs={"class": "td-03"})
        # 是否强制置顶
        _isForceTop = td01.find("i", attrs={"class": "icon-top"})
        isForceTop = 0
        if _isForceTop is not None:
            isForceTop = 1
        # 排名
        _index = td01.text
        index = 0
        if _index is not None and _index != "":
            index = eval(_index)
        # 标题
        title = td02.find("a").text
        # 链接
        _url = td02.find("a")['href']
        url = ""
        if _url is not None and _url != "":
            url = "%s%s" % (weibo_url, _url)
        # 热度
        num_text = td02.find("span")
        num = 0
        if num_text is not None:
            num = eval(num_text.text)
        # 标识
        flag_text = td03.find("i")
        flag = u"无"
        if flag_text is not None:
            flag = flag_text.text
        result.append(HotSearchInfo(isForceTop, index, title, url, num, flag))
    return result


# 输出
def output(tr):
    result = parse_html(tr)
    content = build_content(result)
    send_mail(content)
    # for info in result:
    #     print info.__str__()+'\n'


def build_content(list):
    if list is None:
        return u""
    table = u"序号关键词热度标识%s
" trs = [] for info in list: tr = u"%s%s%s%s" % \ (info.index, info.url, info.title, info.num, info.flag) trs.append(tr) result = table % u"".join(trs) return result def send_mail(content): sender = '[email protected]' pwd = "xxx" receivers = ['[email protected]'] message = MIMEText(content, 'html', 'utf-8') message['Subject'] = Header('微博热搜榜单-%s' % time.strftime("%Y-%m-%d"), 'utf-8') message['From'] = sender message['To'] = ",".join(receivers) try: smtp_obj = smtplib.SMTP_SSL() smtp_obj.connect("smtp.163.com") # 此处密码是需要去邮箱设置的授权码,不是邮箱密码 smtp_obj.login(sender, pwd) smtp_obj.sendmail(sender, receivers, message.as_string()) print "邮件发送成功" except smtplib.SMTPException, e: print "Error: 无法发送邮件, %s" % traceback.format_exc() finally: if smtp_obj: smtp_obj.close() def main(): html = get_html() if html is None: print "get none" return output(html) main()

千万不要使用阿里云的邮箱,发不出去 

 

你可能感兴趣的:(其他,python,爬虫,微博热搜,邮件)