WeiBO Hot Spider

# -*- coding:utf-8-*-
# Author: @EM
import time
import requests
from lxml import etree
import pandas as pd
import pymysql as ps
import datetime
import os


class WeiboHot(object):
    def __init__(self, cookie: str):
        self.__url = 'https://s.weibo.com/top/summary?cate=realtimehot'
        self.__cookie = cookie
        self.__event_name = []
        self.__event_href = []
        self.__event_num = []
        self.__time_now = datetime.datetime.now().timestamp()

    def __str__(self) -> str:
        return '微博热搜爬虫'

    @property
    def check_cookie(self) -> str:
        return self.__cookie

    @property
    def check_url(self) -> str:
        return self.__url

    def __auto_get_cookie(self) -> None:
        pass
        return

    def __get_event(self) -> None:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
            'cookie': self.__cookie
        }

        r_text = requests.get(self.__url, headers=headers)
        r_text.encoding = "utf-8"
        xp_in_object = etree.HTML(r_text.text)
        events = xp_in_object.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/text()')[1:]
        href = xp_in_object.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/@href')
        href = ['https://s.weibo.com/' + i for i in href][1:]
        num = xp_in_object.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/span/text()')
        try:
            K_nan_index = num.index(' ')
            self.__event_name = events[0:K_nan_index] + events[K_nan_index + 1:]
            self.__event_num = num[0:K_nan_index] + num[K_nan_index + 1:]
            self.__event_href = href[0:K_nan_index] + href[K_nan_index + 1:]
        except:
            self.__event_name = events
            self.__event_num = num
            self.__event_href = href
        else:
            pass
        return

    def save_to_excel(self) -> None:
        if os.path.exists('data'):
            pass
        else:
            os.mkdir('data')
        self.__get_event()
        dataframe_dict = {
            'num': [i + 1 for i in range(len(self.__event_num))],
            'event': self.__event_name,
            'hot_num': self.__event_num,
            'url': self.__event_href
        }
        data = pd.DataFrame(data=dataframe_dict)
        table_name = 'TIME' + str(int(self.__time_now)) + '.xlsx'
        data.to_excel('data/' + table_name, index=False)

        return

    def save_to_mysql(self, mysql_config: dict) -> None:
        self.__get_event()
        dataframe_dict = {
            'num': [i + 1 for i in range(len(self.__event_num))],
            'event': self.__event_name,
            'hot_num': self.__event_num,
            'url': self.__event_href
        }
        data = pd.DataFrame(data=dataframe_dict)
        db = ps.connect(**mysql_config)
        cursor = db.cursor()
        table_name = str(int(self.__time_now))
        sql_order_create_table = f"""
        CREATE TABLE RESULT_{table_name}(
        NUM INT NOT NULL,
        EVENT CHAR(100),
        HOT CHAR(20),
        HREF CHAR(255)
        )
        """
        cursor.execute(sql_order_create_table)

        for num in range(len(self.__event_num)):
            sql_order_insert = f"""
            INSERT INTO RESULT_{table_name}(
            NUM,EVENT,HOT,HREF)
            VALUES
            {tuple(data.iloc[num, :])}
            """
            try:
                cursor.execute(sql_order_insert)
                db.commit()
            except Exception as e:
                print(f'error:{e}')
                db.rollback()
        db.close()

        return


if __name__ == '__main__':
    cookie = 'your cookies'
    mysql_config = {
        'host': 'localhost',
        'user': 'root',
        'password': '123456',
        'database': 'weibohot'
    }
    hot = WeiboHot(cookie=cookie)
    hot.save_to_excel()
    # hot.save_to_mysql(mysql_config=mysql_config)

你可能感兴趣的:(python,爬虫,开发语言)