# -*- coding:utf-8-*-
# Author: @EM
import time
import requests
from lxml import etree
import pandas as pd
import pymysql as ps
import datetime
import os
class WeiboHot(object):
def __init__(self, cookie: str):
self.__url = 'https://s.weibo.com/top/summary?cate=realtimehot'
self.__cookie = cookie
self.__event_name = []
self.__event_href = []
self.__event_num = []
self.__time_now = datetime.datetime.now().timestamp()
def __str__(self) -> str:
return '微博热搜爬虫'
@property
def check_cookie(self) -> str:
return self.__cookie
@property
def check_url(self) -> str:
return self.__url
def __auto_get_cookie(self) -> None:
pass
return
def __get_event(self) -> None:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
'cookie': self.__cookie
}
r_text = requests.get(self.__url, headers=headers)
r_text.encoding = "utf-8"
xp_in_object = etree.HTML(r_text.text)
events = xp_in_object.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/text()')[1:]
href = xp_in_object.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/@href')
href = ['https://s.weibo.com/' + i for i in href][1:]
num = xp_in_object.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/span/text()')
try:
K_nan_index = num.index(' ')
self.__event_name = events[0:K_nan_index] + events[K_nan_index + 1:]
self.__event_num = num[0:K_nan_index] + num[K_nan_index + 1:]
self.__event_href = href[0:K_nan_index] + href[K_nan_index + 1:]
except:
self.__event_name = events
self.__event_num = num
self.__event_href = href
else:
pass
return
def save_to_excel(self) -> None:
if os.path.exists('data'):
pass
else:
os.mkdir('data')
self.__get_event()
dataframe_dict = {
'num': [i + 1 for i in range(len(self.__event_num))],
'event': self.__event_name,
'hot_num': self.__event_num,
'url': self.__event_href
}
data = pd.DataFrame(data=dataframe_dict)
table_name = 'TIME' + str(int(self.__time_now)) + '.xlsx'
data.to_excel('data/' + table_name, index=False)
return
def save_to_mysql(self, mysql_config: dict) -> None:
self.__get_event()
dataframe_dict = {
'num': [i + 1 for i in range(len(self.__event_num))],
'event': self.__event_name,
'hot_num': self.__event_num,
'url': self.__event_href
}
data = pd.DataFrame(data=dataframe_dict)
db = ps.connect(**mysql_config)
cursor = db.cursor()
table_name = str(int(self.__time_now))
sql_order_create_table = f"""
CREATE TABLE RESULT_{table_name}(
NUM INT NOT NULL,
EVENT CHAR(100),
HOT CHAR(20),
HREF CHAR(255)
)
"""
cursor.execute(sql_order_create_table)
for num in range(len(self.__event_num)):
sql_order_insert = f"""
INSERT INTO RESULT_{table_name}(
NUM,EVENT,HOT,HREF)
VALUES
{tuple(data.iloc[num, :])}
"""
try:
cursor.execute(sql_order_insert)
db.commit()
except Exception as e:
print(f'error:{e}')
db.rollback()
db.close()
return
if __name__ == '__main__':
cookie = 'your cookies'
mysql_config = {
'host': 'localhost',
'user': 'root',
'password': '123456',
'database': 'weibohot'
}
hot = WeiboHot(cookie=cookie)
hot.save_to_excel()
# hot.save_to_mysql(mysql_config=mysql_config)