import requests
from bs4 import BeautifulSoup
import pandas as pd
# 定义解析页面函数,用来获取网易新闻热点排行Top10信息
def get_news_info(url, headers):
resposne = requests.get(url=url, headers=headers)
resposne.encoding = 'utf-8'
content = resposne.text
# print(content)
soup = BeautifulSoup(content,'lxml') # 创建一个BeautifulSoup对象,获取页面正文
all_news =soup.find('div',class_ ='mod_hot_rank').find('ul').find_all('li') # 获取网易新闻热点排行Top10内容
news_list = [] # 创建空列表
for news in all_news:
# print(news)
news_rank = news.find('em').get_text() # 获取新闻排名
news_title =news.find('a').get_text() # 获取新闻标题
# print(news_title)
posts_num = news.find('span').get_text() # 获取新闻跟帖数
# print(posts_num)
news_url = news.a['href'] # 获取新闻链接 获取href的指令
# print(news_url)
news_list.append(news_rank+news_title+posts_num+news_url) # 把每条新闻的排名、标题、跟帖数和链接添加到一个列表中,再追加到一个大列表中
return news_list
def save_as_csv(new_list):
df=pd.DataFrame(new_list)
df.to_csv('/home/qingjiao/news.csv', index=False, header=False)
print('写入完成')
# 程序入口
if __name__ == '__main__':
url = 'https://news.163.com/' # 网易新闻首页链接
# 定义请求头信息
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
}
news_list =get_news_info(url,headers) # 调用爬虫方法,获取网易新闻热点排行Top10
print(news_list) # 输出网易新闻热点排行Top10信息
save_as_csv(news_list)