一、根据已经加载好了的网页来
# 微博热搜-网页获取数据
xuhao = 1 weibo = "https://s.weibo.com/top/summary?cate=realtimehot" header = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Mobile Safari/537.36', 'Host': 's.weibo.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh-Hans;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Cookie': 'SINAGLOBAL=629307801125.1562.1647222894281; MEIQIA_TRACK_ID=270PU8S5yKqEfpAVWoDc06H6jyD; MEIQIA_VISIT_ID=270PU8zsVljRWyEBancxQrgvmYm; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWdaWTB3sRF6jd1XJfs0ZqD5JpX5KMhUgL.FozReK-XSo5XS0B2dJLoIppiC-LidspjdspjC-LiC-LiC-LiCgpjdspjdspj; UOR=,,login.sina.com.cn; ULV=1656646122000:68:2:8:1116980463544.4568.1656646121912:1656637191583; PC_TOKEN=180e205e98; ALF=1688431393; SSOLoginState=1656895394; SCF=AuBeGDPF_iwvKhM6y9Sg1l4czlcP34cIcTmRzZVkVT3XGUH0LYyg1_Cy51PWmYxV-WxitwqfJFkf_r6KtbvKRQM.; SUB=_2A25PxkfzDeRhGeRG6lcV9i7IzDiIHXVssj47rDV8PUNbmtAKLXTnkW9NUiglTzvBtqLbIN6bsZuRpc9iMTdo4MDQ' } html = requests.get(weibo, headers=header) html.encoding = 'utf-8' soup = BeautifulSoup(html.text, "html.parser") items = soup.find('section', {'class': 'list'}) # 根据节点 try: list_ = items.find_all('li') except (ValueError, ArithmeticError): print("程序发生了数字格式异常、算术异常之一") list_ = [] count = 0 with open(r'' + address + filename, 'a', encoding='utf-8-sig') as f: for n in list_: src = n.find('a').get('href') title = n.find('a').get_text() title = str.replace(title, "\n", '') f.write("{},{},{},{}\n".format(xuhao, title, '', 'https://s.weibo.com'+src)) count = count + 1 xuhao = xuhao + 1 f.close() print('微博热搜,' + str(count) + '篇')
二、根据http请求来,这种需要f12去看网页的请求和参数是什么
# 头条健康-网页获取数据 url_toutiao = 'https://www.toutiao.com/api/pc/feed/?category=news_regimen' result = requests.get(url_toutiao, headers=headers) result.encoding = 'utf-8' soup = BeautifulSoup(result.text, "html.parser") temp = str(soup) res = json.loads(temp) # json数据 li = res['data'] index = 1 with open(r'' + address + filename, 'a', encoding='utf-8-sig') as f: # f.write("{}\n".format('头条')) for item in li: src = 'www.toutiao.com' + item['source_url'] f.write("{},{},{},{}\n".format(xuhao, item['title'], '', src)) index += 1 xuhao = xuhao + 1 print('头条,' + str(index) + '篇')
最后,需要导入的包,没有的pip install吧
import requests import json from openpyxl.styles import Alignment from openpyxl.utils import get_column_letter from bs4 import BeautifulSoup import os from openpyxl import Workbook from openpyxl import load_workbook
注意事项,更换Cookie
将Python打包成exe命令:
pyinstaller -F D:\\py_workplace\\python_example\\news20220623.py -p D:\\py_workplace\\python_example\\venv\\Lib\\site-packages