Python爬网页数据的两种方式,并存到Excel

一、根据已经加载好了的网页来

# 微博热搜-网页获取数据
xuhao = 1
weibo = "https://s.weibo.com/top/summary?cate=realtimehot"
header = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Mobile Safari/537.36',
    'Host': 's.weibo.com',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Cookie': 'SINAGLOBAL=629307801125.1562.1647222894281; MEIQIA_TRACK_ID=270PU8S5yKqEfpAVWoDc06H6jyD; MEIQIA_VISIT_ID=270PU8zsVljRWyEBancxQrgvmYm; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWdaWTB3sRF6jd1XJfs0ZqD5JpX5KMhUgL.FozReK-XSo5XS0B2dJLoIppiC-LidspjdspjC-LiC-LiC-LiCgpjdspjdspj; UOR=,,login.sina.com.cn; ULV=1656646122000:68:2:8:1116980463544.4568.1656646121912:1656637191583; PC_TOKEN=180e205e98; ALF=1688431393; SSOLoginState=1656895394; SCF=AuBeGDPF_iwvKhM6y9Sg1l4czlcP34cIcTmRzZVkVT3XGUH0LYyg1_Cy51PWmYxV-WxitwqfJFkf_r6KtbvKRQM.; SUB=_2A25PxkfzDeRhGeRG6lcV9i7IzDiIHXVssj47rDV8PUNbmtAKLXTnkW9NUiglTzvBtqLbIN6bsZuRpc9iMTdo4MDQ'
}
html = requests.get(weibo, headers=header)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, "html.parser")
items = soup.find('section', {'class': 'list'}) # 根据节点
try:
    list_ = items.find_all('li')
except (ValueError, ArithmeticError):
    print("程序发生了数字格式异常、算术异常之一")
    list_ = []
count = 0
with open(r'' + address + filename, 'a', encoding='utf-8-sig') as f:
    for n in list_:
        src = n.find('a').get('href')
        title = n.find('a').get_text()
        title = str.replace(title, "\n", '')
        f.write("{},{},{},{}\n".format(xuhao, title, '', 'https://s.weibo.com'+src))
        count = count + 1
        xuhao = xuhao + 1
    f.close()
print('微博热搜,' + str(count) + '篇')

二、根据http请求来,这种需要f12去看网页的请求和参数是什么

# 头条健康-网页获取数据
url_toutiao = 'https://www.toutiao.com/api/pc/feed/?category=news_regimen'
result = requests.get(url_toutiao, headers=headers)
result.encoding = 'utf-8'
soup = BeautifulSoup(result.text, "html.parser")
temp = str(soup)
res = json.loads(temp)  # json数据
li = res['data']
index = 1
with open(r'' + address + filename, 'a', encoding='utf-8-sig') as f:
    # f.write("{}\n".format('头条'))
    for item in li:
        src = 'www.toutiao.com' + item['source_url']
        f.write("{},{},{},{}\n".format(xuhao, item['title'], '', src))
        index += 1
        xuhao = xuhao + 1
print('头条,' + str(index) + '篇')

最后,需要导入的包,没有的pip install吧

import requests
import json
from openpyxl.styles import Alignment
from openpyxl.utils import get_column_letter
from bs4 import BeautifulSoup
import os
from openpyxl import Workbook
from openpyxl import load_workbook

注意事项,更换Cookie

将Python打包成exe命令:

pyinstaller -F D:\\py_workplace\\python_example\\news20220623.py  -p D:\\py_workplace\\python_example\\venv\\Lib\\site-packages

你可能感兴趣的:(python,开发语言)