获取实时的 沪深京A股 Top40 的股票数据,按照涨幅倒序排列,输出到 excel 表格,其中股票数据包含:
最终的结果如下:
beautifulsoup4==4.12.2
selenium==4.16.0=
pandas~=2.1.4
其中依赖的 ChromeDriver 的下载可以看这篇文章下载。要求下载 Chrome 浏览器,原因是有些 Js 动态内容用浏览器模拟可以拿到。
如果觉得麻烦,我这里上传了完整项目资源,可以点击这里下载: 《爬取此刻Top40的股票数据》
import os
from datetime import datetime
import pandas as pd
from selenium.webdriver.common.by import By
from spider import Spider
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
root_path = os.path.dirname(os.path.dirname(__file__))
class YourTrader(Spider):
def __init__(self, url):
super().__init__(url)
option = webdriver.ChromeOptions()
option.add_argument('headless')
# 声明浏览器对象
service = Service('{}/tools/chromedriver-mac-x64/chromedriver'.format(root_path))
self.driver = webdriver.Chrome(service=service, options=option)
def get_html(self):
self.driver.get(self.url) # 相当于地址栏跳转
html = self.driver.page_source
return html
def get_top_20_stocks(self):
# 解析内容
soup = BeautifulSoup(self.get_html(), 'html5lib')
# 查找指定的元素
table = soup.find('table', attrs={'id': 'table_wrapper-table'})
table_thead = table.find('thead')
columns = []
for row in table_thead.find_all('tr'):
cols = row.find_all(['th']) # 根据需要决定是否包括 td
# 记录表头
row_data = [col.text.strip() for col in cols]
columns.extend(row_data)
table_tbody = table.find('tbody')
top_20_stocks = []
for row in table_tbody.find_all('tr'):
cols = row.find_all(['td']) # 根据需要决定是否包括表头(th)
# 对每行数据进行处理并存储
row_data = [col.text.strip() for col in cols]
top_20_stocks.append(row_data)
return columns, top_20_stocks
def do_biz(self):
columns, top_20_stocks = self.get_top_20_stocks()
top_20_40_stocks = None
btns = self.driver.find_elements(by=By.CLASS_NAME, value='paginate_button')
for btn in btns:
if btn.get_attribute('data-index') != "2":
continue
btn.click()
_, top_20_40_stocks = self.get_top_20_stocks()
break
top_40_stocks = top_20_stocks + top_20_40_stocks
df = pd.DataFrame(top_40_stocks, columns=columns)
df.to_excel('{0}/biz/top_40/top_40_{1}.xlsx'.format(root_path, datetime.now().strftime('%Y-%m-%d-%H:%M:%S')), index=False)
if __name__ == '__main__':
your_trader = YourTrader('https://quote.eastmoney.com/center/gridlist.html#hs_a_board')
your_trader.do_biz()
欢迎关注 wx 公众号:一个比特定乾坤
时不时发布硬核的技术文章!