Python 爬取新浪财经部分股票的历史交易数据

本文仅供学习交流,如有错误纰漏,还请谅解,欢迎大家一起来学习探讨!

    • 参考资料(感谢!)
    • 爬取准备
      • 爬取思路
      • 模块1:网页表格数据爬取
      • 模块2:添加输出数据
    • 源代码(近期可能还要修改...)
      • 爬取近一个月的历史交易数据
      • 爬取近一年的历史交易数据

参考资料(感谢!)

配角七三—如何抓取网页中的表格:
https://zhuanlan.zhihu.com/p/33986020

爬取准备

import requests
from bs4 import BeautifulSoup
import pandas as pd
import  os
import time
import random

爬取思路

找到数据所在的网页,利用开发者工具,查看网页url,请求状态,源代码等等,然后定位数据元素。随后,进行编程。利用相关函数,模拟访问网页,采集数据,加以处理,并保存至本地。(细节之处不到位,还请见谅,博主还会再找时间另外总结)
Python 爬取新浪财经部分股票的历史交易数据_第1张图片

模块1:网页表格数据爬取

def get_stock_table(stockcode,i):
        url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
            stockcode) + '.phtml?year=2019&jidu=' + str(i)
        print(url)
        res = requests.get(url)
        res.encoding = 'gbk'
        soup = BeautifulSoup(res.text, 'lxml')
        tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
        df_list = []
        for table in tables:
            df_list.append(pd.concat(pd.read_html(table.prettify())))
        df = pd.concat(df_list)
        df.columns = df.iloc[0]
        headers = df.iloc[0]
        df = pd.DataFrame(df.values[1:], columns=headers)
        #print(len(df) - 1)   #df中有几行数据
        if (len(df) - 1 < 22):
            c =len(df)-1
            df = add_stock_table(stockcode,i,c,df)
        else:
            df =pd.DataFrame(df.values[1:22], columns=headers)
        df = df.reset_index(drop=True)
        df.to_excel('...\\'+str(stockcode) +'.xlsx')
        sleeptime = random.randint(1, 10)
        #print(sleeptime)
        time.sleep(sleeptime)

不过以上函数可能有时候获取不到一个月的数据,因此还要再加一个函数,用以添加数据。如果要获取一年的话,就要加一个循环

模块2:添加输出数据

def add_stock_table(stockcode,i,c,df):
    i = i - 1
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(stockcode) + '.phtml?year=2019&jidu=' + str(i)
    #print(url)
    res = requests.get(url)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
    df_addlist = []
    for table in tables:
        df_addlist.append(pd.concat(pd.read_html(table.prettify())))
    df_add = pd.concat(df_addlist)
    headers = df_add.iloc[0]
    df_add = pd.DataFrame(df_add.values[1:random.randint(20, 22)-c], columns=headers)
    #print(df_add)
    df_sum = df.append(df_add)
    #print(df_sum)
    #print(len(df_sum)-1)
    return df_sum

谨记!本文仅供学习交流,如有错误纰漏,还请原谅,欢迎指教!博主较佛(懒),随缘修改!

源代码(近期可能还要修改…)

注意:
源代码无法直接套用!
.xlsx文件的路径需要修改!!
其他的根据自己的需要做变更。

爬取近一个月的历史交易数据

from bs4 import BeautifulSoup
import requests
import pandas as pd
import  os
import time
import random


def get_stock_table(stockcode,i):
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
        stockcode) + '.phtml?year=2019&jidu=' + str(i)
    print(url)
    res = requests.get(url)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
    df_list = []
    for table in tables:
        df_list.append(pd.concat(pd.read_html(table.prettify())))
    df = pd.concat(df_list)
    df.columns = df.iloc[0]
    headers = df.iloc[0]
    df = pd.DataFrame(df.values[1:], columns=headers)
    #print(len(df) - 1)   #df中有几行数据
    if (len(df) - 1 < 22):
        c =len(df)-1
        df = add_stock_table(stockcode,i,c,df)
    else:
        df =pd.DataFrame(df.values[1:22], columns=headers)
    df = df.reset_index(drop=True)
    df.to_excel('...\\'+str(stockcode) +'.xlsx')
    sleeptime = random.randint(1, 10)
    #print(sleeptime)
    time.sleep(sleeptime)


def add_stock_table(stockcode,i,c,df):
    i = i - 1
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(stockcode) + '.phtml?year=2019&jidu=' + str(i)
    #print(url)
    res = requests.get(url)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
    df_addlist = []
    for table in tables:
        df_addlist.append(pd.concat(pd.read_html(table.prettify())))
    df_add = pd.concat(df_addlist)
    headers = df_add.iloc[0]
    df_add = pd.DataFrame(df_add.values[1:random.randint(20, 22)-c], columns=headers)
    #print(df_add)
    df_sum = df.append(df_add)
    #print(df_sum)
    #print(len(df_sum)-1)
    return df_sum






if __name__ == "__main__":
    if os.path.exists("...\\601006.xlsx") == True:
        os.remove("...\\601006.xlsx")


stockcode = ['601006', '000046', '601398', '000069', '601939', '000402',
                 '000001', '000089', '000027', '399001', '000002', '000800',
                 '601111', '600050', '601600', '600028', '601857', '601988',
                 '000951', '601919']


i=2
index = 1
print("正在爬取month_stock信息...\n")
print("---------------\n")
print("请耐心等待...\n")
for x in stockcode:
    print(index)
    get_stock_table(x,i)
    index +=1

爬取近一年的历史交易数据

from bs4 import BeautifulSoup
import requests
import pandas as pd
import  os
import time
import random


def get_stock_yeartable(stockcode,s,y):
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
        stockcode) + '/type/S.phtml?year=' + str(y) + '&jidu=' + str(s)
    print(url)
    res = requests.get(url)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
    df_list = []
    for table in tables:
        df_list.append(pd.concat(pd.read_html(table.prettify())))
    df = pd.concat(df_list)
    df.columns = df.iloc[0]
    headers = df.iloc[0]
    df = pd.DataFrame(df.values[1:], columns=headers)
    #print(len(df) - 1)   #df中有几行数据
    while len(df)<250:
        s -=1
        while s>0:
            df = add_stock_table(stockcode,s,y,df)
            s -= 1
        s = 5
        y -= 1
    df = df.reset_index(drop=True)
    df = pd.DataFrame(df.values[1:250], columns=headers)
    df.to_excel('D:\\Workplace\\PyCharm\\MySpider\\sh'+str(stockcode) +'.xlsx')
    sleeptime = random.randint(1, 10)
    #print(sleeptime)
    time.sleep(sleeptime)


def add_stock_table(stockcode,s,y,df):
    print(y,"-",s)
    url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
        stockcode) + '/type/S.phtml?year=' + str(y) + '&jidu=' + str(s)

    print(url)
    res = requests.get(url)
    res.encoding = 'gbk'
    soup = BeautifulSoup(res.text, 'lxml')
    tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
    df_addlist = []
    for table in tables:
        df_addlist.append(pd.concat(pd.read_html(table.prettify())))
    df_add = pd.concat(df_addlist)
    headers = df_add.iloc[0]
    df_add = pd.DataFrame(df_add.values[1:], columns=headers)
    #print(df_add)
    df_sum = df.append(df_add)
    #print(df_sum)
    #print(len(df_sum)-1)
    return df_sum


if __name__ == "__main__":
    if os.path.exists("D:\\Workplace\\PyCharm\\MySpider\\sh000001.xlsx") == True:
        os.remove("D:\\Workplace\\PyCharm\\MySpider\\sh000001.xlsx")

stockcode = ['000001']
s = 2
y = 2019
index = 1
print("正在爬取year_sh_stock信息...\n")
print("---------------\n")
print("请耐心等待...\n")
for x in stockcode:
    print(index)
    get_stock_yeartable(x,s,y)

你可能感兴趣的:(Python,Python,学习笔记)