配角七三—如何抓取网页中的表格:
https://zhuanlan.zhihu.com/p/33986020
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import random
找到数据所在的网页,利用开发者工具,查看网页url,请求状态,源代码等等,然后定位数据元素。随后,进行编程。利用相关函数,模拟访问网页,采集数据,加以处理,并保存至本地。(细节之处不到位,还请见谅,博主还会再找时间另外总结)
def get_stock_table(stockcode,i):
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
stockcode) + '.phtml?year=2019&jidu=' + str(i)
print(url)
res = requests.get(url)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
df_list = []
for table in tables:
df_list.append(pd.concat(pd.read_html(table.prettify())))
df = pd.concat(df_list)
df.columns = df.iloc[0]
headers = df.iloc[0]
df = pd.DataFrame(df.values[1:], columns=headers)
#print(len(df) - 1) #df中有几行数据
if (len(df) - 1 < 22):
c =len(df)-1
df = add_stock_table(stockcode,i,c,df)
else:
df =pd.DataFrame(df.values[1:22], columns=headers)
df = df.reset_index(drop=True)
df.to_excel('...\\'+str(stockcode) +'.xlsx')
sleeptime = random.randint(1, 10)
#print(sleeptime)
time.sleep(sleeptime)
不过以上函数可能有时候获取不到一个月的数据,因此还要再加一个函数,用以添加数据。如果要获取一年的话,就要加一个循环
def add_stock_table(stockcode,i,c,df):
i = i - 1
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(stockcode) + '.phtml?year=2019&jidu=' + str(i)
#print(url)
res = requests.get(url)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
df_addlist = []
for table in tables:
df_addlist.append(pd.concat(pd.read_html(table.prettify())))
df_add = pd.concat(df_addlist)
headers = df_add.iloc[0]
df_add = pd.DataFrame(df_add.values[1:random.randint(20, 22)-c], columns=headers)
#print(df_add)
df_sum = df.append(df_add)
#print(df_sum)
#print(len(df_sum)-1)
return df_sum
谨记!本文仅供学习交流,如有错误纰漏,还请原谅,欢迎指教!博主较佛(懒),随缘修改!
注意:
源代码无法直接套用!
.xlsx文件的路径需要修改!!
其他的根据自己的需要做变更。
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import time
import random
def get_stock_table(stockcode,i):
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
stockcode) + '.phtml?year=2019&jidu=' + str(i)
print(url)
res = requests.get(url)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
df_list = []
for table in tables:
df_list.append(pd.concat(pd.read_html(table.prettify())))
df = pd.concat(df_list)
df.columns = df.iloc[0]
headers = df.iloc[0]
df = pd.DataFrame(df.values[1:], columns=headers)
#print(len(df) - 1) #df中有几行数据
if (len(df) - 1 < 22):
c =len(df)-1
df = add_stock_table(stockcode,i,c,df)
else:
df =pd.DataFrame(df.values[1:22], columns=headers)
df = df.reset_index(drop=True)
df.to_excel('...\\'+str(stockcode) +'.xlsx')
sleeptime = random.randint(1, 10)
#print(sleeptime)
time.sleep(sleeptime)
def add_stock_table(stockcode,i,c,df):
i = i - 1
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(stockcode) + '.phtml?year=2019&jidu=' + str(i)
#print(url)
res = requests.get(url)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
df_addlist = []
for table in tables:
df_addlist.append(pd.concat(pd.read_html(table.prettify())))
df_add = pd.concat(df_addlist)
headers = df_add.iloc[0]
df_add = pd.DataFrame(df_add.values[1:random.randint(20, 22)-c], columns=headers)
#print(df_add)
df_sum = df.append(df_add)
#print(df_sum)
#print(len(df_sum)-1)
return df_sum
if __name__ == "__main__":
if os.path.exists("...\\601006.xlsx") == True:
os.remove("...\\601006.xlsx")
stockcode = ['601006', '000046', '601398', '000069', '601939', '000402',
'000001', '000089', '000027', '399001', '000002', '000800',
'601111', '600050', '601600', '600028', '601857', '601988',
'000951', '601919']
i=2
index = 1
print("正在爬取month_stock信息...\n")
print("---------------\n")
print("请耐心等待...\n")
for x in stockcode:
print(index)
get_stock_table(x,i)
index +=1
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import time
import random
def get_stock_yeartable(stockcode,s,y):
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
stockcode) + '/type/S.phtml?year=' + str(y) + '&jidu=' + str(s)
print(url)
res = requests.get(url)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
df_list = []
for table in tables:
df_list.append(pd.concat(pd.read_html(table.prettify())))
df = pd.concat(df_list)
df.columns = df.iloc[0]
headers = df.iloc[0]
df = pd.DataFrame(df.values[1:], columns=headers)
#print(len(df) - 1) #df中有几行数据
while len(df)<250:
s -=1
while s>0:
df = add_stock_table(stockcode,s,y,df)
s -= 1
s = 5
y -= 1
df = df.reset_index(drop=True)
df = pd.DataFrame(df.values[1:250], columns=headers)
df.to_excel('D:\\Workplace\\PyCharm\\MySpider\\sh'+str(stockcode) +'.xlsx')
sleeptime = random.randint(1, 10)
#print(sleeptime)
time.sleep(sleeptime)
def add_stock_table(stockcode,s,y,df):
print(y,"-",s)
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' + str(
stockcode) + '/type/S.phtml?year=' + str(y) + '&jidu=' + str(s)
print(url)
res = requests.get(url)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tables = soup.find_all('table', {'id': 'FundHoldSharesTable'})
df_addlist = []
for table in tables:
df_addlist.append(pd.concat(pd.read_html(table.prettify())))
df_add = pd.concat(df_addlist)
headers = df_add.iloc[0]
df_add = pd.DataFrame(df_add.values[1:], columns=headers)
#print(df_add)
df_sum = df.append(df_add)
#print(df_sum)
#print(len(df_sum)-1)
return df_sum
if __name__ == "__main__":
if os.path.exists("D:\\Workplace\\PyCharm\\MySpider\\sh000001.xlsx") == True:
os.remove("D:\\Workplace\\PyCharm\\MySpider\\sh000001.xlsx")
stockcode = ['000001']
s = 2
y = 2019
index = 1
print("正在爬取year_sh_stock信息...\n")
print("---------------\n")
print("请耐心等待...\n")
for x in stockcode:
print(index)
get_stock_yeartable(x,s,y)