不说太多描述,直接上代码。
目标网站:数字货币相关网站
爬虫目标:爬取目标网页上排名前800的数字货币在20190101到20200701的每日价格数据,包括开盘价、最高价、最低价、收盘价、交易量、市值数据。
第一部分:获取排名前800的数字货币信息
import requests
from urllib.parse import urlencode
import pandas as pd
import time
import random
baseUrl = 'https://web-api.coinmarketcap.com/v1/cryptocurrency/listings/latest?'
headers = {'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,und;q=0.8',
'origin': 'https://coinmarketcap.com',
'referer': 'https://coinmarketcap.com/',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
def get_json(num):
data = {
'convert': 'USD,BTC,ETH,XRP,BCH,LTC',
'cryptocurrency_type': 'all',
'limit': 200,
'sort': 'market_cap',
'sort_dir': 'desc',
'start': num
}
url = baseUrl + urlencode(data)
response = requests.get(url,headers = headers,verify=False)
time.sleep(50 + random.random())
return response.json()
df = pd.DataFrame()
for i in range(4):
num = i*200+1
json = get_json(num)
idList = []
nameList = []
symbolList = []
slugList = []
for j in range(200):
Id = json['data'][j]['id']
idList.append(Id)
name = json['data'][j]['name']
nameList.append(name)
symbol = json['data'][j]['symbol']
symbolList.append(symbol)
slug = json['data'][j]['slug']
slugList.append(slug)
data = pd.DataFrame(data = {'ID':idList,'name':nameList,'symbol':symbolList,'slug':slugList})
df = pd.concat([df,data])
df.to_excel(r"bitID.xlsx",index=0)
第二部分:获取每个数字货币在区间内的价格信息
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
path = r"bitID.xlsx"
name = pd.read_excel(path)
df = pd.DataFrame()
def get_message(datas,title):
#日期,开盘价、最高价、最低价、收盘价、交易量、市值
dateList = []
openList = []
maxList = []
minList = []
closeList = []
cntList = []
valueList = []
for data in datas:
date = data.find_all('td')[0].getText()
dateList.append(date)
openPrice = data.find_all('td')[1].getText()
openList.append(openPrice)
maxPrice = data.find_all('td')[2].getText()
maxList.append(maxPrice)
minPrice = data.find_all('td')[3].getText()
minList.append(minPrice)
closePrice = data.find_all('td')[4].getText()
closeList.append(closePrice)
cnt = data.find_all('td')[5].getText()
cntList.append(cnt)
value = data.find_all('td')[6].getText()
valueList.append(value)
df1 = pd.DataFrame(data={
'title':[title]*len(dateList),'date':dateList,'open':openList,'max':maxList,'min':minList,'close':closeList,'cnt':cntList,'value':valueList,
})
return df1
for i in range(name.shape[0]):#name.shape[0]
slug = name.slug[i]
print(i,':',slug)
url = r'https://coinmarketcap.com/currencies/'+slug+'/historical-data/?start=20190101&end=20200701'
try:
response = requests.get(url)
except ConnectionError:
time.sleep(10+random.random())
response = requests.get(url)
except:
time.sleep(10+random.random())
response = requests.get(url)
time.sleep(10+random.random())
soup = BeautifulSoup(response.content,'lxml')
soup = soup.find('div',class_='sc-1oio33t-0 kQXqPh cmc-tab-historical-data')
title = soup.find('h2').getText()
datas = soup.find_all('tr',class_='cmc-table-row')
df1 = get_message(datas,title)
df = pd.concat([df,df1])
df.to_csv(r"bitPrice.csv",index=0)
有不懂的可以留言,看到会回复,码字不易,喜欢请点赞,谢谢!!