爬取的数据主要有,主队名称、客队名称、比赛日期、两队各节的分数。
数据主要从http://www.stat-nba.com获取。
def get_url_content(url):
#获取比赛时间
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.content, 'html.parser')
div_time = soup.find_all('div', attrs={"style":"float: left;margin-top: 25px;margin-left: 10px;font-size: 16px;font-weight: bold;color: #009CFF"})
date = re.findall(r"(\d{4}-\d{1,2}-\d{1,2})", str(div_time[0]))
#获取主队客队
div_title = soup.find_all('div', attrs={"class":"title"})
season = div_title[0].string
home_team, visit_team = div_title[1].find('a').string, div_title[2].find('a').string
#获取每节比分
div_table = soup.find_all('div', attrs={"class":"table"})
home_score = div_table[0].find_all('td', class_='number')
visit_score = div_table[1].find_all('td', class_='number')
home_everystage_score = [home_score[i].string for i in range(0,4)]
visit_everystage_score = [visit_score[i].string for i in range(0,4)]
return season, date, home_team, visit_team, home_everystage_score, visit_everystage_score
遍历2019~2020赛季的每一场比赛,获取相应数据存入数据框中:
def run():
basketball_frame = []
for i, j in enumerate(range(43977,45362)):
try:
url = 'http://www.stat-nba.com/game/' + str(j) + '.html'
print(url)
basketball_frame.append(get_url_content(url))
except:
pass
bf = pd.DataFrame(basketball_frame)
return bf
bf = run()
bf.to_csv(r'存储路径')