1. 分析新浪网站中球员数据的获取方式(F12 开发者模式,调试网页):
一般网站保存数据的方式分为2种:1. 静态网页存储;2. 动态请求;
对于静态网页存储来说,就是打开浏览器中查看源码,就可以从源码中获取所需要的数据;
对于动态请求来说,采用F12的开发者模式中,才能从服务器的response 中查看到data数据。
查看该网址的源码:https://slamdunk.sports.sina.com.cn/player/query?pid=8ec91366-faea-4196-bbfd-b8fab7434795,发现无法从网页中获取数据,故该网站采用的是动态请求。
对于动态请求有2中方式:1. Get方式;2. Post方式;其中Post方式需要url以及提交参数。
举例说明,采用F12分析上述网址,得到如下内容,其中该网址采用POST方式,用python请求时,需要将URL和Form Data一起提交给服务器:
1.General:
Request URL:https://slamdunk.sports.sina.com.cn/api?p=radar&s=summary&a=search&limit=10?callback=ijax_1512828812920_72693455
Request Method:POST
Status Code:200 OK
Remote Address:111.13.87.205:443
2.Form Data:
page:1
pid:8ec91366-faea-4196-bbfd-b8fab7434795
type:REG
tid:583ec825-fb46-11e1-82cb-f4ce4684ea4c
season1:2017
season2:2017
items:[{"key":"points","op":">=","value":"0"}]
callback:ijax_1512828812920_72693455
2. 写爬虫的python 代码前准备:
1. 学会使用urllib2的request请求;
2. 学会python处理json数据;举例:result = json.loads(html),该代码的作用是将json数据变为python内部数据格式之“字典”
3. 学会使用xlwt库,操作excel表格;
4. 学会python的字典数据结构;
5. 学会python的循环语句;
3. 爬虫编码如下:
# !usr/bin/env python
# _*_ coding:utf-8 -*-
__author__ = 'wangzhen'
import urllib,urllib2
import json
#操作excel表格
import xlwt
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Referer':'https://slamdunk.sports.sina.com.cn/player/query?pid=8ec91366-faea-4196-bbfd-b8fab7434795'
}
def getPlayerData(season):
url = 'https://slamdunk.sports.sina.com.cn/api?p=radar&s=summary&a=search&limit=?callback=ijax_1512224580290_80173252'
postdata = {
'page':'1',
'pid' :'8ec91366-faea-4196-bbfd-b8fab7434795',
'type':'REG',
'tid' :'583ec825-fb46-11e1-82cb-f4ce4684ea4c',
'season1':'%s' %(season),
'season2':'%s' %(season),
'items':'[{"key":"points","op":">","value":"0"},{"key":"points","op":">","value":"0"},{"key":"points","op":">","value":"0"},{"key":"points","op":">","value":"0"}]'
}
data = urllib.urlencode(postdata)
req = urllib2.Request(url,data)
req.headers = headers
res = urllib2.urlopen(req)
html = res.read()
result = json.loads(html)
result = result[u'result']
data = result[u'data']
players = data[u'players']
return players
#设置单元格样式
def set_style(name,height,bold=False):
style = xlwt.XFStyle() # 初始化样式
font = xlwt.Font() # 为样式创建字体
font.name = name # 'Times New Roman'
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
if __name__ == '__main__':
f = xlwt.Workbook() # 创建工作簿
for i in range(2016,2018):
players = getPlayerData(i)
'''创建sheet:'''
sheet = f.add_sheet(u'%s-%s' %(i,i+1), cell_overwrite_ok=True) # 创建sheet
row0 = [u'球员', u'日期', u'比赛', u'类型', u'赛果', u'首发', u'时间', u'得分', u'投篮',u'投篮%',u'三分'
,u'三分%',u'罚球',u'罚球%',u'前场板',u'后场板',u'总篮板',u'助攻',u'失误',u'抢断',u'盖帽',u'犯规']
# 生成第一行
for i in range(0, len(row0)):
sheet.write(0, i, row0[i], set_style('Times New Roman', 220, True))
column = 1
for player in players:
if(player[u'type']!=u'常规赛'):
continue
#i = 0
#for item in player:
#sheet.write(column, i, player[item])
#i += 1
#column += 1
sheet.write(column,0,'%s-%s' %(player['first_name'],player[u'last_name']))
sheet.write(column,1,'%s' %(player[u'date']))
sheet.write(column,2,'%s%s-%s%s' %(player[u'team_name'],player[u'score'],player[u'opp_score'],player[u'opp_team_name']))
sheet.write(column,3,'%s' %(player[u'type']))
sheet.write(column,4,'%s')
sheet.write(column,5,'%s' %(player[u'starter']))
sheet.write(column,6,'%s' %(player[u'minutes']))
sheet.write(column,7,'%s' %(player[u'points']))
sheet.write(column,8,'%s-%s' %(player[u'field_goals_made'],player[u'field_goals_att']))
sheet.write(column,9,'%s' %(player[u'field_goals_pct']))
sheet.write(column,10,'%s-%s' %(player[u'three_points_made'],player[u'three_points_att']))
sheet.write(column,11,'%s' %(player[u'three_points_pct']))
sheet.write(column,12,'%s-%s' %(player[u'free_throws_made'],player[u'free_throws_att']))
sheet.write(column,13,'%s' %(player[u'free_throws_pct']))
sheet.write(column,14,'%s' %(player[u'offensive_rebounds']))
sheet.write(column,15,'%s' %(player[u'defensive_rebounds']))
sheet.write(column,16,'%s' %(player[u'rebounds']))
sheet.write(column,17,'%s' %(player[u'assists']))
sheet.write(column,18,'%s' %(player[u'turnovers']))
sheet.write(column,19,'%s' %(player[u'steals']))
sheet.write(column,20,'%s' %(player[u'blocks']))
sheet.write(column,21,'%s' %(player[u'personal_fouls']))
column += 1
f.save(u'nbaplayer.xls') # 保存文件
4. 导出的数据:
|