利用数据分析星际争霸2选手

1、利用爬虫爬取http://aligulac.com/网站上的职业选手数据

爬虫与数据集在我的github上https://github.com/wuchangsheng951/kaggle

1)网站概况

image.png

2)选手数据
image.png

3)利用bs4爬取数据,代码和获取到的数据集都放在了我的github里。
https://github.com/wuchangsheng951/kaggle
csv数据如下
image.png

2、数据清晰

#加载数据集
df=pd.read_csv('/home/kesci/input/temp4946/星际争霸数据集.csv')
#将玩家1和玩家二都处于排名之中
df=df[df.player_2.isin(df.player_1.unique())]
#将win替换为1,输替换为0
df.player_1_match_status=df.player_1_match_status.map({'[\'winner\']':1,'[\'loser\']':0 })
df.tournament_type=df.tournament_type.map({'offline':1,'online':0 })
df=df.drop(['addon','player_2_match_status'],axis=1)
image.png

类型转换以及分割字符串

df.score=df.score.str.replace('–',' ')
df.score=df.score.str.strip()
df['player_1_win']=df.score.str.slice(0,1)
df['player_2_win']=df.score.str.slice(2,5)
df['player_2_win']=df['player_2_win'].str.strip()
df['player_1_win']=df['player_1_win'].astype(int)
df['player_2_win']=df['player_2_win'].astype(int)
#将时间转换为datatime类型
#df.match_date=pd.to_datetime(df.match_date)
df=df.rename(columns={'player_1_match_status':'win'})
df['P_win']=0
df['Z_win']=0
df['T_win']=0
df['total']=0
def k(row):
    if row['player_1_race']=='P':
        row['P_win']=row['player_1_win']
    elif  row['player_1_race']=='T':
        row['T_win']=row['player_1_win']
    elif  row['player_1_race']=='Z':
        row['Z_win']=row['player_1_win']
    if  row['player_2_race']=='P':
        row['P_win']=row['player_2_win']
    elif  row['player_2_race']=='Z':
        row['Z_win']=row['player_2_win']
    elif  row['player_2_race']=='T':
        row['T_win']=row['player_2_win']
    row['total'] = row['player_1_win']+row['player_2_win']
    return row
df=df.apply(k,axis=1)
df=df.drop('score',axis=1)
image.png

然后就可以计算啦

#各个种族职业选手人数
labels={'P','IMBAZ','IMBAT'}#标题
plt.rcParams['font.sans-serif']=['SimHei']
sizes=[race.count('P'),race.count('Z'),race.count('T')]#占比
#colors=['yellowgreen','green','red','orange']#规定颜色
explode=[0,0,0]#占比突出
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%')#,colors=colors)
plt.axis('equal')#显示比例

image.png
#剔除随机选手
df=df[df.player_1_race!='R' ]
df=df[df.player_2_race!='R' ]
#创建十三个以三个月为间隔的时间间隔
date=pd.date_range('1/1/2015', periods=13, freq='3M')
#将日期转为datetime类型方便比较
df.match_date=pd.to_datetime(df.match_date)

计算pvz各个种族胜率

PVT=[]
PVZ=[]
ZVT=[]
for i in range(12):
    df_temp=df[(df.match_date>date[i]) &(df.match_date
image.png
image.png

种族胜率排名

#选取2015年之后的数据
df=df[df.match_date>'2015']
rate=df.pivot_table(index=['player_1','player_1_race','player_2_race'],aggfunc=np.sum)
rate=rate.reset_index()
#种族胜率排名
def  mm(row):
    row['win']=row.player_1_win/row.total
    return row

rating=rate[rate.total>100].apply(mm,axis=1)
lala=rating.groupby(['player_1','player_1_race','player_2_race']).mean().sort_values('win').reset_index()
#在此处修改对阵种族
lala[(lala['player_1_race']=='T') & (lala['player_2_race']=='P')][-30:].plot('player_1','win',kind='barh')
image.png

选手胜率排名

#选手胜率排名
player='ByuN'
def func(row):
  row['win'] = row['player_1_win']/row['total']
  return row
#此处可以修改时间
temp1=df[(df.player_1==player)&(df.match_date>='2017-4')].groupby('player_2').sum().reset_index().apply(func,axis=1).sort_values('win')
temp1=temp1[temp1.total>10].reset_index()
temp1.plot('player_2','win',kind='barh',title=player,xticks=[x*0.1 for x in range(10)])

image.png

线上线下胜率

#线下线上胜率
df=df[df.match_date>'2017']
if_on_line=df[df.tournament_type==0]
def  lala(row):
  row['win']=row.player_1_win/row.total
  return row

offline=if_on_line.groupby(['player_1']).sum().reset_index()
offline=offline.apply(lala,axis=1).sort_values('win')[-30:]
offline.plot('player_1','win',kind='barh')
image.png

你可能感兴趣的:(利用数据分析星际争霸2选手)