第三次尝试

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
df = pd.read_csv('HRSalaries.csv')
df.head()
len(df)
30284
df.Department.value_counts()
POLICE                   12461
FIRE                      4798
SANITATION                2092
WATER MGMT                1796
AVIATION                  1252
TRANSPORTATION            1196
EMERGENCY MGMT            1182
GENERAL SERVICES           922
PUBLIC LIBRARY             874
FAMILY & SUPPORT           719
HEALTH                     568
FINANCE                    533
LAW                        455
CITY COUNCIL               265
BUILDINGS                  261
COMMUNITY DEVELOPMENT      216
BUSINESS AFFAIRS           177
DoIT                        97
MAYOR'S OFFICE              96
PROCUREMENT                 77
CULTURAL AFFAIRS            76
HUMAN RESOURCES             61
ANIMAL CONTRL               57
DISABILITIES                29
TREASURER                   24
Name: Department, dtype: int64
len(df.Department.unique())
25
salary = df.Annual_Salary
salary.sum() / len(salary)
60836.98560295866
salary.mean()
60836.98560295866
np.mean(salary)
60836.98560295866
df.groupby('Department').Annual_Salary.mean().sort_values(ascending=False)
Department
DoIT                     73831.979381
BUILDINGS                72137.885057
FIRE                     69383.989996
MAYOR'S OFFICE           68953.677083
WATER MGMT               64760.186526
COMMUNITY DEVELOPMENT    64262.597222
GENERAL SERVICES         63747.808026
TREASURER                63497.500000
POLICE                   63127.904984
TRANSPORTATION           62947.504181
PROCUREMENT              61452.584416
HEALTH                   61213.503521
CULTURAL AFFAIRS         61181.894737
DISABILITIES             58058.586207
BUSINESS AFFAIRS         57216.067797
HUMAN RESOURCES          57108.163934
LAW                      55917.958242
AVIATION                 55816.200479
SANITATION               55555.813576
FINANCE                  54286.375235
ANIMAL CONTRL            47604.473684
PUBLIC LIBRARY           44241.731121
EMERGENCY MGMT           42845.754653
CITY COUNCIL             38046.547170
FAMILY & SUPPORT         31193.307371
Name: Annual_Salary, dtype: float64
len(salary)
30284
sorted_salary = salary.sort_values()
sorted_salary
16629      3128
2247       3132
20961      3133
13423      3135
25422      3135
16451      3136
1732       3136
2215       3140
23454      3144
12291      3149
25959      3151
19186      3153
9905       3156
1752       3182
25170      3188
4739       3188
4858       3190
3688       3193
6947       3209
20065      3215
17837      3219
24322      3230
5840       3232
25971      3236
8175       3244
22631      3253
22602      3257
15958      3263
5194       3265
18713      3266
          ...  
23452    128174
27353    130023
14122    130788
18022    131070
17173    131461
7333     132249
20450    132283
11322    135289
14042    135623
14386    136798
6491     136806
17347    137121
7565     137153
13217    137300
12748    137305
10283    137574
16910    137583
4165     137584
29829    137669
1260     138506
14746    138546
15059    138760
2866     138826
8904     144240
3246     144914
20222    146247
11657    146776
3486     157054
27879    167858
6724     201448
Name: Annual_Salary, Length: 30284, dtype: int64
(sorted_salary.iloc[15141] + sorted_salary.iloc[15142]) / 2
61836.0
salary.median()
61836.0
plt.hist(salary, bins=50, rwidth=0.9)
plt.show()
第三次尝试_第1张图片
output_14_0.png
plt.hist(salary, bins=25, rwidth=0.9, range=(100000, 210000))
plt.show()
第三次尝试_第2张图片
output_15_0.png
salary.mean() > salary.median()
False
fire_salary = df[df.Department == 'FIRE'].Annual_Salary
plt.hist(fire_salary, bins=50, rwidth=0.9)
plt.show()
第三次尝试_第3张图片
output_17_0.png
fire_salary.mean()

69383.9899958316
fire_salary.median()
66260.0
fire_salary.mean() > fire_salary.median()
True
salary.max() 
201448
salary.min()
3128
salary.max() - salary.min()
198320
Q1 = salary.quantile(0.25)
Q3 = salary.quantile(0.75)
Q3
68558.5
IQR = Q3 - Q1
IQR
12886.75
salary.quantile(0.5)
61836.0
salary.median()
61836.0
salary.plot(kind='box', vert=False, figsize=(15, 5))
plt.show()
第三次尝试_第4张图片
output_29_0.png
doit_salary = df[df.Department == 'DoIT'].Annual_Salary.tolist()
plt.boxplot(doit_salary)
plt.show()
第三次尝试_第5张图片
output_30_0.png
build_salary = df[df.Department == 'BUILDINGS'].Annual_Salary.tolist()
plt.boxplot([build_salary, doit_salary], labels=['BUILDINGS', 'DoIT'])
plt.show()
第三次尝试_第6张图片
output_31_0.png
import seaborn as sns

sns.boxplot(data=df, y='Department', x='Annual_Salary')
plt.show()
第三次尝试_第7张图片
output_32_0.png
mean = salary.mean()
np.sum((salary - mean)**2) / (len(salary) - 1)
271490393.4177519
var = salary.var()
var
271490393.4177519
np.sqrt(np.sum((salary - mean)**2) / (len(salary) - 1))
16476.965540346071
std = salary.std()
std
16476.96554034607
len(salary[salary.between(mean - std, mean + std)]) / len(salary)
0.7666094307224938
len(salary[salary.between(mean - 2*std, mean + 2*std)]) / len(salary)
0.933364152687888
score = df.Review_Score
np.sum((salary - salary.mean()) * (score - score.mean())) / (len(salary)-1)
7.747599921809748
np.cov(salary, score)
array([[  2.71490393e+08,   7.74759992e+00],
       [  7.74759992e+00,   1.06173362e+00]])
cov = np.cov(salary, score)[0,1]
cov
7.7475999218100222
np.cov(salary, score)[0,1] / (salary.std() * score.std())
0.00045633330757004046
np.corrcoef(salary, score)[0,1]
0.00045633330757003586
plt.scatter(score, salary, alpha=0.3)
plt.show()
第三次尝试_第8张图片
output_45_0.png
position = df[df.Position_Title == 'FIREFIGHTER']
print(np.corrcoef(position.Annual_Salary, position.Review_Score)[1,0])
plt.scatter(position.Review_Score, position.Annual_Salary)
plt.show()
0.0571267765462
第三次尝试_第9张图片
output_46_1.png
#第三课作业
#1、计算 HRSalaries 数据中评分Review_Score 的均值和中位数,并判断其偏度是左偏还是右偏?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
df = pd.read_csv('HRSalaries.csv')
df.head()
salary = df.Review_Score
salary.sum() / len(salary)
6.4558908994849205
df.groupby('Department').Review_Score.mean().sort_values(ascending=False)
Department
MAYOR'S OFFICE           6.711458
ANIMAL CONTRL            6.575439
BUSINESS AFFAIRS         6.509605
AVIATION                 6.504073
COMMUNITY DEVELOPMENT    6.493519
HEALTH                   6.493486
HUMAN RESOURCES          6.480328
BUILDINGS                6.474330
GENERAL SERVICES         6.472234
WATER MGMT               6.464310
POLICE                   6.463117
FIRE                     6.461171
EMERGENCY MGMT           6.452369
DISABILITIES             6.441379
LAW                      6.439121
SANITATION               6.433987
FINANCE                  6.420450
FAMILY & SUPPORT         6.413908
PROCUREMENT              6.406494
PUBLIC LIBRARY           6.399542
TRANSPORTATION           6.395234
CITY COUNCIL             6.383019
DoIT                     6.342268
CULTURAL AFFAIRS         6.261842
TREASURER                6.258333
Name: Review_Score, dtype: float64
len(score)
30284
salary.mean() > salary.median()
False
fire_salary.mean() > fire_salary.median()
True
#2、 Review_Score 的IQR是多少?并绘制该数据的box图。
Q1 = salary.quantile(0.25)
Q1
5.8
Q3 = salary.quantile(0.75)
Q3
7.2
IQR = Q3 - Q1
IQR
1.4000000000000004
salary.plot(kind='box', vert=False, figsize=(15, 5))
plt.show()
第三次尝试_第10张图片
output_61_0.png
#3、Review_Score的标准差是多少?
np.sqrt(np.sum((salary - mean)**2) / (len(salary) - 1))
60831.534080120342
std = salary.std()
std
1.030404588021642
#4、在Review_Score中,求落在两个标准差内的数据占总数的百分比。
#  这一课感觉完全看不懂了,大概能理解到最前面的一些东西,导入什么库,和读取文件。
#这些公式表示看不懂。下次再来研究它。

你可能感兴趣的:(第三次尝试)