# median
from numpy import median
values_median = median(values )
variance tells us how concentrated the date is around the mean.
and measures how far the average data point is from the mean .
calculate variance by subtracting every value from the mean,squaring the results, and then averaging them.
mean of list
sum()/len()
mean of pandasseries.mean()
mean of numpynp.array.mean()
# variance
import matplotlib.pyplot as plt
import pandas as pd
pf_mean = nba_stats['pf'].mean()
variance = 0
for p in nba_stats['pf']:
difference = p - pf_mean
square_difference = difference ** 2
variance += square_difference
variance = variance / len(nba_stats['pf'])
# standard deviation
# the square root of variance
import numpy as np
def calc_column_deviation(column):
mean = column .mean()
variance =0
for p in column:
difference = p - mean
square_difference = difference ** 2
variance += square_difference
variance = variance / len(column)
return variance ** .5
mp_dev = calc_col_deviation(nba_stats['mp'])
pandas method
std()
on series.
mp_dev = nba_stats['mp'].std()
# standard deviation distance compares data density.
import matplotlib.pyplot as plt
plt.hist(nba_stats["pf"])
mean = nba_stats["pf"].mean()
plt.axvline(mean, color="r")
# We can calculate standard deviation
# by using the std() method on a pandas series.
std_dev = nba_stats["pf"].std()
# Plot a line one standard deviation below the mean.
plt.axvline(mean - std_dev, color="g")
# Plot a line one standard deviation above the mean.
plt.axvline(mean + std_dev, color="g")
# We can see how many of the data points
#fall within one standard deviation of the mean.
# The more that fall into this range, the more dense the data is.
plt.show()
# We can calculate how many
#standard deviations a data point
#is from the mean by doing some subtraction
#and division.
# First, we find the total distance
# by subtracting the mean.
total_distance = nba_stats["pf"][0] - mean
# Then we divide by standard deviation to
#find how many standard deviations
#away the point is.
standard_deviation_distance = total_distance / std_dev
point_10 = nba_stats["pf"][9]
point_100 = nba_stats["pf"][99]
point_10_std = (point_10 - mean) / std_dev
point_100_std = (point_100 - mean) / std_dev
normal distribution
Make a normal distribution across the range that starts at-10
, ends at10
, and has the step.1
.
points = np.arange(-10,10,.1)
import numpy as np
import matplotlib.pyplot as plt
# The norm module has a pdf function (
# pdf - probability density function)
from scipy.stats import norm
# The arange function generates a numpy vector
vector
# The vector below will start at -1,
# and go up to, but not including 1
# It will proceed in "steps" of .01.
# So the first element will be -1,
# the second -.99, the third -.98,
# all the way up to .99.
points = np.arange(-1,1,0.01)
# The norm.pdf function will take the points vector
# and convert it into a probability vector
# Each element in the vector will correspond
# to the normal distribution
#(earlier elements and later element
# smaller, peak in the center)
# The distribution will be centered on 0,
# and will have a standard devation of .3
probabilities = norm.pdf(points, 0, .3)
# Plot the points values on the x-axis
# and the corresponding probabilities on the y-axis
# See the bell curve?
plt.plot(points, probabilities)
plt.show()
# Housefly wing lengths in millimeters
wing_lengths = [36, 37, 38, 38, 39, 39, 40, 40, 40,
40, 41, 41, 41, 41, 41, 41, 42, 42,
42, 42, 42, 42, 42, 43, 43, 43, 43,
43, 43, 43, 43, 44, 44, 44, 44, 44,
44, 44, 44, 44, 45, 45, 45, 45, 45,
45, 45, 45, 45, 45, 46, 46, 46, 46,
46, 46, 46, 46, 46, 46, 47, 47, 47,
47, 47, 47, 47, 47, 47, 48, 48, 48,
48, 48, 48, 48, 48, 49, 49, 49, 49,
49, 49, 49, 50, 50, 50, 50, 50, 50,
51, 51, 51, 51, 52, 52, 53, 53, 54,
55]
mean = sum(wing_lengths) / len(wing_lengths)
variances = [(i - mean) ** 2 for i in wing_lengths]
variance = sum(variances)/ len(variances)
standard_deviation = variance ** (1/2)
standard_deviations = [(i - mean) / standard_deviation for i in wing_lengths]
def within_percentage(deviations, count):
within = [i for i in deviations if i <= count and i >= -count]
count = len(within)
return count / len(deviations)
within_one_percentage = within_percentage(standard_deviations, 1)
within_two_percentage = within_percentage(standard_deviations, 2)
within_three_percentage = within_percentage(standard_deviations, 3)
Using Scatterplots to Plot Correlations
import matplotlib.pyplot as plt
# Plot field goals attempted (number of
# shots someone takes in a season) vs. point
# scored in a season.
# Field goals attempted is on the x-axis,
# and points is on the y-axis.
# As you can tell, they are very strongly correlated.
#The plot is close to a straight line.
# The plot also slopes upward,
# which means that as field goal attempts
#go up, so do points.
# That means that the plot is positively correlated.
plt.scatter(nba_stats["fga"], nba_stats["pts"])
plt.show()
Measuring Correlation with Pearson's r
The most common way to measure correlation is to use Pearson's r, which we also call an r-value.
An r-value ranges from -1 to 1, and indicates how strongly two variables are correlated.
We can use a function from scipy to calculate Pearson's r.
from scipy.stats.stats import pearsonr
# The pearsonr function will find the correlation
# between two columns of data.
# It returns the r value and the p value.
r, p_value = pearsonr(nba_stats["fga"], nba_stats["pts"])
# As we can see, this is a very high positive r value
# - it's close to 1.
print(r)
Covariance
Another way to think of correlation is in terms of variance.
Covariance refers to how different numbers vary jointly.
For each element in the vectors x and y, we:
- Take the value at each position from 1 to the length of the vectors.
- Subtract the mean of the vector from those values.
- Multiply them together at each position, and all of the resulting values together.
def covariance (x,y):
x_mean = sum(x) /len(x)
y_mean = sum(y) /len(y)
x_diffs = [i - x_mean for i in x]
y_diffs = [i -y_mean for i in y]
codeviates = [x_diffs[i]*y_diffs[i] for i in range(len(x))]
return sum( codeviates )/len(codeviates)
from numpy import cov
cov(nd_array_a, nd_array_b)
Calculate Correlation With the std() Method
We can use thestd
method on any pandas DataFrame or Series to calculate the standard deviation.
We can use thecov
function from NumPy to compute covariance,
\frac{cov(\mathbf{x},
\mathbf{y})}{\sigma_{x}\sigma_{y}}
from numpy import cov
r_fta_blk = cov(nba_stats['fta'],
nba_stats["blk"])[0,1]/(
nba_stats['fta'].var(
)*nba_stats["blk"].var(
))**(1/2)
visualize dataset
import matplotlib.pyplot as plt
import pandas as pd
movie_reviews = pd.read_csv(
"fandango_score_comparison.csv")
fig = plt.figure(figsize =(5,12))
ax1 = fig.add_subplot(4,1,1)
ax2 = fig.add_subplot(4,1,2)
ax3 = fig.add_subplot(4,1,3)
ax4 = fig.add_subplot(4,1,4)
ax1.set_xlim(0,5.0)
ax2.set_xlim(0,5.0)
ax3.set_xlim(0,5.0)
ax4.set_xlim(0,5.0)
movie_reviews["RT_user_norm"].hist(ax=ax1)
movie_reviews["Metacritic_user_nom"].hist(ax=ax2)
movie_reviews["Fandango_Ratingvalue"].hist(ax=ax3)
movie_reviews["IMDB_norm"].hist(ax=ax4)
plt.show()
- Recall that you can return the values in a Series using the values attribute.
#Write a function, named calc_mean,
# that returns the mean
# for the values in a Series object.
def calc_mean(series):
return None
# Recall that you can return the values
# in a Series using the values attribute.
def calc_mean(series):
vals = series.values
mean = sum(vals) / len(vals)
return mean
# Select just the columns containing normalized user
# reviews and assign to a separate Dataframe named user_reviews.
columns = ["RT_user_norm","Metacritic_user_nom",
"Fandango_Ratingvalue","IMDB_norm",]
user_reviews = movie_reviews[columns]
user_reviews_means = user_reviews.apply(calc_mean)
rt_mean = user_reviews_means["RT_user_norm"]
mc_mean = user_reviews_means["Metacritic_user_nom"]
fg_mean = user_reviews_means["Fandango_Ratingvalue"]
id_mean = user_reviews_means["IMDB_norm"]
print("Rotten Tomatoes (mean):", rt_mean)
print("Metacritic (mean):", mc_mean)
print("Fandango (mean):",fg_mean)
print("IMDB (mean):",id_mean)
def calc_mean(series):
vals = series.values
mean = sum(vals) / len(vals)
return mean
# To calculate the variance:
# write a function,
# named calc_variance, that returns
# the variance for the values in a Series object.
def calc_variance(series):
mean = calc_mean(series)
squared_deviations = (series - mean)**2
mean_squared_deviations = calc_mean(squared_deviations)
return mean_squared_deviations
cols = ["RT_user_norm",
"Metacritic_user_nom",
"Fandango_Ratingvalue",
"IMDB_norm"]
user_reviews = movie_reviews[columns]
user_reviews_variances = user_reviews.apply(calc_variance)
# Calculate the variance and standard deviation
# for the RT_user_norm column and
# assign to rt_var and rt_stdev respectively.
rt_var = user_reviews_variances["RT_user_norm"]
mc_var = user_reviews_variances["Metacritic_user_nom"]
fg_var = user_reviews_variances["Fandango_Ratingvalue"]
id_var = user_reviews_variances["IMDB_norm"]
rt_stdev = rt_var ** (1/2)
mc_stdev = mc_var ** (1/2)
fg_stdev = fg_var ** (1/2)
id_stdev = id_var ** (1/2)
print("Rotten Tomatoes (variance):", rt_var)
print("Metacritic (variance):", mc_var)
print("Fandango (variance):", fg_var)
print("IMDB (variance):", id_var)
print("Rotten Tomatoes (standard deviation):", rt_stdev)
print("Metacritic (standard deviation):", mc_stdev)
print("Fandango (standard deviation):", fg_stdev)
print("IMDB (standard deviation):", id_stdev)
# Create a matplotlib subplot grid with the following properties:
#3 rows by 1 column,
# figsize of 4 (width) by 8 (height),
# each Axes instance should have
# an x-value range of 0.0 to 5.0.
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (4,8))
ax1 = fig.add_subplot(3,1,1)
ax2 = fig.add_subplot(3,1,2)
ax3 = fig.add_subplot(3,1,3)
ax1.set_xlim(0.0,5.0)
ax2.set_xlim(0.0,5.0)
ax3.set_xlim(0.0,5.0)
ax1.scatter(movie_reviews["RT_user_norm"],movie_reviews["Fandango_Ratingvalue"])
ax2.scatter(movie_reviews["Metacritic_user_nom"],movie_reviews["Fandango_Ratingvalue"])
ax3.scatter(movie_reviews["IMDB_norm"],movie_reviews["Fandango_Ratingvalue"])
plt.show()
def calc_mean(series):
vals = series.values
mean = sum(vals) / len(vals)
return mean
def calc_variance(series):
mean = calc_mean(series)
squared_deviations = (series - mean)**2
mean_squared_deviations = calc_mean(squared_deviations)
return mean_squared_deviations
def calc_covariance(series_one, series_two):
x = series_one.values
y = series_two.values
x_mean = calc_mean(series_one)
y_mean = calc_mean(series_two)
x_diffs = [i - x_mean for i in x]
y_diffs = [i - y_mean for i in y]
codeviates = [x_diffs[i] * y_diffs[i] for i in range(len(x))]
return sum(codeviates) / len(codeviates)
rt_fg_covar = calc_covariance(movie_reviews["RT_user_norm"],
movie_reviews["Fandango_Ratingvalue"])
mc_fg_covar = calc_covariance(movie_reviews["Metacritic_user_nom"],
movie_reviews["Fandango_Ratingvalue"])
id_fg_covar = calc_covariance(movie_reviews["IMDB_norm"],
movie_reviews["Fandango_Ratingvalue"])
def calc_correlation(series_one,series_two):
cov = calc_covariance(series_one,series_two)
stde_one = calc_variance(series_one)**(1/2)
stde_two = calc_variance(series_two)**(1/2)
correlation = cov/(stde_one*stde_two)
return correlation
rt_fg_corr= calc_correlation(movie_reviews['RT_user_norm'],
movie_reviews['Fandango_Ratingvalue'])
mc_fg_corr= calc_correlation(movie_reviews['Metacritic_user_nom'],
movie_reviews['Fandango_Ratingvalue'])
id_fg_corr= calc_correlation(movie_reviews['IMDB_norm'],
movie_reviews['Fandango_Ratingvalue'])
print("Correlation between Rotten Tomatoes and Fandango", rt_fg_corr)
print("Correlation between Metacritic and Fandango", mc_fg_corr)
print("Correlation between IMDB and Fandango", id_fg_corr)