5.17 ② summary statistics

2018-05-18 本文已影响0人钊钖

# median
from numpy  import  median
values_median  = median(values )

variance tells us how concentrated the date is around the mean.
and measures how far the average data point is from the mean .
calculate variance by subtracting every value from the mean,squaring the results, and then averaging them.

mean of list sum()/len()
mean of pandas series.mean()
mean of numpy np.array.mean()

# variance 
import  matplotlib.pyplot  as plt
import pandas as pd

pf_mean = nba_stats['pf'].mean()
variance  = 0
for p in nba_stats['pf']:
    difference = p - pf_mean 
    square_difference = difference ** 2
    variance += square_difference
variance  = variance / len(nba_stats['pf'])

# standard deviation 
# the square root of variance 
import  numpy as np

def calc_column_deviation(column):
    mean = column .mean()
    variance  =0

    for p in column:
        difference = p - mean
        square_difference = difference ** 2
        variance += square_difference
    
    variance = variance / len(column)
    return  variance ** .5

mp_dev = calc_col_deviation(nba_stats['mp'])

pandas method std()on series.

mp_dev = nba_stats['mp'].std()

# standard deviation distance compares data density.
import matplotlib.pyplot as plt

plt.hist(nba_stats["pf"])
mean = nba_stats["pf"].mean()
plt.axvline(mean, color="r")
# We can calculate standard deviation
# by using the std() method on a pandas series.
std_dev = nba_stats["pf"].std()
# Plot a line one standard deviation below the mean.
plt.axvline(mean - std_dev, color="g")
# Plot a line one standard deviation above the mean.
plt.axvline(mean + std_dev, color="g")

# We can see how many of the data points 
#fall within one standard deviation of the mean.
# The more that fall into this range, the more dense the data is.
plt.show()
 
# We can calculate how many 
#standard deviations a data point 
#is from the mean by doing some subtraction 
#and division.
# First, we find the total distance
# by subtracting the mean.
total_distance = nba_stats["pf"][0] - mean
# Then we divide by standard deviation to 
#find how many standard deviations 
#away the point is.
standard_deviation_distance = total_distance / std_dev

point_10 = nba_stats["pf"][9]
point_100 = nba_stats["pf"][99]
point_10_std = (point_10 - mean) / std_dev
point_100_std = (point_100 - mean) / std_dev

normal distribution
Make a normal distribution across the range that starts at-10, ends at 10, and has the step .1.
points = np.arange(-10,10,.1)

import numpy as np
import matplotlib.pyplot as plt
# The norm module has a pdf function (
# pdf -  probability density function)
from scipy.stats import norm

# The arange function generates a numpy vector
vector
# The vector below will start at -1,  
# and go up to, but not including 1
# It will proceed in "steps" of .01. 
# So the first element will be -1, 
# the second -.99, the third -.98, 
# all the way up to .99.
points = np.arange(-1,1,0.01)

# The norm.pdf function will take the points vector 
# and convert it into a probability vector


# Each element in the vector will correspond 
# to the normal distribution 
#(earlier elements and later element 
# smaller, peak in the center)


# The distribution will be centered on 0, 
# and will have a standard devation of .3
probabilities = norm.pdf(points, 0, .3)

# Plot the points values on the x-axis 
# and the corresponding probabilities on the y-axis
# See the bell curve?
plt.plot(points, probabilities)
plt.show()

12.png

# Housefly wing lengths in millimeters
wing_lengths = [36, 37, 38, 38, 39, 39, 40, 40, 40,
                40, 41, 41, 41, 41, 41, 41, 42, 42, 
                42, 42, 42, 42, 42, 43, 43, 43, 43, 
                43, 43, 43, 43, 44, 44, 44, 44, 44, 
                44, 44, 44, 44, 45, 45, 45, 45, 45, 
                45, 45, 45, 45, 45, 46, 46, 46, 46,
                46, 46, 46, 46, 46, 46, 47, 47, 47,
                47, 47, 47, 47, 47, 47, 48, 48, 48, 
                48, 48, 48, 48, 48, 49, 49, 49, 49, 
                49, 49, 49, 50, 50, 50, 50, 50, 50,
                51, 51, 51, 51, 52, 52, 53, 53, 54,
                55]


mean = sum(wing_lengths) / len(wing_lengths)
variances = [(i - mean) ** 2 for i in wing_lengths]
variance = sum(variances)/ len(variances)
standard_deviation = variance ** (1/2)

standard_deviations = [(i - mean) / standard_deviation for i in wing_lengths]
def within_percentage(deviations, count):
    within = [i for i in deviations if i <= count and i >= -count]
    count = len(within)
    return count / len(deviations)

within_one_percentage = within_percentage(standard_deviations, 1)
within_two_percentage = within_percentage(standard_deviations, 2)
within_three_percentage = within_percentage(standard_deviations, 3)

Using Scatterplots to Plot Correlations

import matplotlib.pyplot as plt

# Plot field goals attempted (number of 
# shots someone takes in a season) vs. point 
# scored in a season.
# Field goals attempted is on the x-axis, 
# and points is on the y-axis.
# As you can tell, they are very strongly correlated. 
#The plot is close to a straight line.
# The plot also slopes upward, 
# which means that as field goal attempts 
#go up, so do points.
# That means that the plot is positively correlated.
plt.scatter(nba_stats["fga"], nba_stats["pts"])
plt.show()

Measuring Correlation with Pearson's r
The most common way to measure correlation is to use Pearson's r, which we also call an r-value.
An r-value ranges from -1 to 1, and indicates how strongly two variables are correlated.
We can use a function from scipy to calculate Pearson's r.

from scipy.stats.stats import pearsonr

# The pearsonr function will find the correlation 
# between two columns of data.
# It returns the r value and the p value. 
r, p_value = pearsonr(nba_stats["fga"], nba_stats["pts"])
# As we can see, this is a very high positive r value 
# - it's close to 1.
print(r)

Covariance
Another way to think of correlation is in terms of variance.
Covariance refers to how different numbers vary jointly.
For each element in the vectors x and y, we:

Take the value at each position from 1 to the length of the vectors.

Subtract the mean of the vector from those values.

Multiply them together at each position, and all of the resulting values together.

def covariance (x,y):
    x_mean = sum(x) /len(x)
    y_mean = sum(y) /len(y)
    x_diffs =  [i - x_mean for i in x]
    y_diffs = [i -y_mean for i in y]
    
    codeviates = [x_diffs[i]*y_diffs[i] for i in range(len(x))]
    
    return sum( codeviates )/len(codeviates)

from numpy import cov
cov(nd_array_a, nd_array_b)

Calculate Correlation With the std() Method
We can use the std method on any pandas DataFrame or Series to calculate the standard deviation.
We can use the cov function from NumPy to compute covariance,

\frac{cov(\mathbf{x},
\mathbf{y})}{\sigma_{x}\sigma_{y}}

from numpy import cov

r_fta_blk = cov(nba_stats['fta'],
                nba_stats["blk"])[0,1]/(
    nba_stats['fta'].var(
    )*nba_stats["blk"].var(
    ))**(1/2)

visualize dataset

import matplotlib.pyplot as plt
import pandas as pd
movie_reviews = pd.read_csv(
    "fandango_score_comparison.csv")


fig = plt.figure(figsize =(5,12))
ax1 = fig.add_subplot(4,1,1)
ax2 = fig.add_subplot(4,1,2)
ax3 = fig.add_subplot(4,1,3)
ax4 = fig.add_subplot(4,1,4)
ax1.set_xlim(0,5.0)
ax2.set_xlim(0,5.0)
ax3.set_xlim(0,5.0)
ax4.set_xlim(0,5.0)

movie_reviews["RT_user_norm"].hist(ax=ax1)
movie_reviews["Metacritic_user_nom"].hist(ax=ax2)
movie_reviews["Fandango_Ratingvalue"].hist(ax=ax3)
movie_reviews["IMDB_norm"].hist(ax=ax4)
plt.show()

Recall that you can return the values in a Series using the values attribute.

#Write a function, named calc_mean, 
# that returns the mean
# for the values in a Series object.
def calc_mean(series):
    return None
# Recall that you can return the values 
# in a Series using the values attribute.
def calc_mean(series):
    vals = series.values
    mean = sum(vals) / len(vals)
    return mean    

# Select just the columns containing normalized user 
# reviews and assign to a separate Dataframe named user_reviews.
columns = ["RT_user_norm","Metacritic_user_nom",
           "Fandango_Ratingvalue","IMDB_norm",]
user_reviews = movie_reviews[columns]
user_reviews_means = user_reviews.apply(calc_mean)

rt_mean = user_reviews_means["RT_user_norm"]
mc_mean = user_reviews_means["Metacritic_user_nom"]
fg_mean = user_reviews_means["Fandango_Ratingvalue"]
id_mean = user_reviews_means["IMDB_norm"]

print("Rotten Tomatoes (mean):", rt_mean)
print("Metacritic (mean):", mc_mean)
print("Fandango (mean):",fg_mean)
print("IMDB (mean):",id_mean)

def calc_mean(series):
    vals = series.values
    mean = sum(vals) / len(vals)
    return mean
# To calculate the variance:
# write a function, 
# named calc_variance, that returns 
# the variance for the values in a Series object.

def calc_variance(series):
    mean = calc_mean(series)
    squared_deviations = (series - mean)**2
    mean_squared_deviations = calc_mean(squared_deviations)
    return mean_squared_deviations


cols = ["RT_user_norm", 
        "Metacritic_user_nom", 
        "Fandango_Ratingvalue", 
        "IMDB_norm"]
user_reviews = movie_reviews[columns]
user_reviews_variances = user_reviews.apply(calc_variance)

# Calculate the variance and  standard deviation
# for the RT_user_norm column and 
# assign to rt_var and rt_stdev respectively.

rt_var = user_reviews_variances["RT_user_norm"]
mc_var = user_reviews_variances["Metacritic_user_nom"]
fg_var = user_reviews_variances["Fandango_Ratingvalue"]
id_var = user_reviews_variances["IMDB_norm"]

rt_stdev = rt_var ** (1/2)
mc_stdev = mc_var ** (1/2)
fg_stdev = fg_var ** (1/2)
id_stdev = id_var ** (1/2)

print("Rotten Tomatoes (variance):", rt_var)
print("Metacritic (variance):", mc_var)
print("Fandango (variance):", fg_var)
print("IMDB (variance):", id_var)

print("Rotten Tomatoes (standard deviation):", rt_stdev)
print("Metacritic (standard deviation):", mc_stdev)
print("Fandango (standard deviation):", fg_stdev)
print("IMDB (standard deviation):", id_stdev)

# Create a matplotlib subplot grid with the following properties:

#3 rows by 1 column,
# figsize of 4 (width) by 8 (height),
# each Axes instance should have
# an x-value range of 0.0 to 5.0.
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (4,8))
ax1 = fig.add_subplot(3,1,1)
ax2 = fig.add_subplot(3,1,2)
ax3 = fig.add_subplot(3,1,3)

ax1.set_xlim(0.0,5.0)
ax2.set_xlim(0.0,5.0)
ax3.set_xlim(0.0,5.0)

ax1.scatter(movie_reviews["RT_user_norm"],movie_reviews["Fandango_Ratingvalue"])
ax2.scatter(movie_reviews["Metacritic_user_nom"],movie_reviews["Fandango_Ratingvalue"])
ax3.scatter(movie_reviews["IMDB_norm"],movie_reviews["Fandango_Ratingvalue"])

plt.show()

def calc_mean(series):
    vals = series.values
    mean = sum(vals) / len(vals)
    return mean

def calc_variance(series):
    mean = calc_mean(series)
    squared_deviations = (series - mean)**2
    mean_squared_deviations = calc_mean(squared_deviations)
    return mean_squared_deviations

def calc_covariance(series_one, series_two):
    x = series_one.values
    y = series_two.values
    x_mean = calc_mean(series_one)
    y_mean = calc_mean(series_two)
    x_diffs = [i - x_mean for i in x]
    y_diffs = [i - y_mean for i in y]
    codeviates = [x_diffs[i] * y_diffs[i] for i in range(len(x))]
    return sum(codeviates) / len(codeviates)

rt_fg_covar = calc_covariance(movie_reviews["RT_user_norm"], 
                              movie_reviews["Fandango_Ratingvalue"])
mc_fg_covar = calc_covariance(movie_reviews["Metacritic_user_nom"], 
                              movie_reviews["Fandango_Ratingvalue"])
id_fg_covar = calc_covariance(movie_reviews["IMDB_norm"], 
                              movie_reviews["Fandango_Ratingvalue"])

def calc_correlation(series_one,series_two):
    cov = calc_covariance(series_one,series_two)
    stde_one = calc_variance(series_one)**(1/2)
    stde_two = calc_variance(series_two)**(1/2)
    correlation = cov/(stde_one*stde_two)
    return correlation

rt_fg_corr= calc_correlation(movie_reviews['RT_user_norm'],
                           movie_reviews['Fandango_Ratingvalue'])

mc_fg_corr= calc_correlation(movie_reviews['Metacritic_user_nom'],
                           movie_reviews['Fandango_Ratingvalue'])

id_fg_corr= calc_correlation(movie_reviews['IMDB_norm'],
                           movie_reviews['Fandango_Ratingvalue'])

print("Correlation between Rotten Tomatoes and Fandango", rt_fg_corr)
print("Correlation between Metacritic and Fandango", mc_fg_corr)
print("Correlation between IMDB and Fandango", id_fg_corr)

5.17 ② summary statistics

猜你喜欢

热点阅读