pyspark学习

案例3-NBA

2018-01-13  本文已影响69人  7125messi

Apache Spark已经成为大规模数据分析的常用工具,本文我们将展示如何使用Spark来分析NBA数据。具体来说,我们将使用1979年到2016年的赛季数据以及投篮图数据来展示NBA如何继续朝着越来越多的三分投篮的方向发展。(主要是勇士队掀起的小球打法以及小学生“库里”的变态准)。
使用Python 3,我们利用Spark Python API(PySpark)来创建和分析Spark。

导入各种模块

%matplotlib inline
import os

import numpy as np
import pandas as pd
import seaborn as sns

from nba_utils import draw_3pt_piechart,plot_shot_chart

from IPython.core.display import display, HTML
from IPython.core.magic import register_cell_magic, register_line_cell_magic, register_line_magic
from matplotlib import pyplot as plt
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf

import warnings
warnings.filterwarnings("ignore")

sns.set_style("white")
sns.set_color_codes()

自定义可视化样式

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 9
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['figure.titlesize'] = 18

display(HTML('<style>.container {width:80% !important;}</style>'))
update_title = 'document.title = "Using Python and Apache Spark to Analyze the NBA and the 3-point Shot";'
HTML('<script>{}</script>'.format(update_title))
df = spark.read.option('header','true')\
          .option('inferSchema','true')\
          .csv('data/season_totals.csv')
# 缓存数据
df.cache()
DataFrame[_c0: int, player: string, pos: string, age: int, team_id: string, g: int, gs: int, mp: int, fg: int, fga: int, fg_pct: double, fg3: int, fg3a: int, fg3_pct: double, fg2: int, fg2a: int, fg2_pct: double, efg_pct: double, ft: int, fta: int, ft_pct: double, orb: int, drb: int, trb: int, ast: int, stl: int, blk: int, tov: int, pf: int, pts: int, yr: int]
# Using our DataFrame `df`, we can view the top 10 players
df.orderBy('pts',ascending = False).limit(10).toPandas()[['yr','player','age','pts','fg3']]

yr  player  age pts fg3
0   1987    Jordan,Michael  23  3041    12
1   1988    Jordan,Michael  24  2868    7
2   2006    Bryant,Kobe 27  2832    180
3   1990    Jordan,Michael  26  2753    92
4   1989    Jordan,Michael  25  2633    27
5   2014    Durant,Kevin    25  2593    192
6   1980    Gervin,George   27  2585    32
7   1991    Jordan,Michael  27  2580    29
8   1982    Gervin,George   29  2551    10
9   1993    Jordan,Michael  29  2541    81
print(df.columns)
['_c0', 'player', 'pos', 'age', 'team_id', 'g', 'gs', 'mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'fg2', 'fg2a', 'fg2_pct', 'efg_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'yr']
# 3 point attempts / 36 minute
fga_py = df.groupBy('yr')\
           .agg({'mp' : 'sum', 'fg3a' : 'sum'})\
           .select(col('yr'), (36*col('sum(fg3a)')/col('sum(mp)')).alias('fg3a_p36m'))\
           .orderBy('yr')

# or could use SQL
sqlContext.registerDataFrameAsTable(df, 'df')
fga_py = sqlContext.sql('''SELECT yr,
                                  sum(fg3a)/sum(mp)*36 fg3a_p36m
                           FROM df GROUP BY yr
                           ORDER BY yr''')

可视化

_df = fga_py.toPandas()
plt.plot(_df.yr,_df.fg3a_p36m, color = '#00a79c')
plt.xlabel('Year')
plt.ylabel('Number of attempts')
_=plt.title('Player average 3-point attempts (per 36 minutes)')
_=plt.annotate('3 pointer introduced', xy=(1980.5, .5), xytext=(1981, 1.1), fontsize = 12,
               arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
_=plt.annotate('NBA moved in\n3-point line', xy=(1993.7, 1.5), xytext=(1987, 1.79), fontsize = 12,
               arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
_=plt.annotate('NBA moved back\n3-point line', xy=(1998, 2.), xytext=(1998.5, 2.4), fontsize = 12,
               arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
plt.tight_layout()
plt.savefig('results/3_point_trend.png')
image.png

我们可以看到自从1979-80赛季的投篮以来,三分球命中率稳步上升。 观察90年代中期NBA几英尺线路上的尝试次数是有意思的,合乎逻辑的。 另外,过去5年的尝试次数也有了突然的增加。

建立线性模型

# train the model
t = VectorAssembler(inputCols=['yr'], outputCol = 'features')
training = t.transform(fga_py)\
            .withColumn('yr',fga_py.yr)\
            .withColumn('label',fga_py.fg3a_p36m)
training.toPandas().head()
    yr  fg3a_p36m   features    label
0   1980    0.410089    [1980.0]    0.410089
1   1981    0.309376    [1981.0]    0.309376
2   1982    0.341511    [1982.0]    0.341511
3   1983    0.331479    [1983.0]    0.331479
4   1984    0.357110    [1984.0]    0.357110
lr = LinearRegression(maxIter=10)
model = lr.fit(training)
# apply model for the 1979-80 season thru 2020-21 season
training_yrs = training.select('yr').rdd.map(lambda x: x[0]).collect()
training_y = training.select('fg3a_p36m').rdd.map(lambda x: x[0]).collect()
prediction_yrs = [2017, 2018, 2019, 2020, 2021]
all_yrs = training_yrs + prediction_yrs

# built testing DataFrame
test_rdd = sc.parallelize(all_yrs)
row = Row('yr')
all_years_features = t.transform(test_rdd.map(row).toDF())

# apply linear regression model
df_results = model.transform(all_years_features).toPandas()
plt.plot(df_results.yr,df_results.prediction, linewidth = 2, linestyle = '--',color = '#fc4f30', label = 'L2 Fit')
plt.plot(training_yrs, training_y, color = '#00a79c', label = None)
plt.xlabel('Year')
plt.ylabel('Number of attempts')
plt.legend(loc = 4)
_=plt.title('Player average 3-point attempts (per 36 minutes)')
plt.tight_layout()
plt.savefig('results/model_prediction.png')
image.png

投篮数据

除季节总数据外,我们还处理和分析NBA投篮图表,以观察三分球革命对投篮选择的影响。 图表数据来自(https://www.nbasavant.com),它来自NBA.com和ESPN。

投篮图数据包含单个球员投篮命中的xy坐标,比赛日期,投篮时间,投篮距离,投篮标志等字段。 我们已经编制了所有个人赛季,从2010-11赛季到2015-16赛季,一名球员尝试了至少1000次进球。

如前所述,我们可以将CSV数据读入Spark DataFrame。

# reset style for pretty shot charts
plt.style.use('default')
sns.set_style("white")

df = spark.read\
          .option('header', 'true')\
          .option('inferSchema', 'true')\
          .csv('data/shot_charts_top_10/1000_plus_shot_charts_2011_2016.csv')
df.cache() # optimizes performance for later calls to this dataframe
print(df.count())
df.orderBy('game_date').limit(10).toPandas()[['yr','name','game_date','shot_distance','x','y','shot_made_flag']]

243719

yr  name    game_date   shot_distance   x   y   shot_made_flag
0   2011    LaMarcus Aldridge   2010-10-26  1   4   11  0
1   2011    Paul Pierce 2010-10-26  25  67  246 1
2   2011    Paul Pierce 2010-10-26  18  165 83  0
3   2011    Paul Pierce 2010-10-26  24  159 186 0
4   2011    Paul Pierce 2010-10-26  24  198 148 1
5   2011    Paul Pierce 2010-10-26  23  231 4   1
6   2011    Paul Pierce 2010-10-26  1   -7  9   0
7   2011    Paul Pierce 2010-10-26  0   -2  -5  1
8   2011    LaMarcus Aldridge   2010-10-26  21  39  211 0
9   2011    LaMarcus Aldridge   2010-10-26  8   -82 23  0

我们可以查询个人球员和赛季,并可视化他们的投篮位置。

以Steph Curry的2015-2016历史性的投射赛季为例。


player = 'Stephen Curry'
yr = '2016'
df_steph = df.filter('''name == "{player}"
                        and yr == {yr}
                        and y < 400'''.format(player = player,yr = yr))

x = np.array([v[0] for v in df_steph.select('x').collect()])
y = np.array([v[0] for v in df_steph.select('y').collect()])
p=plot_shot_chart(x, y, gridsize = 30,
                      kind='hex',
                      label='Steph Curry\n2016')
p.savefig('results/steph_curry_2016_shotchart.png')
image.png
shot_acc = df.groupBy('shot_distance','corner_3','normal_3','is_a_3')\
             .agg(count('*').alias('num_attempts'),mean(df.shot_made_flag).alias('shot_accuracy'))\
             .withColumn('points_per_shot',when(col('is_a_3') == 1, col('shot_accuracy')*3)
                                           .otherwise(col('shot_accuracy')*2)
                        )\
             .filter('num_attempts > 5')\
             .orderBy('shot_distance')\
             .toPandas()
plt.style.use('fivethirtyeight')

def plot_acc_vs_dist(df,kwargs = {}):
    plt.plot(df.shot_distance, df.points_per_shot, **kwargs)
    
plot_acc_vs_dist(shot_acc.query('is_a_3 == False'), {'color' : '#008fd5'})
plot_acc_vs_dist(shot_acc.query('is_a_3 == True'), {'color' : '#008fd5'})
plt.title('Shot value vs. shot distance, 2011-2016 seasons\n Players with 1000+ attempts in a season', size = 14)
plt.xlim(0,30)
plt.xlabel('Shot Distance (ft)')
plt.ylabel('Points per shot')
plt.annotate('high efficiency 2s', xy=(2., 1.15), xytext=(4.5, 1.28),
            arrowprops=dict(facecolor='grey', shrink=0.05),
            )
plt.annotate('high efficiency 3s', xy=(22, 1.15), xytext=(13.5, 1.15),
            arrowprops=dict(facecolor='grey', shrink=0.05),
            )
plt.text(22, 1.28, 'corner 3s', fontsize = 12)
plt.tight_layout()
plt.savefig('results/pps.png')
image.png
上一篇下一篇

猜你喜欢

热点阅读