案例3-NBA
2018-01-13 本文已影响69人
7125messi
Apache Spark已经成为大规模数据分析的常用工具,本文我们将展示如何使用Spark来分析NBA数据。具体来说,我们将使用1979年到2016年的赛季数据以及投篮图数据来展示NBA如何继续朝着越来越多的三分投篮的方向发展。(主要是勇士队掀起的小球打法以及小学生“库里”的变态准)。
使用Python 3,我们利用Spark Python API(PySpark)来创建和分析Spark。
导入各种模块
%matplotlib inline
import os
import numpy as np
import pandas as pd
import seaborn as sns
from nba_utils import draw_3pt_piechart,plot_shot_chart
from IPython.core.display import display, HTML
from IPython.core.magic import register_cell_magic, register_line_cell_magic, register_line_magic
from matplotlib import pyplot as plt
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import array, col, count, mean, sum, udf, when
from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
from pyspark.sql.functions import sum, col, udf
import warnings
warnings.filterwarnings("ignore")
sns.set_style("white")
sns.set_color_codes()
自定义可视化样式
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 9
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['figure.titlesize'] = 18
display(HTML('<style>.container {width:80% !important;}</style>'))
update_title = 'document.title = "Using Python and Apache Spark to Analyze the NBA and the 3-point Shot";'
HTML('<script>{}</script>'.format(update_title))
df = spark.read.option('header','true')\
.option('inferSchema','true')\
.csv('data/season_totals.csv')
# 缓存数据
df.cache()
DataFrame[_c0: int, player: string, pos: string, age: int, team_id: string, g: int, gs: int, mp: int, fg: int, fga: int, fg_pct: double, fg3: int, fg3a: int, fg3_pct: double, fg2: int, fg2a: int, fg2_pct: double, efg_pct: double, ft: int, fta: int, ft_pct: double, orb: int, drb: int, trb: int, ast: int, stl: int, blk: int, tov: int, pf: int, pts: int, yr: int]
# Using our DataFrame `df`, we can view the top 10 players
df.orderBy('pts',ascending = False).limit(10).toPandas()[['yr','player','age','pts','fg3']]
yr player age pts fg3
0 1987 Jordan,Michael 23 3041 12
1 1988 Jordan,Michael 24 2868 7
2 2006 Bryant,Kobe 27 2832 180
3 1990 Jordan,Michael 26 2753 92
4 1989 Jordan,Michael 25 2633 27
5 2014 Durant,Kevin 25 2593 192
6 1980 Gervin,George 27 2585 32
7 1991 Jordan,Michael 27 2580 29
8 1982 Gervin,George 29 2551 10
9 1993 Jordan,Michael 29 2541 81
print(df.columns)
['_c0', 'player', 'pos', 'age', 'team_id', 'g', 'gs', 'mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'fg2', 'fg2a', 'fg2_pct', 'efg_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'yr']
# 3 point attempts / 36 minute
fga_py = df.groupBy('yr')\
.agg({'mp' : 'sum', 'fg3a' : 'sum'})\
.select(col('yr'), (36*col('sum(fg3a)')/col('sum(mp)')).alias('fg3a_p36m'))\
.orderBy('yr')
# or could use SQL
sqlContext.registerDataFrameAsTable(df, 'df')
fga_py = sqlContext.sql('''SELECT yr,
sum(fg3a)/sum(mp)*36 fg3a_p36m
FROM df GROUP BY yr
ORDER BY yr''')
可视化
_df = fga_py.toPandas()
plt.plot(_df.yr,_df.fg3a_p36m, color = '#00a79c')
plt.xlabel('Year')
plt.ylabel('Number of attempts')
_=plt.title('Player average 3-point attempts (per 36 minutes)')
_=plt.annotate('3 pointer introduced', xy=(1980.5, .5), xytext=(1981, 1.1), fontsize = 12,
arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
_=plt.annotate('NBA moved in\n3-point line', xy=(1993.7, 1.5), xytext=(1987, 1.79), fontsize = 12,
arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
_=plt.annotate('NBA moved back\n3-point line', xy=(1998, 2.), xytext=(1998.5, 2.4), fontsize = 12,
arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
plt.tight_layout()
plt.savefig('results/3_point_trend.png')

我们可以看到自从1979-80赛季的投篮以来,三分球命中率稳步上升。 观察90年代中期NBA几英尺线路上的尝试次数是有意思的,合乎逻辑的。 另外,过去5年的尝试次数也有了突然的增加。
建立线性模型
# train the model
t = VectorAssembler(inputCols=['yr'], outputCol = 'features')
training = t.transform(fga_py)\
.withColumn('yr',fga_py.yr)\
.withColumn('label',fga_py.fg3a_p36m)
training.toPandas().head()
yr fg3a_p36m features label
0 1980 0.410089 [1980.0] 0.410089
1 1981 0.309376 [1981.0] 0.309376
2 1982 0.341511 [1982.0] 0.341511
3 1983 0.331479 [1983.0] 0.331479
4 1984 0.357110 [1984.0] 0.357110
lr = LinearRegression(maxIter=10)
model = lr.fit(training)
# apply model for the 1979-80 season thru 2020-21 season
training_yrs = training.select('yr').rdd.map(lambda x: x[0]).collect()
training_y = training.select('fg3a_p36m').rdd.map(lambda x: x[0]).collect()
prediction_yrs = [2017, 2018, 2019, 2020, 2021]
all_yrs = training_yrs + prediction_yrs
# built testing DataFrame
test_rdd = sc.parallelize(all_yrs)
row = Row('yr')
all_years_features = t.transform(test_rdd.map(row).toDF())
# apply linear regression model
df_results = model.transform(all_years_features).toPandas()
plt.plot(df_results.yr,df_results.prediction, linewidth = 2, linestyle = '--',color = '#fc4f30', label = 'L2 Fit')
plt.plot(training_yrs, training_y, color = '#00a79c', label = None)
plt.xlabel('Year')
plt.ylabel('Number of attempts')
plt.legend(loc = 4)
_=plt.title('Player average 3-point attempts (per 36 minutes)')
plt.tight_layout()
plt.savefig('results/model_prediction.png')

投篮数据
除季节总数据外,我们还处理和分析NBA投篮图表,以观察三分球革命对投篮选择的影响。 图表数据来自(https://www.nbasavant.com),它来自NBA.com和ESPN。
投篮图数据包含单个球员投篮命中的xy坐标,比赛日期,投篮时间,投篮距离,投篮标志等字段。 我们已经编制了所有个人赛季,从2010-11赛季到2015-16赛季,一名球员尝试了至少1000次进球。
如前所述,我们可以将CSV数据读入Spark DataFrame。
# reset style for pretty shot charts
plt.style.use('default')
sns.set_style("white")
df = spark.read\
.option('header', 'true')\
.option('inferSchema', 'true')\
.csv('data/shot_charts_top_10/1000_plus_shot_charts_2011_2016.csv')
df.cache() # optimizes performance for later calls to this dataframe
print(df.count())
df.orderBy('game_date').limit(10).toPandas()[['yr','name','game_date','shot_distance','x','y','shot_made_flag']]
243719
yr name game_date shot_distance x y shot_made_flag
0 2011 LaMarcus Aldridge 2010-10-26 1 4 11 0
1 2011 Paul Pierce 2010-10-26 25 67 246 1
2 2011 Paul Pierce 2010-10-26 18 165 83 0
3 2011 Paul Pierce 2010-10-26 24 159 186 0
4 2011 Paul Pierce 2010-10-26 24 198 148 1
5 2011 Paul Pierce 2010-10-26 23 231 4 1
6 2011 Paul Pierce 2010-10-26 1 -7 9 0
7 2011 Paul Pierce 2010-10-26 0 -2 -5 1
8 2011 LaMarcus Aldridge 2010-10-26 21 39 211 0
9 2011 LaMarcus Aldridge 2010-10-26 8 -82 23 0
我们可以查询个人球员和赛季,并可视化他们的投篮位置。
以Steph Curry的2015-2016历史性的投射赛季为例。
player = 'Stephen Curry'
yr = '2016'
df_steph = df.filter('''name == "{player}"
and yr == {yr}
and y < 400'''.format(player = player,yr = yr))
x = np.array([v[0] for v in df_steph.select('x').collect()])
y = np.array([v[0] for v in df_steph.select('y').collect()])
p=plot_shot_chart(x, y, gridsize = 30,
kind='hex',
label='Steph Curry\n2016')
p.savefig('results/steph_curry_2016_shotchart.png')

shot_acc = df.groupBy('shot_distance','corner_3','normal_3','is_a_3')\
.agg(count('*').alias('num_attempts'),mean(df.shot_made_flag).alias('shot_accuracy'))\
.withColumn('points_per_shot',when(col('is_a_3') == 1, col('shot_accuracy')*3)
.otherwise(col('shot_accuracy')*2)
)\
.filter('num_attempts > 5')\
.orderBy('shot_distance')\
.toPandas()
plt.style.use('fivethirtyeight')
def plot_acc_vs_dist(df,kwargs = {}):
plt.plot(df.shot_distance, df.points_per_shot, **kwargs)
plot_acc_vs_dist(shot_acc.query('is_a_3 == False'), {'color' : '#008fd5'})
plot_acc_vs_dist(shot_acc.query('is_a_3 == True'), {'color' : '#008fd5'})
plt.title('Shot value vs. shot distance, 2011-2016 seasons\n Players with 1000+ attempts in a season', size = 14)
plt.xlim(0,30)
plt.xlabel('Shot Distance (ft)')
plt.ylabel('Points per shot')
plt.annotate('high efficiency 2s', xy=(2., 1.15), xytext=(4.5, 1.28),
arrowprops=dict(facecolor='grey', shrink=0.05),
)
plt.annotate('high efficiency 3s', xy=(22, 1.15), xytext=(13.5, 1.15),
arrowprops=dict(facecolor='grey', shrink=0.05),
)
plt.text(22, 1.28, 'corner 3s', fontsize = 12)
plt.tight_layout()
plt.savefig('results/pps.png')
