谷歌应用商品APP数据集的分析
谷歌应用商品APP的分析报告
Google play store analysis
数据集来自kaggle,爬取的谷歌应用商店的APP数据
我们今天来探索一下数据,并且看下哪些因素可以影响顾客评分Rating
环境 python 3.6, windows 10, jupyter notebook
首先导入相关分析包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#导入数据集
data =pd.read_csv('googleplaystore.csv')
探索数据
# 首先看下数据头
data.head()
image.png
#看下总体情况
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
App 10841 non-null object
Category 10841 non-null object
Rating 9367 non-null float64
Reviews 10841 non-null object
Size 10841 non-null object
Installs 10841 non-null object
Type 10840 non-null object
Price 10841 non-null object
Content Rating 10840 non-null object
Genres 10841 non-null object
Last Updated 10841 non-null object
Current Ver 10833 non-null object
Android Ver 10838 non-null object
dtypes: float64(1), object(12)
memory usage: 1.1+ MB
这份数据有10841行,13个字段包括APP名,分类,打分,下载量,评论量,是否付费,价格,最新更新日期,版本
首先要转化数据成我们需要的格式,Rating,Size,Price要转换成数值型,Last updated要转换成时间序列
#改变为数值型
#data.Reviews.value_counts()
pd.to_numeric(data['Reviews'])
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
pandas\src\inference.pyx in pandas.lib.maybe_convert_numeric (pandas\lib.c:55708)()
ValueError: Unable to parse string "3.0M"
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-5-e509e4352e56> in <module>()
1 #改变为数值型
2 #data.Reviews.value_counts()
----> 3 pd.to_numeric(data['Reviews'])
C:\Users\renhl1\Anaconda3\lib\site-packages\pandas\tools\util.py in to_numeric(arg, errors, downcast)
193 coerce_numeric = False if errors in ('ignore', 'raise') else True
194 values = lib.maybe_convert_numeric(values, set(),
--> 195 coerce_numeric=coerce_numeric)
196
197 except Exception:
pandas\src\inference.pyx in pandas.lib.maybe_convert_numeric (pandas\lib.c:56097)()
ValueError: Unable to parse string "3.0M" at position 10472
#第10472行有问题,看下什么原因
data.loc[10472,]
App Life Made WI-Fi Touchscreen Photo Frame
Category 1.9
Rating 19
Reviews 3.0M
Size 1,000+
Installs Free
Type 0
Price Everyone
Content Rating NaN
Genres February 11, 2018
Last Updated 1.0.19
Current Ver 4.0 and up
Android Ver NaN
Name: 10472, dtype: object
#可以看出这行数据错误,直接删除
data.drop(10472,inplace=True)
data['Reviews']=data['Reviews'].astype(int)
#更改Size为数值型
data.Size.unique()
array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M',
'28M', '12M', '20M', '21M', '37M', '2.7M', '5.5M', '17M', '39M',
'31M', '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M',
'5.2M', '11M', '24M', 'Varies with device', '9.4M', '15M', '10M',
'1.2M', '26M', '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k',
'3.6M', '5.7M', '8.6M', '2.4M', '27M', '2.5M', '16M', '3.4M',
'8.9M', '3.9M', '2.9M', '38M', '32M', '5.4M', '18M', '1.1M', '2.2M',
'4.5M', '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M',
……
'892k', '154k', '860k', '364k', '387k', '626k', '161k', '879k',
'39k', '970k', '170k', '141k', '160k', '144k', '143k', '190k',
'376k', '193k', '246k', '73k', '658k', '992k', '253k', '420k',
'404k', '470k', '226k', '240k', '89k', '234k', '257k', '861k',
'467k', '157k', '44k', '676k', '67k', '552k', '885k', '1020k',
'582k', '619k'], dtype=object)
# 去掉错误值 Varies with device,替换为空值
data['Size'].replace('Varies with device', np.nan, inplace = True )
data['Size'].isnull().sum() #空值总数
1695
#由于size里有‘kM’字符,为了转换成数值型,我们需要用正则表达式进行匹配
import re #导入正则表达式包
#定义一个函数来,k改为1000,M改为1000,000
def change(i):
if i is not np.nan:
A,B=re.split('[kM]+',i)
C,D=re.split('[0-9.]+',i)
if D=='M':
A=float(A)*1000000
elif D =='k':
A=float(A)*1000
return A
#转换size列为数值型
data['Size'] =data['Size'].apply(lambda x:change(x))
#用平均值来填充空置
data['Size'].fillna(data.groupby('Category')['Size'].transform('mean'),inplace=True)
#data['Price'].value_counts()
#看下price里具体哪些数据
#变更price为float型
data['Price']=data['Price'].apply(lambda x: float(x[1:]) if x !='0' else 0 )
#首先看下有多少款APPs
len(data.App.unique())
9659
#比data行数少,说明有重复项,看下具体是哪些APP
data.App.value_counts()
ROBLOX 9
CBS Sports App - Scores, News, Stats & Watch Live 8
Candy Crush Saga 7
ESPN 7
Duolingo: Learn Languages Free 7
……
#选择第一个APP看下内容
data[data['App']=='ROBLOX']
image.png
#可以看到Reviews不一样,去除重复项
#对于多个分类的,只保留一个分类(有100多个APP)
data=data.drop_duplicates(subset=['App'])
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9659 entries, 0 to 10840
Data columns (total 13 columns):
App 9659 non-null object
Category 9659 non-null object
Rating 8196 non-null float64
Reviews 9659 non-null int32
Size 9659 non-null float64
Installs 9659 non-null object
Type 9658 non-null object
Price 9659 non-null float64
Content Rating 9659 non-null object
Genres 9659 non-null object
Last Updated 9659 non-null object
Current Ver 9651 non-null object
Android Ver 9657 non-null object
dtypes: float64(3), int32(1), object(9)
memory usage: 1018.7+ KB
具体分析每个字段
#分析category
cate= data['Category'].groupby(data['Category']).count()
cate=cate.sort_values(ascending=False)
plt.figure(figsize=(15,10))
sns.barplot(x=cate.index,y=cate.values)
plt.xticks(rotation=90)
plt.xlabel('Category')
plt.ylabel('App qty')
plt.title("App qty by category")
<matplotlib.text.Text at 0x1b76369a2e8>
image.png
labels=data['Category'].value_counts().index
sizes= data['Category'].value_counts().values
#做饼图看各分类占比
plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.title('App qty by category',color = 'blue',fontsize = 15)
<matplotlib.text.Text at 0x1b7638f8128>
image.png
结论:按分类数量,排名前3的APP是family 19.6%,game 9.9%,tool 8.5%,而且显著高于之后分类的APP数量
#分析Genres
len(data.Genres.value_counts())
118
#genres 总共有120个类型
genr= data['Genres'].groupby(data['Genres']).count()
genr=genr.sort_values(ascending=False)
genr.index[:15] #选择前15个类型
Index(['Tools', 'Entertainment', 'Education', 'Business', 'Medical',
'Personalization', 'Productivity', 'Lifestyle', 'Finance', 'Sports',
'Communication', 'Action', 'Health & Fitness', 'Photography',
'News & Magazines'],
dtype='object', name='Genres')
plt.figure(figsize=(15,10))
sns.barplot(x=genr.index[:15],y=genr.values[:15])
plt.xticks(rotation=90)
plt.xlabel('Genres')
plt.ylabel('App qty')
plt.title("App qty by Genres")
<matplotlib.text.Text at 0x1b764a7c278>
image.png
data.describe()
image.png
#看下Rating数据分布
fig=plt.figure(figsize=(15,6))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
sns.violinplot(y=data['Rating'],data=data,ax=ax1)
sns.kdeplot(data.Rating,ax=ax2,shade=True)
sns.boxplot(y=data.Rating,ax=ax3)
image.png
结论:50% app 评分在4-4.5之间,均值4.17分
#看下reivews数据
#data['Reviews'].value_counts()
fig=plt.figure(figsize=(12,8))
sns.kdeplot(data.Reviews,shade=True) #Reviews 的密度分布
image.png
绝大部分APP的评论少于10个
#具体看下评论少于200的APP的分布
a=[]
for i in range(0,200,5):
a.append(i)
fig=plt.figure(figsize=(15,8))
plt.hist(data['Reviews'],a,histtype="bar",rwidth=0.8,alpha=0.4)
plt.xticks(np.arange(0, 100, step=5))
image.png
#找出top 10 reiviews
b=data['Reviews'].value_counts()
b.sort_index(ascending=False)
78158306 1
69119316 1
66577313 1
56642847 1
44891723 1
42916526 1
27722264 1
25655305 1
24900999 1
23133508 1
22426677 1
……
16 35
15 30
14 41
13 49
12 58
11 52
10 62
9 64
8 72
7 88
6 945
4 137
3 170
2 213
1 272
0 593
Name: Reviews, dtype: int64
data[data['Reviews']>20000000]
image.png
看下评论最高的APP除了4个游戏类,竟然主要是facebook系,谷歌系的只有youtube上榜,最后两个是猎豹移动的
接下来分析下价格的影响,包括tpye和price两个字段
a=data.Type.value_counts()
labels=data['Type'].value_counts().index
explode = [0.2,0] #每一块饼离中心的距离
sizes= data['Type'].value_counts().values
#colors = ['grey','blue','red','yellow','green','brown']
plt.figure(figsize = (9,9))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',explode=explode)
plt.rcParams.update({'font.size': 10})
plt.title('App qty by type',color = 'blue',fontsize = 20)
<matplotlib.text.Text at 0x1b765455208>
image.png
可以看到92.2%的APP免费,付费APP占比7.8%
#分析下price
data['Price'].value_counts()
0.00 8903
0.99 145
2.99 124
1.99 73
4.99 70
3.99 57
1.49 46
5.99 26
2.49 25
9.99 19
399.99 12
6.99 11
14.99 9
4.49 9
...
Name: Price, dtype: int64
price = data['Price'].value_counts()
price.drop(0,inplace=True) #删除免费的,分析付费APP情况
price=price.sort_values(ascending=False)
fig = plt.figure(figsize=(15,10))
sns.kdeplot(data[data['Price']!=0]['Price']) #分析付费APP的密度分布图
image.png
可以看到绝大部分APP价格低于30美元,但是看到400美元价位有一个凸起,把这类选中看下什么情况
data[data['Price']==399.99]
image.png
在网上查了后发现这是一个恶搞软件,没有任何用处。看了play确实有几千评论,10W下载,不过没明白为什么有这么说下载量,有人知道的话可以告诉我
可以之后价格分析中把这些异常值删除
#我们再具体看下所有分类
#num = str(a.tolist()).count("1")
#num
#绝大部分APP会定价0.99,1.99,2.99等,为了更改的分析,我们把价格值唯一的删除(也就是只有一个APP定的是这个价格),总共63个值
price =price[price>1]
#a=data['Price'].value_counts().values
fig = plt.figure(figsize=(12,10))
sns.kdeplot(price.values,shade=True)
C:\Users\renhl1\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
<matplotlib.axes._subplots.AxesSubplot at 0x1b765bcbb38>
image.png
fig = plt.figure(figsize=(18,10))
sns.barplot(price.index,price.values)
image.png
付费的绝大部分在10美元以下,排名top5 依次是0.99,2.99,1.99,4.99,3.99美元
#转变last undated 为日期型
data['Last Updated']=pd.to_datetime(data['Last Updated'])
fig = plt.figure(figsize=(10,7))
plt.plot(data['Last Updated'],'.')
image.png
#看 installs情况
data['Installs'].value_counts()
1,000,000+ 1417
100,000+ 1112
10,000+ 1031
10,000,000+ 937
1,000+ 888
100+ 710
5,000,000+ 607
500,000+ 505
50,000+ 469
5,000+ 468
10+ 385
500+ 328
50+ 204
50,000,000+ 202
100,000,000+ 188
5+ 82
1+ 67
500,000,000+ 24
1,000,000,000+ 20
0+ 14
0 1
Name: Installs, dtype: int64
install=data['Installs'].groupby(data['Installs']).count()
install =install.sort_values(ascending=False)
fig = plt.figure(figsize=(9,12))
sns.barplot(x=install.values,y=install.index)
plt.ylabel('installed times')
plt.xlabel('App qty')
plt.title("App qty by installed times")
image.png
可以看到APP数量最多的是1M次下载的,另外还有一个好玩的地方,5开头的下载量显著的少于10开头的下载量
# 下载量超过10亿次的APP情况
data[data['Installs']=='1,000,000,000+']
image.png
image.png
超过10亿下载量的大多数是google的产品
#看下下载量跟reviews有没有关系
reviews=data['Reviews'].groupby(data['Installs']).mean()
fig = plt.figure(figsize=(15,9))
sns.barplot(x=reviews.values,y=reviews.index)
plt.ylabel('installed times')
plt.xlabel('reviews')
plt.title("avg.reivew by installed times")
plt.xscale('log') #刻度改为log
image.png
可以看到下载量确实和评论数呈正相关
探索评分可能跟哪些参数有关
首先清理评分为0的数据并赋值到一个新数据集
#data['Rating'].value_counts()
newdata=data[data['Rating'].notnull()] #删除没有评分的数据
newdata.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8196 entries, 0 to 10840
Data columns (total 13 columns):
App 8196 non-null object
Category 8196 non-null object
Rating 8196 non-null float64
Reviews 8196 non-null int32
Size 8196 non-null float64
Installs 8196 non-null object
Type 8196 non-null object
Price 8196 non-null float64
Content Rating 8196 non-null object
Genres 8196 non-null object
Last Updated 8196 non-null datetime64[ns]
Current Ver 8192 non-null object
Android Ver 8194 non-null object
dtypes: datetime64[ns](1), float64(3), int32(1), object(8)
memory usage: 864.4+ KB
#看下last update 和 rating 有没有关系
fig = plt.figure(figsize=(10,7))
plt.plot(newdata['Last Updated'],newdata['Rating'],'.')
[<matplotlib.lines.Line2D at 0x1b7670ec5f8>]
image.png
#把年份单独提取出来,作为新的一列
from datetime import datetime
newdata['updated_year']=newdata['Last Updated'].dt.year
fig = plt.figure(figsize=(15,9))
sns.boxplot(newdata['updated_year'],newdata['Rating'])
plt.xlabel('updated year')
plt.ylabel('rating')
plt.title('rating with different updated year')
image.png
可以得出结论随着时间APP的中位数打分在越来越高,到了2018年首次超过75%的APP分数超过4分,说明随着移动应用的完善,低质的APP基本没有了市场
plt.figure(figsize=(12,9))
sns.boxplot(x=newdata['Type'],y=newdata['Rating'],data=newdata)
image.png
可以看到付费APP的评分比免费APP的评分高
#看下reivews和rating是否有相关性
#pearson相关性,值在-1和+1之间,+1表示完全正相关,-1表示完全负相关,0表示没有相关性
plt.figure(figsize=(10,10))
sns.jointplot(newdata['Reviews'],newdata['Rating'],kind='reg',size =7)
image.png
#看下size是否有相关性
plt.figure(figsize=(10,10))
sns.jointplot(newdata['Size'],newdata['Rating'],kind='reg',size =7)
image.png
结论:Rating跟Reviews 和 Size 没有相关性
#看下category 和 rating 的关系
fig =plt.figure(figsize=(15,12))
sns.boxplot(y=newdata['Category'],x=newdata['Rating'],data=newdata)
#plt.xticks(rotation=90)
plt.ylabel('category')
plt.xlabel('rating')
plt.title('rating distribution by category')
<matplotlib.text.Text at 0x1b768aabb00>
image.png
可以看到评分最低的是dating :),评分比较高的分类有art and design, events,personalization,parenting
#看installs和Rating关系
installrate =newdata['Rating'].groupby(newdata['Installs']).count()
installrate
Installs
1+ 3
1,000+ 697
1,000,000+ 1415
1,000,000,000+ 20
10+ 69
10,000+ 987
10,000,000+ 937
100+ 303
100,000+ 1094
100,000,000+ 188
5+ 9
5,000+ 425
5,000,000+ 607
50+ 56
50,000+ 457
50,000,000+ 202
500+ 199
500,000+ 504
500,000,000+ 24
Name: Rating, dtype: int64
#把下载人数过少的评论去掉,只查看高于100下载的
slected =newdata.loc[(newdata['Installs'] != '1+')&(newdata['Installs'] != '5+')&(newdata['Installs'] != '10+')&(newdata['Installs'] != '50+')]
#看下 installs 和 rating 的关系
fig =plt.figure(figsize=(15,9))
sns.boxplot(x=slected['Installs'],y=slected['Rating'])
plt.xticks(rotation=45)
plt.xlabel('Installed qty')
plt.ylabel('rating')
plt.title('rating distribution by category')
image.png
#分数集中在4-4.5,rating跟installs 没有很强的相关性
#看下跟Price关系,前面tpye相当于付费0元 和大于0元的比较,这里再细分付费金额的区别
#drop 0元 和 异常的i'm rick APP
selected =newdata.loc[(newdata['Price']!=0) & (newdata['Price']<200)]
#看下 installs 和 rating 的关系
fig =plt.figure(figsize=(15,9))
sns.jointplot(x=selected['Price'],y=selected['Rating'],kind='reg')
#xplt.xticks(rotation=45)
plt.xlabel('Price')
plt.ylabel('rating')
plt.title('rating distribution vs. price')
image.png
分值-0.029,price和rating 缺乏相关性
#看下category 和 genres
data['App'].groupby([data['Category'],data['Genres']]).count()
Category Genres
ART_AND_DESIGN Art & Design 57
Art & Design;Action & Adventure 1
Art & Design;Creativity 5
Art & Design;Pretend Play 1
AUTO_AND_VEHICLES Auto & Vehicles 85
BEAUTY Beauty 53
BOOKS_AND_REFERENCE Books & Reference 222
BUSINESS Business 420
COMICS Comics 55
Comics;Creativity 1
COMMUNICATION Communication 315
DATING Dating 171
EDUCATION Education 99
Education;Action & Adventure 1
Education;Brain Games 3
Education;Creativity 3
Education;Education 8
Education;Music & Video 1
Education;Pretend Play 4
ENTERTAINMENT Entertainment 92
Entertainment;Brain Games 2
Entertainment;Creativity 1
Entertainment;Music & Video 7
EVENTS Events 64
FAMILY Action;Action & Adventure 9
Adventure;Action & Adventure 4
Adventure;Brain Games 1
Adventure;Education 1
Arcade;Action & Adventure 14
Arcade;Pretend Play 1
...
GAME Simulation;Education 1
Sports 6
Strategy 17
Trivia 38
Word 23
HEALTH_AND_FITNESS Health & Fitness 288
HOUSE_AND_HOME House & Home 74
LIBRARIES_AND_DEMO Libraries & Demo 84
LIFESTYLE Lifestyle 368
Lifestyle;Pretend Play 1
MAPS_AND_NAVIGATION Maps & Navigation 131
MEDICAL Medical 395
NEWS_AND_MAGAZINES News & Magazines 254
PARENTING Parenting 46
Parenting;Brain Games 1
Parenting;Education 7
Parenting;Music & Video 6
PERSONALIZATION Personalization 376
PHOTOGRAPHY Photography 281
PRODUCTIVITY Productivity 374
SHOPPING Shopping 202
SOCIAL Social 239
SPORTS Sports 325
TOOLS Tools 826
Tools;Education 1
TRAVEL_AND_LOCAL Travel & Local 218
Travel & Local;Action & Adventure 1
VIDEO_PLAYERS Video Players & Editors 162
Video Players & Editors;Music & Video 1
WEATHER Weather 79
Name: App, dtype: int64
#不同category,付费用户比例
a=data['App'].groupby([data['Category'],data['Type']]).count()
c=[]
d=[]
for i in a.index.values:
c.append(i[0])
d.append(i[1])
typedata=pd.DataFrame({'Category':c,'Type':d,'values':list(a.values)})
fig =plt.figure(figsize=(15,12))
sns.barplot(y=typedata[typedata['Type']=='Paid']['Category'],x=typedata[typedata['Type']=='Paid']['values'],color='yellow',alpha=0.8,label='Paid')
sns.barplot(y=typedata[typedata['Type']=='Free']['Category'],x=typedata[typedata['Type']=='Free']['values'],color='green',alpha = 0.2,label='Free')
<matplotlib.axes._subplots.AxesSubplot at 0x1b76968add8>
image.png
可以看出付费用户占最高的是ENTERTAINMENT,'LIBRARIES_AND_DEMO,BEAUTY,SHOPPING
#看下安卓版本和rating 关系
newdata['Android Ver'].value_counts()
4.1 and up 1811
4.0.3 and up 1141
4.0 and up 1042
Varies with device 947
4.4 and up 713
2.3 and up 547
5.0 and up 447
4.2 and up 316
2.3.3 and up 232
2.2 and up 203
3.0 and up 201
4.3 and up 185
2.1 and up 112
1.6 and up 87
6.0 and up 42
7.0 and up 41
3.2 and up 31
2.0 and up 27
5.1 and up 16
1.5 and up 16
3.1 and up 8
2.0.1 and up 7
4.4W and up 5
8.0 and up 5
7.1 and up 3
4.0.3 - 7.1.1 2
1.0 and up 2
5.0 - 8.0 2
4.1 - 7.1.1 1
7.0 - 7.1.1 1
5.0 - 6.0 1
Name: Android Ver, dtype: int64
fig = plt.figure(figsize=(15,9))
sns.boxplot(x=newdata['Rating'],y=newdata['Android Ver'])
plt.xlabel('rating')
plt.ylabel('android ver')
<matplotlib.text.Text at 0x1b76a0e3240>
image.png
支持安卓版本和rating没有特别相关性
#看戏分级和Rating关系
data['Content Rating'].value_counts()
Everyone 7903
Teen 1036
Mature 17+ 393
Everyone 10+ 322
Adults only 18+ 3
Unrated 2
Name: Content Rating, dtype: int64
fig = plt.figure(figsize=(15,9))
sns.boxplot(x=newdata['Content Rating'],y=newdata['Rating'])
plt.xlabel('content rating')
plt.ylabel('rating')
<matplotlib.text.Text at 0x1b76b76c198>
image.png
conclusion
本篇共分析了谷歌应用商店APP数据集,共9659个APPs
评分rating的均值是4.17, 50%的APP分值在4-4.5分
app分类数量排名前3的APP是family 19.6%,game 9.9%,tool 8.5%的总APP数量占比
付费用户占比7.8%,其中ENTERTAINMENT,'LIBRARIES_AND_DEMO,BEAUTY,SHOPPING等分类的付费APP最高,付费的价格绝大部分在10美元以下,排名top5 依次是0.99,2.99,1.99,4.99,3.99美元、
大多数APP支持安卓4.0以上版本,还支持安卓2.0,3.0的APP已经很少了
超过10亿下载量的大多数是google系的产品,但是评论量最高的是facebook系产品
影响Rating分值的因子有Type,Category,updated year