MovieLens-1M数据分析
1 首先导入pandas包,然后读入数据,数据集从https://github.com/wesm/pydata-book上下载就行
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('/movielens/users.dat', sep='::', header=None, names=unames, engine='python')
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('/movielens/ratings.dat', sep='::', header=None, names=rnames, engine='python')
mnames = ['user_id', 'title', 'genres']
movies = pd.read_table('/movielens/movies.dat', sep='::', header=None, names=mnames, engine='python')
可以利用users[:5]等切片看看自己加载的数据集是否成功
2 用pandas的merge函数将ratings跟users合并到一起,然后将movies也合并进去
data = pd.merge(pd.merge(ratings, users), movies)
data.loc[0] #查看第一个元素
计算电影平均分
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
ratings_by_title=data.groupby('title').size() #对title进行分组
active_titles=ratings_by_title.index[ratings_by_title>=250] # 获得评论数据大于250的电影
mean_ratings=mean_ratings.loc[active_titles]mean_ratings
mean_ratings=mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':'Seven Samurai (Shichinin no samurai) (1954)'})
top_female_ratings=mean_ratings.sort_index(by='F',ascending=False)# 获取女性观众最喜欢的电影
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F'] #计算评分分歧
sorted_by_diff = mean_ratings.sort_values(by='diff')
rating_std_by_title = data.groupby('title')['rating'].std() #根据电影名称分组得到标准差
rating_std_by_title = rating_std_by_title.loc[active_titles] # 根据active_title进行过滤