from fastai.collab import *
from fastai.tabular.all import *

path = untar_data(URLs.ML_100k)
df = pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user', 'movie', 'rating', 'timestamp'])
df = df.drop(columns='timestamp')
df.head()
user movie rating
0 196 242 3
1 186 302 3
2 22 377 1
3 244 51 2
4 166 346 1

Generating a new df only with most frequently reviewing users

user_ids = df.user.value_counts().index.tolist()[:10]
df_users = df[df.user.isin(user_ids)]
df_users.head()
user movie rating
15 303 785 3
19 234 1184 2
38 276 796 1
43 276 564 3
63 13 526 3

Generating a new df only with most frequently reviewed movies

movie_ids = df.movie.value_counts().index.tolist()[:20]
df_movies = df[df.movie.isin(movie_ids)]
df_movies.head()
user movie rating
12 200 222 5
24 308 1 4
31 301 98 4
50 251 100 4
53 25 181 5

Combine both dfs to generate an crosstable aligning on the above '(index, user)' and '(index, movie)'

pd.crosstab(df_users.user, df_movies.movie, values=df.rating, aggfunc='mean').fillna('-')
movie 1 7 50 56 98 100 117 121 127 172 174 181 204 222 237 258 286 288 294 300
user
13 3.0 2.0 5.0 5.0 4.0 5.0 3.0 5.0 5.0 5.0 4.0 5.0 5.0 3.0 5.0 4.0 3.0 1.0 2.0 1.0
234 3.0 2.0 4.0 3.0 4.0 4.0 2.0 - 4.0 3.0 3.0 3.0 2.0 3.0 3.0 2.0 3.0 3.0 3.0 3.0
276 5.0 5.0 5.0 5.0 5.0 5.0 4.0 4.0 5.0 5.0 5.0 5.0 5.0 4.0 5.0 5.0 - 4.0 4.0 4.0
303 5.0 4.0 5.0 5.0 5.0 5.0 3.0 3.0 5.0 5.0 5.0 5.0 4.0 3.0 5.0 4.0 5.0 4.0 4.0 1.0
393 3.0 4.0 5.0 2.0 - 1.0 4.0 4.0 - 5.0 - 4.0 4.0 4.0 4.0 4.0 - 3.0 4.0 -
405 - - 5.0 4.0 4.0 - - - 5.0 5.0 5.0 5.0 5.0 - - - - 5.0 - -
416 5.0 4.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 5.0 - 3.0 5.0 5.0 5.0 4.0 4.0
450 4.0 4.0 5.0 4.0 4.0 4.0 4.0 3.0 5.0 4.0 5.0 4.0 4.0 3.0 5.0 4.0 4.0 3.0 4.0 4.0
537 2.0 4.0 4.0 5.0 3.0 4.0 2.0 1.0 5.0 3.0 3.0 2.0 3.0 2.0 3.0 4.0 3.0 2.0 1.0 1.0
655 2.0 3.0 4.0 3.0 4.0 3.0 2.0 3.0 5.0 4.0 3.0 3.0 3.0 2.0 3.0 2.0 3.0 3.0 3.0 3.0