from fastai.collab import *
from fastai.tabular.all import *

path = untar_data(URLs.ML_100k)
df = pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user', 'movie', 'rating', 'timestamp'])
df = df.drop(columns='timestamp')
df.head()

Generating a new df only with most frequently reviewing users

user_ids = df.user.value_counts().index.tolist()[:10]
df_users = df[df.user.isin(user_ids)]
df_users.head()

Generating a new df only with most frequently reviewed movies

movie_ids = df.movie.value_counts().index.tolist()[:20]
df_movies = df[df.movie.isin(movie_ids)]
df_movies.head()

Combine both dfs to generate an crosstable aligning on the above '(index, user)' and '(index, movie)'

pd.crosstab(df_users.user, df_movies.movie, values=df.rating, aggfunc='mean').fillna('-')

	user	movie	rating
0	196	242	3
1	186	302	3
2	22	377	1
3	244	51	2
4	166	346	1

	user	movie	rating
15	303	785	3
19	234	1184	2
38	276	796	1
43	276	564	3
63	13	526	3

	user	movie	rating
12	200	222	5
24	308	1	4
31	301	98	4
50	251	100	4
53	25	181	5

movie	1	7	50	56	98	100	117	121	127	172	174	181	204	222	237	258	286	288	294	300
user
13	3.0	2.0	5.0	5.0	4.0	5.0	3.0	5.0	5.0	5.0	4.0	5.0	5.0	3.0	5.0	4.0	3.0	1.0	2.0	1.0
234	3.0	2.0	4.0	3.0	4.0	4.0	2.0	-	4.0	3.0	3.0	3.0	2.0	3.0	3.0	2.0	3.0	3.0	3.0	3.0
276	5.0	5.0	5.0	5.0	5.0	5.0	4.0	4.0	5.0	5.0	5.0	5.0	5.0	4.0	5.0	5.0	-	4.0	4.0	4.0
303	5.0	4.0	5.0	5.0	5.0	5.0	3.0	3.0	5.0	5.0	5.0	5.0	4.0	3.0	5.0	4.0	5.0	4.0	4.0	1.0
393	3.0	4.0	5.0	2.0	-	1.0	4.0	4.0	-	5.0	-	4.0	4.0	4.0	4.0	4.0	-	3.0	4.0	-
405	-	-	5.0	4.0	4.0	-	-	-	5.0	5.0	5.0	5.0	5.0	-	-	-	-	5.0	-	-
416	5.0	4.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	5.0	-	3.0	5.0	5.0	5.0	4.0	4.0
450	4.0	4.0	5.0	4.0	4.0	4.0	4.0	3.0	5.0	4.0	5.0	4.0	4.0	3.0	5.0	4.0	4.0	3.0	4.0	4.0
537	2.0	4.0	4.0	5.0	3.0	4.0	2.0	1.0	5.0	3.0	3.0	2.0	3.0	2.0	3.0	4.0	3.0	2.0	1.0	1.0
655	2.0	3.0	4.0	3.0	4.0	3.0	2.0	3.0	5.0	4.0	3.0	3.0	3.0	2.0	3.0	2.0	3.0	3.0	3.0	3.0