In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import copy
import surprise

from surprise import Dataset, Reader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv(r'/content/drive/MyDrive/Bahan Ajar/Materi Bootcamp dibimbing.id/Day 24/HandsOn Day 24/data/ratings.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [None]:
df = df.drop(columns=['timestamp'], axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [None]:
reader = Reader(rating_scale = (1, 5))
reader

<surprise.reader.Reader at 0x220d51071c0>

In [None]:
utility_data = Dataset.load_from_df(
                    df = df,
                    reader = reader
                )

In [None]:
def train_test_split(utility_data, test_size, random_state):
    """
    Train test split the data
    ref: https://surprise.readthedocs.io/en/stable/FAQ.html#split-data-for-unbiased-estimation-py

    Parameters
    ----------
    utility_data : Surprise utility data
        The sample of whole data set

    test_size : float, default=0.2
        The test size

    random_state : int, default=42
        For reproducibility

    Returns
    -------
    full_data : Surprise utility data
        The new utility data

    train_data : Surprise format
        The train data

    test_data : Surprise format
        The test data
    """
    # Deep copy the utility_data
    full_data = copy.deepcopy(utility_data)

    # Generate random seed
    np.random.seed(random_state)

    # Shuffle the raw_ratings for reproducibility
    raw_ratings = full_data.raw_ratings
    np.random.shuffle(raw_ratings)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_ratings))

    # Split the data
    train_raw_ratings = raw_ratings[:threshold]
    test_raw_ratings = raw_ratings[threshold:]

    # Get the data
    full_data.raw_ratings = train_raw_ratings
    train_data = full_data.build_full_trainset()
    test_data = full_data.construct_testset(test_raw_ratings)

    return full_data, train_data, test_data

In [None]:
full_data, train_data, test_data = train_test_split(utility_data,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [None]:
# Import the cross validation module
from surprise.model_selection import cross_validate
from surprise.model_selection.search import RandomizedSearchCV

from surprise import KNNBasic

In [None]:
params = {'k':list(np.arange(start=5, stop=40, step=5)),
          'sim_options':{'name':['cosine','pearson_baseline'],
                         'user_based':[True,False]
            }
}

tuning = RandomizedSearchCV(
    algo_class=KNNBasic,
    param_distributions = params,
    cv=5
)

In [None]:
tuning.fit(data=full_data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity mat

In [None]:
summary_df = pd.DataFrame({'Model': ['Baseline', 'Neighborhood Collaborative Filtering'],
                           'Model Condiguration':['N/A',f'{tuning.best_params["rmse"]}']})

summary_df

Unnamed: 0,Model,Model Condiguration
0,Baseline,
1,Neighborhood Collaborative Filtering,"{'k': 35, 'sim_options': {'name': 'pearson_bas..."


In [None]:
best_params = tuning.best_params['rmse']

# Create object
model_best = KNNBasic(**best_params)

# Retrain on whole train dataset
model_best.fit(train_data)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x220d517d340>

In [None]:
# import performance library
from surprise import accuracy

In [None]:
test_pred = model_best.test(test_data)
test_rmse = accuracy.rmse(test_pred)
test_rmse

RMSE: 0.9236


0.9235585854419559

In [None]:
summary_test_df = pd.DataFrame({'Model' : ['User to User CF'],
                                'RMSE-Tuning': [tuning.best_score['rmse']],
                                'RMSE-Test': [test_rmse]})

summary_test_df

Unnamed: 0,Model,RMSE-Tuning,RMSE-Test
0,User to User CF,0.930686,0.923559


In [None]:
help(model_best.predict)

Help on method predict in module surprise.prediction_algorithms.algo_base:

predict(uid, iid, r_ui=None, clip=True, verbose=False) method of surprise.prediction_algorithms.knns.KNNBasic instance
    Compute the rating prediction for given user and item.
    
    The ``predict`` method converts raw ids to inner ids and then calls the
    ``estimate`` method which is defined in every derived class. If the
    prediction is impossible (e.g. because the user and/or the item is
    unknown), the prediction is set according to
    :meth:`default_prediction()
    <surprise.prediction_algorithms.algo_base.AlgoBase.default_prediction>`.
    
    Args:
        uid: (Raw) id of the user. See :ref:`this note<raw_inner_note>`.
        iid: (Raw) id of the item. See :ref:`this note<raw_inner_note>`.
        r_ui(float): The true rating :math:`r_{ui}`. Optional, default is
            ``None``.
        clip(bool): Whether to clip the estimation into the rating scale.
            For example, if :math

In [None]:
sample_prediction = model_best.predict(uid = 100, iid = 1)
sample_prediction

Prediction(uid=100, iid=1, r_ui=None, est=3.6972330407092566, details={'actual_k': 35, 'was_impossible': False})