Skip to content
Snippets Groups Projects
Commit e7016544 authored by Winnie Uyen Nguyen's avatar Winnie Uyen Nguyen
Browse files

Upload Tuning techniques for the alogorithm

parent b80760ea
Branches main
No related merge requests found
import pandas as pd
import numpy as np
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(
'/Users/winnie/Desktop/CS488/incremental-approsvd/ml-100k/u.data', sep='\t', names=names)
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print("n_users", n_users)
print("n_items", n_items)
# Create r_{ui}, our ratings matrix
ratings = np.zeros((n_users, n_items))
#Assign rating for each element in the ratings matrix
for row in df.itertuples():
if row[1]-1 < n_users and row[2]-1 < n_items:
ratings[row[1]-1, row[2]-1] = row[3]
# ratings[user_id, item_id] = rating
# Split into training and test sets.
# Remove 10 ratings for each user from the training set and assign them to the test set
def train_test_split(ratings):
test = np.zeros(ratings.shape)
train = ratings.copy()
for user in range(ratings.shape[0]):
test_ratings = np.random.choice(ratings[user, :].nonzero()[0],
size=10,
replace=False)
#Filling missing ratings as 0
train[user, test_ratings] = 0.
test[user, test_ratings] = ratings[user, test_ratings]
# Test and training are truly disjoint
assert(np.all((train * test) == 0))
return train, test
train, test = train_test_split(ratings)
#indicating matrix created from train set, with value =1; no value =0
indicating_mat = np.vectorize(lambda x: 0 if x==0 else 1)(train)
#turning indicating matrix into True-False value
mask = indicating_mat == 1
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def get_rmse(pred, actual):
# Ignore nonzero terms.
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
rmse = mean_squared_error(actual, pred,sample_weight=None, multioutput='uniform_average', squared=False)
print("rmse", rmse)
return rmse
def get_mae(pred, actual):
# Ignore nonzero terms.
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
mae = mean_absolute_error(actual, pred, sample_weight=None, multioutput='uniform_average')
print("mae", mae)
return mae
class MyClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, df, k=1, lrate=0):
self.df = df
self.n_users = df.user_id.unique().shape[0]
self.n_items = df.item_id.unique().shape[0]
# Create r_{ui}, our ratings matrix
self.ratings = np.zeros((self.n_users, self.n_items))
for row in df.itertuples():
self.ratings[row[1]-1, row[2]-1] = row[3]
self.k = k
self.lrate = lrate
self._lambda = 0.001
def get_item_item(self, u, mu, b_i, b_u, x_j):
#baseline estimate for r_{ui}
#mu: average result of all ratings in the matrix
#b_u: user bias
#b_i: item bias
#x_j
temp = np.zeros(self.k)
for j in indicating_mat[u,:].nonzero()[0]:
temp += (self.ratings[u, j] - mu - b_u[u] - b_i[j])*x_j[j, :]
return temp * self.R[u]
#Asymmetric SVD rating estimate equationn(predict)
def predict_one(self, u, i):
#q_i
#item_item: predicted rating
#N_u
#y_j: implicit rating
item_item = self.get_item_item(u, self.global_bias, self.b_i, self.b_u, self.x_j)
return self.global_bias + self.b_i[i] + self.b_u[u] + self.q_i[i,:].T.dot(item_item + self.N[u] * self.y_j[self.mask[u,:], :].sum(axis=0))
def svdasy_step(self):
rows = np.random.permutation(len(self.non_zeros[0]))
size = len(self.non_zeros[0])
c = 0
for i in rows:
c+=1
if ((c % 0x1000) == 0):
print(c, size)
user = self.non_zeros[0][i]
item = self.non_zeros[1][i]
pred = self.predict_one(user, item)
## Watch out to turn learning rate separately, this needs to be calculate separately
error = self.train[user][item] - pred
#Calculate the predicted ratings
self.b_u[user] += self.lrate*(error - self._lambda * self.b_u[user])
self.b_i[item] += self.lrate*(error - self._lambda * self.b_i[item])
item_item = self.get_item_item(user, self.global_bias, self.b_i, self.b_u, self.x_j)
## Update for q_i (item vector)
self.q_i[item, :] += self.lrate * (error * (item_item + self.N[user]* self.y_j[self.mask[user, :], :].sum(axis=0)) - self._lambda * self.q_i[item, :])
# Update for x_j (user vector)
j = indicating_mat[user,:].nonzero()
self.x_j[j, :] += self.lrate * (error * item_item * self.q_i[item, :] - self._lambda * self.x_j[j, :])
# Update for each y_j
temp = error * self.N[user] * self.q_i[item, :]
j = indicating_mat[user,:].nonzero()
self.y_j[j,:] += self.lrate * (temp - self._lambda * self.y_j[j,:])
def fit(self, train, test):
self.test = test
self.train = train
self.mask = indicating_mat == 1
self.global_bias = np.mean(self.train[np.where(self.train != 0)]) #average of all ratings which is not zero in ratings matrix
self.non_zeros = self.train.nonzero()
# This is equivalent of taking the length and doing the square root.
#Calculate norm for matrix using linalg.norm()
# N = np.power(indicating_mat.sum(1), -0.5)
self.N = 1./np.linalg.norm(indicating_mat, axis = 1)
self.R = 1./np.linalg.norm(self.train, axis = 1)
self.n_user, self.n_item = self.train.shape
# Parameters: create an array having the same size as the number of user/ number of item and make all the values inside=0
self.b_u = np.zeros(self.n_user)
self.b_i = np.zeros(self.n_item)
#q_i
self.q_i = np.random.normal(scale=1./self.k, size=(self.n_item, self.k))
#P_u
self.x_j = np.random.normal(scale=1./self.k, size=(self.n_item, self.k))
#y_i
self.y_j = np.random.normal(scale=1./self.k, size=(self.n_item, self.k))
for i in range(15):
self.svdasy_step()
return self
def predict(self, X):
n_user, n_item = X.shape
predictions = np.copy(X)
for user in range(n_user):
for item in range(n_item):
if X[user, item] != 0:
predictions[user, item] = self.predict_one(user, item)
return predictions
params = {
"k" : [50, 100, 150],
"lrate" : [0.001, 0.002],
}
rmse_score = make_scorer(get_rmse, greater_is_better = False)
mae_score = make_scorer(get_mae, greater_is_better = False)
score_dictionary ={'rmse':rmse_score, 'mae':mae_score}
gs = GridSearchCV(MyClassifier(df), param_grid=params, scoring=score_dictionary, cv=5, n_jobs=20, refit=False)
gs.fit(train, test)
print(gs.best_params['rmse_score'])
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment