Skip to content
Snippets Groups Projects
Commit 964b6727 authored by Winnie Uyen Nguyen's avatar Winnie Uyen Nguyen
Browse files

Tuned Asymmetric SVD model

parent e7016544
Branches main
No related merge requests found
import pandas as pd
import numpy as np
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(
'/Users/winnie/Desktop/CS488/incremental-approsvd/ml-100k/u.data', sep='\t', names=names)
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print("n_users", n_users)
print("n_items", n_items)
# Create r_{ui}, our ratings matrix
ratings = np.zeros((n_users, n_items))
# print("ratings", ratings)
for row in df.itertuples():
if row[1]-1 < n_users and row[2]-1 < n_items:
ratings[row[1]-1, row[2]-1] = row[3]
# ratings[user_id, item_id] = rating
# Split into training and test sets.
# Remove 10 ratings for each user
# and assign them to the test set
def train_test_split(ratings):
test = np.zeros(ratings.shape)
train = ratings.copy()
for user in range(ratings.shape[0]):
test_ratings = np.random.choice(ratings[user, :].nonzero()[0],
size=10,
replace=False)
train[user, test_ratings] = 0.
test[user, test_ratings] = ratings[user, test_ratings]
# Test and training are truly disjoint
assert(np.all((train * test) == 0))
return train, test
train, test = train_test_split(ratings)
indicating_mat = np.vectorize(lambda x: 0 if x==0 else 1)(train)
mask = indicating_mat == 1
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def get_rmse(pred, actual):
# Ignore nonzero terms.
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
rmse = mean_squared_error(actual, pred,sample_weight=None, multioutput='uniform_average', squared=False)
print("rmse", rmse)
return rmse
def get_mae(pred, actual):
# Ignore nonzero terms.
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
mae = mean_absolute_error(actual, pred, sample_weight=None, multioutput='uniform_average')
print("mae", mae)
return mae
def get_item_item (u, mu, b_i, b_u, x_j):
temp = np.zeros(k)
for j in indicating_mat[u,:].nonzero()[0]:
temp += (ratings[u, j] - mu - b_u[u] - b_i[j])*x_j[j, :]
return temp * R[u]
def predict_one(u, i, b_i, b_u, q_i, x_j, y_j):
item_item = get_item_item(u, global_bias, b_i, b_u, x_j)
return global_bias + b_i[i] + b_u[u] + \
q_i[i,:].T.dot(item_item + N[u] * y_j[mask[u,:], :].sum(axis=0))
def svdasy_step():
rows = np.random.permutation(len(non_zeros[0]))
size = len(non_zeros[0])
c = 0
for i in rows:
c+=1
if ((c % 0x1000) == 0):
print(c, size)
user = non_zeros[0][i]
item = non_zeros[1][i]
# print("user", user)
# print("item", item)
pred = predict_one(user, item, b_i, b_u, q_i, x_j, y_j)
## Watch out to turn learning rate separately, this needs to be calculate separately
error = train[user][item] - pred
b_u[user] += lrate*(error - _lambda * b_u[user])
b_i[item] += lrate*(error - _lambda * b_i[item])
item_item = get_item_item(user, global_bias, b_i, b_u, x_j)
## Update for q_i (item vector)
q_i[item, :] += lrate * (error * (item_item + N[user]* y_j[mask[user, :], :].sum(axis=0)) - _lambda * q_i[item, :])
# Update for x_j (user vector)
j = indicating_mat[user,:].nonzero()
x_j[j, :] += lrate * (error * item_item * q_i[item, :] - _lambda * x_j[j, :])
# Update for each y_j
temp = error * N[user] * q_i[item, :]
j = indicating_mat[user,:].nonzero()
y_j[j,:] += lrate * (temp - _lambda * y_j[j,:])
data_rmse = []
data_mae=[]
def predict():
predictions = np.zeros([n_user, n_item])
for user in range(n_users):
for item in range(n_items):
predictions[user, item] = predict_one(user, item, b_i, b_u, q_i, x_j, y_j)
data_rmse.append([get_rmse(predictions, train),get_rmse(predictions, test)])
data_mae.append([get_mae(predictions, train),get_mae(predictions, test)])
print(data_rmse,data_mae)
# with open('./asym.csv', 'a') as f:
# f.write(get_mse(predictions, train)+','+get_mse(predictions, test)+' \n')
n_user, n_item = train.shape
# Hyperparameter:
k = 50
steps = 200
lrate = 0.001
_lambda = 0.001
# Parameters:
b_u = np.zeros(n_user)
b_i = np.zeros(n_item)
#q_i
q_i = np.random.normal(scale=1./k, size=(n_item, k))
#P_u
x_j = np.random.normal(scale=1./k, size=(n_item, k))
#y_i
y_j = np.random.normal(scale=1./k, size=(n_item, k))
global_bias = np.mean(train[np.where(train != 0)])
non_zeros = train.nonzero()
# This is equivalent of taking the length and doing the square root.
# N = np.power(indicating_mat.sum(1), -0.5)
N = 1./np.linalg.norm(indicating_mat, axis = 1)
R = 1./np.linalg.norm(train, axis = 1)
for i in range(steps):
print(i)
svdasy_step()
if i%10 == 0:
predict()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment