Bandits algorithms are a type of learning algorithm that can perform in uncertain environments. The objective is to maximize its cumulative reward in the long term, and they are commonly used to portray the exploration–exploitation tradeoff dilemma
We will be exploring three different bandits algorithms:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
For this part we will implement the three algorithms for one random realisation, and plot the cumulative reward against time for each algorithm with its best hyper-parameter
# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
# linspace for epsilon
np.linspace(0.01, 0.11, 11)
# we iterate over all the epsilon options and save the cumulative reward
reward1 = []
for i in np.linspace(0.01, 0.11, 11):
def eps_greedy(reward_realization, K, T, eps=i):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
reward = eps_greedy(reward_realization, K_arms, T)
reward1.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is", reward1)
We can see from the list that the highest cumulative reward is 88797 and it happens with an epsilon of 0.03.
Now we will get the cumulative reward for the best hyper-parameter (eps = 0.03)
def eps_greedy(reward_realization, K, T, eps=0.03):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
#reward1 = eps_greedy(reward_realization, K_arms, T)
#print("The cumulative reward of epsilon greedy algorithm is ", np.sum(reward1))
# get cumulative sum to plot after
reward = eps_greedy(reward_realization, K_arms, T)
np.cumsum(reward)
In the case of the UCB algorithm, in this version we do not have any hyper-parameter to set, so we just run the algorithm
def UCB(reward_realization, K, T):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
# construct upper confidence bound
UCB_tmp = np.zeros([K])
for k in range(K):
UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
action = np.argmax(UCB_tmp)
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
# check cumulative reward
reward2 = UCB(reward_realization, K_arms, T)
print("The cumulative reward of UCB algorithm is ", np.sum(reward2))
np.cumsum(reward2)
# check logspace for eta value
np.set_printoptions(suppress=True)
np.logspace(-6,-3, 10)
reward3 = []
for i in np.logspace(-6,-3, 10):
def EXP3(reward_realization, K, T, eta=i):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
reward = eps_greedy(reward_realization, K_arms, T)
reward3.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is", reward3)
We can see from the list that the highest cumulative reward is 88109 and it happens with an eta of 0.00001.
Now we will get the cumulative reward for the best hyper-parameter (eta = 0.00001)
def EXP3(reward_realization, K, T, eta=0.00001):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
# check cumulative reward
reward3 = EXP3(reward_realization, K_arms, T)
np.cumsum(reward3)
After getting best hyper-parameter for the Greedy and EXP3 models, and saving the cumulative rewards of the three models, we will proceed to plot them
plt.figure(figsize=(12,8))
plt.plot(np.cumsum(reward), label="Greedy ($\epsilon=0.03$)")
plt.plot(np.cumsum(reward2), label="UCB")
plt.plot(np.cumsum(reward3), label="EXP3 ($\eta=0.00001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Cumulative Reward")
plt.show()
From the plot, we can see that the Greedy and UCB algorithms performed well, with a lower relative performance of algorithm EXP3.
UCB algorithm performs the best in this example, and it could be related to that UCB algorithm computes a confidence interval for each arm's reward and picks the arm with largest upper confidence bound. The confidence interval stays around the true expected reward, so there is a large chance that we pick a good arm.
Epsilon-Greedy algorithm is the second best, really close to EXP3. Nevertheless, we should take into account that this algorithm is not completely efficient, as the exploration rate epsilon is fixed over time.
Finally, EXP3 algorithm is a robust algorithm but it may tend to be conservative. In particular, EXP3 algorithm is designed to perform better in an adversarial environment.
For this part we will do the same as part I but implement each algorithm for m = 30 random trials. For each algorithm, we will only implement its best hyper-parameter found in the previous part. We will also plot the average cumulative reward against time, where the average is taken with respect to m = 30 random trials.
# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
m = 30 # number or random trials
#np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])
We select the best parameter of section (a) which is eps = 0.03
greedy_list = []
for i in range (m):
greedy_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
def eps_greedy(reward_realization, K, T, eps=0.03):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
greedy_list[i] = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm on trial number " + str(i+1) + " is: ", np.sum(greedy_list[i]))
# get average reward per time step of the 30 trials
greedy_average_30 = np.mean(greedy_list, axis=0)
ucb_list = []
for i in range (m):
ucb_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
def UCB(reward_realization, K, T):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
# construct upper confidence bound
UCB_tmp = np.zeros([K])
for k in range(K):
UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
action = np.argmax(UCB_tmp)
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
ucb_list[i] = UCB(reward_realization, K_arms, T)
print("The cumulative reward of UCB algorithm on trial number " + str(i+1) + " is: ", np.sum(ucb_list[i]))
# get average reward per time step of the 30 trials
ucb_average_30 = np.mean(ucb_list, axis=0)
We select the best parameter of section (a) which is eta=0.00001
exp3_list = []
for i in range (m):
exp3_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
def EXP3(reward_realization, K, T, eta=0.00001):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
exp3_list[i] = EXP3(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))
# get average reward per time step of the 30 trials
exp3_average_30 = np.mean(exp3_list, axis=0)
plt.figure(figsize=(12,8))
plt.plot(np.cumsum(greedy_average_30), label="Greedy ($\epsilon=0.03$)")
plt.plot(np.cumsum(ucb_average_30), label="UCB")
plt.plot(np.cumsum(exp3_average_30), label="EXP3 ($\eta=0.00001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Cumulative Reward")
plt.show()
From the plot, we see similar relative positions from part I, with UCB as the best algorithm and Epsilon-greedy algorithm following close to it. We note that the distance between the cumulative reward of the Epsilon-Greedy and the UCB algorithms is closer that in part I. By doing 30 different random trial tests and getting the average reward per time step, we try to get the real performance of each algorithm and eliminate any of bias that could happen by only seeing only one trial.
We can algo generate other types of visualizations:
average_reward1 = []
average_reward2 = []
average_reward3 = []
for i in range(100000):
new_average = (greedy_average_30)[:1+i].sum()/(i+1)
average_reward1.append(new_average)
new_average2 = (ucb_average_30)[:1+i].sum()/(i+1)
average_reward2.append(new_average2)
new_average3 = (exp3_average_30)[:1+i].sum()/(i+1)
average_reward3.append(new_average3)
plt.figure(figsize=(12,8))
plt.plot(average_reward1, label="Greedy ($\epsilon=0.03$)")
plt.plot(average_reward2, label="UCB")
plt.plot(average_reward3, label="EXP3 ($\eta=0.00001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")
plt.show()
What is interesting to note from this Average Cumulative Reward plot is being able to see the learning behaviour of the different algorithms, in particular the comparison between Epsilon-greedy and UCB algorithms.
We see UCB algorithm to achieve a higher average cumulative reward, but if we see closer, compared to Epsilon-greedy algorithm, UCB is a slower learner. At the beginning, all arms have a high confidence interval, so they will have a higher tendency toward exploration over explotation. As Epsilon-greedy spends more time exploiting, its raise is faster than the rest until its average peak value. On the other hand, UCB algorithm, with a higher tendency to exploration, its performance increases slower than Epsilon-greedy, but it ultimately outperforms it. We can see from the plot below that this happens around iteration number 6000.
plt.figure(figsize=(12,8))
plt.plot(average_reward1, label="Greedy ($\epsilon=0.03$)")
plt.plot(average_reward2, label="UCB")
plt.plot(average_reward3, label="EXP3 ($\eta=0.00001$)")
plt.axis((0,20000, 0,1))
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")
plt.show()
For this part we will redo part II but with larger variances (Sigma = 4, and Sigma = 25). At the end of the section we will explain the effect of larger variance on the algorithm performance
# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
np.linspace(0.01, 0.11, 11)
### check for best hyperparameter
reward1 = []
for i in np.linspace(0.01, 0.11, 11):
def eps_greedy(reward_realization, K, T, eps=i):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
reward = eps_greedy(reward_realization, K_arms, T)
reward1.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is", reward1)
Define best hyper-parameter as eps = 0.01
# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
m = 30 # number or random trials
#np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])
greedy_list = []
for i in range (m):
greedy_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
def eps_greedy(reward_realization, K, T, eps=0.01):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
greedy_list[i] = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm on trial number " + str(i+1) + " is: ", np.sum(greedy_list[i]))
# get average reward per time step of the 30 trials
greedy_average_30 = np.mean(greedy_list, axis=0)
ucb_list = []
for i in range (m):
ucb_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
def UCB(reward_realization, K, T):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
# construct upper confidence bound
UCB_tmp = np.zeros([K])
for k in range(K):
UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
action = np.argmax(UCB_tmp)
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
ucb_list[i] = UCB(reward_realization, K_arms, T)
print("The cumulative reward of UCB algorithm on trial number " + str(i+1) + " is: ", np.sum(ucb_list[i]))
# get average reward per time step of the 30 trials
ucb_average_30 = np.mean(ucb_list, axis=0)
np.logspace(-6,-3, 10)
### get best hyper-parameter
reward3 = []
for i in np.logspace(-6,-3, 10):
def EXP3(reward_realization, K, T, eta=i):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
reward = eps_greedy(reward_realization, K_arms, T)
reward3.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is", reward3)
Best hyper-parameter is eta = 0.001
exp3_list = []
for i in range (m):
exp3_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
def EXP3(reward_realization, K, T, eta=0.001):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
exp3_list[i] = EXP3(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))
# get average reward per time step of the 30 trials
exp3_average_30 = np.mean(exp3_list, axis=0)
plt.figure(figsize=(12,8))
plt.plot(greedy_average_30, label="Greedy ($\epsilon=0.01$)")
plt.plot(ucb_average_30, label="UCB")
plt.plot(exp3_average_30, label="EXP3 ($\eta=0.001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")
plt.show()
np.linspace(0.01, 0.11, 11)
### check for best hyperparameter
reward1 = []
for i in np.linspace(0.01, 0.11, 11):
def eps_greedy(reward_realization, K, T, eps=i):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
reward = eps_greedy(reward_realization, K_arms, T)
reward1.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is", reward1)
Best eps = 0.01
greedy_list = []
for i in range (m):
greedy_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(25, mean_reward[k], [T])
def eps_greedy(reward_realization, K, T, eps=0.01):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
if np.random.random()>eps:
# choose the arm with largest mean estimate
action = np.argmax(reward_est)
else:
# choose a random arm
action = np.random.choice(K,1)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
greedy_list[i] = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm on trial number " + str(i+1) + " is: ", np.sum(greedy_list[i]))
# get average reward per time step of the 30 trials
greedy_average_30 = np.mean(greedy_list, axis=0)
ucb_list = []
for i in range (m):
ucb_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(25, mean_reward[k], [T])
def UCB(reward_realization, K, T):
# record the mean estimators
reward_est = np.zeros([K])
# record the counts
counts = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
# Phase I: play each arm once
for t in range(K):
# play t-th arm
rewards[t] = reward_realization[t,t]
reward_est[t] = reward_realization[t,t]
counts[t] += 1
# Phase II
for t in range(K,T):
# construct upper confidence bound
UCB_tmp = np.zeros([K])
for k in range(K):
UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
action = np.argmax(UCB_tmp)
# Update the estimate and count
rewards[t] = reward_realization[action,t]
reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
counts[action]+=1
return rewards
ucb_list[i] = UCB(reward_realization, K_arms, T)
print("The cumulative reward of UCB algorithm on trial number " + str(i+1) + " is: ", np.sum(ucb_list[i]))
# get average reward per time step of the 30 trials
ucb_average_30 = np.mean(ucb_list, axis=0)
np.logspace(-6,-3, 10)
### get best hyper-parameter
reward3 = []
for i in np.logspace(-6,-3, 10):
def EXP3(reward_realization, K, T, eta=i):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
reward = eps_greedy(reward_realization, K_arms, T)
reward3.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is", reward3)
Best eta = 0.001
exp3_list = []
for i in range (m):
exp3_list.append(str(i))
for i in range(m):
np.random.seed(i+1)
for k in range(K_arms):
### the k-th row and t-th column of the matrix reward realization represents
### the reward of the k-th arm at time t
reward_realization[k,:] = np.random.binomial(25, mean_reward[k], [T])
def EXP3(reward_realization, K, T, eta=0.001):
# Initialize the past reward monitor
S_vec = np.zeros([K])
# record the collected reward
rewards = np.zeros([T])
for t in range(T):
# compute the sampling probability
exp_S_vec = np.exp(eta*S_vec)
prob_vec = exp_S_vec/np.sum(exp_S_vec)
# sample an arm
action = np.random.choice(K,1,p=prob_vec)[0]
# Update the estimate and count
rewards[t] = reward_realization[action,t]
S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]
return rewards
exp3_list[i] = EXP3(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))
plt.figure(figsize=(12,8))
plt.plot(greedy_average_30, label="Greedy ($\epsilon=0.01$)")
plt.plot(ucb_average_30, label="UCB")
#plt.plot(average_reward3, label="EXP3 ($\eta=0.001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")
plt.show()
After plotting the different scenarios with larger variances (sigma = 4 and sigma = 25) related to the reward of each arm and updating the hyper-parameters, we observe that as the variance get higher the average cumulative reward revolves around a higher value. On the other hand, we see higher confidence intervals and higher uncertainty every iteration.
Also, in the case of sigma = 25 we see that in the EXP3 algorithm, in some cases the probabilites of the model are out of bound, and the algorithm fails.