Bandits Algorithms Exercise¶

Bandits algorithms are a type of learning algorithm that can perform in uncertain environments. The objective is to maximize its cumulative reward in the long term, and they are commonly used to portray the exploration–exploitation tradeoff dilemma

We will be exploring three different bandits algorithms:

Epsilon-greedy: a simple method to balance exploration and exploitation by choosing between exploration and exploitation randomly
Upper Confidence Bound (UCB): this algorithm changes its exploration-exploitation balance as it gathers more knowledge of the environment
EXP3: popular algorithm for adversarial multiarmed bandits, in each round of the game EXP3 picks an arm according to a Gibbs distribution based on upper confidence bounds for each arm, and plays it

import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
%matplotlib inline

Part I¶

For this part we will implement the three algorithms for one random realisation, and plot the cumulative reward against time for each algorithm with its best hyper-parameter

# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])
for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
    reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])

Epsilon-greedy algorithm¶

# linspace for epsilon
np.linspace(0.01, 0.11, 11)

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11])

# we iterate over all the epsilon options and save the cumulative reward

reward1 = []

for i in np.linspace(0.01, 0.11, 11):
    
    def eps_greedy(reward_realization, K, T, eps=i):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            if np.random.random()>eps:
                # choose the arm with largest mean estimate
                action = np.argmax(reward_est)
            else:
                # choose a random arm
                action = np.random.choice(K,1)[0]

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        
        return rewards
    reward = eps_greedy(reward_realization, K_arms, T)
    reward1.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is", reward1)

The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is [87591.0, 87518.0, 88797.0, 88320.0, 88178.0, 87148.0, 86230.0, 86931.0, 86730.0, 86060.0, 85948.0]

We can see from the list that the highest cumulative reward is 88797 and it happens with an epsilon of 0.03.

Now we will get the cumulative reward for the best hyper-parameter (eps = 0.03)

def eps_greedy(reward_realization, K, T, eps=0.03):
    # record the mean estimators
    reward_est = np.zeros([K])
    # record the counts
    counts = np.zeros([K])
    
    # record the collected reward
    rewards = np.zeros([T])
    
    # Phase I: play each arm once
    for t in range(K):
        # play t-th arm
        rewards[t] = reward_realization[t,t]
        reward_est[t] = reward_realization[t,t]
        counts[t] += 1     
    
    # Phase II
    for t in range(K,T):
        if np.random.random()>eps:
            # choose the arm with largest mean estimate
            action = np.argmax(reward_est)
        else:
            # choose a random arm
            action = np.random.choice(K,1)[0]
        
        # Update the estimate and count
        rewards[t] = reward_realization[action,t]
        reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
        counts[action]+=1
    
    return rewards



#reward1 = eps_greedy(reward_realization, K_arms, T)
#print("The cumulative reward of epsilon greedy algorithm is ", np.sum(reward1))

# get cumulative sum to plot after

reward = eps_greedy(reward_realization, K_arms, T)
np.cumsum(reward)

array([0.0000e+00, 1.0000e+00, 2.0000e+00, ..., 8.7163e+04, 8.7164e+04,
       8.7165e+04])

UCB algorithm¶

In the case of the UCB algorithm, in this version we do not have any hyper-parameter to set, so we just run the algorithm

def UCB(reward_realization, K, T):
    # record the mean estimators
    reward_est = np.zeros([K])
    # record the counts
    counts = np.zeros([K])
    
    # record the collected reward
    rewards = np.zeros([T])
    
    # Phase I: play each arm once
    for t in range(K):
        # play t-th arm
        rewards[t] = reward_realization[t,t]
        reward_est[t] = reward_realization[t,t]
        counts[t] += 1     
    
    # Phase II
    for t in range(K,T):
        # construct upper confidence bound
        UCB_tmp = np.zeros([K])
        for k in range(K):
            UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
        action = np.argmax(UCB_tmp)
        
        # Update the estimate and count
        rewards[t] = reward_realization[action,t]
        reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
        counts[action]+=1
    
    return rewards

# check cumulative reward

reward2 = UCB(reward_realization, K_arms, T)


print("The cumulative reward of UCB algorithm is ", np.sum(reward2))

The cumulative reward of UCB algorithm is  89525.0

np.cumsum(reward2)

array([0.0000e+00, 1.0000e+00, 2.0000e+00, ..., 8.9523e+04, 8.9524e+04,
       8.9525e+04])

EXP3 algorithm¶

# check logspace for eta value
np.set_printoptions(suppress=True)
np.logspace(-6,-3, 10)

array([0.000001  , 0.00000215, 0.00000464, 0.00001   , 0.00002154,
       0.00004642, 0.0001    , 0.00021544, 0.00046416, 0.001     ])

reward3 = []

for i in np.logspace(-6,-3, 10):
    def EXP3(reward_realization, K, T, eta=i):
        # Initialize the past reward monitor
        S_vec = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        for t in range(T):

            # compute the sampling probability
            exp_S_vec = np.exp(eta*S_vec)
            prob_vec = exp_S_vec/np.sum(exp_S_vec)

            # sample an arm
            action = np.random.choice(K,1,p=prob_vec)[0] 

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

        return rewards
    reward = eps_greedy(reward_realization, K_arms, T)
    reward3.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is", reward3)

The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is [87622.0, 88065.0, 87837.0, 88109.0, 87872.0, 87624.0, 87914.0, 87566.0, 86241.0, 87739.0]

We can see from the list that the highest cumulative reward is 88109 and it happens with an eta of 0.00001.

Now we will get the cumulative reward for the best hyper-parameter (eta = 0.00001)

def EXP3(reward_realization, K, T, eta=0.00001):
        # Initialize the past reward monitor
        S_vec = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        for t in range(T):

            # compute the sampling probability
            exp_S_vec = np.exp(eta*S_vec)
            prob_vec = exp_S_vec/np.sum(exp_S_vec)

            # sample an arm
            action = np.random.choice(K,1,p=prob_vec)[0] 

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

        return rewards

# check cumulative reward
reward3 = EXP3(reward_realization, K_arms, T)
np.cumsum(reward3)

array([    0.,     1.,     2., ..., 58957., 58957., 58958.])

Plotting cumulative rewards of the three algorithms¶

After getting best hyper-parameter for the Greedy and EXP3 models, and saving the cumulative rewards of the three models, we will proceed to plot them

plt.figure(figsize=(12,8))
plt.plot(np.cumsum(reward), label="Greedy ($\epsilon=0.03$)")
plt.plot(np.cumsum(reward2), label="UCB")
plt.plot(np.cumsum(reward3), label="EXP3 ($\eta=0.00001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Cumulative Reward")

plt.show()

From the plot, we can see that the Greedy and UCB algorithms performed well, with a lower relative performance of algorithm EXP3.

UCB algorithm performs the best in this example, and it could be related to that UCB algorithm computes a confidence interval for each arm's reward and picks the arm with largest upper confidence bound. The confidence interval stays around the true expected reward, so there is a large chance that we pick a good arm.

Epsilon-Greedy algorithm is the second best, really close to EXP3. Nevertheless, we should take into account that this algorithm is not completely efficient, as the exploration rate epsilon is fixed over time.

Finally, EXP3 algorithm is a robust algorithm but it may tend to be conservative. In particular, EXP3 algorithm is designed to perform better in an adversarial environment.

Part II¶

For this part we will do the same as part I but implement each algorithm for m = 30 random trials. For each algorithm, we will only implement its best hyper-parameter found in the previous part. We will also plot the average cumulative reward against time, where the average is taken with respect to m = 30 random trials.

# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
m = 30 # number or random trials
#np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])

Epsilon-greedy algorithm¶

We select the best parameter of section (a) which is eps = 0.03

greedy_list = []

for i in range (m):
    
    greedy_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
    
    
    def eps_greedy(reward_realization, K, T, eps=0.03):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            if np.random.random()>eps:
                # choose the arm with largest mean estimate
                action = np.argmax(reward_est)
            else:
                # choose a random arm
                action = np.random.choice(K,1)[0]

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        return rewards



    greedy_list[i] = eps_greedy(reward_realization, K_arms, T)
    print("The cumulative reward of epsilon greedy algorithm on trial number " + str(i+1) + " is: ", np.sum(greedy_list[i]))

The cumulative reward of epsilon greedy algorithm on trial number 1 is:  88542.0
The cumulative reward of epsilon greedy algorithm on trial number 2 is:  88925.0
The cumulative reward of epsilon greedy algorithm on trial number 3 is:  87045.0
The cumulative reward of epsilon greedy algorithm on trial number 4 is:  88931.0
The cumulative reward of epsilon greedy algorithm on trial number 5 is:  88900.0
The cumulative reward of epsilon greedy algorithm on trial number 6 is:  88816.0
The cumulative reward of epsilon greedy algorithm on trial number 7 is:  88942.0
The cumulative reward of epsilon greedy algorithm on trial number 8 is:  88810.0
The cumulative reward of epsilon greedy algorithm on trial number 9 is:  88995.0
The cumulative reward of epsilon greedy algorithm on trial number 10 is:  88570.0
The cumulative reward of epsilon greedy algorithm on trial number 11 is:  88805.0
The cumulative reward of epsilon greedy algorithm on trial number 12 is:  88971.0
The cumulative reward of epsilon greedy algorithm on trial number 13 is:  88794.0
The cumulative reward of epsilon greedy algorithm on trial number 14 is:  88606.0
The cumulative reward of epsilon greedy algorithm on trial number 15 is:  88797.0
The cumulative reward of epsilon greedy algorithm on trial number 16 is:  88464.0
The cumulative reward of epsilon greedy algorithm on trial number 17 is:  89010.0
The cumulative reward of epsilon greedy algorithm on trial number 18 is:  88465.0
The cumulative reward of epsilon greedy algorithm on trial number 19 is:  88861.0
The cumulative reward of epsilon greedy algorithm on trial number 20 is:  89088.0
The cumulative reward of epsilon greedy algorithm on trial number 21 is:  88922.0
The cumulative reward of epsilon greedy algorithm on trial number 22 is:  88508.0
The cumulative reward of epsilon greedy algorithm on trial number 23 is:  88782.0
The cumulative reward of epsilon greedy algorithm on trial number 24 is:  88750.0
The cumulative reward of epsilon greedy algorithm on trial number 25 is:  88893.0
The cumulative reward of epsilon greedy algorithm on trial number 26 is:  88880.0
The cumulative reward of epsilon greedy algorithm on trial number 27 is:  88795.0
The cumulative reward of epsilon greedy algorithm on trial number 28 is:  86936.0
The cumulative reward of epsilon greedy algorithm on trial number 29 is:  88945.0
The cumulative reward of epsilon greedy algorithm on trial number 30 is:  88960.0

# get average reward per time step of the 30 trials 

greedy_average_30 = np.mean(greedy_list, axis=0)

UCB algorithm¶

ucb_list = []

for i in range (m):
    
    ucb_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
    
    
    def UCB(reward_realization, K, T):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            # construct upper confidence bound
            UCB_tmp = np.zeros([K])
            for k in range(K):
                UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
            action = np.argmax(UCB_tmp)

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        return rewards   

    ucb_list[i] = UCB(reward_realization, K_arms, T)


    print("The cumulative reward of UCB algorithm on trial number " + str(i+1) + " is: ", np.sum(ucb_list[i]))

The cumulative reward of UCB algorithm on trial number 1 is:  89732.0
The cumulative reward of UCB algorithm on trial number 2 is:  89540.0
The cumulative reward of UCB algorithm on trial number 3 is:  89600.0
The cumulative reward of UCB algorithm on trial number 4 is:  89597.0
The cumulative reward of UCB algorithm on trial number 5 is:  89486.0
The cumulative reward of UCB algorithm on trial number 6 is:  89525.0
The cumulative reward of UCB algorithm on trial number 7 is:  89565.0
The cumulative reward of UCB algorithm on trial number 8 is:  89424.0
The cumulative reward of UCB algorithm on trial number 9 is:  89678.0
The cumulative reward of UCB algorithm on trial number 10 is:  89604.0
The cumulative reward of UCB algorithm on trial number 11 is:  89580.0
The cumulative reward of UCB algorithm on trial number 12 is:  89587.0
The cumulative reward of UCB algorithm on trial number 13 is:  89521.0
The cumulative reward of UCB algorithm on trial number 14 is:  89561.0
The cumulative reward of UCB algorithm on trial number 15 is:  89476.0
The cumulative reward of UCB algorithm on trial number 16 is:  89577.0
The cumulative reward of UCB algorithm on trial number 17 is:  89648.0
The cumulative reward of UCB algorithm on trial number 18 is:  89610.0
The cumulative reward of UCB algorithm on trial number 19 is:  89511.0
The cumulative reward of UCB algorithm on trial number 20 is:  89856.0
The cumulative reward of UCB algorithm on trial number 21 is:  89512.0
The cumulative reward of UCB algorithm on trial number 22 is:  89572.0
The cumulative reward of UCB algorithm on trial number 23 is:  89485.0
The cumulative reward of UCB algorithm on trial number 24 is:  89618.0
The cumulative reward of UCB algorithm on trial number 25 is:  89576.0
The cumulative reward of UCB algorithm on trial number 26 is:  89630.0
The cumulative reward of UCB algorithm on trial number 27 is:  89468.0
The cumulative reward of UCB algorithm on trial number 28 is:  89481.0
The cumulative reward of UCB algorithm on trial number 29 is:  89583.0
The cumulative reward of UCB algorithm on trial number 30 is:  89524.0

# get average reward per time step of the 30 trials 
ucb_average_30 = np.mean(ucb_list, axis=0)

EXP3 algorithm¶

We select the best parameter of section (a) which is eta=0.00001

exp3_list = []

for i in range (m):
    
    exp3_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(1, mean_reward[k], [T])
    

    def EXP3(reward_realization, K, T, eta=0.00001):
            # Initialize the past reward monitor
            S_vec = np.zeros([K])

            # record the collected reward
            rewards = np.zeros([T])

            for t in range(T):

                # compute the sampling probability
                exp_S_vec = np.exp(eta*S_vec)
                prob_vec = exp_S_vec/np.sum(exp_S_vec)

                # sample an arm
                action = np.random.choice(K,1,p=prob_vec)[0] 

                # Update the estimate and count
                rewards[t] = reward_realization[action,t]
                S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

            return rewards

    exp3_list[i] = EXP3(reward_realization, K_arms, T)

    print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))

The cumulative reward of EXP3 algorithm on trial number 1 is:  58898.0
The cumulative reward of EXP3 algorithm on trial number 2 is:  59020.0
The cumulative reward of EXP3 algorithm on trial number 3 is:  58996.0
The cumulative reward of EXP3 algorithm on trial number 4 is:  58836.0
The cumulative reward of EXP3 algorithm on trial number 5 is:  58587.0
The cumulative reward of EXP3 algorithm on trial number 6 is:  58879.0
The cumulative reward of EXP3 algorithm on trial number 7 is:  58758.0
The cumulative reward of EXP3 algorithm on trial number 8 is:  58858.0
The cumulative reward of EXP3 algorithm on trial number 9 is:  58883.0
The cumulative reward of EXP3 algorithm on trial number 10 is:  58718.0
The cumulative reward of EXP3 algorithm on trial number 11 is:  58739.0
The cumulative reward of EXP3 algorithm on trial number 12 is:  58920.0
The cumulative reward of EXP3 algorithm on trial number 13 is:  58862.0
The cumulative reward of EXP3 algorithm on trial number 14 is:  58594.0
The cumulative reward of EXP3 algorithm on trial number 15 is:  58912.0
The cumulative reward of EXP3 algorithm on trial number 16 is:  58644.0
The cumulative reward of EXP3 algorithm on trial number 17 is:  58881.0
The cumulative reward of EXP3 algorithm on trial number 18 is:  58911.0
The cumulative reward of EXP3 algorithm on trial number 19 is:  58844.0
The cumulative reward of EXP3 algorithm on trial number 20 is:  59146.0
The cumulative reward of EXP3 algorithm on trial number 21 is:  58845.0
The cumulative reward of EXP3 algorithm on trial number 22 is:  58916.0
The cumulative reward of EXP3 algorithm on trial number 23 is:  58915.0
The cumulative reward of EXP3 algorithm on trial number 24 is:  59019.0
The cumulative reward of EXP3 algorithm on trial number 25 is:  58952.0
The cumulative reward of EXP3 algorithm on trial number 26 is:  58844.0
The cumulative reward of EXP3 algorithm on trial number 27 is:  58952.0
The cumulative reward of EXP3 algorithm on trial number 28 is:  58935.0
The cumulative reward of EXP3 algorithm on trial number 29 is:  58984.0
The cumulative reward of EXP3 algorithm on trial number 30 is:  58661.0

# get average reward per time step of the 30 trials 
exp3_average_30 = np.mean(exp3_list, axis=0)

Plot Average Cumulative Reward for 30 trials¶

plt.figure(figsize=(12,8))
plt.plot(np.cumsum(greedy_average_30), label="Greedy ($\epsilon=0.03$)")
plt.plot(np.cumsum(ucb_average_30), label="UCB")
plt.plot(np.cumsum(exp3_average_30), label="EXP3 ($\eta=0.00001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Cumulative Reward")

plt.show()

From the plot, we see similar relative positions from part I, with UCB as the best algorithm and Epsilon-greedy algorithm following close to it. We note that the distance between the cumulative reward of the Epsilon-Greedy and the UCB algorithms is closer that in part I. By doing 30 different random trial tests and getting the average reward per time step, we try to get the real performance of each algorithm and eliminate any of bias that could happen by only seeing only one trial.

We can algo generate other types of visualizations:

average_reward1 = []
average_reward2 = []
average_reward3 = []

for i in range(100000):
    new_average = (greedy_average_30)[:1+i].sum()/(i+1)
    average_reward1.append(new_average)
    new_average2 = (ucb_average_30)[:1+i].sum()/(i+1)
    average_reward2.append(new_average2)
    new_average3 = (exp3_average_30)[:1+i].sum()/(i+1)
    average_reward3.append(new_average3)

plt.figure(figsize=(12,8))
plt.plot(average_reward1, label="Greedy ($\epsilon=0.03$)")
plt.plot(average_reward2, label="UCB")
plt.plot(average_reward3, label="EXP3 ($\eta=0.00001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")

plt.show()

What is interesting to note from this Average Cumulative Reward plot is being able to see the learning behaviour of the different algorithms, in particular the comparison between Epsilon-greedy and UCB algorithms.

We see UCB algorithm to achieve a higher average cumulative reward, but if we see closer, compared to Epsilon-greedy algorithm, UCB is a slower learner. At the beginning, all arms have a high confidence interval, so they will have a higher tendency toward exploration over explotation. As Epsilon-greedy spends more time exploiting, its raise is faster than the rest until its average peak value. On the other hand, UCB algorithm, with a higher tendency to exploration, its performance increases slower than Epsilon-greedy, but it ultimately outperforms it. We can see from the plot below that this happens around iteration number 6000.

plt.figure(figsize=(12,8))
plt.plot(average_reward1, label="Greedy ($\epsilon=0.03$)")
plt.plot(average_reward2, label="UCB")
plt.plot(average_reward3, label="EXP3 ($\eta=0.00001$)")
plt.axis((0,20000, 0,1))
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")

plt.show()

Part III¶

For this part we will redo part II but with larger variances (Sigma = 4, and Sigma = 25). At the end of the section we will explain the effect of larger variance on the algorithm performance

Sigma = 4¶

# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])
for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
    reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])

Epsilon-greedy algorithm¶

np.linspace(0.01, 0.11, 11)

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11])

### check for best hyperparameter

reward1 = []

for i in np.linspace(0.01, 0.11, 11):
    
    def eps_greedy(reward_realization, K, T, eps=i):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            if np.random.random()>eps:
                # choose the arm with largest mean estimate
                action = np.argmax(reward_est)
            else:
                # choose a random arm
                action = np.random.choice(K,1)[0]

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        
        return rewards
    reward = eps_greedy(reward_realization, K_arms, T)
    reward1.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is", reward1)

The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is [358750.0, 357258.0, 355779.0, 354371.0, 352661.0, 351488.0, 350066.0, 348644.0, 347440.0, 345793.0, 344430.0]

Define best hyper-parameter as eps = 0.01

# setting parameters
K_arms = 5 # number of arms
T = 100000 # length of horizon
m = 30 # number or random trials
#np.random.seed(100)
mean_reward = np.array([0.1, 0.3, 0.5, 0.88, 0.9])
reward_realization = np.zeros([K_arms, T])

greedy_list = []

for i in range (m):
    
    greedy_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
    
    
    def eps_greedy(reward_realization, K, T, eps=0.01):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            if np.random.random()>eps:
                # choose the arm with largest mean estimate
                action = np.argmax(reward_est)
            else:
                # choose a random arm
                action = np.random.choice(K,1)[0]

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        return rewards



    greedy_list[i] = eps_greedy(reward_realization, K_arms, T)
    print("The cumulative reward of epsilon greedy algorithm on trial number " + str(i+1) + " is: ", np.sum(greedy_list[i]))

The cumulative reward of epsilon greedy algorithm on trial number 1 is:  358691.0
The cumulative reward of epsilon greedy algorithm on trial number 2 is:  358630.0
The cumulative reward of epsilon greedy algorithm on trial number 3 is:  357616.0
The cumulative reward of epsilon greedy algorithm on trial number 4 is:  358794.0
The cumulative reward of epsilon greedy algorithm on trial number 5 is:  358518.0
The cumulative reward of epsilon greedy algorithm on trial number 6 is:  358515.0
The cumulative reward of epsilon greedy algorithm on trial number 7 is:  358370.0
The cumulative reward of epsilon greedy algorithm on trial number 8 is:  358571.0
The cumulative reward of epsilon greedy algorithm on trial number 9 is:  358704.0
The cumulative reward of epsilon greedy algorithm on trial number 10 is:  358258.0
The cumulative reward of epsilon greedy algorithm on trial number 11 is:  358422.0
The cumulative reward of epsilon greedy algorithm on trial number 12 is:  358390.0
The cumulative reward of epsilon greedy algorithm on trial number 13 is:  350852.0
The cumulative reward of epsilon greedy algorithm on trial number 14 is:  358366.0
The cumulative reward of epsilon greedy algorithm on trial number 15 is:  358333.0
The cumulative reward of epsilon greedy algorithm on trial number 16 is:  358292.0
The cumulative reward of epsilon greedy algorithm on trial number 17 is:  358723.0
The cumulative reward of epsilon greedy algorithm on trial number 18 is:  358446.0
The cumulative reward of epsilon greedy algorithm on trial number 19 is:  358642.0
The cumulative reward of epsilon greedy algorithm on trial number 20 is:  358810.0
The cumulative reward of epsilon greedy algorithm on trial number 21 is:  358736.0
The cumulative reward of epsilon greedy algorithm on trial number 22 is:  358203.0
The cumulative reward of epsilon greedy algorithm on trial number 23 is:  357985.0
The cumulative reward of epsilon greedy algorithm on trial number 24 is:  358404.0
The cumulative reward of epsilon greedy algorithm on trial number 25 is:  358157.0
The cumulative reward of epsilon greedy algorithm on trial number 26 is:  358848.0
The cumulative reward of epsilon greedy algorithm on trial number 27 is:  358377.0
The cumulative reward of epsilon greedy algorithm on trial number 28 is:  357947.0
The cumulative reward of epsilon greedy algorithm on trial number 29 is:  357757.0
The cumulative reward of epsilon greedy algorithm on trial number 30 is:  358213.0

# get average reward per time step of the 30 trials 

greedy_average_30 = np.mean(greedy_list, axis=0)

UCB algorithm¶

ucb_list = []

for i in range (m):
    
    ucb_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
    
    
    def UCB(reward_realization, K, T):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            # construct upper confidence bound
            UCB_tmp = np.zeros([K])
            for k in range(K):
                UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
            action = np.argmax(UCB_tmp)

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        return rewards   

    ucb_list[i] = UCB(reward_realization, K_arms, T)


    print("The cumulative reward of UCB algorithm on trial number " + str(i+1) + " is: ", np.sum(ucb_list[i]))

The cumulative reward of UCB algorithm on trial number 1 is:  359998.0
The cumulative reward of UCB algorithm on trial number 2 is:  359956.0
The cumulative reward of UCB algorithm on trial number 3 is:  359665.0
The cumulative reward of UCB algorithm on trial number 4 is:  359964.0
The cumulative reward of UCB algorithm on trial number 5 is:  359764.0
The cumulative reward of UCB algorithm on trial number 6 is:  359727.0
The cumulative reward of UCB algorithm on trial number 7 is:  359598.0
The cumulative reward of UCB algorithm on trial number 8 is:  359707.0
The cumulative reward of UCB algorithm on trial number 9 is:  359994.0
The cumulative reward of UCB algorithm on trial number 10 is:  359518.0
The cumulative reward of UCB algorithm on trial number 11 is:  359548.0
The cumulative reward of UCB algorithm on trial number 12 is:  359631.0
The cumulative reward of UCB algorithm on trial number 13 is:  359600.0
The cumulative reward of UCB algorithm on trial number 14 is:  359624.0
The cumulative reward of UCB algorithm on trial number 15 is:  359709.0
The cumulative reward of UCB algorithm on trial number 16 is:  359591.0
The cumulative reward of UCB algorithm on trial number 17 is:  359945.0
The cumulative reward of UCB algorithm on trial number 18 is:  360032.0
The cumulative reward of UCB algorithm on trial number 19 is:  359793.0
The cumulative reward of UCB algorithm on trial number 20 is:  360039.0
The cumulative reward of UCB algorithm on trial number 21 is:  359885.0
The cumulative reward of UCB algorithm on trial number 22 is:  359489.0
The cumulative reward of UCB algorithm on trial number 23 is:  359555.0
The cumulative reward of UCB algorithm on trial number 24 is:  359632.0
The cumulative reward of UCB algorithm on trial number 25 is:  359897.0
The cumulative reward of UCB algorithm on trial number 26 is:  360010.0
The cumulative reward of UCB algorithm on trial number 27 is:  359607.0
The cumulative reward of UCB algorithm on trial number 28 is:  359546.0
The cumulative reward of UCB algorithm on trial number 29 is:  359439.0
The cumulative reward of UCB algorithm on trial number 30 is:  359966.0

# get average reward per time step of the 30 trials 

ucb_average_30 = np.mean(ucb_list, axis=0)

EXP3 algorithm¶

np.logspace(-6,-3, 10)

array([0.000001  , 0.00000215, 0.00000464, 0.00001   , 0.00002154,
       0.00004642, 0.0001    , 0.00021544, 0.00046416, 0.001     ])

### get best hyper-parameter

reward3 = []

for i in np.logspace(-6,-3, 10):
    def EXP3(reward_realization, K, T, eta=i):
        # Initialize the past reward monitor
        S_vec = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        for t in range(T):

            # compute the sampling probability
            exp_S_vec = np.exp(eta*S_vec)
            prob_vec = exp_S_vec/np.sum(exp_S_vec)

            # sample an arm
            action = np.random.choice(K,1,p=prob_vec)[0] 

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

        return rewards
    reward = eps_greedy(reward_realization, K_arms, T)
    reward3.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is", reward3)

The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is [358630.0, 358758.0, 358798.0, 358741.0, 358666.0, 358644.0, 358789.0, 358744.0, 358763.0, 358820.0]

Best hyper-parameter is eta = 0.001

exp3_list = []

for i in range (m):
    
    exp3_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(4, mean_reward[k], [T])
    

    def EXP3(reward_realization, K, T, eta=0.001):
            # Initialize the past reward monitor
            S_vec = np.zeros([K])

            # record the collected reward
            rewards = np.zeros([T])

            for t in range(T):

                # compute the sampling probability
                exp_S_vec = np.exp(eta*S_vec)
                prob_vec = exp_S_vec/np.sum(exp_S_vec)

                # sample an arm
                action = np.random.choice(K,1,p=prob_vec)[0] 

                # Update the estimate and count
                rewards[t] = reward_realization[action,t]
                S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

            return rewards

    exp3_list[i] = EXP3(reward_realization, K_arms, T)

    print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))

The cumulative reward of EXP3 algorithm on trial number 1 is:  356385.0
The cumulative reward of EXP3 algorithm on trial number 2 is:  359348.0
The cumulative reward of EXP3 algorithm on trial number 3 is:  359131.0
The cumulative reward of EXP3 algorithm on trial number 4 is:  359341.0
The cumulative reward of EXP3 algorithm on trial number 5 is:  350985.0
The cumulative reward of EXP3 algorithm on trial number 6 is:  358743.0
The cumulative reward of EXP3 algorithm on trial number 7 is:  358810.0
The cumulative reward of EXP3 algorithm on trial number 8 is:  205788.0
The cumulative reward of EXP3 algorithm on trial number 9 is:  359289.0
The cumulative reward of EXP3 algorithm on trial number 10 is:  358644.0
The cumulative reward of EXP3 algorithm on trial number 11 is:  358811.0
The cumulative reward of EXP3 algorithm on trial number 12 is:  358828.0
The cumulative reward of EXP3 algorithm on trial number 13 is:  358746.0
The cumulative reward of EXP3 algorithm on trial number 14 is:  358585.0
The cumulative reward of EXP3 algorithm on trial number 15 is:  359031.0
The cumulative reward of EXP3 algorithm on trial number 16 is:  351398.0
The cumulative reward of EXP3 algorithm on trial number 17 is:  357113.0
The cumulative reward of EXP3 algorithm on trial number 18 is:  359239.0
The cumulative reward of EXP3 algorithm on trial number 19 is:  350916.0
The cumulative reward of EXP3 algorithm on trial number 20 is:  346761.0
The cumulative reward of EXP3 algorithm on trial number 21 is:  359124.0
The cumulative reward of EXP3 algorithm on trial number 22 is:  351654.0
The cumulative reward of EXP3 algorithm on trial number 23 is:  358909.0
The cumulative reward of EXP3 algorithm on trial number 24 is:  205887.0
The cumulative reward of EXP3 algorithm on trial number 25 is:  351693.0
The cumulative reward of EXP3 algorithm on trial number 26 is:  359388.0
The cumulative reward of EXP3 algorithm on trial number 27 is:  358805.0
The cumulative reward of EXP3 algorithm on trial number 28 is:  358781.0
The cumulative reward of EXP3 algorithm on trial number 29 is:  351265.0
The cumulative reward of EXP3 algorithm on trial number 30 is:  359234.0

# get average reward per time step of the 30 trials 

exp3_average_30 = np.mean(exp3_list, axis=0)

Plot Average Cumulative Reward¶

plt.figure(figsize=(12,8))
plt.plot(greedy_average_30, label="Greedy ($\epsilon=0.01$)")
plt.plot(ucb_average_30, label="UCB")
plt.plot(exp3_average_30, label="EXP3 ($\eta=0.001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")

plt.show()

Sigma = 25¶

Epsilon-greedy algorithm¶

np.linspace(0.01, 0.11, 11)

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11])

### check for best hyperparameter

reward1 = []

for i in np.linspace(0.01, 0.11, 11):
    
    def eps_greedy(reward_realization, K, T, eps=i):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            if np.random.random()>eps:
                # choose the arm with largest mean estimate
                action = np.argmax(reward_est)
            else:
                # choose a random arm
                action = np.random.choice(K,1)[0]

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        
        return rewards
    reward = eps_greedy(reward_realization, K_arms, T)
    reward1.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is", reward1)

The cumulative reward of epsilon greedy algorithm for np.linspace(0.01, 0.11, 11) is [2238191.0, 2231936.0, 2222132.0, 2212179.0, 2203659.0, 2196094.0, 2187430.0, 2178172.0, 2170619.0, 2160542.0, 2152095.0]

Best eps = 0.01

greedy_list = []

for i in range (m):
    
    greedy_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(25, mean_reward[k], [T])
    
    
    def eps_greedy(reward_realization, K, T, eps=0.01):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            if np.random.random()>eps:
                # choose the arm with largest mean estimate
                action = np.argmax(reward_est)
            else:
                # choose a random arm
                action = np.random.choice(K,1)[0]

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        return rewards



    greedy_list[i] = eps_greedy(reward_realization, K_arms, T)
    print("The cumulative reward of epsilon greedy algorithm on trial number " + str(i+1) + " is: ", np.sum(greedy_list[i]))

The cumulative reward of epsilon greedy algorithm on trial number 1 is:  2241622.0
The cumulative reward of epsilon greedy algorithm on trial number 2 is:  2239856.0
The cumulative reward of epsilon greedy algorithm on trial number 3 is:  2238137.0
The cumulative reward of epsilon greedy algorithm on trial number 4 is:  2241409.0
The cumulative reward of epsilon greedy algorithm on trial number 5 is:  2240502.0
The cumulative reward of epsilon greedy algorithm on trial number 6 is:  2241553.0
The cumulative reward of epsilon greedy algorithm on trial number 7 is:  2240556.0
The cumulative reward of epsilon greedy algorithm on trial number 8 is:  2241154.0
The cumulative reward of epsilon greedy algorithm on trial number 9 is:  2240798.0
The cumulative reward of epsilon greedy algorithm on trial number 10 is:  2239975.0
The cumulative reward of epsilon greedy algorithm on trial number 11 is:  2241269.0
The cumulative reward of epsilon greedy algorithm on trial number 12 is:  2239950.0
The cumulative reward of epsilon greedy algorithm on trial number 13 is:  2240378.0
The cumulative reward of epsilon greedy algorithm on trial number 14 is:  2240424.0
The cumulative reward of epsilon greedy algorithm on trial number 15 is:  2239523.0
The cumulative reward of epsilon greedy algorithm on trial number 16 is:  2240908.0
The cumulative reward of epsilon greedy algorithm on trial number 17 is:  2241648.0
The cumulative reward of epsilon greedy algorithm on trial number 18 is:  2238787.0
The cumulative reward of epsilon greedy algorithm on trial number 19 is:  2240942.0
The cumulative reward of epsilon greedy algorithm on trial number 20 is:  2241781.0
The cumulative reward of epsilon greedy algorithm on trial number 21 is:  2241318.0
The cumulative reward of epsilon greedy algorithm on trial number 22 is:  2240255.0
The cumulative reward of epsilon greedy algorithm on trial number 23 is:  2240657.0
The cumulative reward of epsilon greedy algorithm on trial number 24 is:  2240821.0
The cumulative reward of epsilon greedy algorithm on trial number 25 is:  2241666.0
The cumulative reward of epsilon greedy algorithm on trial number 26 is:  2241877.0
The cumulative reward of epsilon greedy algorithm on trial number 27 is:  2240420.0
The cumulative reward of epsilon greedy algorithm on trial number 28 is:  2238754.0
The cumulative reward of epsilon greedy algorithm on trial number 29 is:  2240458.0
The cumulative reward of epsilon greedy algorithm on trial number 30 is:  2240820.0

# get average reward per time step of the 30 trials 

greedy_average_30 = np.mean(greedy_list, axis=0)

UCB algorithm¶

ucb_list = []

for i in range (m):
    
    ucb_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(25, mean_reward[k], [T])
    
    
    def UCB(reward_realization, K, T):
        # record the mean estimators
        reward_est = np.zeros([K])
        # record the counts
        counts = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        # Phase I: play each arm once
        for t in range(K):
            # play t-th arm
            rewards[t] = reward_realization[t,t]
            reward_est[t] = reward_realization[t,t]
            counts[t] += 1     

        # Phase II
        for t in range(K,T):
            # construct upper confidence bound
            UCB_tmp = np.zeros([K])
            for k in range(K):
                UCB_tmp[k] = reward_est[k] + np.sqrt(2*np.log(t-1)/counts[k])
            action = np.argmax(UCB_tmp)

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            reward_est[action] = (reward_est[action]*counts[action] + reward_realization[action,t])/(counts[action] + 1)
            counts[action]+=1

        return rewards   

    ucb_list[i] = UCB(reward_realization, K_arms, T)


    print("The cumulative reward of UCB algorithm on trial number " + str(i+1) + " is: ", np.sum(ucb_list[i]))

The cumulative reward of UCB algorithm on trial number 1 is:  2250416.0
The cumulative reward of UCB algorithm on trial number 2 is:  2250823.0
The cumulative reward of UCB algorithm on trial number 3 is:  2250213.0
The cumulative reward of UCB algorithm on trial number 4 is:  2250182.0
The cumulative reward of UCB algorithm on trial number 5 is:  2249828.0
The cumulative reward of UCB algorithm on trial number 6 is:  2249896.0
The cumulative reward of UCB algorithm on trial number 7 is:  2249663.0
The cumulative reward of UCB algorithm on trial number 8 is:  2249828.0
The cumulative reward of UCB algorithm on trial number 9 is:  2250138.0
The cumulative reward of UCB algorithm on trial number 10 is:  2249026.0
The cumulative reward of UCB algorithm on trial number 11 is:  2249669.0
The cumulative reward of UCB algorithm on trial number 12 is:  2249528.0
The cumulative reward of UCB algorithm on trial number 13 is:  2249806.0
The cumulative reward of UCB algorithm on trial number 14 is:  2249633.0
The cumulative reward of UCB algorithm on trial number 15 is:  2249884.0
The cumulative reward of UCB algorithm on trial number 16 is:  2249775.0
The cumulative reward of UCB algorithm on trial number 17 is:  2250205.0
The cumulative reward of UCB algorithm on trial number 18 is:  2249388.0
The cumulative reward of UCB algorithm on trial number 19 is:  2249759.0
The cumulative reward of UCB algorithm on trial number 20 is:  2250458.0
The cumulative reward of UCB algorithm on trial number 21 is:  2250217.0
The cumulative reward of UCB algorithm on trial number 22 is:  2249745.0
The cumulative reward of UCB algorithm on trial number 23 is:  2250065.0
The cumulative reward of UCB algorithm on trial number 24 is:  2249969.0
The cumulative reward of UCB algorithm on trial number 25 is:  2250563.0
The cumulative reward of UCB algorithm on trial number 26 is:  2250514.0
The cumulative reward of UCB algorithm on trial number 27 is:  2249453.0
The cumulative reward of UCB algorithm on trial number 28 is:  2249505.0
The cumulative reward of UCB algorithm on trial number 29 is:  2249828.0
The cumulative reward of UCB algorithm on trial number 30 is:  2250184.0

# get average reward per time step of the 30 trials 

ucb_average_30 = np.mean(ucb_list, axis=0)

EXP3 algorithm¶

np.logspace(-6,-3, 10)

array([0.000001  , 0.00000215, 0.00000464, 0.00001   , 0.00002154,
       0.00004642, 0.0001    , 0.00021544, 0.00046416, 0.001     ])

### get best hyper-parameter

reward3 = []

for i in np.logspace(-6,-3, 10):
    def EXP3(reward_realization, K, T, eta=i):
        # Initialize the past reward monitor
        S_vec = np.zeros([K])

        # record the collected reward
        rewards = np.zeros([T])

        for t in range(T):

            # compute the sampling probability
            exp_S_vec = np.exp(eta*S_vec)
            prob_vec = exp_S_vec/np.sum(exp_S_vec)

            # sample an arm
            action = np.random.choice(K,1,p=prob_vec)[0] 

            # Update the estimate and count
            rewards[t] = reward_realization[action,t]
            S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

        return rewards
    reward = eps_greedy(reward_realization, K_arms, T)
    reward3.append(np.sum(reward))
#reward1 = eps_greedy(reward_realization, K_arms, T)
print("The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is", reward3)

The cumulative reward of EXP3 algorithm for np.logspace(-6,-3, 10) is [2238804.0, 2240987.0, 2241011.0, 2241300.0, 2241063.0, 2240888.0, 2241736.0, 2239797.0, 2239786.0, 2241827.0]

Best eta = 0.001

exp3_list = []

for i in range (m):
    
    exp3_list.append(str(i))

for i in range(m):
    
    np.random.seed(i+1)
    for k in range(K_arms):
    ### the k-th row and t-th column of the matrix reward realization represents
    ### the reward of the k-th arm at time t
        reward_realization[k,:] = np.random.binomial(25, mean_reward[k], [T])
    

    def EXP3(reward_realization, K, T, eta=0.001):
            # Initialize the past reward monitor
            S_vec = np.zeros([K])

            # record the collected reward
            rewards = np.zeros([T])

            for t in range(T):

                # compute the sampling probability
                exp_S_vec = np.exp(eta*S_vec)
                prob_vec = exp_S_vec/np.sum(exp_S_vec)

                # sample an arm
                action = np.random.choice(K,1,p=prob_vec)[0] 

                # Update the estimate and count
                rewards[t] = reward_realization[action,t]
                S_vec[action] += 1-(1-reward_realization[action,t])/prob_vec[action]

            return rewards

    exp3_list[i] = EXP3(reward_realization, K_arms, T)

    print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))

C:\Users\hernan\AppData\Local\Temp/ipykernel_18876/3517171080.py:20: RuntimeWarning: overflow encountered in exp
  exp_S_vec = np.exp(eta*S_vec)
C:\Users\hernan\AppData\Local\Temp/ipykernel_18876/3517171080.py:21: RuntimeWarning: invalid value encountered in true_divide
  prob_vec = exp_S_vec/np.sum(exp_S_vec)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_18876/3517171080.py in <module>
     30             return rewards
     31 
---> 32     exp3_list[i] = EXP3(reward_realization, K_arms, T)
     33 
     34     print("The cumulative reward of EXP3 algorithm on trial number " + str(i+1) + " is: ", np.sum(exp3_list[i]))

~\AppData\Local\Temp/ipykernel_18876/3517171080.py in EXP3(reward_realization, K, T, eta)
     22 
     23                 # sample an arm
---> 24                 action = np.random.choice(K,1,p=prob_vec)[0]
     25 
     26                 # Update the estimate and count

mtrand.pyx in numpy.random.mtrand.RandomState.choice()

ValueError: probabilities contain NaN

Plot Average Cumulative Reward¶

plt.figure(figsize=(12,8))
plt.plot(greedy_average_30, label="Greedy ($\epsilon=0.01$)")
plt.plot(ucb_average_30, label="UCB")
#plt.plot(average_reward3, label="EXP3 ($\eta=0.001$)")
plt.legend(bbox_to_anchor=(1.3, 0.5))
plt.xlabel("Iterations")
plt.ylabel("Average Cumulative Reward")

plt.show()

Conclusions part (c)¶

After plotting the different scenarios with larger variances (sigma = 4 and sigma = 25) related to the reward of each arm and updating the hyper-parameters, we observe that as the variance get higher the average cumulative reward revolves around a higher value. On the other hand, we see higher confidence intervals and higher uncertainty every iteration.

Also, in the case of sigma = 25 we see that in the EXP3 algorithm, in some cases the probabilites of the model are out of bound, and the algorithm fails.