The following is a Python program that learns to play a simple game of chance in somewhat the same way that the AlphaZero program learned to play chess, Go, and other games.
The game is a variant of the card game “Blackjack”:Two players alternately roll dice, and keep track of their total across turns. They are each trying to reach a sum that lies in a specified target, between a fixed low value and high value. If a player reaches a score in the target range, they immediately win. If they exceed the high value, they immediately lose. The players can choose the number of dice to roll on each turn, between 1 and a fixed maximum. The game thus has four parameters:
 • NSides, The number of sides of the die. The die is numbered 1 to NSides and all outcomes are equally likely.
 • LTarget, the lowest winning value.
 • UTarget, the highest winning value.
 • NDice, the maximum number of dice a player may roll.
The output would be two LT arget × LT arget arrays; the correct number of dices to roll in state hX, Yi and the probability of winning if you roll the correct number of dice. The probability of winning if you roll J dice in state X, Y is estimated as WinCount[X,Y,J]/(WinCount[X,Y,J] + LostCount[X,Y,J]) (Output −1 if the denominator is 0.)

To run this program:
python main.py NDice NSides LTarget UTarget M NGames
For example, the game is NDice=2, NSides=2, LTarget=4, UTarget=5 with M=100, play 10000 times, please type:
python main.py 2 2 4 5 100 10000
Thank you!

import random,sys
def playGame(NDice,Nsides,LTarget,UTarget,LoseCount,WinCount,M):
    winnerDecided=False
    Ascore=0
    Bscore=0
    Aturn=True
    Bturn=False
    Awin=None
    A_Moves=[]
    B_Moves=[]
    
    while(not winnerDecided):
        #create a tuple
        if Aturn:
        # the first is the current player's score
            score=(Ascore,Bscore)
        else:
            score=(Bscore,Ascore)
        currentPlayerScore=score[0]
        # decide how many dice to roll
        numOfDice=chooseDice(score,LoseCount,WinCount,NDice,M)
        # roll dice and add to sum and record current state into <X, Y, J>
        sumOfDice=rollDice(numOfDice,Nsides)
        currentState=(score[0],score[1],numOfDice)
        
        #if current player: A
        if Aturn:
            A_Moves.append(currentState)
            Ascore+=sumOfDice
            currentPlayerScore=Ascore
        #if current player: B
        else:
            B_Moves.append(currentState)
            Bscore+=sumOfDice
            currentPlayerScore=Bscore
            
        if LTarget<=currentPlayerScore<=UTarget:
            #current player the winner
            Awin=Aturn
            winnerDecided=True
            
        elif currentPlayerScore>UTarget:
            #current player loses
            Awin=not Aturn
            winnerDecided=True
            
        #after we have a winner
        if winnerDecided:
            #store the winner and loser's moves
            if Awin:
                winner_Moves=A_Moves;
                loser_Moves=B_Moves;
            else:
                winner_Moves=B_Moves;
                loser_Moves=A_Moves;
            incrementCount(WinCount,LoseCount,winner_Moves,loser_Moves)
        else:
        #if no winner has yet been decided
        #switch A and B as current player
        #then begin next turn from while loop
            Aturn=not Aturn
            
def rollDice(NDice,Nsides):
    #roll a "Nsides" dice "Ndice" times and add up all the values
    return sum([random.randint(1,Nsides) for i in range(NDice)])
    
#get the winning probability of the state/move
def get_fj(WinCount,LoseCount,x,y,j):
    if WinCount[x][y][j]+LoseCount[x][y][j]>0:
    
        return WinCount[x][y][j]/(WinCount[x][y][j]+LoseCount[x][y][j])
    else:
        return 0
def chooseDice(Score,LoseCount,WinCount,NDice,M):
    x,y=Score
    #T=the # of times the state <x,y> appeared
    T=sum(LoseCount[x][y][j]+WinCount[x][y][j] for j in range(1,NDice+1))
    fjs=[get_fj(WinCount,LoseCount,x,y,j) for j in range(1,NDice+1)]
    fb_bestFj=max(fjs)
    B_bestMove=fjs.index(fb_bestFj)+1
    
    #g is the probabilty of not rolling bestMove num of dice
    #aka the probability of winning by not taking the best move
    g=sum(fjs)-fb_bestFj
    pb=(T*fb_bestFj+M)/(T*fb_bestFj+M*NDice)
    probabilityOfChoice=[None for _ in range(NDice)]
    DiceChoice=[i for i in range(1,NDice+1)]
    probabilityOfChoice[B_bestMove-1]=pb
    
    for j in range(1,NDice+1):
        if j !=B_bestMove:
            fj= fjs[j-1]
            pj=(1-pb)*(T*fj+M)/(g*T+(NDice-1)*M)
            probabilityOfChoice[j-1]=pj
    return random.choices(DiceChoice,probabilityOfChoice)[0]
    
    
    #///debug till here
    
    
            
        
def incrementCount(WinCount,LoseCount,winner_Moves,loser_Moves):
    # <WinCount[x,y,z]++>
    for x,y,j in winner_Moves:WinCount[x][y][j]+=1
    # <LoseCount[x,y,z]++>
    for x,y,j in loser_Moves:LoseCount[x][y][j]+=1
    
def extractAnswer(WinCount,LoseCount):
    numAll=len(WinCount)
    numThis=len(WinCount[0][0])
    moves_matrix=[[0 for y in range(numAll)]for x in range(numAll)]
    probability_matrix=[[0 for y in range(numAll)]for x in range(numAll)]
    for x in range(numAll):
        for y in range(numAll):
            #store all fj for each dice number
            fjs=[get_fj(WinCount,LoseCount,x,y,j)for j in range(1,numThis)]
            #get the biggest fj
            fb_bestFj=max(fjs)
            if fb_bestFj>0:
            #deduce the best move from the best fj
                B_bestMove=fjs.index(fb_bestFj)+1
            else:
                B_bestMove=0
            #pass the matrix
            moves_matrix[x][y]=B_bestMove
            probability_matrix[x][y]=fb_bestFj
    return moves_matrix,probability_matrix
            
            
            
if __name__=="__main__":
    #getting input if not already done so  
    if(len(sys.argv)<7):
        print("Input the folloing parameters which are all non-negative integers")
        print("""
        NSides, the number of sides of the dice;
        LTarget, the lowest winning score;
        UTarget, the highest winning score;
        NDice the maximum number of dice that can be rolled per turn;
        M, the hyperparameter described above;
        NGAMES, the number of games to play;
        M is a floating point number; (the rest are integers)
        """)
        exit(1)
    NDice=int(sys.argv[1])
    NSides=int(sys.argv[2])
    LTarget=int(sys.argv[3])
    UTarget=int(sys.argv[4])
    M=int(sys.argv[5])
    NGames=int(sys.argv[6])
    print("Reinforcement learning experiment with M =",M,"NGames =",NGames)
    for run in range(3):
        #create two 3x3 matrices
        LoseCount=[[[0 for j in range(NDice+1)] for x in range(LTarget)] for y in range(LTarget)]
        WinCount=[[[0 for j in range(NDice+1)] for x in range(LTarget)] for y in range(LTarget)]
        
        #play the game "NGames" times
        for i in range(NGames):
            playGame(NDice, NSides,LTarget,UTarget,LoseCount,WinCount,M)
        moves,probability = extractAnswer(WinCount,LoseCount)
        print('\nRun',run+1,end='\n\n')
        print('Play =')
        for m in moves:
            print('\t'.join('%5d' % v for v in m))
        print('\nProb =')
        for p in probability:
            print('\t'.join('%1.4f' % v for v in p))
    
Back to Top