The following is a Python program that learns to play a simple game of chance in somewhat the same way that the AlphaZero program learned to play chess, Go, and other games.
The game is a variant of the card game “Blackjack”:Two players alternately roll dice, and keep track of their total across turns. They are each trying to reach a sum that lies in a specified target, between a fixed low value and high value. If a player reaches a score in the target range, they immediately win. If they exceed the high value, they immediately lose. The players can choose the number of dice to roll on each turn, between 1 and a fixed maximum. The game thus has four parameters:
 • NSides, The number of sides of the die. The die is numbered 1 to NSides and all outcomes are equally likely.
 • LTarget, the lowest winning value.
 • UTarget, the highest winning value.
 • NDice, the maximum number of dice a player may roll.
The output would be two LT arget × LT arget arrays; the correct number of dices to roll in state hX, Yi and the probability of winning if you roll the correct number of dice. The probability of winning if you roll J dice in state X, Y is estimated as WinCount[X,Y,J]/(WinCount[X,Y,J] + LostCount[X,Y,J]) (Output −1 if the denominator is 0.)

To run this program:
python NDice NSides LTarget UTarget M NGames
For example, the game is NDice=2, NSides=2, LTarget=4, UTarget=5 with M=100, play 10000 times, please type:
python 2 2 4 5 100 10000
Thank you!

import random,sys
def playGame(NDice,Nsides,LTarget,UTarget,LoseCount,WinCount,M):
    while(not winnerDecided):
        #create a tuple
        if Aturn:
        # the first is the current player's score
        # decide how many dice to roll
        # roll dice and add to sum and record current state into <X, Y, J>
        #if current player: A
        if Aturn:
        #if current player: B
        if LTarget<=currentPlayerScore<=UTarget:
            #current player the winner
        elif currentPlayerScore>UTarget:
            #current player loses
            Awin=not Aturn
        #after we have a winner
        if winnerDecided:
            #store the winner and loser's moves
            if Awin:
        #if no winner has yet been decided
        #switch A and B as current player
        #then begin next turn from while loop
            Aturn=not Aturn
def rollDice(NDice,Nsides):
    #roll a "Nsides" dice "Ndice" times and add up all the values
    return sum([random.randint(1,Nsides) for i in range(NDice)])
#get the winning probability of the state/move
def get_fj(WinCount,LoseCount,x,y,j):
    if WinCount[x][y][j]+LoseCount[x][y][j]>0:
        return WinCount[x][y][j]/(WinCount[x][y][j]+LoseCount[x][y][j])
        return 0
def chooseDice(Score,LoseCount,WinCount,NDice,M):
    #T=the # of times the state <x,y> appeared
    T=sum(LoseCount[x][y][j]+WinCount[x][y][j] for j in range(1,NDice+1))
    fjs=[get_fj(WinCount,LoseCount,x,y,j) for j in range(1,NDice+1)]
    #g is the probabilty of not rolling bestMove num of dice
    #aka the probability of winning by not taking the best move
    probabilityOfChoice=[None for _ in range(NDice)]
    DiceChoice=[i for i in range(1,NDice+1)]
    for j in range(1,NDice+1):
        if j !=B_bestMove:
            fj= fjs[j-1]
    return random.choices(DiceChoice,probabilityOfChoice)[0]
    #///debug till here
def incrementCount(WinCount,LoseCount,winner_Moves,loser_Moves):
    # <WinCount[x,y,z]++>
    for x,y,j in winner_Moves:WinCount[x][y][j]+=1
    # <LoseCount[x,y,z]++>
    for x,y,j in loser_Moves:LoseCount[x][y][j]+=1
def extractAnswer(WinCount,LoseCount):
    moves_matrix=[[0 for y in range(numAll)]for x in range(numAll)]
    probability_matrix=[[0 for y in range(numAll)]for x in range(numAll)]
    for x in range(numAll):
        for y in range(numAll):
            #store all fj for each dice number
            fjs=[get_fj(WinCount,LoseCount,x,y,j)for j in range(1,numThis)]
            #get the biggest fj
            if fb_bestFj>0:
            #deduce the best move from the best fj
            #pass the matrix
    return moves_matrix,probability_matrix
if __name__=="__main__":
    #getting input if not already done so  
        print("Input the folloing parameters which are all non-negative integers")
        NSides, the number of sides of the dice;
        LTarget, the lowest winning score;
        UTarget, the highest winning score;
        NDice the maximum number of dice that can be rolled per turn;
        M, the hyperparameter described above;
        NGAMES, the number of games to play;
        M is a floating point number; (the rest are integers)
    print("Reinforcement learning experiment with M =",M,"NGames =",NGames)
    for run in range(3):
        #create two 3x3 matrices
        LoseCount=[[[0 for j in range(NDice+1)] for x in range(LTarget)] for y in range(LTarget)]
        WinCount=[[[0 for j in range(NDice+1)] for x in range(LTarget)] for y in range(LTarget)]
        #play the game "NGames" times
        for i in range(NGames):
            playGame(NDice, NSides,LTarget,UTarget,LoseCount,WinCount,M)
        moves,probability = extractAnswer(WinCount,LoseCount)
        print('Play =')
        for m in moves:
            print('\t'.join('%5d' % v for v in m))
        print('\nProb =')
        for p in probability:
            print('\t'.join('%1.4f' % v for v in p))
Back to Top