import numpy as np
import random

VELOCITY_BOUND = 5
POSITION_BOUND = 50
LIGHT_BOUND = 5
TARGET_POSITION = 23
DISCOUNT_FACTOR = 0.99

def clip(a, mi, ma):
  return min(ma, max(a, mi))

def get_next_state_fixed_action(state, action):
  position, velocity, light = state
  
  velocity = clip(velocity + action, -VELOCITY_BOUND, VELOCITY_BOUND) 
  new_position = clip(position + velocity, -POSITION_BOUND, POSITION_BOUND)
  if position * new_position <= 0 and light < 0:
    # Isli sme na cervenu -> odpadovy stav
    new_position = 50

  if light < 0:
    light += 1
    if light == 0:
      light = LIGHT_BOUND
  else:
    light -= 1
    if light == 0:
      light = -LIGHT_BOUND
  
  return new_position, velocity, light

# Vrati distribuciu nasledujucich stavov
# Returns distribution for next state (transition probability)
def get_next_state_distrib(state, action):
  assert(action in [-1, 0, 1])
  distrib = []
  for a in [-1, 0, 1]:
    distrib.append((get_next_state_fixed_action(state, action), 0.9 if a == action else 0.05))
  return distrib

# Vrati nasledujuci stav
# Returns next state
def get_next_state(state, action):
  distrib = get_next_state_distrib(state, action)
  choice = np.random.choice(range(len(distrib)), p=[x[1] for x in distrib])
  return distrib[choice][0]

def is_good(state):
  position, velocity, light = state
  return position == TARGET_POSITION and velocity == 0

def is_bad(state):
  position, velocity, light = state
  return position == POSITION_BOUND or position == -POSITION_BOUND

def is_final(state):
  return is_good(state) or is_bad(state)
    
def get_reward(state):
  if is_good(state):
    return 100.0
  if is_bad(state):
    return -100.0
  return -0.1

# Naprogramujte tuto funkciu
# Fill out this function
def get_policy():
  # Pre kazdy stav spocitajte policy a vratte ju
  # For each state calculate policy and return it
  policy = {}
  for position in range(-POSITION_BOUND, POSITION_BOUND+1):
    for velocity in range(-VELOCITY_BOUND, VELOCITY_BOUND+1):
      for light in range(-LIGHT_BOUND, LIGHT_BOUND+1):
        if light == 0:
          continue
        policy[(position, velocity, light)] = 1
  return policy

def simulate(policy, state=None):
  if not state:
    position = random.randint(-POSITION_BOUND + 1, POSITION_BOUND)
    velocity = 0
    light = 1

    state = position, velocity, light
    
  total_reward = 0
  while not is_final(state):
    total_reward += get_reward(state)
    print("current_state: ", state, "total_reward:", total_reward)
    state = get_next_state(state, policy[state])
  total_reward += get_reward(state)
  print ("final_state: ", state, "total_reward:", total_reward)
  if is_good(state):
    print ("Success")
  else:
    print ("Fail")

policy = get_policy()

for i in range(20):
  simulate(policy)
  print()

simulate(policy, (-1, 0, -5))
