-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhexner_full_fix.py
322 lines (267 loc) · 11.2 KB
/
hexner_full_fix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# Copyright 2019 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as python3
"""Hexner's one-step game for T-dt implemented in Python.
This is a simple demonstration of implementing a game in Python, featuring
chance and imperfect information.
Python games are significantly slower than C++, but it may still be suitable
for prototyping or for small games.
It is possible to run C++ algorithms on Python implemented games, This is likely
to have good performance if the algorithm simply extracts a game tree and then
works with that. It is likely to be poor if the algorithm relies on processing
and updating states as it goes, e.g. MCTS.
"""
import numpy as np
import pyspiel
from itertools import product
from typing import Any, Iterable, List, Mapping, Optional, Set
# create a dictionary of int-action pairs
ux_max = 12
uy_max = 12
dx_max = 12
dy_max = 12
n = 4
_uxs = np.linspace(-ux_max, ux_max, n)
_uys = np.linspace(-uy_max, uy_max, n)
_dxs = np.linspace(-dx_max, dx_max, n)
_dys = np.linspace(-dy_max, dy_max, n)
_us = list(product(_uxs, _uys))
_ds = list(product(_dxs, _dys))
_umap = {k: v for (k, v) in enumerate(_us)}
_dmap = {k: v for (k, v) in enumerate(_ds)}
_NUM_PLAYERS = 2
_TYPE = frozenset([0, 1])
_dt = 0.25
_R1 = np.array([[0.05, 0], [0, 0.025]])
_R2 = np.array([[0.05, 0], [0, 0.1]])
# _BELIEF = frozenset([0.5, 0.5])
_GAME_TYPE = pyspiel.GameType(
short_name="python_hexner_full",
long_name="Python Hexner's Game",
dynamics=pyspiel.GameType.Dynamics.SIMULTANEOUS,
chance_mode=pyspiel.GameType.ChanceMode.EXPLICIT_STOCHASTIC,
information=pyspiel.GameType.Information.IMPERFECT_INFORMATION,
utility=pyspiel.GameType.Utility.ZERO_SUM,
reward_model=pyspiel.GameType.RewardModel.REWARDS,
max_num_players=_NUM_PLAYERS,
min_num_players=_NUM_PLAYERS,
provides_information_state_string=True,
provides_information_state_tensor=True,
provides_observation_string=True,
provides_observation_tensor=True,
provides_factored_observation_string=True,
)
_GAME_INFO = pyspiel.GameInfo(
num_distinct_actions=n * n,
max_chance_outcomes=len(_TYPE), # p1's types
num_players=_NUM_PLAYERS,
min_utility=-1e6,
max_utility=1e6,
utility_sum=0.0,
max_game_length=4) # e.g. total time-steps
class HexnerGame(pyspiel.Game):
"""A Python version of Hexner game."""
def __init__(self, params=None):
super().__init__(_GAME_TYPE, _GAME_INFO, params or dict())
def new_initial_state(self):
"""Returns a state corresponding to the start of a game."""
return HexnerState(self)
def make_py_observer(self, iig_obs_type=None, params=None):
"""Returns an object used for observing game state."""
return HexnerObserver(
iig_obs_type or pyspiel.IIGObservationType(perfect_recall=False),
params)
def _go_forward(states, actions):
"""Applies point dynamics to advance the system, i.e. states of each player"""
u = _umap[actions[0]]
d = _dmap[actions[1]]
ux, uy = u
dx, dy = d
x1 = states[0]
y1 = states[1]
vx1 = states[2]
vy1 = states[3]
x2 = states[4]
y2 = states[5]
vx2 = states[6]
vy2 = states[7]
x1_next = x1 + vx1 * _dt + (1/2) * ux * _dt ** 2
y1_next = y1 + vy1 * _dt + (1/2) * uy * _dt ** 2
vx1_next = vx1 + ux * _dt
vy1_next = vy1 + uy * _dt
x2_next = x2 + vx2 * _dt + (1/2) * dx * _dt ** 2
y2_next = y2 + vy2 * _dt + (1/2) * dy * _dt ** 2
vx2_next = vx2 + dx * _dt
vy2_next = vy2 + dy * _dt
ins_reward = (_dt * (np.sum((np.array(u) ** 2) * np.diag(_R1)) - np.sum((np.array(d) ** 2) * np.diag(_R2))))
x1_next = np.clip(x1_next, -1, 1)
y1_next = np.clip(y1_next, -1, 1)
x2_next = np.clip(x2_next, -1, 1)
y2_next = np.clip(y2_next, -1, 1)
return [x1_next, y1_next, vx1_next, vy1_next, x2_next, y2_next, vx2_next, vy2_next], \
np.array([ins_reward, -ins_reward])
class HexnerState(pyspiel.State):
"""A python version of the Kuhn poker state."""
def __init__(self, game):
"""Constructor; should only be called by Game.new_initial_state."""
super().__init__(game)
initial_state = [np.zeros(8, )] # initialize positions and velocities
# initial_state[0][0] = -0.5 # fix p1 to left of the center
# initial_state[0][4] = 0.5
# sample initial positions
x1 = np.random.uniform(-1, 1, 1)
y1 = np.random.uniform(-1, 1, 1)
x2 = np.random.uniform(-1, 1, 1)
y2 = np.random.uniform(-1, 1, 1)
initial_state[0][0] = x1
initial_state[0][1] = y1
initial_state[0][4] = x2
initial_state[0][5] = y2
self.states = np.array(initial_state).reshape(1, -1)
self.actions = []
self._game_over = False
self._next_player = 0
self._is_chance = True # no chance here
self.t_step = 0 # initial time step is 0
self._rewards = np.zeros(_NUM_PLAYERS)
self._returns = np.zeros(_NUM_PLAYERS)
self.p1type = [0, 0] # [0, 1] -- goal 1 (left, up) [1, 0] -- goal 2 (right, down)
self.p = 0.5
# self.time = 0
# OpenSpiel (PySpiel) API functions are below. This is the standard set that
# should be implemented by every sequential-move game with chance.
def current_player(self):
"""Returns id of the next player to move, or TERMINAL if game is over."""
if self._game_over:
return pyspiel.PlayerId.TERMINAL
elif self._is_chance:
return pyspiel.PlayerId.CHANCE
else:
return pyspiel.PlayerId.SIMULTANEOUS
def _legal_actions(self, player):
"""Returns a list of legal actions, sorted in ascending order."""
assert player >= 0
return list(_umap.keys()) if player == 0 else list(_dmap.keys())
def chance_outcomes(self):
"""Returns the possible chance outcomes and their probabilities."""
assert self._is_chance
outcomes = _TYPE
p = [self.p, 1 - self.p]
return list(zip(outcomes, p))
def _apply_action(self, action):
"""Applies the specified action to the state."""
# This is not called at simultaneous-move states
assert self._is_chance and not self._game_over
self._is_chance = False
self.p1type[action] = 1
def _apply_actions(self, actions):
"""Applies the specified actions (per player) to the state."""
assert not self._is_chance and not self._game_over
self.actions.append(actions)
_next_states, _ins_costs = _go_forward(self.states[-1], actions)
self.states = np.vstack((self.states, _next_states))
# self.states.append(_next_states)
self._rewards = _ins_costs
self.t_step += 1
self._game_over = True if self.t_step >= self.get_game().max_game_length() else False
if self._game_over:
self._returns -= (self._terminal_cost(_next_states) + self._rewards)
def _terminal_cost(self, states):
"""Computes terminal cost of the game based on the p1's type."""
goal_1 = np.array([0, 1])
goal_2 = np.array([0, -1])
x1 = states[:2]
x2 = states[4:6]
dist_to_goal_1 = np.linalg.norm(x1 - goal_1) ** 2 - np.linalg.norm(x2 - goal_1) ** 2
dist_to_goal_2 = np.linalg.norm(x1 - goal_2) ** 2 - np.linalg.norm(x2 - goal_2) ** 2
if self.p1type == [0, 1]:
terminal = dist_to_goal_1
else:
terminal = dist_to_goal_2
return np.array([terminal, -terminal])
def _action_to_string(self, player, action):
"""Action -> string."""
if player == pyspiel.PlayerId.CHANCE:
return f"Goal:{'2' if action == 0 else '1'}"
if player == 0:
return f"{action}" # just return the action idx
else:
return f"{action}"
def is_terminal(self):
"""Returns True if the game is over."""
return self._game_over
def rewards(self):
"""Reward at the previous step"""
return self._rewards
def returns(self):
"""Total reward for each player over the course of the game."""
return self._returns # round off for better graph viz.
def __str__(self):
"""String for debug purposes. No particular semantics are required"""
return (f"p1:{self.action_history_string(0)}"
f"p2:{self.action_history_string(1)}")
def action_history_string(self, player):
return "".join(
self._action_to_string(pa.player, pa.action)[0]
for pa in self.full_history()
if pa.player == player
)
class HexnerObserver:
"""Observer, conforming to the PyObserver interface (see observation.py)."""
def __init__(self, iig_obs_type, params):
"""Initializes an empty observation tensor."""
assert not bool(params)
self.iig_obs_type = iig_obs_type
pieces = []
self.dict = {}
if iig_obs_type.private_info == pyspiel.PrivateInfoType.SINGLE_PLAYER:
pieces.append(("player_type", 2, (2,)))
if iig_obs_type.public_info:
if iig_obs_type.perfect_recall:
pieces.append(("state", 8, (8, )))
pieces.append(("actions", 2 * 4 * n**2, (4, 2 * n**2)))
#build the single flat tensor
total_size = sum(size for name, size, shape in pieces)
self.tensor = np.zeros(total_size, np.float32)
# build the named & reshaed views of the bits of the flat tensor.
self.dict = {}
index = 0
for name, size, shape in pieces:
self.dict[name] = self.tensor[index:index + size].reshape(shape)
index += size
def set_from(self, state, player):
"""Updates `tensor` and `dict` to reflect `state` from PoV of `player`."""
self.tensor.fill(0)
if "player_type" in self.dict and player == 0:
self.dict["player_type"] = state.p1type
if "state" in self.dict:
self.dict["state"] = state.states[-1]
if "actions" in self.dict:
for stage, actions in enumerate(state.actions):
# add p1's action
self.dict["actions"][stage, actions[0]] = 1
# add p2's action
self.dict["actions"][stage, n**2 + actions[1]] = 1
# pass
def string_from(self, state, player):
"""Observation of `state` from the PoV of `player`, as a string."""
pieces = []
if "player_type" in self.dict and player == 0:
pieces.append(f"type:{state.p1type}")
pieces.append(f"us:{state.action_history_string(player)} "
f"op:{state.action_history_string(1 - player)}")
return " ".join(p for p in pieces)
# pass
# Register the game with the OpenSpiel library
pyspiel.register_game(_GAME_TYPE, HexnerGame)