Description
When training to 189 epoch, the training was interrupted in a server.
It seems OK on my own computer with the same config.
Exception in thread Thread-4:
Traceback (most recent call last):
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/alex2/hx_workspare/HandyRL/handyrl/connection.py", line 175, in _sender
conn.send(next(self.send_generator))
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
self._send(buf)
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 368, in _send
n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Exception in thread Thread-6:
Traceback (most recent call last):
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/alex2/hx_workspare/HandyRL/handyrl/connection.py", line 190, in _receiver
data, cnt = conn.recv()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 250, in recv
buf = self._recv_bytes()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
Exception in thread Thread-5:
Traceback (most recent call last):
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/alex2/hx_workspare/HandyRL/handyrl/connection.py", line 190, in _receiver
data, cnt = conn.recv()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 282, in rebuild_storage_fd
fd = df.detach()
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 492, in Client
c = SocketClient(address)
File "/home/alex2/workspace/miniconda3/envs/torch/lib/python3.7/multiprocessing/connection.py", line 620, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
yaml
train_args:
turn_based_training: False
observation: True
gamma: 0.8
forward_steps: 32
compress_steps: 4
entropy_regularization: 2.0e-3
entropy_regularization_decay: 0.3
update_episodes: 300
batch_size: 400
minimum_episodes: 10000
maximum_episodes: 250000
num_batchers: 7
eval_rate: 0.1
worker:
num_parallel: 6
lambda: 0.7
policy_target: 'UPGO' # 'UPGO' 'VTRACE' 'TD' 'MC'
value_target: 'TD' # 'VTRACE' 'TD' 'MC'
seed: 0
restart_epoch: 0
worker_args:
server_address: ''
num_parallel: 6