forked from kduxin/firelang
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stack.py
198 lines (166 loc) · 6.33 KB
/
stack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from __future__ import annotations
from typing import List, Tuple, Mapping, Set, Iterable, Any, Union
from copy import deepcopy
from collections import OrderedDict, defaultdict
import inspect
import numpy as np
import torch
from torch import Tensor
from torch.nn import Module, ModuleList, ModuleDict
from .utils.index import IndexLike
from .utils.shape import parse_shape
__all__ = [
"clear_cache",
"current_cache_sizes",
"StackingSlicing",
]
_cache = defaultdict(OrderedDict)
def clear_cache():
_cache.clear()
def current_cache_sizes():
return {key: len(dct) for key, dct in _cache.items()}
class StackingSlicing(Module):
init_locals: Mapping[str, Any]
unsliceable_params: Set[str]
skip_keys = ["shape"]
def __init__(
self,
locals_: Mapping[str, Any],
unsliceable_params: Iterable[str] = [],
):
Module.__init__(self)
self.init_locals = locals_
self._sanity_check()
self.shape = locals_["shape"]
self.unsliceable_params = set(unsliceable_params)
def register_extra_init_kwargs(self, **kwargs):
for key, val in kwargs.items():
self.init_locals[key] = val
def register_extra_unsliceable_params(self, *names):
for name in names:
self.unsliceable_params.add(name)
def _sanity_check(self):
assert (
type(self) != StackingSlicing
), "StackingSlicing must be initialized from a subclass"
assert "shape" in self.init_locals, (
"A `StackingSlicing` subclass must accept `shape` as an "
"initialization argument."
)
def __getitem__(self, index: IndexLike):
new_shape = tuple(torch.empty(self.shape, device="cuda")[index].shape)
to: StackingSlicing = self.restack(new_shape)
# A parameter not listed in `unsliceable_params` should be
# sliced and copied. Otherwise, the whole parameter is copied.
for name, param in self.named_parameters(recurse=False):
param_to = to.get_parameter(name)
param_to.requires_grad_(False)
if name in self.unsliceable_params:
param_to.copy_(param)
else:
param_to.copy_(param[index])
# A submodule that is a `StackingSlicing` should be sliced
# and copied. Otherwise, the whole submodule is copied.
for name, module in self.named_children():
submod_to: Module = to.get_submodule(name)
submod_to.requires_grad_(False)
if isinstance(module, StackingSlicing):
submod_from: Module = module.__getitem__(index)
elif isinstance(module, ModuleList):
submod_from = ModuleList(
[
entry[index] if isinstance(entry, StackingSlicing) else entry
for entry in module
]
)
elif isinstance(module, ModuleDict):
submod_from = ModuleDict(
{
key: entry[index]
if isinstance(entry, StackingSlicing)
else entry
for key, entry in module.items()
}
)
else:
submod_from: Module = module
submod_from.shape = new_shape
setattr(to, name, submod_from)
to.shape = new_shape
return to
def detect_device(self):
for m in self.modules():
try:
return next(m.parameters()).device
except StopIteration:
if hasattr(m, "_former_parameters"):
# `self` is an instance from torch.nn.parallel.replicate
fp = m._former_parameters
if len(fp):
return next(iter(fp.values())).device
raise ValueError("Failed to detect the device.")
def detect_dtype(self):
for m in self.modules():
try:
return next(m.parameters()).dtype
except StopIteration:
if hasattr(m, "_former_parameters"):
# `self` is an instance from torch.nn.parallel.replicate
fp = m._former_parameters
if len(fp):
return next(iter(fp.values())).dtype
raise ValueError("Failed to detect the dtype.")
def _parameter_shape_hash(self):
name_shapes = [(name, p.shape) for name, p in self.named_parameters()]
return hash(tuple(name_shapes))
def restack(
self,
*shape: int | Tuple[int],
use_cached: bool = True,
max_cached_copies: int = 100,
):
if len(shape) == 1 and isinstance(shape[0], Iterable):
shape = tuple(shape[0])
tag = f"stacked/{self.__class__.__name__}-{self._parameter_shape_hash()}"
if use_cached and shape in _cache[tag]:
new = deepcopy(_cache[tag][shape])
else:
positional, keywords = self._recover_args_from_locals(
locals_=self.init_locals,
)
new = self.__class__(
*positional,
**keywords,
shape=shape,
)
new = new.to(self.detect_device())
_cache[tag][shape] = new
while len(_cache[tag]) > max_cached_copies:
_cache[tag].popitem(last=False) # pop the earliest
new.shape = shape
return new.to(self.detect_device())
stack = restack
def _recover_args_from_locals(
self,
locals_: Mapping[str, Any],
):
signature = inspect.signature(self.__init__)
positional = []
keywords = {}
for key, sign in signature.parameters.items():
if key in self.skip_keys:
continue
value = locals_[key]
if sign.kind not in [
inspect.Parameter.VAR_POSITIONAL,
inspect.Parameter.VAR_KEYWORD,
]:
keywords[key] = value
elif sign.kind == inspect.Parameter.VAR_POSITIONAL:
positional.extend(value)
elif sign.kind == inspect.Parameter.VAR_KEYWORD:
keywords = {**keywords, **value}
return positional, keywords
@property
def ndim(self) -> int:
return len(self.shape)