-
Notifications
You must be signed in to change notification settings - Fork 416
/
c3d.py
187 lines (147 loc) · 6.23 KB
/
c3d.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# C3D, AlexNet type model with 3D convolutions (for video processing).
# From "Learning Spatiotemporal Features with 3D Convolutional Networks"
#
# Pretrained weights from https://data.vision.ee.ethz.ch/gyglim/C3D/c3d_model.pkl
# and the snipplet mean from
# https://data.vision.ee.ethz.ch/gyglim/C3D/snipplet_mean.npy
#
# License: Not specified
# Author: Michael Gygli, https://github.com/gyglim
#
import lasagne
from lasagne.layers.shape import PadLayer
from lasagne.layers import InputLayer, DenseLayer, NonlinearityLayer
from lasagne.layers.dnn import Conv3DDNNLayer, MaxPool3DDNNLayer
from lasagne.nonlinearities import softmax
import theano
import numpy as np
import skimage.transform
from skimage import color
import pickle
def build_model():
'''
Builds C3D model
Returns
-------
dict
A dictionary containing the network layers, where the output layer is at key 'prob'
'''
net = {}
net['input'] = InputLayer((None, 3, 16, 112, 112))
# ----------- 1st layer group ---------------
net['conv1a'] = Conv3DDNNLayer(net['input'], 64, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify,flip_filters=False)
net['pool1'] = MaxPool3DDNNLayer(net['conv1a'],pool_size=(1,2,2),stride=(1,2,2))
# ------------- 2nd layer group --------------
net['conv2a'] = Conv3DDNNLayer(net['pool1'], 128, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['pool2'] = MaxPool3DDNNLayer(net['conv2a'],pool_size=(2,2,2),stride=(2,2,2))
# ----------------- 3rd layer group --------------
net['conv3a'] = Conv3DDNNLayer(net['pool2'], 256, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['conv3b'] = Conv3DDNNLayer(net['conv3a'], 256, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['pool3'] = MaxPool3DDNNLayer(net['conv3b'],pool_size=(2,2,2),stride=(2,2,2))
# ----------------- 4th layer group --------------
net['conv4a'] = Conv3DDNNLayer(net['pool3'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['conv4b'] = Conv3DDNNLayer(net['conv4a'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['pool4'] = MaxPool3DDNNLayer(net['conv4b'],pool_size=(2,2,2),stride=(2,2,2))
# ----------------- 5th layer group --------------
net['conv5a'] = Conv3DDNNLayer(net['pool4'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
net['conv5b'] = Conv3DDNNLayer(net['conv5a'], 512, (3,3,3), pad=1,nonlinearity=lasagne.nonlinearities.rectify)
# We need a padding layer, as C3D only pads on the right, which cannot be done with a theano pooling layer
net['pad'] = PadLayer(net['conv5b'],width=[(0,1),(0,1)], batch_ndim=3)
net['pool5'] = MaxPool3DDNNLayer(net['pad'],pool_size=(2,2,2),pad=(0,0,0),stride=(2,2,2))
net['fc6-1'] = DenseLayer(net['pool5'], num_units=4096,nonlinearity=lasagne.nonlinearities.rectify)
net['fc7-1'] = DenseLayer(net['fc6-1'], num_units=4096,nonlinearity=lasagne.nonlinearities.rectify)
net['fc8-1'] = DenseLayer(net['fc7-1'], num_units=487, nonlinearity=None)
net['prob'] = NonlinearityLayer(net['fc8-1'], softmax)
return net
def set_weights(net,model_file):
'''
Sets the parameters of the model using the weights stored in model_file
Parameters
----------
net: a Lasagne layer
model_file: string
path to the model that containes the weights
Returns
-------
None
'''
with open(model_file) as f:
print('Load pretrained weights from %s...' % model_file)
model = pickle.load(f)
print('Set the weights...')
lasagne.layers.set_all_param_values(net, model,trainable=True)
######## Below, there are several helper functions to transform (lists of) images into the right format ######
def get_snips(images,image_mean,start=0, with_mirrored=False):
'''
Converts a list of images to a 5d tensor that serves as input to C3D
Parameters
----------
images: 4d numpy array or list of 3d numpy arrays
RGB images
image_mean: 4d numpy array
snipplet mean (given by C3D)
start: int
first frame to use from the list of images
with_mirrored: bool
return the snipplet and its mirrored version (horizontal flip)
Returns
-------
caffe format 5D numpy array (serves as input to C3D)
'''
assert len(images) >= start+16, "Not enough frames to fill a snipplet of 16 frames"
# Convert images to caffe format and stack them
caffe_imgs=map(lambda x: rgb2caffe(x).reshape(1,3,128,171),images[start:start+16])
snip=np.vstack(caffe_imgs).swapaxes(0,1)
# Remove the mean
snip-= image_mean
# Get the center crop
snip=snip[:,:,8:120,29:141]
snip=snip.reshape(1,3,16,112,112)
if with_mirrored: # Return nromal and flipped version
return np.vstack((snip,snip[:,:,:,:,::-1]))
else:
return snip
def rgb2caffe(im, out_size=(128, 171)):
'''
Converts an RGB image to caffe format and downscales it as needed by C3D
Parameters
----------
im numpy array
an RGB image
downscale
Returns
-------
a caffe image (channel,height, width) in BGR format
'''
im=np.copy(im)
if len(im.shape)==2: # Make sure the image has 3 channels
im = color.gray2rgb(im)
h, w, _ = im.shape
im = skimage.transform.resize(im, out_size, preserve_range=True)
im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1)
# Convert to BGR
im = im[::-1, :, :]
return np.array(im,theano.config.floatX)
def convert_back(raw_im, image_mean=None,idx=0):
'''
Converts a Caffe format image back to the standard format, so that it can be plotted.
Parameters
----------
raw_im numpy array
a bgr caffe image; format (channel,height, width)
add_mean boolean
Add the C3D mean?
idx integer (default: 0)
position in the snipplet (used for mean addtion, but differences are very small)
Returns
-------
a RGB image; format (w,h,channel)
'''
raw_im=np.copy(raw_im)
if image_mean is not None:
raw_im += image_mean[idx,:,8:120,29:141].squeeze()
# Convert to RGB
raw_im = raw_im[::-1, :, :]
# Back in (y,w,channel) order
im = np.array(np.swapaxes(np.swapaxes(raw_im, 1, 0), 2, 1),np.uint8)
return im