In [1]:
# By Justin Johnson https://github.com/jcjohnson/pytorch-examples/blob/master/nn/dynamic_net.py

import random
import torch
from torch.autograd import Variable

"""
To showcase the power of PyTorch dynamic graphs, we will implement a very strange
model: a fully-connected ReLU network that on each forward pass randomly chooses
a number between 1 and 4 and has that many hidden layers, reusing the same
weights multiple times to compute the innermost hidden layers.
"""

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    super(DynamicNet, self).__init__()
    self.input_linear = torch.nn.Linear(D_in, H)
    self.middle_linear = torch.nn.Linear(H, H)
    self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x, verbose = False):
    """
    For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
    and reuse the middle_linear Module that many times to compute hidden layer
    representations.
    Since each forward pass builds a dynamic computation graph, we can use normal
    Python control-flow operators like loops or conditional statements when
    defining the forward pass of the model.
    Here we also see that it is perfectly safe to reuse the same Module many
    times when defining a computational graph. This is a big improvement from Lua
    Torch, where each Module could be used only once.
    """
    h_relu = self.input_linear(x).clamp(min=0)
    n_layers = random.randint(0, 3)
    if verbose:
        print("The number of layers for this run is", n_layers)
        # print(h_relu)
    for _ in range(n_layers):
        h_relu = self.middle_linear(h_relu).clamp(min=0)
        if verbose:
            pass
            # print(h_relu)
    y_pred = self.output_linear(h_relu)
    return y_pred




# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 10, 1

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.data[0])

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 66.81979370117188
1 62.70124053955078
2 60.11635971069336
3 59.01012420654297
4 58.9544677734375
5 59.42093276977539
6 58.79389190673828
7 58.585784912109375
8 58.195953369140625
9 58.74108123779297
10 37.2628173828125
11 58.01183319091797
12 56.33346176147461
13 58.70774841308594
14 57.56559371948242
15 31.000629425048828
16 58.64924621582031
17 57.091064453125
18 53.6650505065918
19 56.62508773803711
20 52.271949768066406
21 51.258689880371094
22 58.11512756347656
23 48.78624725341797
24 47.30036163330078
25 57.446319580078125
26 27.24831771850586
27 56.91810607910156
28 26.6308650970459
29 56.387474060058594
30 39.59881591796875
31 38.3515739440918
32 24.11873435974121
33 49.98106002807617
34 49.13752746582031
35 48.09789276123047
36 53.18637466430664
37 21.792524337768555
38 44.621768951416016
39 20.74881362915039
40 29.225149154663086
41 28.5273380279541
42 40.31317901611328
43 26.860628128051758
44 18.201810836791992
45 44.756046295166016
46 35.72251510620117
47 17.067745208740

In [2]:
model(x)[1:5]

Variable containing:
-1.2218
-0.4829
-0.2787
 0.1339
[torch.FloatTensor of size 4x1]

In [3]:
model(x)[1:5] # another run

Variable containing:
-1.2218
-0.4829
-0.2787
 0.1339
[torch.FloatTensor of size 4x1]

In [4]:
model(x)[1:5]

Variable containing:
-1.3516
-0.5395
-0.2080
 0.1541
[torch.FloatTensor of size 4x1]

Looks consistent! Let's now try to see what's happening inside

In [5]:
model(x, verbose = True)[1:5]

The number of layers for this run is 3


Variable containing:
-1.3516
-0.5395
-0.2080
 0.1541
[torch.FloatTensor of size 4x1]

In [6]:
model(x, verbose = True)[1:5]

The number of layers for this run is 1


Variable containing:
-1.3114
-0.5785
-0.2852
 0.0950
[torch.FloatTensor of size 4x1]

In [7]:
model(x, verbose = True)[1:5]

The number of layers for this run is 0


Variable containing:
-1.2218
-0.4829
-0.2787
 0.1339
[torch.FloatTensor of size 4x1]

In [8]:
model(x, verbose = True)[1:5]

The number of layers for this run is 0


Variable containing:
-1.2218
-0.4829
-0.2787
 0.1339
[torch.FloatTensor of size 4x1]

In [9]:
model(x, verbose = True)[1:5]

The number of layers for this run is 2


Variable containing:
-1.3534
-0.5795
-0.2706
 0.1067
[torch.FloatTensor of size 4x1]

So what's the target?

In [10]:
y[1:5]

Variable containing:
-1.3277
-0.6187
-0.2690
 0.1163
[torch.FloatTensor of size 4x1]