/ CS231N

cs231n - Lecture 6. Hardware and Software

Deeplearning Software

  • The point of deep learning frameworks
    (1) Quick to develop and test new ideas
    (2) Automatically compute gradients
    (3) Run it all efficiently on GPU (wrap cuDNN, cuBLAS, OpenCL, etc)

Computational graph example

import numpy as np
np.random.seed(0)

N, D = 3, 4

x = np.random.randn(N, D)
y = np.random.randn(N, D)
z = np.random.randn(N, D)

a = x * y
b = a + z
c = np.sum(b)

grad_c = 1.0
grad_b = grad_c * np.ones((N, D))
grad_a = grad_b.copy()
grad_z = grad_b.copy()
grad_x = grad_a * y
grad_y = grad_a * x
  • in Numpy
    Good: Clean API, easy to write numeric code
    Bad: Have to compute our own gradients and can’t run on GPU
import torch

device = 'cuda:0'
N, D = 3, 4
x = torch.randn(N, D, requires_grad=True,
				device=device)
y = torch.randn(N, D)
z = torch.randn(N, D)

a = x * y
b = a + z
c = torch.sum(b)

c.backward()
print(x.grad)
  • in PyTorch
    PyTorch handles gradients for us
    Can run on GPU

PyTorch: Fundamental Concepts

  • torch.Tensor: Like a numpy array, but can run on GPU
  • torch.autograd: Package for building computational graphs out of Tensors, and automatically computing gradients
  • torch.nn.Module: A neural network layer; may store state or learnable weights
  • we are using PyTorch version 1.7 here

PyTorch: Autograd

# Running example: Train a two-layer ReLU network on random data with L2 loss
import torch

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H, requires_grad=True)	# enables autograd
w2 = torch.randn(H, D_out, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
	h = x.mm(w1)							# Forward pass
	h_relu = h.clamp(min=0)					# no need to track intermediate values
	y_pred = h_relu.mm(w2)					# = x.mm(w1).clamp(min=0).mm(w2)
	loss = (y_pred - y).pow(2).sum()
	
	loss_backward()							# Compute gradient of loss

	with torch.no_grad():					# Gradient descent
		w1 -= learning_rate * w1.grad			
		w2 -= learning_rate * w2.grad
		w1.grad.zero_()
		w2.grad.zero_()

Or you can define your own

class MyReLU(torch.autograd.Function):
	@staticmethod
	def forward(ctx, x):	#Use ctx object to “cache” values for the backward pass
		ctx.save_for_backward(x)
		return x.clamp(min=0)
	
	@staticmethod
	def backward(ctx, grad_y):
		x, = ctx.saved_tensors
		grad_input = grad_y.clone()
		grad_input[x < 0] = 0
		return grad_input
		
def my_relu(x):		# a helper function to make it easy to use the new function
	return MyReLU.apply(x)
  • Now we can replace y_pred = x.mm(w1).clamp(min=0).mm(w2) with y_pred = my_relu(x.mm(w1)).mm(w2). In practice, do it only when you need custom backward.

PyTorch: nn

# Higher-level wrapper for working with neural nets
import torch

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
			torch.nn.Linear(D_in, H),
			torch.nn.ReLU(),
			torch.nn.Linear(H, D_out))

learning_rate = 1e-2
for t in range(500):
	y_pred = model(x)
	loss = torch.nn.functional.mse_loss(y_pred, y)
	
	loss.backward()
	
	with torch.no_grad():
		for param in model.parameters():
			param -= learning_rate * param.grad
	model.zero_grad()

PyTorch: optim

import torch

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
			torch.nn.Linear(D_in, H),
			torch.nn.ReLU(),
			torch.nn.Linear(H, D_out))

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),
				lr=learning_rate)	# different update rules

for t in range(500):
	y_pred = model(x)
	loss = torch.nn.functional.mse_loss(y_pred, y)
	
	loss.backward()
	optimizer.step()
	optimizer.zero_grad()

PyTorch: Define new Modules

import torch

class TwoLayerNet(torch.nn.Module):
	def __init__(self, D_in, H, D_out):		# init sets up two children
		super(TwoLayerNet, self).__init__()
		self.linear1 = torch.nn.Linear(D_in, H)
		self.linear2 = torch.nn.Linear(H, D_out)
	
	def forward(self, x):
		h_relu = self.linear1(x).clamp(min=0)
		y_pred = self.linear2(h_relu)
		return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
	y_pred = model(x)
	loss = torch.nn.functional.mse_loss(y_pred, y)
	
	loss.backward()
	optimizer.step()
	optimizer.zero_grad()

PyTorch: Pretrained Models

import torch
import torchvision

alexnet = torchvision.models.alexnet(pretrained=True)
vgg16 = torchvision.models.vgg16(pretrained=True)
resnet101 = torchvision.models.resnet101(pretrained=True)

TensorFlow 2.4

  • Default dynamic graph, optionally static.
import numpy as np
import tensorflow as tf

N, D, H = 64, 1000, 100

x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
w1 = tf.Variable(tf.random.uniform((D, H))) # weights
w2 = tf.Variable(tf.random.uniform((H, D))) # weights

learning_rate = 1e-6
for t in range(50):
	with tf.GradientTape() as tape:					# build dynamic graph
		h = tf.maximum(tf.matmul(x, w1), 0)			# forward pass
		y_pred = tf.matmul(h, w2)
		diff = y_pred - y
		loss = tf.reduce_mean(tf.reduce_sum(diff ** 2, axis=1))
	gradients = tape.gradient(loss, [w1, w2])		# backward pass
	w1.assign(w1 - learning_rate * gradients[0])	# gradient descent
	w2.assign(w2 - learning_rate * gradients[1])

Keras: High-level Wrapper

import numpy as np
import tensorflow as tf

N, D, H = 64, 1000, 100

x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D,),
			activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(D))
optimizer = tf.optimizers.SGD(1e-1)

losses = []
for t in range(50):
	with tf.GradientTape() as tape:
		y_pred = model(x)
		loss = tf.losses.MeanSquaredError()(y_pred, y)
	gradients = tape.gradient(
		loss, model.trainable_variables)
	optimizer.apply_gradients(
		zip(gradients, model.trainable_variables))
  • We can make use of different update rules with tf.optimizers.{} and predefined loss functions as well.

  • Keras can handle the training loop;

N, D, H = 64, 1000, 100

x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D,),
			activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(D))
optimizer = tf.optimizers.SGD(1e-1)
model.compile(loss=tf.keras.losses.MeanSquaredError(),
			 optimizer=optimizer)
history = model.fit(x, y, epochs=50, batch_size=N)

TensorFlow: compile static graph

	...
model.add(tf.keras.layers.Dense(D))
optimizer = tf.optimizers.SGD(1e-1)

@tf.function
def model_func(x, y):
	y_pred = model(x)
	loss = tf.losses.MeanSquaredError()(y_pred, y)
	return y_pred, loss

for t in range(50):
	...
  • @tf.function decorator (implicitly) compiles python functions to static graph for better performance.

Dynamic vs. Static

  • Dynamic Computation Graphs: Building the graph and computing the graph happen at the same time.
    Graph building and execution are intertwined, so always need to keep code around
    Inefficient, especially if we are building the same graph over and over again.

  • Static Computation Graphs:
    Build computational graph describing our computation(including finding paths for backprop)
    Reuse the same graph on every iteration
    Once graph is built, can serialize it and run it without the code that built the graph
    Framework can optimize the graph before it runs

PyTorch vs. TensorFlow

  • PyTorch
    Dynamic Graphs as default set
    Static: ONNX, TorchScript

  • TensorFlow
    Dynamic: Eager set
    Static: @tf.function

Model Parallel vs. Data Parallel

  • Model parallelism:
    split computation graph into parts and distribute to GPUs/nodes

  • Data parallelism:
    split minibatch into chunks and distribute to GPUs/ nodes
    PyTorch: nn.DataParallel, nn.DistributedDataParallel
    TensorFlow: tf.distributed.Strategy