PyTorch CookBook

待整理

参考: https://zhuanlan.zhihu.com/p/59205847

本文代码基于PyTorch 1.0版本,需要用到以下包

import collections
import os
import shutil
import tqdm

import numpy as np
import PIL.Image
import torch
import torchvision

  1. 基础配置
    检查PyTorch版本

torch.version # PyTorch version
torch.version.cuda # Corresponding CUDA version
torch.backends.cudnn.version() # Corresponding cuDNN version
torch.cuda.get_device_name(0) # GPU type
更新PyTorch

PyTorch将被安装在anaconda3/lib/python3.7/site-packages/torch/目录下。

conda update pytorch torchvision -c pytorch
固定随机种子

torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
指定程序运行在特定GPU卡上

在命令行指定环境变量

CUDA_VISIBLE_DEVICES=0,1 python train.py
或在代码中指定

os.environ[‘CUDA_VISIBLE_DEVICES’] = ‘0,1’
判断是否有CUDA支持

torch.cuda.is_available()
设置为cuDNN benchmark模式

Benchmark模式会提升计算速度,但是由于计算中有随机性,每次网络前馈结果略有差异。

torch.backends.cudnn.benchmark = True
如果想要避免这种结果波动,设置

torch.backends.cudnn.deterministic = True
清除GPU存储

有时Control-C中止运行后GPU存储没有及时释放,需要手动清空。在PyTorch内部可以

torch.cuda.empty_cache()
或在命令行可以先使用ps找到程序的PID,再使用kill结束该进程

ps aux | grep python
kill -9 [pid]
或者直接重置没有被清空的GPU

nvidia-smi —gpu-reset -i [gpu_id]

  1. 张量处理
    张量基本信息

tensor.type() # Data type
tensor.size() # Shape of the tensor. It is a subclass of Python tuple
tensor.dim() # Number of dimensions.
数据类型转换

Set default tensor type. Float in PyTorch is much faster than double.

torch.set_default_tensor_type(torch.FloatTensor)

Type convertions.

tensor = tensor.cuda()
tensor = tensor.cpu()
tensor = tensor.float()
tensor = tensor.long()
torch.Tensor与np.ndarray转换

torch.Tensor -> np.ndarray.

ndarray = tensor.cpu().numpy()

np.ndarray -> torch.Tensor.

tensor = torch.from_numpy(ndarray).float()
tensor = torch.from_numpy(ndarray.copy()).float() # If ndarray has negative stride
torch.Tensor与PIL.Image转换

PyTorch中的张量默认采用N×D×H×W的顺序,并且数据范围在[0, 1],需要进行转置和规范化。

torch.Tensor -> PIL.Image.

image = PIL.Image.fromarray(torch.clamp(tensor * 255, min=0, max=255
).byte().permute(1, 2, 0).cpu().numpy())
image = torchvision.transforms.functional.to_pil_image(tensor) # Equivalently way

PIL.Image -> torch.Tensor.

tensor = torch.from_numpy(np.asarray(PIL.Image.open(path))
).permute(2, 0, 1).float() / 255
tensor = torchvision.transforms.functional.to_tensor(PIL.Image.open(path)) # Equivalently way
np.ndarray与PIL.Image转换

np.ndarray -> PIL.Image.

image = PIL.Image.fromarray(ndarray.astypde(np.uint8))

PIL.Image -> np.ndarray.

ndarray = np.asarray(PIL.Image.open(path))
从只包含一个元素的张量中提取值

这在训练时统计loss的变化过程中特别有用。否则这将累积计算图,使GPU存储占用量越来越大。

value = tensor.item()
张量形变

张量形变常常需要用于将卷积层特征输入全连接层的情形。相比torch.view,torch.reshape可以自动处理输入张量不连续的情况。

tensor = torch.reshape(tensor, shape)
打乱顺序

tensor = tensor[torch.randperm(tensor.size(0))] # Shuffle the first dimension
水平翻转

PyTorch不支持tensor[::-1]这样的负步长操作,水平翻转可以用张量索引实现。

Assume tensor has shape NDH*W.

tensor = tensor[:, :, :, torch.arange(tensor.size(3) - 1, -1, -1).long()]
复制张量

有三种复制的方式,对应不同的需求。

Operation | New/Shared memory | Still in computation graph |

tensor.clone() # | New | Yes |
tensor.detach() # | Shared | No |
tensor.detach.clone()() # | New | No |
拼接张量

注意torch.cat和torch.stack的区别在于torch.cat沿着给定的维度拼接,而torch.stack会新增一维。例如当参数是3个10×5的张量,torch.cat的结果是30×5的张量,而torch.stack的结果是3×10×5的张量。

tensor = torch.cat(list_of_tensors, dim=0)
tensor = torch.stack(list_of_tensors, dim=0)
将整数标记转换成独热(one-hot)编码

PyTorch中的标记默认从0开始。

N = tensor.size(0)
one_hot = torch.zeros(N, num_classes).long()
one_hot.scatter_(dim=1, index=torch.unsqueeze(tensor, dim=1), src=torch.ones(N, num_classes).long())
得到非零/零元素

torch.nonzero(tensor) # Index of non-zero elements
torch.nonzero(tensor == 0) # Index of zero elements
torch.nonzero(tensor).size(0) # Number of non-zero elements
torch.nonzero(tensor == 0).size(0) # Number of zero elements
判断两个张量相等

torch.allclose(tensor1, tensor2) # float tensor
torch.equal(tensor1, tensor2) # int tensor
张量扩展

Expand tensor of shape 64512 to shape 6451277.

torch.reshape(tensor, (64, 512, 1, 1)).expand(64, 512, 7, 7)
矩阵乘法

Matrix multiplication: (mn) (np) -> (mp).

result = torch.mm(tensor1, tensor2)

Batch matrix multiplication: (bmn) (bnp) -> (bm*p).

result = torch.bmm(tensor1, tensor2)

Element-wise multiplication.

result = tensor1 * tensor2
计算两组数据之间的两两欧式距离

X1 is of shape m*d.

X1 = torch.unsqueeze(X1, dim=1).expand(m, n, d)

X2 is of shape n*d.

X2 = torch.unsqueeze(X2, dim=0).expand(m, n, d)

dist is of shape m*n, where dist[i][j] = sqrt(|X1[i, :] - X[j, :]|^2)

dist = torch.sqrt(torch.sum((X1 - X2) ** 2, dim=2))

  1. 模型定义
    卷积层

最常用的卷积层配置是

conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True)
conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
如果卷积层配置比较复杂,不方便计算输出大小时,可以利用如下可视化工具辅助

Convolution Visualizer

ezyang.github.io
GAP(Global average pooling)层

gap = torch.nn.AdaptiveAvgPool2d(output_size=1)
双线性汇合(bilinear pooling)[1]

X = torch.reshape(N, D, H W) # Assume X has shape NDHW
X = torch.bmm(X, torch.transpose(X, 1, 2)) / (H W) # Bilinear pooling
assert X.size() == (N, D, D)
X = torch.reshape(X, (N, D
D))
X = torch.sign(X) * torch.sqrt(torch.abs(X) + 1e-5) # Signed-sqrt normalization
X = torch.nn.functional.normalize(X) # L2 normalization
多卡同步BN(Batch normalization)

当使用torch.nn.DataParallel将代码运行在多张GPU卡上时,PyTorch的BN层默认操作是各卡上数据独立地计算均值和标准差,同步BN使用所有卡上的数据一起计算BN层的均值和标准差,缓解了当批量大小(batch size)比较小时对均值和标准差估计不准的情况,是在目标检测等任务中一个有效的提升性能的技巧。

vacancy/Synchronized-BatchNorm-PyTorch

github.com
图标
现在PyTorch官方已经支持同步BN操作

sync_bn = torch.nn.SyncBatchNorm(num_features, eps=1e-05, momentum=0.1, affine=True,
track_running_stats=True)
将已有网络的所有BN层改为同步BN层

def convertBNtoSyncBN(module, process_group=None):
‘’’Recursively replace all BN layers to SyncBN layer.

Args:
    module[torch.nn.Module]. Network
'''
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
    sync_bn = torch.nn.SyncBatchNorm(module.num_features, module.eps, module.momentum,
                                     module.affine, module.track_running_stats, process_group)
    sync_bn.running_mean = module.running_mean
    sync_bn.running_var = module.running_var
    if module.affine:
        sync_bn.weight = module.weight.clone().detach()
        sync_bn.bias = module.bias.clone().detach()
    return sync_bn
else:
    for name, child_module in module.named_children():
        setattr(module, name) = convert_syncbn_model(child_module, process_group=process_group))
    return module

类似BN滑动平均

如果要实现类似BN滑动平均的操作,在forward函数中要使用原地(inplace)操作给滑动平均赋值。

class BN(torch.nn.Module)
def init(self):

self.register_buffer(‘running_mean’, torch.zeros(num_features))

def forward(self, X):
    ...
    self.running_mean += momentum * (current - self.running_mean)

计算模型整体参数量

num_parameters = sum(torch.numel(parameter) for parameter in model.parameters())
类似Keras的model.summary()输出模型信息

sksq96/pytorch-summary

github.com
图标
模型权值初始化

注意model.modules()和model.children()的区别:model.modules()会迭代地遍历模型的所有子层,而model.children()只会遍历模型下的一层。

Common practise for initialization.

for layer in model.modules():
if isinstance(layer, torch.nn.Conv2d):
torch.nn.init.kaiming_normal_(layer.weight, mode=’fan_out’,
nonlinearity=’relu’)
if layer.bias is not None:
torch.nn.init.constant_(layer.bias, val=0.0)
elif isinstance(layer, torch.nn.BatchNorm2d):
torch.nn.init.constant_(layer.weight, val=1.0)
torch.nn.init.constant_(layer.bias, val=0.0)
elif isinstance(layer, torch.nn.Linear):
torch.nn.init.xavier_normal_(layer.weight)
if layer.bias is not None:
torch.nn.init.constant_(layer.bias, val=0.0)

Initialization with given tensor.

layer.weight = torch.nn.Parameter(tensor)
部分层使用预训练模型

注意如果保存的模型是torch.nn.DataParallel,则当前的模型也需要是torch.nn.DataParallel。torch.nn.DataParallel(model).module == model。

model.load_state_dict(torch.load(‘model,pth’), strict=False)
将在GPU保存的模型加载到CPU

model.load_state_dict(torch.load(‘model,pth’, map_location=’cpu’))

  1. 数据准备、特征提取与微调
    图像分块打散(image shuffle)/区域混淆机制(region confusion mechanism,RCM)[2]

X is torch.Tensor of size NDH*W.

Shuffle rows

Q = (torch.unsqueeze(torch.arange(num_blocks), dim=1) * torch.ones(1, num_blocks).long()

 + torch.randint(low=-neighbour, high=neighbour, size=(num_blocks, num_blocks)))

Q = torch.argsort(Q, dim=0)
assert Q.size() == (num_blocks, num_blocks)

X = [torch.chunk(row, chunks=num_blocks, dim=2)
for row in torch.chunk(X, chunks=num_blocks, dim=1)]
X = [[X[Q[i, j].item()][j] for j in range(num_blocks)]
for i in range(num_blocks)]

Shulle columns.

Q = (torch.ones(num_blocks, 1).long() * torch.unsqueeze(torch.arange(num_blocks), dim=0)

 + torch.randint(low=-neighbour, high=neighbour, size=(num_blocks, num_blocks)))

Q = torch.argsort(Q, dim=1)
assert Q.size() == (num_blocks, num_blocks)
X = [[X[i][Q[i, j].item()] for j in range(num_blocks)]
for i in range(num_blocks)]

Y = torch.cat([torch.cat(row, dim=2) for row in X], dim=1)
得到视频数据基本信息

import cv2
video = cv2.VideoCapture(mp4_path)
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(video.get(cv2.CAP_PROP_FPS))
video.release()
TSN每段(segment)采样一帧视频[3]

K = self._num_segments
if is_train:
if num_frames > K:

    # Random index for each segment.
    frame_indices = torch.randint(
        high=num_frames // K, size=(K,), dtype=torch.long)
    frame_indices += num_frames // K * torch.arange(K)
else:
    frame_indices = torch.randint(
        high=num_frames, size=(K - num_frames,), dtype=torch.long)
    frame_indices = torch.sort(torch.cat((
        torch.arange(num_frames), frame_indices)))[0]

else:
if num_frames > K:

    # Middle index for each segment.
    frame_indices = num_frames / K // 2
    frame_indices += num_frames // K * torch.arange(K)
else:
    frame_indices = torch.sort(torch.cat((                              
        torch.arange(num_frames), torch.arange(K - num_frames))))[0]

assert frame_indices.size() == (K,)
return [frame_indices[i] for i in range(K)]
提取ImageNet预训练模型某层的卷积特征

VGG-16 relu5-3 feature.

model = torchvision.models.vgg16(pretrained=True).features[:-1]

VGG-16 pool5 feature.

model = torchvision.models.vgg16(pretrained=True).features

VGG-16 fc7 feature.

model = torchvision.models.vgg16(pretrained=True)
model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-3])

ResNet GAP feature.

model = torchvision.models.resnet18(pretrained=True)
model = torch.nn.Sequential(collections.OrderedDict(
list(model.named_children())[:-1]))

with torch.no_grad():
model.eval()
conv_representation = model(image)
提取ImageNet预训练模型多层的卷积特征

class FeatureExtractor(torch.nn.Module):
“””Helper class to extract several convolution features from the given
pre-trained model.

Attributes:
    _model, torch.nn.Module.
    _layers_to_extract, list<str> or set<str>

Example:
    >>> model = torchvision.models.resnet152(pretrained=True)
    >>> model = torch.nn.Sequential(collections.OrderedDict(
            list(model.named_children())[:-1]))
    >>> conv_representation = FeatureExtractor(
            pretrained_model=model,
            layers_to_extract={'layer1', 'layer2', 'layer3', 'layer4'})(image)
"""
def __init__(self, pretrained_model, layers_to_extract):
    torch.nn.Module.__init__(self)
    self._model = pretrained_model
    self._model.eval()
    self._layers_to_extract = set(layers_to_extract)

def forward(self, x):
    with torch.no_grad():
        conv_representation = []
        for name, layer in self._model.named_children():
            x = layer(x)
            if name in self._layers_to_extract:
                conv_representation.append(x)
        return conv_representation

其他预训练模型

Cadene/pretrained-models.pytorch

github.com
图标
微调全连接层

model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
param.requires_grad = False
model.fc = nn.Linear(512, 100) # Replace the last fc layer
optimizer = torch.optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
以较大学习率微调全连接层,较小学习率微调卷积层

model = torchvision.models.resnet18(pretrained=True)
finetuned_parameters = list(map(id, model.fc.parameters()))
conv_parameters = (p for p in model.parameters() if id(p) not in finetuned_parameters)
parameters = [{‘params’: conv_parameters, ‘lr’: 1e-3},
{‘params’: model.fc.parameters()}]
optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)

  1. 模型训练
    常用训练和验证数据预处理

其中ToTensor操作会将PIL.Image或形状为H×W×D,数值范围为[0, 255]的np.ndarray转换为形状为D×H×W,数值范围为[0.0, 1.0]的torch.Tensor。

train_transform = torchvision.transforms.Compose([
torchvision.transforms.RandomResizedCrop(size=224,
scale=(0.08, 1.0)),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225)),
])
val_transform = torchvision.transforms.Compose([
torchvision.transforms.Resize(256),
torchvision.transforms.CenterCrop(224),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225)),
])
训练基本代码框架

for t in epoch(80):
for images, labels in tqdm.tqdm(train_loader, desc=’Epoch %3d’ % (t + 1)):
images, labels = images.cuda(), labels.cuda()
scores = model(images)
loss = loss_function(scores, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
标记平滑(label smoothing)[4]

for images, labels in train_loader:
images, labels = images.cuda(), labels.cuda()
N = labels.size(0)

# C is the number of classes.
smoothed_labels = torch.full(size=(N, C), fill_value=0.1 / (C - 1)).cuda()
smoothed_labels.scatter_(dim=1, index=torch.unsqueeze(labels, dim=1), value=0.9)

score = model(images)
log_prob = torch.nn.functional.log_softmax(score, dim=1)
loss = -torch.sum(log_prob * smoothed_labels) / N
optimizer.zero_grad()
loss.backward()
optimizer.step()

Mixup[5]

beta_distribution = torch.distributions.beta.Beta(alpha, alpha)
for images, labels in train_loader:
images, labels = images.cuda(), labels.cuda()

# Mixup images.
lambda_ = beta_distribution.sample([]).item()
index = torch.randperm(images.size(0)).cuda()
mixed_images = lambda_ * images + (1 - lambda_) * images[index, :]

# Mixup loss.    
scores = model(mixed_images)
loss = (lambda_ * loss_function(scores, labels)
        + (1 - lambda_) * loss_function(scores, labels[index]))

optimizer.zero_grad()
loss.backward()
optimizer.step()

L1正则化

l1_regularization = torch.nn.L1Loss(reduction=’sum’)
loss = … # Standard cross-entropy loss
for param in model.parameters():
loss += lambda_ * torch.sum(torch.abs(param))
loss.backward()
不对偏置项进行L2正则化/权值衰减(weight decay)

bias_list = (param for name, param in model.named_parameters() if name[-4:] == ‘bias’)
others_list = (param for name, param in model.named_parameters() if name[-4:] != ‘bias’)
parameters = [{‘parameters’: bias_list, ‘weight_decay’: 0},
{‘parameters’: others_list}]
optimizer = torch.optim.SGD(parameters, lr=1e-2, momentum=0.9, weight_decay=1e-4)
梯度裁剪(gradient clipping)

torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=20)
计算Softmax输出的准确率

score = model(images)
prediction = torch.argmax(score, dim=1)
num_correct = torch.sum(prediction == labels).item()
accuruacy = num_correct / labels.size(0)
可视化模型前馈的计算图

szagoruyko/pytorchviz

github.com
图标
可视化学习曲线

有Facebook自己开发的Visdom和Tensorboard(仍处于实验阶段)两个选择。

facebookresearch/visdom

github.com
图标
torch.utils.tensorboard - PyTorch master documentation

pytorch.org

Example using Visdom.

vis = visdom.Visdom(env=’Learning curve’, use_incoming_socket=False)
assert self._visdom.check_connection()
self._visdom.close()
options = collections.namedtuple(‘Options’, [‘loss’, ‘acc’, ‘lr’])(
loss={‘xlabel’: ‘Epoch’, ‘ylabel’: ‘Loss’, ‘showlegend’: True},
acc={‘xlabel’: ‘Epoch’, ‘ylabel’: ‘Accuracy’, ‘showlegend’: True},
lr={‘xlabel’: ‘Epoch’, ‘ylabel’: ‘Learning rate’, ‘showlegend’: True})

for t in epoch(80):
tran(…)
val(…)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([train_loss]),
name=’train’, win=’Loss’, update=’append’, opts=options.loss)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([val_loss]),
name=’val’, win=’Loss’, update=’append’, opts=options.loss)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([train_acc]),
name=’train’, win=’Accuracy’, update=’append’, opts=options.acc)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([val_acc]),
name=’val’, win=’Accuracy’, update=’append’, opts=options.acc)
vis.line(X=torch.Tensor([t + 1]), Y=torch.Tensor([lr]),
win=’Learning rate’, update=’append’, opts=options.lr)
得到当前学习率

If there is one global learning rate (which is the common case).

lr = next(iter(optimizer.param_groups))[‘lr’]

If there are multiple learning rates for different layers.

all_lr = []
for param_group in optimizer.param_groups:
all_lr.append(param_group[‘lr’])
学习率衰减

Reduce learning rate when validation accuarcy plateau.

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=’max’, patience=5, verbose=True)
for t in range(0, 80):
train(…); val(…)
scheduler.step(val_acc)

Cosine annealing learning rate.

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80)

Reduce learning rate by 10 at given epochs.

scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 70], gamma=0.1)
for t in range(0, 80):
scheduler.step()
train(…); val(…)

Learning rate warmup by 10 epochs.

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda t: t / 10)
for t in range(0, 10):
scheduler.step()
train(…); val(…)
保存与加载断点

注意为了能够恢复训练,我们需要同时保存模型和优化器的状态,以及当前的训练轮数。

Save checkpoint.

is_best = current_acc > best_acc
best_acc = max(best_acc, current_acc)
checkpoint = {
‘best_acc’: best_acc,
‘epoch’: t + 1,
‘model’: model.state_dict(),
‘optimizer’: optimizer.state_dict(),
}
model_path = os.path.join(‘model’, ‘checkpoint.pth.tar’)
torch.save(checkpoint, model_path)
if is_best:
shutil.copy(‘checkpoint.pth.tar’, model_path)

Load checkpoint.

if resume:
model_path = os.path.join(‘model’, ‘checkpoint.pth.tar’)
assert os.path.isfile(model_path)
checkpoint = torch.load(model_path)
best_acc = checkpoint[‘best_acc’]
start_epoch = checkpoint[‘epoch’]
model.load_state_dict(checkpoint[‘model’])
optimizer.load_state_dict(checkpoint[‘optimizer’])
print(‘Load checkpoint at epoch %d.’ % start_epoch)
计算准确率、查准率(precision)、查全率(recall)

data[‘label’] and data[‘prediction’] are groundtruth label and prediction

for each image, respectively.

accuracy = np.mean(data[‘label’] == data[‘prediction’]) * 100

Compute recision and recall for each class.

for c in range(len(num_classes)):
tp = np.dot((data[‘label’] == c).astype(int),
(data[‘prediction’] == c).astype(int))
tp_fp = np.sum(data[‘prediction’] == c)
tp_fn = np.sum(data[‘label’] == c)
precision = tp / tp_fp 100
recall = tp / tp_fn
100

  1. 模型测试
    计算每个类别的查准率(precision)、查全率(recall)、F1和总体指标

import sklearn.metrics

all_label = []
all_prediction = []
for images, labels in tqdm.tqdm(data_loader):

 # Data.
 images, labels = images.cuda(), labels.cuda()

 # Forward pass.
 score = model(images)

 # Save label and predictions.
 prediction = torch.argmax(score, dim=1)
 all_label.append(labels.cpu().numpy())
 all_prediction.append(prediction.cpu().numpy())

Compute RP and confusion matrix.

all_label = np.concatenate(all_label)
assert len(all_label.shape) == 1
all_prediction = np.concatenate(all_prediction)
assert all_label.shape == all_prediction.shape
micro_p, micro_r, micro_f1, _ = sklearn.metrics.precision_recall_fscore_support(
all_label, all_prediction, average=’micro’, labels=range(num_classes))
class_p, class_r, class_f1, class_occurence = sklearn.metrics.precision_recall_fscore_support(
all_label, all_prediction, average=None, labels=range(num_classes))

Ci,j = #{y=i and hat_y=j}

confusion_mat = sklearn.metrics.confusion_matrix(
all_label, all_prediction, labels=range(num_classes))
assert confusion_mat.shape == (num_classes, num_classes)
将各类结果写入电子表格

import csv

Write results onto disk.

with open(os.path.join(path, filename), ‘wt’, encoding=’utf-8’) as f:
f = csv.writer(f)
f.writerow([‘Class’, ‘Label’, ‘# occurence’, ‘Precision’, ‘Recall’, ‘F1’,
‘Confused class 1’, ‘Confused class 2’, ‘Confused class 3’,
‘Confused 4’, ‘Confused class 5’])
for c in range(num_classes):
index = np.argsort(confusion_mat[:, c])[::-1][:5]
f.writerow([
label2class[c], c, class_occurence[c], ‘%4.3f’ % class_p[c],
‘%4.3f’ % class_r[c], ‘%4.3f’ % class_f1[c],
‘%s:%d’ % (label2class[index[0]], confusion_mat[index[0], c]),
‘%s:%d’ % (label2class[index[1]], confusion_mat[index[1], c]),
‘%s:%d’ % (label2class[index[2]], confusion_mat[index[2], c]),
‘%s:%d’ % (label2class[index[3]], confusion_mat[index[3], c]),
‘%s:%d’ % (label2class[index[4]], confusion_mat[index[4], c])])
f.writerow([‘All’, ‘’, np.sum(class_occurence), micro_p, micro_r, micro_f1,
‘’, ‘’, ‘’, ‘’, ‘’])

  1. PyTorch其他注意事项
    模型定义

建议有参数的层和汇合(pooling)层使用torch.nn模块定义,激活函数直接使用torch.nn.functional。torch.nn模块和torch.nn.functional的区别在于,torch.nn模块在计算时底层调用了torch.nn.functional,但torch.nn模块包括该层参数,还可以应对训练和测试两种网络状态。使用torch.nn.functional时要注意网络状态,如
def forward(self, x):

x = torch.nn.functional.dropout(x, p=0.5, training=self.training)
model(x)前用model.train()和model.eval()切换网络状态。
不需要计算梯度的代码块用with torch.no_grad()包含起来。model.eval()和torch.no_grad()的区别在于,model.eval()是将网络切换为测试状态,例如BN和随机失活(dropout)在训练和测试阶段使用不同的计算方法。torch.no_grad()是关闭PyTorch张量的自动求导机制,以减少存储使用和加速计算,得到的结果无法进行loss.backward()。
torch.nn.CrossEntropyLoss的输入不需要经过Softmax。torch.nn.CrossEntropyLoss等价于torch.nn.functional.log_softmax + torch.nn.NLLLoss。
loss.backward()前用optimizer.zero_grad()清除累积梯度。optimizer.zero_grad()和model.zero_grad()效果一样。
PyTorch性能与调试

torch.utils.data.DataLoader中尽量设置pin_memory=True,对特别小的数据集如MNIST设置pin_memory=False反而更快一些。num_workers的设置需要在实验中找到最快的取值。
用del及时删除不用的中间变量,节约GPU存储。
使用inplace操作可节约GPU存储,如
x = torch.nn.functional.relu(x, inplace=True)
此外,还可以通过torch.utils.checkpoint前向传播时只保留一部分中间结果来节约GPU存储使用,在反向传播时需要的内容从最近中间结果中计算得到。

减少CPU和GPU之间的数据传输。例如如果你想知道一个epoch中每个mini-batch的loss和准确率,先将它们累积在GPU中等一个epoch结束之后一起传输回CPU会比每个mini-batch都进行一次GPU到CPU的传输更快。
使用半精度浮点数half()会有一定的速度提升,具体效率依赖于GPU型号。需要小心数值精度过低带来的稳定性问题。
时常使用assert tensor.size() == (N, D, H, W)作为调试手段,确保张量维度和你设想中一致。
除了标记y外,尽量少使用一维张量,使用n*1的二维张量代替,可以避免一些意想不到的一维张量计算结果。
统计代码各部分耗时
with torch.autograd.profiler.profile(enabled=True, use_cuda=False) as profile:

print(profile)
或者在命令行运行

实用工具

从网上各种资料加上自己实践的可用工具。

主要包括:

模型层数:print_layers_num

模型参数总量:print_model_parm_nums

模型的计算图:def print_autograd_graph():或者参见tensorboad

模型滤波器可视化:show_save_tensor

模型在具体的输入下的尺寸信息summary以及参数量:show_summary

模型计算量:print_model_parm_flops

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
式较混乱,但上述代码均可用,后续会继续整理。

#coding:utf8
import torch
import torchvision

import torch.nn as nn
from torch.autograd import Variable
import torchvision.models as models

import numpy as np

def test():
model = models.resnet18()
print model.layer1[0].conv1.weight.data

print model.layer1[0].conv1.__class__#<class 'torch.nn.modules.conv.Conv2d'>
print model.layer1[0].conv1.kernel_size


input = torch.autograd.Variable(torch.randn(20, 16, 50, 100))
print input.size()
print np.prod(input.size())


def print_model_parm_nums():
model = models.alexnet()
total = sum([param.nelement() for param in model.parameters()])
print(' + Number of params: %.2fM' % (total / 1e6))



def print_model_parm_flops():

# prods = {}
# def save_prods(self, input, output):
# print 'flops:{}'.format(self.__class__.__name__)
# print 'input:{}'.format(input)
# print '_dim:{}'.format(input[0].dim())
# print 'input_shape:{}'.format(np.prod(input[0].shape))
# grads.append(np.prod(input[0].shape))

prods = {}
def save_hook(name):
def hook_per(self, input, output):
# print 'flops:{}'.format(self.__class__.__name__)
# print 'input:{}'.format(input)
# print '_dim:{}'.format(input[0].dim())
# print 'input_shape:{}'.format(np.prod(input[0].shape))
# prods.append(np.prod(input[0].shape))
prods[name] = np.prod(input[0].shape)
# prods.append(np.prod(input[0].shape))
return hook_per

list_1=[]
def simple_hook(self, input, output):
list_1.append(np.prod(input[0].shape))
list_2={}
def simple_hook2(self, input, output):
list_2['names'] = np.prod(input[0].shape)


multiply_adds = False
list_conv=[]
def conv_hook(self, input, output):
batch_size, input_channels, input_height, input_width = input[0].size()
output_channels, output_height, output_width = output[0].size()

kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
bias_ops = 1 if self.bias is not None else 0

params = output_channels * (kernel_ops + bias_ops)
flops = batch_size * params * output_height * output_width

list_conv.append(flops)


list_linear=[]
def linear_hook(self, input, output):
batch_size = input[0].size(0) if input[0].dim() == 2 else 1

weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
bias_ops = self.bias.nelement()

flops = batch_size * (weight_ops + bias_ops)
list_linear.append(flops)

list_bn=[]
def bn_hook(self, input, output):
list_bn.append(input[0].nelement())

list_relu=[]
def relu_hook(self, input, output):
list_relu.append(input[0].nelement())

list_pooling=[]
def pooling_hook(self, input, output):
batch_size, input_channels, input_height, input_width = input[0].size()
output_channels, output_height, output_width = output[0].size()

kernel_ops = self.kernel_size * self.kernel_size
bias_ops = 0
params = output_channels * (kernel_ops + bias_ops)
flops = batch_size * params * output_height * output_width

list_pooling.append(flops)



def foo(net):
childrens = list(net.children())
if not childrens:
if isinstance(net, torch.nn.Conv2d):
# net.register_forward_hook(save_hook(net.__class__.__name__))
# net.register_forward_hook(simple_hook)
# net.register_forward_hook(simple_hook2)
net.register_forward_hook(conv_hook)
if isinstance(net, torch.nn.Linear):
net.register_forward_hook(linear_hook)
if isinstance(net, torch.nn.BatchNorm2d):
net.register_forward_hook(bn_hook)
if isinstance(net, torch.nn.ReLU):
net.register_forward_hook(relu_hook)
if isinstance(net, torch.nn.MaxPool2d) or isinstance(net, torch.nn.AvgPool2d):
net.register_forward_hook(pooling_hook)
return
for c in childrens:
foo(c)

resnet = models.alexnet()
foo(resnet)
input = Variable(torch.rand(3,224,224).unsqueeze(0), requires_grad = True)
out = resnet(input)


total_flops = (sum(list_conv) + sum(list_linear) + sum(list_bn) + sum(list_relu) + sum(list_pooling))

print(' + Number of FLOPs: %.2fG' % (total_flops / 1e9))

# print list_bn


# print 'prods:{}'.format(prods)
# print 'list_1:{}'.format(list_1)
# print 'list_2:{}'.format(list_2)
# print 'list_final:{}'.format(list_final)



def print_forward():
model = torchvision.models.resnet18()
select_layer = model.layer1[0].conv1

grads={}
def save_grad(name):
def hook(self, input, output):
grads[name] = input
return hook

select_layer.register_forward_hook(save_grad('select_layer'))

input = Variable(torch.rand(3,224,224).unsqueeze(0), requires_grad = True)
out = model(input)
# print grads['select_layer']
print grads


def print_value():
grads = {}
def save_grad(name):
def hook(grad):
grads[name] = grad
return hook

x = Variable(torch.randn(1,1), requires_grad=True)
y = 3*x
z = y**2

# In here, save_grad('y') returns a hook (a function) that keeps 'y' as name
y.register_hook(save_grad('y'))
z.register_hook(save_grad('z'))
z.backward()
print 'HW'
print("grads['y']: {}".format(grads['y']))
print(grads['z'])

def print_layers_num():
# resnet = models.resnet18()
resnet = models.resnet18()
def foo(net):
childrens = list(net.children())
if not childrens:
if isinstance(net, torch.nn.Conv2d):
print ' '
#可以用来统计不同层的个数
# net.register_backward_hook(print)
return 1
count = 0
for c in childrens:
count += foo(c)
return count
print(foo(resnet))


def check_summary():
def torch_summarize(model, show_weights=True, show_parameters=True):
"""Summarizes torch model by showing trainable parameters and weights."""
from torch.nn.modules.module import _addindent

tmpstr = model.__class__.__name__ + ' (\n'
for key, module in model._modules.items():
# if it contains layers let call it recursively to get params and weights
if type(module) in [
torch.nn.modules.container.Container,
torch.nn.modules.container.Sequential
]:
modstr = torch_summarize(module)
else:
modstr = module.__repr__()
modstr = _addindent(modstr, 2)

params = sum([np.prod(p.size()) for p in module.parameters()])
weights = tuple([tuple(p.size()) for p in module.parameters()])

tmpstr += ' (' + key + '): ' + modstr
if show_weights:
tmpstr += ', weights={}'.format(weights)
if show_parameters:
tmpstr += ', parameters={}'.format(params)
tmpstr += '\n'

tmpstr = tmpstr + ')'
return tmpstr

# Test
import torchvision.models as models
model = models.alexnet()
print(torch_summarize(model))

#https://gist.github.com/wassname/0fb8f95e4272e6bdd27bd7df386716b7
#summarize a torch model like in keras, showing parameters and output shape
def show_summary():
from collections import OrderedDict
import pandas as pd
import numpy as np

import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import nn


def get_names_dict(model):
"""
Recursive walk to get names including path
"""
names = {}
def _get_names(module, parent_name=''):
for key, module in module.named_children():
name = parent_name + '.' + key if parent_name else key
names[name]=module
if isinstance(module, torch.nn.Module):
_get_names(module, parent_name=name)
_get_names(model)
return names


def torch_summarize_df(input_size, model, weights=False, input_shape=True, nb_trainable=False):
"""
Summarizes torch model by showing trainable parameters and weights.

author: wassname
url: https://gist.github.com/wassname/0fb8f95e4272e6bdd27bd7df386716b7
license: MIT

Modified from:
- https://github.com/pytorch/pytorch/issues/2001#issuecomment-313735757
- https://gist.github.com/wassname/0fb8f95e4272e6bdd27bd7df386716b7/

Usage:
import torchvision.models as models
model = models.alexnet()
df = torch_summarize_df(input_size=(3, 224,224), model=model)
print(df)

# name class_name input_shape output_shape nb_params
# 1 features=>0 Conv2d (-1, 3, 224, 224) (-1, 64, 55, 55) 23296#(3*11*11+1)*64
# 2 features=>1 ReLU (-1, 64, 55, 55) (-1, 64, 55, 55) 0
# ...
"""

def register_hook(module):
def hook(module, input, output):
name = ''
for key, item in names.items():
if item == module:
name = key
#<class 'torch.nn.modules.conv.Conv2d'>
class_name = str(module.__class__).split('.')[-1].split("'")[0]
module_idx = len(summary)

m_key = module_idx + 1

summary[m_key] = OrderedDict()
summary[m_key]['name'] = name
summary[m_key]['class_name'] = class_name
if input_shape:
summary[m_key][
'input_shape'] = (-1, ) + tuple(input[0].size())[1:]
summary[m_key]['output_shape'] = (-1, ) + tuple(output.size())[1:]
if weights:
summary[m_key]['weights'] = list(
[tuple(p.size()) for p in module.parameters()])

# summary[m_key]['trainable'] = any([p.requires_grad for p in module.parameters()])
if nb_trainable:
params_trainable = sum([torch.LongTensor(list(p.size())).prod() for p in module.parameters() if p.requires_grad])
summary[m_key]['nb_trainable'] = params_trainable
params = sum([torch.LongTensor(list(p.size())).prod() for p in module.parameters()])
summary[m_key]['nb_params'] = params


if not isinstance(module, nn.Sequential) and \
not isinstance(module, nn.ModuleList) and \
not (module == model):
hooks.append(module.register_forward_hook(hook))

# Names are stored in parent and path+name is unique not the name
names = get_names_dict(model)

# check if there are multiple inputs to the network
if isinstance(input_size[0], (list, tuple)):
x = [Variable(torch.rand(1, *in_size)) for in_size in input_size]
else:
x = Variable(torch.rand(1, *input_size))

if next(model.parameters()).is_cuda:
x = x.cuda()

# create properties
summary = OrderedDict()
hooks = []

# register hook
model.apply(register_hook)

# make a forward pass
model(x)

# remove these hooks
for h in hooks:
h.remove()

# make dataframe
df_summary = pd.DataFrame.from_dict(summary, orient='index')

return df_summary


# Test on alexnet
import torchvision.models as models
model = models.alexnet()
df = torch_summarize_df(input_size=(3, 224, 224), model=model)
print(df)

# # Output
# name class_name input_shape output_shape nb_params
# 1 features=>0 Conv2d (-1, 3, 224, 224) (-1, 64, 55, 55) 23296#nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
# 2 features=>1 ReLU (-1, 64, 55, 55) (-1, 64, 55, 55) 0
# 3 features=>2 MaxPool2d (-1, 64, 55, 55) (-1, 64, 27, 27) 0
# 4 features=>3 Conv2d (-1, 64, 27, 27) (-1, 192, 27, 27) 307392
# 5 features=>4 ReLU (-1, 192, 27, 27) (-1, 192, 27, 27) 0
# 6 features=>5 MaxPool2d (-1, 192, 27, 27) (-1, 192, 13, 13) 0
# 7 features=>6 Conv2d (-1, 192, 13, 13) (-1, 384, 13, 13) 663936
# 8 features=>7 ReLU (-1, 384, 13, 13) (-1, 384, 13, 13) 0
# 9 features=>8 Conv2d (-1, 384, 13, 13) (-1, 256, 13, 13) 884992
# 10 features=>9 ReLU (-1, 256, 13, 13) (-1, 256, 13, 13) 0
# 11 features=>10 Conv2d (-1, 256, 13, 13) (-1, 256, 13, 13) 590080
# 12 features=>11 ReLU (-1, 256, 13, 13) (-1, 256, 13, 13) 0
# 13 features=>12 MaxPool2d (-1, 256, 13, 13) (-1, 256, 6, 6) 0
# 14 classifier=>0 Dropout (-1, 9216) (-1, 9216) 0
# 15 classifier=>1 Linear (-1, 9216) (-1, 4096) 37752832
# 16 classifier=>2 ReLU (-1, 4096) (-1, 4096) 0
# 17 classifier=>3 Dropout (-1, 4096) (-1, 4096) 0
# 18 classifier=>4 Linear (-1, 4096) (-1, 4096) 16781312
# 19 classifier=>5 ReLU (-1, 4096) (-1, 4096) 0
# 20 classifier=>6 Linear (-1, 4096) (-1, 1000) 4097000


def show_save_tensor():
import torch
from torchvision import utils
import torchvision.models as models
from matplotlib import pyplot as plt

def vis_tensor(tensor, ch = 0, all_kernels=False, nrow=8, padding = 2):
'''
ch: channel for visualization
allkernels: all kernels for visualization
'''
n,c,h,w = tensor.shape
if all_kernels:
tensor = tensor.view(n*c ,-1, w, h)
elif c != 3:
tensor = tensor[:, ch,:,:].unsqueeze(dim=1)

rows = np.min((tensor.shape[0]//nrow + 1, 64 ))
grid = utils.make_grid(tensor, nrow=nrow, normalize=True, padding=padding)
# plt.figure(figsize=(nrow,rows))
plt.imshow(grid.numpy().transpose((1, 2, 0)))#CHW HWC


def save_tensor(tensor, filename, ch=0, all_kernels=False, nrow=8, padding=2):
n,c,h,w = tensor.shape
if all_kernels:
tensor = tensor.view(n*c ,-1, w, h)
elif c != 3:
tensor = tensor[:, ch,:,:].unsqueeze(dim=1)
utils.save_image(tensor, filename, nrow = nrow,normalize=True, padding=padding)


vgg = models.resnet18(pretrained=True)
mm = vgg.double()
filters = mm.modules
body_model = [i for i in mm.children()][0]
# layer1 = body_model[0]
layer1 = body_model
tensor = layer1.weight.data.clone()
vis_tensor(tensor)
save_tensor(tensor,'test.png')

plt.axis('off')
plt.ioff()
plt.show()

def print_autograd_graph():
from graphviz import Digraph
import torch
from torch.autograd import Variable


def make_dot(var, params=None):
""" Produces Graphviz representation of PyTorch autograd graph

Blue nodes are the Variables that require grad, orange are Tensors
saved for backward in torch.autograd.Function

Args:
var: output Variable
params: dict of (name, Variable) to add names to node that
require grad (TODO: make optional)
"""
if params is not None:
#assert all(isinstance(p, Variable) for p in params.values())
param_map = {id(v): k for k, v in params.items()}


node_attr = dict(style='filled',
shape='box',
align='left',
fontsize='12',
ranksep='0.1',
height='0.2')
dot = Digraph(node_attr=node_attr, graph_attr=dict(size="12,12"))
seen = set()

def size_to_str(size):
return '('+(', ').join(['%d' % v for v in size])+')'

def add_nodes(var):
if var not in seen:
if torch.is_tensor(var):
dot.node(str(id(var)), size_to_str(var.size()), fillcolor='orange')
elif hasattr(var, 'variable'):
u = var.variable
#name = param_map[id(u)] if params is not None else ''
#node_name = '%s\n %s' % (name, size_to_str(u.size()))
node_name = '%s\n %s' % (param_map.get(id(u.data)), size_to_str(u.size()))
dot.node(str(id(var)), node_name, fillcolor='lightblue')

else:
dot.node(str(id(var)), str(type(var).__name__))
seen.add(var)
if hasattr(var, 'next_functions'):
for u in var.next_functions:
if u[0] is not None:
dot.edge(str(id(u[0])), str(id(var)))
add_nodes(u[0])
if hasattr(var, 'saved_tensors'):
for t in var.saved_tensors:
dot.edge(str(id(t)), str(id(var)))
add_nodes(t)
add_nodes(var.grad_fn)
return dot


from torchvision import models

torch.manual_seed(1)
inputs = torch.randn(1,3,224,224)
model = models.resnet18(pretrained=False)
y = model(Variable(inputs))
#print(y)


g = make_dot(y, params=model.state_dict())
g.view()
#g

if __name__=='__main__':
import fire
fire. Fire()