pytorch学习 | 月源

pytorch

python

发布日期: 2020-11-02

文章字数: 30k

阅读次数:

Pytorch

handbook：https://github.com/zergtant/pytorch-handbook

pytorch API:https://pytorch.org/docs/stable/index.html

类似例程：https://github.com/yunjey/pytorch-tutorial

Docs：https://github.com/fendouai/PyTorchDocs

基本知识

torch运用就和np一样

一个简单的网络最基本的步骤就是预处理，前向，损失，反向，更新

torch.tensor

torch.tensor(3.14)这是标量 torch.tensor([3.14])这是向量，判断是几维张量主要是看有几个中括号

不是基本数据类型如int，float，string等，而是引用数据类型

是在类中封装好的。所以肯定相应操作比如运算符等人家已经给你重载了，所以不用想的太多

两个tensor相加如果是同维度的话，就直接对应元素相加

pytorch通道顺序及索引

NCHW

很棒的索引教程

a = torch.randn((4,3,28,28))
#基本索引
print(a[0].shape)
#torch.Size([3,28,28])
print(a[0,0].shape)
#torch.Size([28,28])
print(a[0,0,2,4])
#tensor(0.8082)
#连续选取
print(a[:2].shape)
#torch.Size([2,3,28,28])
#由于是两张图片，所以第一维变为2
print(a[:2,:1,:,:].shape)
print(a[:2,:1].shape)
#torch.Size(2,1,28,28)
# ...作用
# …代替了切片操作中前面所有的:， 即a[:, :, None] 和a[…, None]等价
# None作用
[a, b]–>[:, None, :]–>[a, 1, b] # None的作用就相当于在对应维度增加了一个维度
# pos[..., (1, 0)] y, x -> x, y

基本函数

x.view(-1,8)
#返回一个新的与原张量数据相同但形状不同的张量,-1是指从其他维度推断！
y.add_(x)
#"_"结尾的函数,会用结果替换原变量
torch.Tensor.item()   eg:x.item()
#返回这个张量的值作为一个标准的Python数。这只适用于只有一个元素的张量。不可微操作
torch.Tensor.tolist() eg:a.tolist() a[0,0].tolist()
#返回张量作为一个(嵌套的)列表。对于标量，返回一个标准的Python数字，就像item()一样。如果需要，张量会首先自动移动到CPU。
a.size()  a.shape
#返回维度 eg:torch.Size([4, 4])
numpy_a=a.numpy()#tensor转numpy
torch_a=torch.from_numpy(numpy_a)#numpy转tensor
#Tensor和numpy对象共享内存，转换很快，但这也意味着，如果其中一个变了，另一个也会变
x.type(torch.FloatTensor)
#如果没有提供dtype返回类型，否则将该对象强制转换为指定的类型,并返回该对象。
torch.log(a)
# 返回自然对数的新张量
with torch.no_grad():
#禁止梯度计算的上下文管理器，当您确定不会调用张量.backward()时，禁用梯度计算对于推断是很有用的。
#它将减少计算的内存消耗，否则需要require_grad =True。
torch.squeeze()
#eg:输入shape为(A·1·B·1·C·1·D),输入张量的shape就是(A·B·C·D)
#如果指定维度的话，那只对该维度去1。注意：返回的张量与输入张量共享存储空间，因此改变一个张量的内容将改变另一个张量的内容。
#另外如果对批次batch为1也去掉的话，可能会引发错误。
torch.unsqueeze()
# 增加一个1维度
troch.max()
#_, predicted = torch.max(outputs, 1)
#outputs是数据Tensor，1表示求第一维度上的最大值
#_是不要了  torch.max（）的返回值分两部分，分别是values和indices
torch.max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
# 返回一个命名元组(values, indices)，其中values是给定维度dim中输入张量的每一行的最大值。indices是找到的每个最大值(argmax)的索引位置。
# 返回一个命名元组(values,indices)
# 其中values是给定维度dim中输入张量的每一行的最大值
# indices是找到的每个最大值(argmax)的索引位置。
# 如果keepdim为True，则输出张量与输入张量的大小相同，除了dim维度的大小为1。
# 否则，dim被压缩(参见torch.squeeze())，导致输出张量比输入少1维。
>>> import torch
>>> a=torch.rand(2,3,4)
>>> b=torch.rand(2,3,4)
>>> indices=torch.max(a,1,keepdim=True)[1]
>>> b_max = torch.take_along_dim(b,indices,dim=1)
>>> a=torch.rand(2,3)
>>> a
tensor([[0.0163, 0.0711, 0.5564],
        [0.4507, 0.8675, 0.5974]])
>>> b=torch.rand(2,3)
>>> b
tensor([[0.7542, 0.1793, 0.5399],
        [0.2292, 0.5329, 0.2084]])
>>> indices=torch.max(a,1,keepdim=True)[1]
>>> torch.take_along_dim(b,indices,dim=1)
tensor([[0.5399],
        [0.5329]])
#rand从(0,1)的均匀分布中随机抽样
torch.rand()
#randn从标准正态分布随机抽样
#torch.normal(mean,std) 正态分布随机抽样
#torch.linspace()线性间距向量  
#torch.ones()初始化为1   torch.zeros()初始化为0  torch.eye()初始化为单位矩阵
torch.complex(real, imag, *, out=None) → Tensor
#real为实部，imag为虚部，real和imag必须位数相同，如果real和imag同为float32那么生成的complex就为complex64。
>>> real = torch.tensor([1, 2], dtype=torch.float32)
>>> imag = torch.tensor([3, 4], dtype=torch.float32)
>>> z = torch.complex(real, imag)
>>> z
tensor([(1.+3.j), (2.+4.j)])
>>> z.dtype
torch.complex64
torch.__version__
#查看torch版本
torch.randint(low=0,#最小
              high,#最大 
              size,#维度 
              *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) → Tensor #均匀分布取样
torch.mm(input, mat2, *, out=None)-> Tensor
#矩阵乘
torch.bmm(input, mat2, out=None) → Tensor
# 对一个batch的矩阵进行矩阵乘积,(bxnxm)x(bxmxp)=(bxnxp)
torch.permute(input, dims) → Tensor # 调整通道顺序
torch.Tensor.expand_as(other)->Tensor
# 和其它张量具有相同的维度。就从现有的值复制扩充。
torch.cat(tensors,dim=0,*,out=None)
# 将一个序列里的张量拼接在一起，按维数拼接
torch.clone(input,*, memory_format=torch.preserve_format) → Tensor
# return a copy of input
torch.mean(input, dim, keepdim=False, *, out=None) → Tensor

loss=[]#这是一个列表,和tensor不通用
b = list(a)#a维度是(4,4,4)  b维度是(4,)
b = [a] #a维度是(4,4,4)  b维度是(1 ,) #上面的会把batch信息保留

torch.numel(input)->int
# 返回input tensor中元素的总数,eg:
a = torch.randn(1, 2, 3, 4, 5)
torch.numel(a)
# 120
torhch.Tensor.nelement()->int
# numel()的别名
torch.split(tensor, [split, split], dim)
#
torch.Tensor.sigmoid()
>>> t = torch.tensor([[[1, 2],[3, 4]],[[5, 6],[7, 8]]])
# 使变平
>>> torch.flatten(t)
tensor([1, 2, 3, 4, 5, 6, 7, 8])
>>> torch.flatten(t, start_dim=1)
tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
Tensor.masked_fill_(mask, value)
# 将tensor中和mask为1的位置相对应的替换为value
mask = torch.zeros(5,6,dtype=torch.float)
print(mask)
>>>tensor([[0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1., 1.]])
a.data.masked_fill_(mask.byte(),-float('inf'))
print(a)
>>>tensor([[-0.1053, -0.0352,  1.4759,  0.8849, -0.7233,    -inf],
        [-0.0529,  0.6663, -0.1082, -0.7243,    -inf,    -inf],
        [-0.0364, -1.0657,  0.8359,    -inf,    -inf,    -inf],
        [ 1.4160,  1.1594,    -inf,    -inf,    -inf,    -inf],
        [ 0.4163,    -inf,    -inf,    -inf,    -inf,    -inf]])
torch.roll(input, shifts, dims=None) → Tensor
# shifts:元素移位的维数,如果该参数是一个元组（例如shifts=(x,y)）
# dims必须是一个相同大小的元组（例如dims=(a,b)），相当于在第a维度移x位，在b维度移y位
x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
torch.roll(x, shifts=(2, 1), dims=(0, 1))
>>>tensor([[6, 5],
           [8, 7],
           [2, 1],
           [4, 3]])
>>> torch.linspace(3, 10, steps=5)
tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
# (end - start)/(steps - 1)
# 设置梯度计算为开或关的上下文管理器,可以用作上下文管理器或一个函数,这个上下文管理器是线程本地的,他不会影响其他线程。
torch.set_grad_enabled(mode)
with torch.set_grad_enabled(is_train):
...   y = x * 2
>>> y.requires_grad
False
>>> torch.set_grad_enabled(True)
>>> y = x * 2
>>> y.requires_grad
True
# 沿给定的dim计算张量输入中非零值的数量。 如果没有指定dim，则计算张量中的所有非零值。 
torch.count_nonzero(a)
torch.clamp(input, min=None, max=None, *, out=None) → Tensor
# 将input中的所有元素限制在[min,max]范围内,操作定义如下
#      | min, if x_i < min
#y_i = | x_i, if min <= x_i <= max
#      | max, if x_i > max
# 和np.where用法相同
torch.where(a>0,a,5*a)
# https://zhuanlan.zhihu.com/p/352877584
torch.gather()
# 返回排序后的值所对应原a的下标，即torch.sort()返回的indices
torch.argsort()
#可以理解为填充或修改:https://blog.csdn.net/weixin_45547563/article/details/105311543 
scatter_(input, dim, index, src)
# 网格函数
torch.meshgrid(*tensors)
#tensors: 两个一维向量，如果是0维，当作1维处理
# 返回：两个矩阵
# 第一个矩阵行相同，列是第一个向量的各个元素
# 第二个矩阵列相同，行是第二个向量的各个元素
x = torch.tensor([1, 2, 3])
y = torch.tensor([4, 5, 6, 7])
grid_x, grid_y = torch.meshgrid(x, y)
'''
grid_x:  
tensor([[1, 1, 1, 1],
        [2, 2, 2, 2],
        [3, 3, 3, 3]])
grid_y:  
tensor([[4, 5, 6, 7],
        [4, 5, 6, 7],
        [4, 5, 6, 7]])
'''
# torch.stack 不会在现有的维度上加
A = torch.tensor([[1, 2, 3],
                  [4, 5, 6],
              [7, 8, 9]])
B = torch.tensor([[12, 22, 33],
              [44, 55, 66],
                  [77, 88,99]])
result1 = torch.stack((A,B),dim=0)
# torch.Size([2, 3, 3])
# torch.chunk(tensor,chunk数,维度)
a=torch.tensor([4,5,7],[3,9,8],[9,6,7])
torch.chunk(a, 3, dim=0)
>>>(tensor([[4,5,7]]),tensor([[3,9,8]]),tensor([[9,6,7]]))

求导和网络相关知识

grad属性保存梯度值，grad_fn保存梯度函数

nn.functional函数的特点是不具有可学习的参数，net.parameters()返回网络可学习的参数

forward函数的输入和输出都是Tensor ,在反向传播前，先要将所有参数的梯度清零,如果不清0，计算得到的梯度值会进行累加

torch.nn只支持mini-batches，不支持一次只输入一个样本，即一次必须是一个batch。

经典报错

一:int和torch

def count_parameters(m, x, y):
    total_params = 0
    for p in m.parameters():
        total_params += torch.DoubleTensor([p.numel()])
    print("m.total_params", m.total_params, "total_params", total_params)
    m.total_params[0] = total_params

m.total_params[0] += total_params,如果m.parameters为空,那么total_params就是0,类型为int,m.total_params[0]的类型为torch.Size([]),相当于一个空tensor,要是m.total_params=total_params,就相当于把int赋值给tensor,触发TypeError

二：RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

import torch
x = torch.FloatTensor([[1., 2.]])
w1 = torch.FloatTensor([[2.], [1.]])
w2 = torch.FloatTensor([3.])
w1.requires_grad = True
w2.requires_grad = True

d = torch.matmul(x, w1)
f = torch.matmul(d, w2)
d[:] = 1 # 因为这句, 代码报错了 RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

f.backward()

因为f的求导需要用到d，但是这个时候d已经被改变了，所以报错

这种报错会显示在这一行batch_loss.backward(),多用b=...a...之类的形式，少用a=...a...之类的形式

似乎对于tensor变量的整体运算覆盖原变量，比如x=2*x不会导致以上问题；但是逐元素操作覆盖原位置元素就会引起这个问题，举例也是这个情况，在循环中，每次访问feature[i,:,:]，如果写成feature[i,:,:]=...feature[i,:,:]...，这设计的是python的变量赋值规则了

三：view size is not compatible with input tensor‘s size and stride

tensor不是contiguous连续引起的错误,查看targets.is_contiguous()为False

两种解决办法：1）按照提示使用reshape代替；2）将变量先转为contiguous ，再进行view:

targets.contiguous().view(targets.size(0)*targets.size(1),-1)

如何判断张量是否连续?

nD 张量底层实现是使用一块连续内存的一维数组，由于 PyTorch 底层实现是 C 语言 (C/C++ 使用行优先的存储方式)，所以 PyTorch 中的 nD 张量也按照行优先的顺序进行存储的。

下图为一个形状为$(2×3)$的2D张量，为了方便将其命名为$A$。

张量 $A$ 在内存中实际以一维数组的形式进行存储，并且使用行优先的顺序进行存储,其中一维数组的形式存储比较好理解,而行优先指的就是存储顺序按照张量$A$的行依次存储。张量$A$在内存中的实际存储形式如下所示。

张量$A$通常称为存储的逻辑结构，而实际存储的一维数组形式称为存储的物理结构。

如果元素在存储的逻辑结构上相邻，在存储的物理结构中也相邻，则称为连续存储的张量；
如果元素在存储的逻辑结构上相邻，但是在存储的物理结构中不相邻，则称为不连续存储的张量；

交换维度的操作能够将连续存储的张量转变成不连续存储的张量。

nD 张量，对于任意一个维度$i(i=0,…,n−1,i≠n−1)$都满足下面的等式则说明 nD 张量连续，不满足则说明 nD 张量不连续。
$$
stride[i]=stride[i+1]\times size[i+1]
$$
$stride[i]$表示逻辑结构中第 $i$个维度上相邻的元素在物理结构中间隔的元素个数.

$size[i]$表示逻辑结构中第$i$个维度的元素个数。

对于$A$,$stride[0]=3,stride[1]=1,size[1]=3$,所以是连续的

假设将$A$转置得到$A^T$,如下图：

在 PyTorch 中交换维度的操作并没有改变其实际的存储，换句话说，交换维度后的张量与原始张量共享同一块内存，因此交换维度后的张量 AT 底层存储和原始张量 A 都是相同的一维数组。

对于$A^T$,$stride[0]=1,stride[1]=3,size[1]=2$,不连续

view只能用于数据连续存储的张量，而reshape则不需要考虑张量中的数据是否连续存储

原始张量的视图简单来说就是和原始张量共享数据，因此如果改变使用 view 方法返回的新张量，原始张量也会发生相对应的改变。

reshape 方法可能返回的是原始张量的视图或者拷贝，当处理连续存储的张量 reshape 返回的是原始张量的视图，而当处理不连续存储的张量 reshape 返回的是原始张量的拷贝

四:Input type (torch.cuda.DoubleTensor) and weight type (torch.cuda.FloatTensor) should be the same

整理数据集的时候,使用了np.array,然后默认保存成float64,但是pytorch中默认是float32。

# 解决办法,dtype="float32"
image_n = np.array(image_n, dtype="float32")/255.0
image_d = np.array(image_d, dtype="float32")/255.0

五:RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

比如整体网络是G+S,有一个G_loss,有一个S_loss,必须loss=G_loss+S_loss,loss.backward(),

而不能G_loss.backward(),S_loss.backward(),因为为了节省空间,G_loss.backward()时已经把G的中间结果删除了

To reduce memory usage, during the .backward() call, all the intermediary results are deleted when they are not needed anymore. Hence if you try to call .backward() again, the intermediary results don’t exist and the backward pass cannot be performed (and you get the error you see).You can call .backward(retain_graph=True) to make a backward pass that will not delete intermediary results, and so you will be able to call .backward() again. All but the last call to backward should have the retain_graph=True option.

六:TypeError: can’t convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

所有在CPU上的Tensor（除了CharTensor）都支持与NumPy数组相互转换。

此外上面提到还有一个常用的方法就是直接用torch.tensor()将NumPy数组转换成Tensor，需要注意的是该方法总是会进行数据拷贝，返回的Tensor和原来的数据不再共享内存。

RuntimeError: Can’t call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

requires_grad = True的也不能转换成numpy

七:错误归一化

対生成的图片,错误的使用了归一化

# 比如输出图片,只要确保图片在0-1之间就好
temp1=np.clip(temp1,0,1)
temp2=np.clip(temp2,0,1)
# 如果进行归一化,如下图
temp1=temp1/(np.max(temp1)-np.min(temp1))
temp2=temp2/(np.max(temp2)-np.min(temp2))
# 相当于对图片进行了放缩

想象一下本来图片的像素是0.5 1 放缩完之后就成了1 2了怪不得那么亮~

经典示例

1.resnet.conv1

if layers == 50:
    resnet = models.resnet50(pretrained=pretrained)
elif layers == 101:
    resnet = models.resnet101(pretrained=pretrained)
else:
    resnet = models.resnet152(pretrained=pretrained)
self.layer0 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, resnet.conv2, resnet.bn2, resnet.relu, resnet.conv3, resnet.bn3, resnet.relu, resnet.maxpool)
self.layer1, self.layer2, self.layer3, self.layer4 = resnet.layer1, resnet.layer2, resnet.layer3, resnet.layer4

resnet是你定义的一个类,那么类中的属性如resnet.conv1自然也可以调用啊。

使用GPU

CUDA\cuDNN是什么

CUDA:NVIDIA推出的用于自家GPU的并行计算框架，也就是说CUDA只能在NVIDIA的GPU上运行，而且只有当要解决的计算问题是可以大量并行计算的时候才能发挥CUDA的作用。

在 CUDA 的架构下，一个程序分为两个部份：host 端和 device 端。Host 端是指在 CPU 上执行的部份，而 device 端则是在显示芯片上执行的部份。Device 端的程序又称为 “kernel”。通常 host 端程序会将数据准备好后，复制到显卡的内存中，再由显示芯片执行 device 端程序，完成后再由 host 端程序将结果从显卡的内存中取回。

cuDNN:是NVIDIA打造的针对深度神经网络的加速库，是一个用于深层神经网络的GPU加速库。

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")#这一步是设置我们使用的GPU
# 确认我们的电脑支持CUDA，然后显示CUDA信息：
print(device)
#然后这些方法将递归遍历所有模块并将模块的参数和缓冲区 转换成CUDA张量：
net.to(device)
#记住：inputs, targets 和 images 也要转换。
inputs, labels = inputs.to(device), labels.to(device)

多GPU

教程

模型保存与加载

ResNet的调用

if pretrained:
# model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    model_path = './initmodel/resnet152_v2.pth'
    model.load_state_dict(torch.load(model_path), strict=False)

torch.save

将对象从内存中保存到磁盘文件中。

torch.save(obj,#被保存的对象
           f: Union[str, os.PathLike, BinaryIO],#
           pickle_module=<module 'pickle' from '/opt/conda/lib/python3.6/pickle.py'>,
           pickle_protocol=2, _use_new_zipfile_serialization=True) → None
#常见的PyTorch约定是使用.pt文件扩展名保存张量。
# Save to file
>>> torch.save(model.state_dict(), PATH)

torch.load

加载用torch.save()保存的磁盘文件到内存中。

torch.load(f,#文件路径
           map_location=None,#加载到的位置
           pickle_module=<module 'pickle' from '/opt/conda/lib/python3.6/pickle.py'>,
           **pickle_load_args)
#eg:torch.load(args.load, map_location=device)

保存和加载,torch.save()将参数由内存或显存保存到硬盘,torch.load()再由硬盘加载到内存或显存,

然后再调用model.load_state_dict()将参数加载到模型中

# 只保存模型参数
# 保存
torch.save(model.state_dict(), '\parameter.pkl')
# 加载
model = TheModelClass(...)
model.load_state_dict(torch.load('\parameter.pkl'))

# 保存完整模型
# 保存
torch.save(model, '\model.pkl')
# 加载
model = torch.load('\model.pkl')
# 但是这样直接加载有时候会出问题，比如分割,本来的模型是2类输出,但是假设我们现在要新训练一个模型,同时利用之前模型的预训练信息，但是这个时候最后的那个prediction部分模型大小就不匹配了

Pytorch断点续接

# 模型参数的加载 优化器参数的加载 epoch的恢复
checkpoint = {
        "net": model.state_dict(),
        'optimizer':optimizer.state_dict(),
        "epoch": epoch
    }
    if not os.path.isdir("./models/checkpoint"):
        os.mkdir("./models/checkpoint")
    torch.save(checkpoint, './models/checkpoint/ckpt_best_%s.pth' %(str(epoch)))


if RESUME:
    path_checkpoint = "./models/checkpoint/ckpt_best_1.pth"  # 断点路径
    checkpoint = torch.load(path_checkpoint)  # 加载断点

    model.load_state_dict(checkpoint['net'])  # 加载模型可学习参数

    optimizer.load_state_dict(checkpoint['optimizer'])  # 加载优化器参数
    start_epoch = checkpoint['epoch']  # 设置开始的epoch

学习率的调节会用到epoch

torch.Tensor

# tensor类型转换 如long int 转换成torch.float 直接利用type函数
mask = torch.where(mask<threshold,0,1).type(torch.float)
# .cuda non_blocking经常与DataLoader的pin_memory搭配使用
src_input = src_input.cuda(non_blocking=True)
# 1. x = x.cuda(non_blocking=True)
# 2. 进行一些和x无关的操作
# 3. 执行和x有关的操作
# 在non_blocking=true下，1不会阻塞2，1和2并行。这样将数据从CPU移动到GPU的时候，它是异步的。在它传输的时候，CPU还可以干其他的事情（不依赖于数据的事情）

Tensor Attributes

Each torch.Tensor has a torch.dtype torch.device, and torch.layout.

torch.dtype

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

CUDA_VISIBLE_DEVICES is the mask used by CUDA to determine what devices it exposes to the user program, which is pytorch in this case. There is no way pytorch can know about that reliably.

Within you application, gpu numbers will always start at 0 and grow up from there.
When you use CUDA_VISIBLE_DEVICES, you hide some devices so they won’t be numbered.
If you have 4 gpus: 0, 1, 2, 3.
And run CUDA_VISIBLE_DEVICES=1,2 python some_code.py. Then the device that you will see within python are device 0, 1. Using device 0 in your code will use device 1 from global numering. Using device 1 in your code will use 2 outside.
So in your case if you always set CUDA_VISIBLE_DEVICES to a single device, in your code, the device id will always be 0, that is expected. Unfortunately, there is no way to know what is the global numbering.

A torch.dtypeis an object that represents the data type of a torch.Tensor.Pytorch has different 12 types.

torch.device

A torch.device is an object representing the device on which a torch.Tensor is or will be allocated.

The torch.device contains a device type ('cpu' or 'cuda') and optional device ordinal for the device type. If the device ordinal is not present, this object will always represent the current device for the device type, even after torch.cuda.set_device() is called; e.g., a torch.Tensor constructed with device 'cuda' is equivalent to 'cuda:X' where X is the result of torch.cuda.current_device().

A torch.Tensor’s device can be accessed via the Tensor.device property.

A torch.device can be constructed via a string or via a string and device ordinal

# via a string 
>>> torch.device('cuda:0')
device(type='cuda', index=0)
>>> torch.device('cpu')
device(type='cpu')
>>> torch.device('cuda')  # current cuda device
device(type='cuda')
# via a string device ordinal
>>> torch.device('cuda', 0)
device(type='cuda', index=0)
>>> torch.device('cpu', 0)
device(type='cpu', index=0)
# torch.cuda.set_device(device)
# Sets the current device. 不鼓励使用此函数。在大多数情况下，最好使用 CUDA_VISIBLE_DEVICES 环境变量。

The torch.device argument in functions can generally be substituted with a string. This allows for fast prototyping of code.

>>> # Example of a function that takes in a torch.device
>>> cuda1 = torch.device('cuda:1')
>>> torch.randn((2,3), device=cuda1)
>>> # You can substitute the torch.device with a string
>>> torch.randn((2,3), device='cuda:1')

For legacy reasons, a device can be constructed via a single device ordinal, which is treated as a cuda device. This matches Tensor.get_device(), which returns an ordinal for cuda tensors and is not supported for cpu tensors.

>>> torch.device(1)
device(type='cuda', index=1)

Methods which take a device will generally accept a (properly formatted) string or (legacy) integer device ordinal, i.e. the following are all equivalent:

>>> torch.randn((2,3), device=torch.device('cuda:1'))
>>> torch.randn((2,3), device='cuda:1')
>>> torch.randn((2,3), device=1)  # legacy

torch.layout

in beta

torch.autograd

torch.autograd提供了实现任意标量值函数的自动微分的类和函数。它只需要对现有代码进行最小的更改——只需要声明张量s，对于这些张量，计算梯度时应带有requires_grad=True关键字。到目前为止，我们只支持浮点张量类型(half、float、double和bfloat16)和复数张量类型(cfloat、cdouble)的自动求导。

Variable

Variable API已被弃用:在使用autograd时，不再需要Variable。Autograd自动支持将requires_grad设置为True的张量。下面是一些变化的快速指南:

Variable(tensor)和Variable(tensor, requires_grad)仍然按预期工作，但它们返回的是张量而不是变量。

var.data和tensor.data是一样的。

print(i)
#tensor([[-0.4404]], requires_grad=True)
print(i.data)
#tensor([[-0.4404]])

像var.backward()、var.detach()、var.register_hook()这样的方法现在可以在具有相同方法名的张量上工作。

此外,现在可以使用工厂方法创建requires_grad=True的张量,如torch.randn()、torch.zeros()、torch.ones()和其他类似如下的方法:

autograd_tensor =torch.randn((2, 3, 4),requires_grad=True)

具体来说，在pytorch中的Variable就是一个存放会变化值的地理位置，里面的值会不停发生变化，就像一个装鸡蛋的篮子，鸡蛋数会不断发生变化。那谁是里面的鸡蛋呢，自然就是pytorch中的tensor了。（也就是说，pytorch都是有tensor计算的，而tensor里面的参数都是Variable的形式）。如果用Variable计算的话，那返回的也是一个同类型的Variable。

也就是说现在requires_grad=True的tensor就相当于以前的Variable,也就是进行反向传播的变量。

detach() data() detach_()

教程

返回一个新的Variable，从当前计算图中分离下来的，但是仍指向原变量的存放位置,不同之处只是requires_grad为false，得到的这个Variable永远不需要计算其梯度，不具有grad。**即使之后重新将它的requires_grad置为true,它也不会具有梯度grad.**这样我们就会继续使用这个新的Variable进行计算，后面当我们进行反向传播时，到该调用detach()的Variable就会停止，不能再继续向前进行传播

# detach与data的区别
# 相同 1.都和x共享同一块数据 2.都和x的计算历史无关 3.requires_grad = False
# 不同
a = torch.tensor([1, 2, 3.], requires_grad=True)
out = a.sigmoid()
b = out.detach()
c = out.data()
# 修改b、c均会同时改变a
# 不修改b,out可以反向传播；修改b,out反向传播会报错
# 修改c,out可以反向传播,但会生成错误的梯度值。
# 所以out.data()在某些情况下不安全

上面内容实现的原理是:In-place 正确性检查(In-place操作是指函数最后带”_”的操作)

所有的Variable都会记录用在他们身上的 in-place operations。如果pytorch检测到variable在一个Function中已经被保存用来backward，但是之后它又被in-place operations修改。当这种情况发生时，在backward的时候，pytorch就会报错。这种机制保证了，如果你用了in-place operations，但是在backward过程中没有报错，那么梯度的计算就是正确的。

pred_fake = self.netD(fake_AB.detach()) # 固定了G,detach隔断了返现传播流

detach()会截断反向传播

Function

知乎教程 CSDN教程

对Function的直观理解

虽然pytorch可以自动求导，但是有时候一些操作是不可导的，这时候你需要自定义求导方式。也就是所谓的 Extending torch.autograd

Function与Module的差异与应用场景

Function一般只定义一个操作，因为其无法保存参数，因此适用于激活函数、pooling等操作;Module是保存了参数，因此适合于定义一层，如线性层，卷积层，也适用于定义一个网络
Function需要定义三个方法：__init__, forward, backward(需要自己写求导公式);Module：只需定义__init__和forward，而backward的计算由自动求导机制构成
可以不严谨的认为，Module是由一系列Function组成，因此其在forward的过程中，Function和Variable组成了计算图，在backward时，只需调用Function的backward就得到结果，因此Module不需要再定义backward。
Module不仅包括了Function，还包括了对应的参数，以及其他函数与变量，这是Function所不具备的

#属性（成员变量）
#saved_tensors: 传给forward()的参数，在backward()中会用到。
#needs_input_grad:长度为 :attr:num_inputs的bool元组，表示输出是否需要梯度。可以用于优化反向过程的缓存。
#num_inputs: 传给函数 :func:forward的参数的数量。
#num_outputs: 函数 :func:forward返回的值的数目。
#requires_grad: 布尔值，表示函数 :func:backward 是否永远不会被调用。

#成员函数
#forward()
#forward()可以有任意多个输入、任意多个输出，但是输入和输出必须是Variable。(官方给的例子中有只传入tensor作为参数的例子)
#backward()
#backward()的输入和输出的个数就是forward()函数的输出和输入的个数。其中，backward()输入表示关于forward()输出的梯度(计算图中上一节点的梯度)，#backward()的输出表示关于forward()的输入的梯度。在输入不需要梯度时（通过查看needs_input_grad参数）或者不可导时，可以返回None。

Anomaly detection

上下文管理器，为autograd引擎启用异常检测。做了两件事:

Running the forward pass with detection enabled will allow the backward pass to print the traceback of the forward operation that created the failing backward function.。任何生成” nan “值的向后计算都将引发错误。

此模式应仅用于调试，因为不同的测试将降低程序执行速度。

torch.autograd.detect_anomaly
>>> with autograd.detect_anomaly():
...     inp = torch.rand(10, 10, requires_grad=True)
...     out = run_fn(inp)
...     out.backward()
torch.autograd.set_detect_anomaly(mode:True or False)
# 上下文管理器，设置自动grad引擎的异常检测开关。
# Set_detect_anomaly将根据参数模式启用或禁用自grad异常检测。它可以用作上下文管理器或函数。

在模型正常训练阶段不建议打开**autograd.detect_anomaly，**会使训练速度大大减慢，以笔者这里的测试，打开后，原本4个小时的训练被减慢至7.5个小时；打开后可以辅助找到出现Nan值的位置

assert torch.isnan(src_n_feat).int().sum() ==0 判断张量中是否有值为nan

torch.cuda

torch.cuda.device_count() → int # 返回可用的GPU数量
torch.cuda.get_device_name() → str # 返回GPU的名字,如NVIDIA GeForce RTX 3080 Ti

memory managetment

pytorch显存机制分析

torch.cuda.memory_reserved(device=None)

torch.cuda.amp

基本使用

Pytorch自动混合精度教程

Automatic mixed precision package自动混合精度包

torch.cuda.amp提供了方便的混合精度方法，在某些操作中需要使用torch.float32 (float)数据类型而有些操作使用torch.float16(half)。
有些操作，比如线性层和卷积，在float16中要快得多。
其他操作，比如减少操作，通常需要float32的动态范围。
混合精度尝试将每个op匹配到其适当的数据类型。

一般来说，自动混合精度训练同时使用torch.cuda.amp.autocast和torch.cuda.amp.GradScaler,当然如果需要也可以单独使用。

from torch.cuda.amp import autocast, GradScaler
# 用户使用混合精度训练基本操作：
# amp依赖Tensor core架构，所以model参数必须是cuda tensor类型
model = Net().cuda()
optimizer = optim.SGD(model.parameters(), ...)
# GradScaler对象用来自动做梯度缩放
scaler = GradScaler()

for epoch in epochs:
    for input, target in data:
        optimizer.zero_grad()
        # 在autocast enable 区域运行forward
        with autocast():
            # model做一个FP16的副本，forward
            output = model(input)
            loss = loss_fn(output, target)
        # 用scaler，scale loss(FP16)，backward得到scaled的梯度(FP16)
        scaler.scale(loss).backward()
        # scaler 更新参数，会先自动unscale梯度
        # 如果有nan或inf，自动跳过
        scaler.step(optimizer)
        # scaler factor更新
        scaler.update()

autocast自定义函数

对于用户自定义的autograd函数，需要用@torch.cuda.amp.custom_fwd装饰forward函数,@torch.cuda.amp.custom_bwd装饰backward函数:

class MyMM(torch.autograd.Function):
    @staticmethod
    @custom_fwd
    def forward(ctx, a, b):
        ctx.save_for_backward(a, b)
        return a.mm(b)
    @staticmethod
    @custom_bwd
    def backward(ctx, grad):
        a, b = ctx.saved_tensors
        return grad.mm(b.t()), a.t().mm(grad)

调用时再autocat

mymm = MyMM.apply

with autocast():
    output = mymm(input1, input2)

torch.backends

torch.backends控制PyTorch支持的各种后端的行为。这些后端包括:

torch.backends.cuda
torch.backends.cudnn
torch.backends.mkl
torch.backends.mkldnn
torch.backends.openmp

torch.cudnn

torch.backends.cudnn.benchmark详解

torch.backends.cudnn.benchmark
#一个bool值，如果为真，将导致cuDNN对多个卷积算法进行基准测试并选择最快的。
#耗费一些预处理时间，选择最好的卷积算法，大大减少之后的训练时间，网络结构不能变，输入输出不能变等
torch.backends.cudnn.deterministic
#如果该bool为真，则导致cuDNN只使用确定性卷积算法。参见torch.is_deterministic()和torch.set_deterministic()。
torch.backends.cudnn.enabled
#一个控制是否启用cuDNN的bool值,默认为True，启用cudnn

torch.distributed

DDP系列第一篇：入门教程

import torch.distributed as dist

DDP与DP模式的不同

DP模式是很早就出现的、单机多卡的、参数服务器架构的多卡训练模式，在PyTorch，即是:

model=torch.nn.DataParaller(model)

在DP模式中，总共只有一个进程(受到GIL很强限制)。master节点相当于参数服务器，其会向其他卡广播其参数；在梯度反向传播后，各卡将梯度集中到master节点，master节点对搜集来的参数进行平均后更新参数，再将参数统一发送到其他卡上。这种参数更新方式，会导致master节点的计算任务、通讯量很重，从而导致网络阻塞，降低训练速度。

但是DP也有优点,优点就是代码实现简单。要速度还是要方便，看官可以自行选用哟。

DDP

教程1

基本概念

教程2

ddp运行代码:

python3 -m torch.distributed.launch --nproc_per_node=8 DDP.py

其中python3 -m的意思是指run library module as a script（将模块当作脚本运行）

# 当文件作为脚本直接运行时，这段代码会产生副作用，输出字符串“模块直接运行”；
# 当文件作为模块被导入时，不会产生副作用，不输出字符串“模块直接运行”；
if __name__ == '__main__':
    print('模块直接运行');
# 当我们知道一个模块的名字，但不知道它的路径时，我们可以通过 -m 参数，在 shell 中将该模块当作脚本运行，例如：
python -m module_name
# 如果我们知道模块的完整路径（此处假设为"/path/to/module.py"），上述命令的效果，以下面的命令等同
python /path/to/module.py

rank：用于表示进程的编号/序号（在一些结构图中rank指的是软节点，rank可以看成一个计算单位），每一个进程对应了一个rank的进程，整个分布式由许多rank完成

node：物理节点，可以是一台机器也可以是一个容器，节点内部可以有多个GPU。

rank与local_rank： rank是指在整个分布式任务中进程的序号；local_rank是指在一个node上进程的相对序号，local_rank在node之间相互独立。

nnodes、node_rank与nproc_per_node： nnodes是指物理节点数量，node_rank是物理节点的序号；nproc_per_node是指每个物理节点上面进程的数量。word size ：全局（一个分布式任务）中，rank的数量。

上一个运算题：每个node包含16个GPU，且nproc_per_node=8，nnodes=3，机器的node_rank=5，请问word_size是多少？
答案：word_size = 3*8 = 24

比如分布式中有三台机器，每台机器起4个进程，每个进程占用1个GPU，如下图所示：

Group：进程组，一个分布式任务对应了一个进程组。只有用户需要创立多个进程组时才会用到group来管理，默认情况下只有一个group。

Groups

By default collectives operate on the default group (also called the world) and require all processes to enter the distributed function call. However, some workloads can benefit from more fine-grained communication. This is where distributed groups come into play. new_group() function can be used to create new groups, with arbitrary subsets of all processes. It returns an opaque group handle that can be given as a group argument to all collectives (collectives are distributed functions to exchange information in certain well-known programming patterns).

pg1 = torch.distributed.new_group(range(torch.distributed.get_world_size()))
batch_size = int(cfg.SOLVER.BATCH_SIZE / torch.distributed.get_world_size())
feature_extractor = torch.nn.parallel.DistributedDataParallel(
    feature_extractor, device_ids=[local_rank], output_device=local_rank,
    find_unused_parameters=True, process_group=pg1
)
pg2 = torch.distributed.new_group(range(torch.distributed.get_world_size()))
classifier = torch.nn.parallel.DistributedDataParallel(
    classifier, device_ids=[local_rank], output_device=local_rank,
    find_unused_parameters=True, process_group=pg2
)
# 开启求导的异常侦测
torch.autograd.set_detect_anomaly(True)
# 保持两个进程同步
torch.distributed.barrier()

Collective functions

``torch.distributed.barrier(group=None, async_op=False, device_ids=None)`

同步所有进程。如果async_op为False，或者在wait()上调用async工作句柄，则该集合将阻塞进程，直到整个组进入此函数。

Launch utility

torch.distributed.launch是一个在每个训练节点上生成多个分布式训练过程的模块。

该实用程序可用于单节点分布式训练，其中每个节点将生成一个或多个进程。该实用程序可以用于CPU训练或GPU训练。如果该实用程序用于GPU培训，则每个分布式进程将在单个GPU上运行。这可以实现明显提升单节点训练性能。它还可以用于多节点分布式训练，通过在每个节点上生成多个进程，也可以很好地提高多节点分布式训练的性能。这对于具有多个直接gpu支持的Infiniband接口的系统尤其有利，因为所有这些接口都可以用于聚合的通信带宽。

在单节点分布式训练或多节点分布式训练的两种情况下，该实用程序将启动每个节点给定数量的进程(--nproc_per_node)。如果用于GPU培训，这个数字需要小于或等于当前系统上的GPU数量(nproc_per_node)，并且每个进程将运行在从GPU 0到GPU (nproc_per_node - 1)的单个GPU上。

How to use this module:

1.Single-Node multi-process distributed training

python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other arguments of your training script)

2.Multi-Node multi-process distributed training: (e.g. two nodes)

Node 1: (IP: 192.168.1.1, and has a free port: 1234)
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
           --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
           --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other arguments of your training script)
Node 2:
>>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
           --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
           --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other arguments of your training script)

3.To look up what optional arguments this module offers:

python -m torch.distributed.launch --help

Important Notices:

1.GPU训练目前使用$NCLL$后端达到最佳性能

2.在你的训练程序中，你必须解析命令行参数:–local_rank=LOCAL_PROCESS_RANK，它将由这个模块提供。如果你的training programs使用GPU，你应该确保你的代码只运行在LOCAL_PROCESS_RANK的GPU设备上。这可以通过以下方式实现:

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
# Set your device to local rank using either
torch.cuda.set_device(args.local_rank)  # before your code runs
or:
with torch.cuda.device(args.local_rank):
   # your code to run

3.In your training program, you are supposed to call the following function at the beginning to start the distributed backend. It is strongly recommended that init_method=env://. Other init methods (e.g. tcp://) may work, but env:// is the one that is officially supported by this module.

torch.distributed.init_process_group(backend='YOUR BACKEND',init_method='env://')

4.加载数据集

多卡训练加载数据:

Dataset的设计上与单gpu一致，但是DataLoader上不一样。

首先解释下原因：多gpu训练是，我们希望同一时刻在每个gpu上的数据是不一样的，这样相当于batch size扩大了N倍，因此起到了加速训练的作用。在DataLoader时，如何做到每个gpu上的数据是不一样的，且gpu1上训练过的数据如何确保接下来不被别的gpu再次训练。这时候就得需要DistributedSampler。

Dataloader设置方式如下，注意shuffle与sampler是冲突的，并行训练需要设置sampler，此时务必要把shuffle设为False。但是这里shuffle=False并不意味着数据就不会乱序了，而是乱序的方式交给sampler来控制，实质上数据仍是乱序的。

train_sampler = torch.utils.data.distributed.DistributedSampler(My_Dataset)
dataloader = torch.utils.data.DataLoader(ds,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=16,
                                         pin_memory=True,
                                         drop_last=True,
                                         sampler=train_sampler)

5.加载模型

多卡训练的模型设置：

最主要的是find_unused_parameters和broadcast_buffers参数；

find_unused_parameters：如果模型的输出有不需要进行反传的(比如部分参数被冻结/或者网络前传是动态的)，设置此参数为True;如果你的代码运行

后卡住某个地方不动，基本上就是该参数的问题。

broadcast_buffers：设置为True时，在模型执行forward之前，gpu0会把buffer中的参数值全部覆盖到别的gpu上。注意这和同步BN并不一样，同步BN应该使用SyncBatchNorm。

My_model = My_model.cuda(args.local_rank)  # 将模型拷贝到每个gpu上.直接.cuda()也行，因为多进程时每个进程的device号是不一样的
My_model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(My_model) # 设置多个gpu的BN同步
My_model = torch.nn.parallel.DistributedDataParallel(My_model, 
                                                     device_ids=[args.local_rank], 
                                                     output_device=args.local_rank, 
                                                     find_unused_parameters=False, 
                                                     broadcast_buffers=False)

torch.nn

网络结构图的基本构建模块import torch.nn as nn

卷积层和线性层在__init__里面，而激活和池化在forward函数里面。

torch.nn只支持小批量输入。整个torch.nn包都只支持小批量样本,而不支持单个样本。例如,nn.Conv2d接受一个4维的张量,每一维分别是$Samples \times nChannels\times Height\times Width$(样本数x通道数x高x宽)。如果你有单个样本,只需使用 input.unsqueeze(0) 来添加其它的维数.

Parameter

torch.nn.parameter.Parameter

A kind of Tensor that is to be considered as a module parameter.

Parameters are Tensor subclasses, that have a very special property when used with Module s - when they’re assigned as Module attributes they are automatically added to the list of its parameters, and will appear e.g. in parameters() iterator. Assigning a Tensor doesn’t have such effect. This is because one might want to cache some temporary state, like last hidden state of the RNN, in the model. If there was no such class as Parameter, these temporaries would get registered too.

Containers

nn.Module

所有神经网络模块的基类。你的模型也应该子类化这个类。

模块还可以包含其他模块，允许将它们嵌套在树结构中。

Variables training 和 train() eval()

net.training = True 
# 布尔值表示该模块是处于训练模式training mode还是评估模式evaluation mode。注意，对module的设置仅仅影响本层，子module不受影响
net.train() # 将本层及子层的training设定为True,使用BatchNormalizetion()和Dropout()
net.eval() # 将本层及子层的training设定为False,不使用BatchNormalization()和Dropout()

add_module

在自定义网络的时候，由于自定义变量不是Module类型（例如，我们用List封装了几个网络），所以pytorch不会自动注册网络模块。add_module函数用来为网络添加模块的，所以我们可以使用这个函数手动添加自定义的网络模块。当然，这种情况，我们也可以使用ModuleList来封装自定义模块，pytorch就会自动注册了。

self.layers = nn.Linear(28*28,28*28)
# self.add_module('layers',nn.Linear(28*28,28*28)) 跟上面的方式等价

buffers()和parameters()的区别

buffers()

指那些不需要参与反向传播的参数,反向传播不需要被optimizer更新

buffers(recurse: bool = True) → Iterator[torch.Tensor]
>>> for buf in model.buffers():
>>>     print(type(buf), buf.size())
<class 'torch.Tensor'> (20L,)
<class 'torch.Tensor'> (20L, 1L, 5L, 5L)

parameters()

是nn.parameter.Paramter，也就是组成Module的参数。例如一个nn.Linear通常由weight和bias参数组成。它的特点是默认requires_grad=True,也就是说训练过程中需要反向传播的，反向传播需要被optimizer更新的。

parameters(recurse: bool = True)-> Iterator[torch.nn.parameter.Parameter]
#recurse (bool)如果为True，则生成此模块和所有子模块的参数。否则，只生成此模块的直接成员参数。
>>> for param in model.parameters():
>>>     print(type(param), param.size())#输出的是一个w,一个b！别忘了b！
<class 'torch.Tensor'> (20L,)
<class 'torch.Tensor'> (20L, 1L, 5L, 5L)

named_parameters()

返回模块参数的迭代器，生成参数名称和参数本身

named_parameters(prefix: str = '',#作为所有参数名称的前缀
                 recurse: bool = True)→ Iterator[Tuple[str, torch.Tensor]]
                #如果为真，则生成该模块和所有子模块的参数。否则，只会产生作为该模块直接成员的参数。
for name,parameters in net.named_parameters():#可同时返回名字和参数
    print(name,':',parameters.size())

#conv1.weight : torch.Size([6, 1, 3, 3])
#conv1.bias : torch.Size([6])
#fc1.weight : torch.Size([10, 1350])
#fc1.bias : torch.Size([10])

modules()

返回一个可以遍历网络所有模块的迭代器。

>>> l = nn.Linear(2, 2)
>>> net = nn.Sequential(l, l)
>>> for idx, m in enumerate(net.modules()):
        print(idx, '->', m)

0 -> Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
)
1 -> Linear(in_features=2, out_features=2, bias=True)

named_modules()

返回一个可以遍历网络所有模块的迭代器,产生模块的名字和模块本身。

>>> l = nn.Linear(2, 2)
>>> net = nn.Sequential(l, l)
>>> for idx, m in enumerate(net.named_modules()):
        print(idx, '->', m)

0 -> ('', Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=2, bias=True)
))
1 -> ('0', Linear(in_features=2, out_features=2, bias=True))

# PSPNet的一段实现
for n, m in self.layer3.named_modules():
    if 'conv2' in n:
        m.dilation, m.padding, m.stride = (2, 2), (2, 2), (1, 1)
    elif 'downsample.0' in n:
        m.stride = (1, 1)
for n, m in self.layer4.named_modules():
    if 'conv2' in n:
        m.dilation, m.padding, m.stride = (4, 4), (4, 4), (1, 1)
    elif 'downsample.0' in n:
        m.stride = (1, 1)
# 注意要点有:
# Sequential是没有name的,所有在sequential里的都按顺序从0开始编号
# 遍历是按照深度优先遍历DFS,名字是不断叠加的，如0,0.conv1，0.conv2,之类的
# PSP这段代码的意思就是block中的那个conv2加上空洞卷积，然后取消下采样
params=segmodel.state_dict() #获得模型的原始状态以及参数。
    for k,v in params.items():
        print(k) #只打印key值，不打印具体参数。
"""
conv1.weight
bn1.weight
bn1.bias
bn1.running_mean
bn1.running_var
bn1.num_batches_tracked
conv2.weight
bn2.weight
bn2.bias
bn2.running_mean
bn2.running_var
bn2.num_batches_tracked
layer1.0.conv1.weight
layer1.0.bn1.weight
layer1.0.bn1.bias
layer1.0.bn1.running_mean
layer1.0.bn1.running_var
layer1.0.bn1.num_batches_tracked
layer1.0.conv2.weight
layer1.0.bn2.weight
layer1.0.bn2.bias
layer1.0.bn2.running_mean
layer1.0.bn2.running_var
layer1.0.bn2.num_batches_tracked
"""
l = nn.Linear(2, 2)
net = nn.Sequential(l, l)
params=net.state_dict()
for k,v in params.items():
    print(k,v)
0.weight
0.bias
1.weight
1.bias
# 所有放在Sequential里面的都是按0,1,2,3...序号排列的
# 中间少了几层就不对不上了就少了

state_dict()

state_dcit和load_state_dict源码详解

返回一个字典，其中包含模块的整个状态,存储了网络结构的名字和对应的参数。parameters和buffers(如运行平均值)都包括在内。键是对应的parameter和buffer名称。

torch.nn.Module模块中的state_dict只包含卷积层和全连接层的参数，当网络中存在batchnorm时，例如vgg网络结构，torch.nn.Module模块中的state_dict也会存放batchnorm的running_mean。

torch.optim模块中的Optimizer优化器对象也存在一个state_dict对象，此处的state_dict字典对象包含两个字典对象，key 分别为state和param_groups，param_groups对应的value也是一个字典对象，由学习率，动量等参数组成。

对于module

state_dict(destination=None, prefix='', keep_vars=False)->dict
>>> module.state_dict().keys()
['bias', 'weight']
# torch.nn.modules.module.py

class Module(object):
    def state_dict(self, destination=None, prefix='', keep_vars=False):
        if destination is None:
            destination = OrderedDict()
            destination._metadata = OrderedDict()
        destination._metadata[prefix[:-1]] = local_metadata = dict(version=self._version)
        # params
        for name, param in self._parameters.items():
            if param is not None:
                destination[prefix + name] = param if keep_vars else param.data
        # buffers
        for name, buf in self._buffers.items():
            if buf is not None:
                destination[prefix + name] = buf if keep_vars else buf.data
        # modules
        for name, module in self._modules.items():
            if module is not None:
                module.state_dict(destination, prefix + name + '.', keep_vars=keep_vars)
        # 
        for hook in self._state_dict_hooks.values():
            hook_result = hook(self, destination, prefix, local_metadata)
            if hook_result is not None:
                destination = hook_result
        return destination

通过_modules递归所有子模块,再通过_parameters和_buffers获得所有parameters和buffers,注意之前的parameters()等函数也是利用他们获取相应的值。而_state_dict_hooks就是在读取state_dict时希望执行的操作,一般为空，所以不做考虑。另外有一点需要注意的是，在读取Module时采用的递归的读取方式，并且名字间使用.做分割，以方便后面load_state_dict读取参数。

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.my_tensor = torch.randn(1) # 参数直接作为模型类成员变量
        self.register_buffer('my_buffer', torch.randn(1)) # 参数注册为 buffer
        self.my_param = nn.Parameter(torch.randn(1))
        self.fc = nn.Linear(2,2,bias=False)
        self.conv = nn.Conv2d(2,1,1)
        self.fc2 = nn.Linear(2,2,bias=False)
        self.f3 = self.fc
    def forward(self, x):
        return x

model = MyModel()
print(model.state_dict())
>>>OrderedDict([('my_param', tensor([-0.3052])),
                ('my_buffer', tensor([0.5583])),
                ('fc.weight', tensor([[ 0.6322, -0.0255],[-0.4747, -0.0530]])),
                ('conv.weight', tensor([[[[ 0.3346]],[[-0.2962]]]])),
                ('conv.bias', tensor([0.5205])),
                ('fc2.weight', tensor([[-0.4949,  0.2815],[ 0.3006,  0.0768]])),
                ('f3.weight', tensor([[ 0.6322, -0.0255],[-0.4747, -0.0530]]))])

对于optim

  def state_dict(self):
        r"""Returns the state of the optimizer as a :class:`dict`.

        It contains two entries:

        * state - a dict holding current optimization state. Its content
            differs between optimizer classes.
        * param_groups - a dict containing all parameter groups
        """
        # Save order indices instead of Tensors
        param_mappings = {}
        start_index = 0

        def pack_group(group):
            nonlocal start_index
            packed = {k: v for k, v in group.items() if k != 'params'}
            param_mappings.update({id(p): i for i, p in enumerate(group['params'], start_index)
                                   if id(p) not in param_mappings})
            packed['params'] = [param_mappings[id(p)] for p in group['params']]
            start_index += len(packed['params'])
            return packed
        param_groups = [pack_group(g) for g in self.param_groups]
        # Remap state to use order indices as keys
        packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
                        for k, v in self.state.items()}
        return {
            'state': packed_state,
            'param_groups': param_groups,
        }

load_state_dict()

将参数和缓冲区从state_dict复制到这个模块及其子模块中。

load_state_dict(state_dict: Dict[str, torch.Tensor],#传入一个state_dict
                strict: bool = True)
#state_dict就是你之前保存的模型参数序列，而_load_from_state_dict中的local_state表示你的代码中定义的模型的结构。
#判断上面参数拷贝过程中是否有unexpected_keys或者missing_keys,如果有就报错，代码不能继续执行。当然，如果strict=False，则会忽略这些细节。
#missing_keys is a list of str containing the missing keys
#unexpected_keys is a list of str containing the unexpected keys

cuda与to

with torch.cuda.device(1):#在这没有区别
    # allocates a tensor on GPU 1
    a = torch.tensor([1., 2.], device=cuda)

    # transfers a tensor from CPU to GPU 1
    b = torch.tensor([1., 2.]).cuda()
    # a.device and b.device are device(type='cuda', index=1)

    # You can also use ``Tensor.to`` to transfer a tensor:
    b2 = torch.tensor([1., 2.]).to(device=cuda)
    # b.device and b2.device are device(type='cuda', index=1)

# .to(device)可以指定CPU或者GPU
# 单GPU或者CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
model.to(device)
# model 是 model.to(device)
# img 是 img = img.to(device)
#如果是多GPU
if torch.cuda.device_count() > 1:
  model = nn.DataParallel(model，device_ids=[0,1,2])
model.to(device)

# .cuda()只能指定GPU
#指定某个GPU
os.environ['CUDA_VISIBLE_DEVICE']='1'
model.cuda()
#如果是多GPU
os.environment['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
device_ids = [0,1,2,3]
net  = torch.nn.Dataparallel(net, device_ids =device_ids)
net  = torch.nn.Dataparallel(net) # 默认使用所有的device_ids 
net = net.cuda()

apply(fn)

将fn递归应用于每个子模块(由.children()返回)以及自身self。典型用法包括初始化模型的参数(参见torch.nn.init()).

fn(Module -> None):将被应用到每个子模块的函数。

Returns:self Return type:Module

register_buffer

向模块添加一个buffer.

这通常用于注册不应被视为模型参数的缓冲区。例如,BatchNorm的running_mean不是参数,而是模块状态的一部分。默认情况下，缓冲区是==持久性==的，并将与参数一起保存。可以通过将persistent设置为False来更改此行为。

==持久缓冲区和非持久缓冲区之间的唯一区别是，后者不会成为该模块的state_dict的一部分。==

register_buffer(name,
                tensor,
                persistent=True)#是否为持久缓冲区

children

返回immediate 子模块的迭代器。

Yields:Module - achildren module

register_forward_hook(hook)

在模块上注册一个前向钩子。

相当于插件。可以实现一些额外的功能，而又不用修改主体代码。把这些额外功能实现了挂在主代码上，所以叫钩子，很形象。

每当forward()计算output后，该钩子都会被调用。该钩子应该具有以下签名:

hook(module, input, output) -> None or modified output

输入仅包含提供给模块的positional arguments。 Kerword arguments不会传递给钩子,而只会传递给the forward。挂钩可以修改输出。它可以就地修改输入,但不会对正向产生影响,因为这是在调用forward()之后调用的。

Returns:a handle that can be used to remove the added hook by calling handle.remove()

ReturnType:torch.utils.hooks.RemovableHandle

nn.ModuleList

PyTorch 中的 ModuleList 和 Sequential: 区别和使用场景

可以把任意 nn.Module 的子类 (比如 nn.Conv2d, nn.Linear 之类的) 加到这个 list 里面，方法和 Python 自带的 list 一样，无非是 extend，append 等操作。但不同于一般的 list，加入到 nn.ModuleList 里面的 module 是会自动注册到整个网络上的，同时 module 的 parameters 也会自动添加到整个网络中。

例子1:使用 nn.ModuleList 来构建一个小型网络,包括2个全连接层

class net1(nn.Module):
    def __init__(self):
        super(net1, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10,10) for i in range(2)])
    def forward(self, x):
        for m in self.linears:
            x = m(x)
        return x

net = net1()
print(net)
# net1(
#   (modules): ModuleList(
#     (0): Linear(in_features=10, out_features=10, bias=True)
#     (1): Linear(in_features=10, out_features=10, bias=True)
#   )
# )

for param in net.parameters():
    print(type(param.data), param.size())
# <class 'torch.Tensor'> torch.Size([10, 10])
# <class 'torch.Tensor'> torch.Size([10])
# <class 'torch.Tensor'> torch.Size([10, 10])
# <class 'torch.Tensor'> torch.Size([10])

网络包含两个全连接层，他们的权重 (weithgs) 和偏置 (bias) 都在这个网络之内。

例子2:使用Python自带的list

class net2(nn.Module):
    def __init__(self):
        super(net2, self).__init__()
        self.linears = [nn.Linear(10,10) for i in range(2)]
    def forward(self, x):
        for m in self.linears:
            x = m(x)
        return x

net = net2()
print(net)
# net2()
print(list(net.parameters()))
# []

使用 Python 的 list 添加的全连接层和它们的 parameters 并没有自动注册到我们的网络中。当然，我们还是可以使用 forward 来计算输出结果。但是如果用 net2 实例化的网络进行训练的时候，因为这些层的 parameters 不在整个网络之中，所以其网络参数也不会被更新，也就是无法训练。

好,看到这里,我们大致明白了 nn.ModuleList 是干什么的了:它是一个储存不同 module,并自动将每个module的parameters添加到网络之中的容器。但是,我们需要注意到**,nn.ModuleList 并没有定义一个网络.它只是将不同的模块储存在一起,这些模块之间并没有什么先后顺序**可言,比如:

class net3(nn.Module):
    def __init__(self):
        super(net3, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(10,20), nn.Linear(20,30), nn.Linear(5,10)])
    def forward(self, x):
        x = self.linears[2](x)
        x = self.linears[0](x)
        x = self.linears[1](x) 
        return x

net = net3()
print(net)
# net3(
#   (linears): ModuleList(
#     (0): Linear(in_features=10, out_features=20, bias=True)
#     (1): Linear(in_features=20, out_features=30, bias=True)
#     (2): Linear(in_features=5, out_features=10, bias=True)
#   )
# )
input = torch.randn(32, 5)
print(net(input).shape)
# torch.Size([32, 30])

根据 net3 的结果,我们可以看出来这个 ModuleList 里面的顺序并不能决定什么,网络的执行顺序是根据 forward 函数来决定的。如果你非要 ModuleList 和 forward 中的顺序不一样,PyTorch 表示它无所谓，但以后 review 你代码的人可能会意见比较大。

我们再考虑另外一种情况,既然这个 ModuleList 可以根据序号来调用，那么一个模块是否可以在 forward 函数中被调用多次呢？答案当然是可以的，但是，被调用多次的模块，是使用同一组 parameters 的，也就是它们的参数是共享的,无论之后怎么更新。

nn.sequential

顺序容器。模块将按照它们在构造函数中传递的顺序被添加到它。另外，也可以传入模块的有序dict。

不同于 nn.ModuleList,它已经实现了内部的 forward 函数，而且里面的模块必须是按照顺序进行排列的，所以我们必须确保前一个模块的输出大小和下一个模块的输入大小是一致的。

# Example of using Sequential
model = nn.Sequential(
          nn.Conv2d(1,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()
        )

# Example of using Sequential with OrderedDict
model = nn.Sequential(OrderedDict([
          ('conv1', nn.Conv2d(1,20,5)),
          ('relu1', nn.ReLU()),
          ('conv2', nn.Conv2d(20,64,5)),
          ('relu2', nn.ReLU())
        ]))

线性层

nn.Linear全连接层

torch.nn.Linear(in_features: int, out_features: int, bias: bool = True)
#在全连接层之前通过view函数将其改为一维向量

Dropout

nn.Dropout

在训练过程中,使用伯努利分布样本，以概率p随机地将输入张量中的一些元素置零。

torch.nn.Dropout(p=0.5,#元素置0的概inplace=False)#
# 对所有元素中每个元素按照概率0.5置为0，对点执行

Furthermore, the outputs are scaled by a factor of $\frac{1}{1-p}$ during training. This means that during evaluation the module simply computes an identity function.

nn.Dropout2d

适用于有多个channel输出的,常用于图像处理。

torch.nn.Dropout2d(p=0.5,inplace=False)
# 对每个通道按照概率0.5置为0，对平面执行,直接将整个channel置为0.psp中dropout值设为0.1,在最后一个预测卷积之前使用的。

Conv

nn.Conv2d()卷积核是二维的

torch.nn.Conv2d(in_channels: int, 
                out_channels: int,
                kernel_size: Union[T, Tuple[T, T]], 
                stride: Union[T, Tuple[T, T]] = 1,
                padding: Union[T, Tuple[T, T]] = 0, 
                dilation: Union[T, Tuple[T, T]] = 1, 
                groups: int = 1, 
                bias: bool = True,#是否要添加偏置参数作为可学习参数的一个，默认为True。
                padding_mode: str = 'zeros')

教程：https://www.jianshu.com/p/45a26d278473

接受$(N,C_{in},H,W)$,输出$(N,C_{out},H_{out},W_{out}$)

卷积核的规模就是kernel_size x input_channel x output_channel

$out(N_i,C_{out_j})=bias(C_{out_j})+∑{k=0}^{C{in-1}}weight(C_{out_j},k)⋆input(N_i,k)$

对于depthwise conv,groups这个参数是关键。当$groups=1$时,说明所有通道为一组;当$groups=in_channels$时,说明分了$in_channels$个组，即每个通道一组。然后分别对齐卷积，输出通道数为k。最后再将每组的输出串联，最后通道数为$in_channels*k$。

要实现depthwise conv,就讲groups设为in_channels,同时out_channels也设为与in_channels相同。

Variables

~Conv2d.weight(Tensor):维度为(out_channels,in_channels/groups,kernel_size[0],kernel_size[1])，权重值取样于$u(-\sqrt k,\sqrt k)$,$k=$

~Conv2d.bias(Tensor):维度为(out_channels)，值取样于$u(-\sqrt k,\sqrt k)$,$k=$

nn.ConvTranspose2d

在由几个输入平面组成的输入图像上应用一个二维转置卷积运算符。

torch.nn.ConvTranspose2d(in_channels: int,
                         out_channels: int,
                         kernel_size: Union[T, Tuple[T, T]],
                         stride: Union[T, Tuple[T, T]] = 1,
                         padding: Union[T, Tuple[T, T]] = 0,
                         output_padding: Union[T, Tuple[T, T]] = 0,
                         groups: int = 1, bias: bool = True,
                         dilation: int = 1,
                         padding_mode: str = 'zeros')

池化层

Pytorch池化操作的步长默认与池化卷积核的大小一样，池化一般不考虑overlap

nn.MaxPool2d()

torch.nn.MaxPool2d(kernel_size: Union[T, Tuple[T, ...]], 
                   stride: Optional[Union[T, Tuple[T, ...]]] = None, #default value是kernel_size
                   padding: Union[T, Tuple[T, ...]] = 0, 
                   dilation: Union[T, Tuple[T, ...]] = 1, 
                   return_indices: bool = False, #如果等于True，会返回输出最大值的序号，对于上采样操作会有帮助
                   ceil_mode: bool = False)#如果true，向上取整而不是floor向下取整

nn.AvgPool2d()

torch.nn.AvgPool2d(kernel_size,
                   stride=None,
                   padding=0, 
                   ceil_mode=False,#when True, will use ceil instead of floor to compute the output shape
                   count_include_pad=True,
                   divisor_override=None)

torch.nn.AdaptiveMaxPool2d(output_size, return_indices=False)

特殊性在于，输出张量的大小都是给定的output_size，可以利用这个实现全局平均池化，将output_size设为(1,1)。

>>> # target output size of 5x7
>>> m = nn.AdaptiveMaxPool2d((5,7))
>>> input = torch.randn(1, 64, 8, 9)
>>> output = m(input)
>>> output.size()
torch.Size([1, 64, 5, 7])

>>> # target output size of 7x7 (square)
>>> m = nn.AdaptiveMaxPool2d(7)
>>> input = torch.randn(1, 64, 10, 9)
>>> output = m(input)
>>> output.size()
torch.Size([1, 64, 7, 7])

自适应池化详解1

自适应池化详解2

Padding Layers

torch.nn.ReflectionPad2d(padding)

>>> m = nn.ReflectionPad2d(2)
>>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
>>> input
tensor([[[[0., 1., 2.],
          [3., 4., 5.],
          [6., 7., 8.]]]])
>>> m(input)
tensor([[[[8., 7., 6., 7., 8., 7., 6.],
          [5., 4., 3., 4., 5., 4., 3.],
          [2., 1., 0., 1., 2., 1., 0.],
          [5., 4., 3., 4., 5., 4., 3.],
          [8., 7., 6., 7., 8., 7., 6.],
          [5., 4., 3., 4., 5., 4., 3.],
          [2., 1., 0., 1., 2., 1., 0.]]]])

归一化层

nn.BatchNorm2d()

torch.nn.BatchNorm2d(num_features,#Channel数
                    eps=1e-05,#为数值稳定性而加在分母上的值。
                    momentum=0.1,#指数加权平均的参数
                    affine=True, #是否有可学习参数
                    track_running_stats=True)#这一层不用到测试

nn.SyncBatchNorm

nn.LayerNorm

torch.nn.LayerNorm(normalized_shape,
                   eps=1e-05,
                   elementwise_affine=True, device=None, dtype=None)
# NLP和CV的应用是不同的
>>> # NLP Example
>>> batch, sentence_length, embedding_dim = 20, 5, 10
>>> embedding = torch.randn(batch, sentence_length, embedding_dim)
>>> layer_norm = nn.LayerNorm(embedding_dim)
>>> # Activate module
>>> layer_norm(embedding)
>>>
>>> # Image Example
>>> N, C, H, W = 20, 5, 10, 10
>>> input = torch.randn(N, C, H, W)
>>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
>>> # as shown in the image below
>>> layer_norm = nn.LayerNorm([C, H, W])
>>> output = layer_norm(input)

非线性激活函数

nn.ReLU()

torch.nn.ReLU(inplace: bool = False)#inplace-选择是否进行覆盖运算 x=x+1 还是 y=x+1 x=y 节省内存

nn.Softmax(dim=None)

对指定维度应用Softmax

input = torch.Tensor([[1,2,3], [4,5,6], [7,8,9]])
m0 = nn.Softmax(dim=0)
m1 = nn.Softmax(dim=1)
output0 = m0(input)
output1 = m1(input)

print("input: ", input)
print("output0: ", output0)
print("output1: ", output1)
'''
input:  tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])
dim=0,就是归一化的元素的维度1都相同,只遍历维度0
output0:  tensor([[0.0024, 0.0024, 0.0024],
        [0.0473, 0.0473, 0.0473],
        [0.9503, 0.9503, 0.9503]])
output1:  tensor([[0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652],
        [0.0900, 0.2447, 0.6652]])'''

nn.Softmax2d()

输入为(N,C,H,W),输出为(N,C,H,W) 就是你想的那样

nn.LogSoftmax(dim=None)
$$
LogSoftmax(x_i)=log(\frac{exp(x_i)}{\sum_jexp(x_j)})
$$

距离函数

nn.CosineSimilarity
$$
similarity=\frac{x_1\cdot x_2}{max(||x_1||_2\cdot||x_2||_2,eps)}
$$

torch.nn.CosineSimilarity(dim=1,# 计算余弦相似性的维度
                          eps=1e-08)# 避免除0的小数

Input1:(*,D,*),D是要计算的维度

Input2:(*,D,*)

Output:(*,*)

损失函数

常见损失函数总结,讲得好啊

nn.BCELoss

计算二元交叉熵

$l(x,y)=L={l_1,\cdots,l_N}^T,l_n=-w_n[y_n\cdot logx_n+(1-y_n)\cdot log(1-x_n)]$

我们的解决方案是BCELoss将其对数函数输出固定为大于或等于-100。这样，我们总是可以有一个有限的损失值和一个线性的反向方法。

torch.nn.BCELoss(weight: Optional[torch.Tensor] = None,#手动给的权重
                 size_average=None,#
                 reduce=None,
                 reduction: str = 'mean')
#指定要应用于输出的reduction操作:' none ' | 'mean' | ' sum '。none输出向量,其他输出标量
#“none”:表示不进行任何reduction，“mean”:输出的和除以输出中的元素数，即求平均值，“sum”:输出求和。
#注意:size_average和reduce正在被弃用，与此同时，指定这两个arg中的任何一个都将覆盖reduction参数。默认值:“mean”

nn.BCEWithLogitsLoss

这种损失结合了Sigmoid层和BCELoss在一个类里。这个版本通过将操作合并到一个层比使用一个简单的Sigmoid后面跟着一个BCELoss在数值上更稳定，我们利用log-sum-exp技巧的数值稳定性。

$l(x,y)=L={l_1,\cdots,l_N}^T,l_n=-w_n[y_n\cdot log\sigma(x_n)+(1-y_n)\cdot log(1-\sigma(x_n))]$

torch.nn.BCEWithLogitsLoss(weight: Optional[torch.Tensor] = None,
                           size_average=None,
                           reduce=None,
                           reduction: str = 'mean',
                           pos_weight: Optional[torch.Tensor] = None)
#正值例子的权重，必须是有着与分类数目相同的长度的向量.可以通过增加正值示例的权重来权衡召回率和准确性。

nn.NLLLoss

负对数似然损失。用C类来训练分类问题是有用的。

==负对数似然损失函数，就是对似然函数取负再取log==

如果提供了可选参数weight，它应该是一个一维张量，为每一类赋权。当你有一个不平衡的训练集时，这是特别有用的。

$l(x,y)=L={l_1,\cdots,l_N}^T,l_n=-w_{y_nx_{n,y_n}},w_c=weight[c]\cdot1{c\ne ignore_index}$只选择第n个数据的实际yn类别作为loss

torch.nn.NLLLoss(weight: Optional[torch.Tensor] = None,
                 # 一个手动标给每个类别的权重，如果给定，必须是一个C大小张量,否则默认所有的权重全是1
                 size_average=None,
                 ignore_index: int = -100,#指定一个被忽略的目标值，该目标值不影响输入梯度。
                 # 当size_average为真时，对非忽略目标的损失进行平均。
                 reduce=None,
                 # 默认情况下为True，根据size_average，通过每个小批的观察来对损失进行平均或求和。
                 # 当reduce为False时，返回每个批处理元素的损失值，并忽略size_average
                 reduction: str = 'mean')

==也就是说没有特殊处理的情况下，返回的是每个mini-bacth的平均cross-entropy loss，因为loss就是一个标量嘛，再之后还需要对所有的mini-batch的cross-entropy loss求平均。==

输入(N,C), C代表类别的数量；或者在计算高维损失函数例子中输入大小为(N,C,d1,d2,...,dK),k>=1

#低维示例:
m = nn.LogSoftmax(dim=1)
loss = nn.NLLLoss()
# input is of size N x C = 3 x 5
input = torch.randn(3,5,requires_grad=True)
input
tensor([[-0.8676,  1.5017,  0.2963, -0.9431, -0.0929],
        [ 0.3540,  1.0994, -1.1085, -0.4001,  0.0102],
        [ 1.3653, -0.3828,  0.6257, -2.4996,  0.1928]], requires_grad=True)
m(input)
tensor([[-2.8899, -0.5205, -1.7259, -2.9653, -2.1152],#1->0.5205
        [-1.5082, -0.7628, -2.9707, -2.2623, -1.8520],#0->1.5082
        [-0.6841, -2.4323, -1.4237, -4.5490, -1.8566]],#4->1.8566
       grad_fn=<LogSoftmaxBackward>)
#each element in target has to have 0 <= value < C
target = torch.tensor([1,0,4])
output = loss(m(input), target)
output
tensor(1.2951, grad_fn=<NllLossBackward>)
#高维示例:就是逐像素返回呗,对每一个像素来说都是一个低维示例
# 2D loss example (used, for example, with image inputs)
N, C = 5,4
loss = nn.NLLLoss()
# input is of size N x channel x height x width
data = torch.randn(N, 16, 10, 10)
conv = nn.Conv2d(16, C, (3, 3))#输出为5*4*8*8
m = nn.LogSoftmax(dim=1)
# each element in target has to have 0 <= value < C
target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) #target.size()=target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
output = loss(m(conv(data)), target)
output
tensor(1.5501, grad_fn=<NllLoss2DBackward>)

nn.CrossEntropyLoss

将nn.LogSoftmax()和nn.NLLLoss()方法结合到一个类中

当用C类训练分类问题时，它是有用的。如果提供了，可选的参数weight权重应该是一个一维张量，为每个类分配权重。当你有一个不平衡的训练集时，这是特别有用的。

损失函数:
$$
log(x,class)=-log(\frac{exp(x[class])}{\sum_jexp(x[j])})=-x[class]+log(\sum_jexp(x[j]))
$$
加上weight:
$$
log(x,class)=weightclass
$$
对于ignore_index:

该ignore_index标签的样本损失不考虑

import torch
import torch.nn.functional as F
# 假设两类{0:背景，1：前景}
pred = torch.Tensor(
    [
        [0.9, 0.1],
        [0.8, 0.2],
        [0.7, 0.3]
    ]
)  # shape=(N,C)=(3,2)，N为样本数，C为类数
label = torch.LongTensor([1, 0, 1])  # shape=(N)=(3)，3个样本的label分别为1，0，1
out = F.cross_entropy(pred, label, ignore_index=0)  # 忽略0类
print(out)
# tensor(1.0421)

$$
loss=1/2{[-0.1+ln(e^{0.9}+e^{0.1})+[-0.3+ln(e^{0.7}+e^{0.3})]}=1.0421
$$

x选的只是class那个

取平均是对每个batch维度取平均

reduction=none的话，说明minibatch之间不做平均，就是个向量，否则就是个标量；

其实整个思路都是明白的，就看如何返回了，上面取得了形式的一致性；

$(N,d1,d2,…,dn)$是如何变成(1,1)的就是对后面维度求和，对N取平均。

视觉层

nn.Upsample

torch.nn.Upsample(size: Optional[Union[T, Tuple[T, ...]]] = None,#指定目标输出的大小
                  scale_factor: Optional[Union[T, Tuple[T, ...]]] = None,#输出为输入的倍数，和size只能指定一个
                  mode: str = 'nearest',#上采样算法，包括最近邻、线性、双线性、双三次、三线性插值算法
                  align_corners: Optional[bool] = None)#如果为True，输入的角像素将与输出张量对齐，
#因此将保存下来这些像素的值。仅当使用的算法为'linear', 'bilinear'or 'trilinear'时可以使用。默认设置为False
#语义分割设置为true
tor

数据并行层

nn.DataParallel

在module level实现数据并行.

该容器在batch维度上拆分,将输入分到指定的设备上,以此实现并行化(其他object每个设备都复制一次)。

前向传播过程中,module在每个设备上,每个副本处理一部分输入;

反向传播时,每个副本的梯度被累加到原始模块中。

batch size应该>大于使用的GPU数量。

推荐使用DistributedDataParallel

pytorch多GPU并行训练

我一般在使用多GPU的时候, 会喜欢使用os.environ['CUDA_VISIBLE_DEVICES']来限制使用的GPU个数, 例如我要使用第0和第3编号的GPU, 那么只需要在程序中设置:

os.environ['CUDA_VISIBLE_DEVICES'] = '0,3'

模型加载到多GPU:

model = nn.DataParallel(model)
model = model.cuda()

对于数据:

inputs = inputs.cuda()
labels = labels.cuda()

如果我们不设定好要使用的device_ids的话, 程序会自动找到这个机器上面可以用的所有的显卡, 然后用于训练.

但是因为我们前面使用os.environ['CUDA_VISIBLE_DEVICES']限定了这个程序可以使用的显卡, 所以这个地方程序如果自己获取的话, 获取到的其实就是我们上面设定的那几个显卡.

我没有进行深入得到考究, 但是我感觉使用os.environ['CUDA_VISIBLE_DEVICES']对可以使用的显卡进行限定之后, 显卡的实际编号和程序看到的编号应该是不一样的, 例如上面我们设定的是os.environ['CUDA_VISIBLE_DEVICES']="0,2", 但是程序看到的显卡编号应该被改成了'0,1', 也就是说程序所使用的显卡编号实际上是经过了一次映射之后才会映射到真正的显卡编号上面的, 例如这里的程序看到的1对应实际的2

torch.nn.parallel.DistributedDataParallel

在module level 实现基于torch.distuributed包的分布式数据并行。这个容器通过在批处理维度上的分块，在指定的设备上分割输入，使给定模块的应用并行化。该模块被复制到每台机器和每个设备上，每个这样的副本处理一部分输入。在倒退过程中，每个节点的梯度被平均化。

batch_size应该大于本地使用的GPU数量。

创建这个类需要调用torch.distributed.init_process_group()使得torch.distributed已经被初始化了。

事实证明,在单节点多GPU的数据并行训练中,DistributedDataParallel明显比torch.nn.DataParallel快。

要在有N个GPU的主机上使用DistributedDataParallel，你应该催生N个进程，确保每个进程只在0到N-1的单个GPU上工作。这可以通过为每个进程设置CUDA_VISIBLE_DEVICES或通过调用:torch.cuda.set_device(i),这里的i是从0到N-1。在每个进程中，你应该参考以下内容来构建这个模块：

torch.distributed.init_process_group(
    backend='nccl', world_size=N, init_method='...'
)
model = DistributedDataParallel(model, device_ids=[i], output_device=i)

torch.nn.parallel.DistributedDataParallel(module,
                                          device_ids=None,# 
                                          output_device=None,
                                          dim=0,
                                          broadcast_buffers=True,#设置为True时，在模型执行forward之前，gpu0会把buffer中的参数值全部覆盖
# 到别的gpu上。注意这和同步BN并不一样，同步BN应该使用SyncBatchNorm。
                                          process_group=None,#用于分布式数据
                                          bucket_cap_mb=25, 
                                          find_unused_parameters=False,# 如果模型的输出有不需要进行反传的(比如部分参数被冻结/
                                          # 或者网络前传是动态的)，设置此参数为True;如果你的代码运行后卡住某个地方不动，基本上就是该参数的问题。
                                          check_reduction=False, 
                                          gradient_as_bucket_view=False, 
                                          static_graph=False)

torch.nn.utils

nn.utils.clip_grad_value_

深度炼丹之梯度裁剪

backward之后,step之前

torch.nn.utils.clip_grad_value_(parameters: Union[torch.Tensor,Iterable[torch.Tensor]],
                                 #将梯度归一化的张量的可迭代或单个张量 net.parameters()
                                clip_value: float) → None#梯度被裁剪到[-clip_value,clip_value]
eg:nn.utils.clip_grad_value_(net.parameters(), 0.1)

nn.utils.clip_grad_norm

这个函数的主要目的是对$parameters$里的所有参数的梯度进行规范化。

torch.nn.utils.clip_grad_norm_(parameters,# 要规范的参数
                               max_norm,# 所有参数的梯度的范数的上界
                               norm_type=2.0,# 范数的类型,"inf"为无穷范数
                               error_if_nonfinite=False)

设$parameters$里所有参数的梯度的范数为$total_norm$，
若$max_norm>total_norm$, $parameters$里面的参数的梯度不做改变;
若$max_norm<total_norm$, $parameters$里面的参数的梯度都要乘以一个系数$clip_coef$:
$$
clip_coef=max_norm/(total_norm+10^{-6})
$$

自定义模块

class DoubleConv(nn.Module):   #要继承
    """(convolution => [BN] => ReLU) * 2"""

    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()    #父类__init__
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):             #要定义一个forward函数
        return self.double_conv(x)    #返回自身

forward函数

定义每次调用时执行的计算。应该被所有子类重写。

#主要是使用了__call__特殊方法,使得forward被自动调用。
class A():
    def __init__(self, init_age):
        super().__init__()
        print('我年龄是:',init_age)
        self.age = init_age

    def __call__(self, added_age):
        res = self.forward(added_age)
        return res

    def forward(self, input_):
        print('forward 函数被调用了')        
        return input_ + self.age


print('对象初始化。。。。')
a = A(10)#初始化
input_param = a(2)#__call__起作用
print("我现在的年龄是：", input_param)
#对象初始化。。。。
#我年龄是: 10
#forward 函数被调用了
#我现在的年龄是： 12

关于 __call__ 方法，不得不先提到一个概念，就是*可调用对象callable，我们平时自定义的函数、内置函数和类都属于可调用对象，但凡是可以把一对括号()应用到某个对象身上都可称之为可调用对象，判断对象是否为可调用对象可以用函数 callable。如果在类中实现了 __call__ 方法，那么实例对象也将成为一个可调用对象。

利用这种特性，可以实现基于类的装饰器。

class Counter:
    def __init__(self, func):
        self.func = func
        self.count = 0

    def __call__(self, *args, **kwargs):
        self.count += 1
        return self.func(*args, **kwargs)

@Counter
def foo():
    pass

for i in range(10):
    foo()

print(foo.count)

pytorch结构介绍

forward函数使用的具体流程:

调用module的call方法
module的call里面调用module的forward方法
forward里面如果碰到Module的子类，回到第1步，如果碰到的是Function的子类，继续往下
调用Function的call方法
Function的call方法调用了Function的forward方法。
Function的forward返回值
module的forward返回值
在module的call进行forward_hook操作，然后返回值。

torch.hub

Pytorch Hub is a pre-trained model repository designed to facilitate(促进) research reproducibility.

torch.nn.init

Pytorch 默认参数初始化

pytorch中的各种参数层(Linear、Conv2d、BatchNorm等)在__init__方法中定义后，不需要手动初始化就可以直接使用，这是因为Pytorch对这些层都会进行默认初始化。但是有时候我们需要自定义参数的初始化，就需要用到torch.nn.init。具体的不同初始化，可以查看pytorch官方文档。

初始化函数

初始化方式

torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
#从正态分布N(mean,std^2)中取值填充张量
#eg:w=torch.empty(3,5)
#nn.init.normal_(w)

kaiming_uniform_

Also known as He initialization.

def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
    # a用于leaky_relu
    if 0 in tensor.shape:
        warnings.warn("Initializing zero-element tensors is a no-op")
        return tensor
    fan = _calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
    with torch.no_grad():
        return tensor.uniform_(-bound, bound)

def _calculate_correct_fan(tensor, mode):
    # 计算fan_in和fan_out并返回所要的
    mode = mode.lower()
    valid_modes = ['fan_in', 'fan_out']
    if mode not in valid_modes:
        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))

    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
    return fan_in if mode == 'fan_in' else fan_out

def _calculate_fan_in_and_fan_out(tensor):

    dimensions = tensor.dim()
    if dimensions < 2:
        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
    # 这个tensor是w的tensor
    num_input_fmaps = tensor.size(1)
    num_output_fmaps = tensor.size(0)

    receptive_field_size = 1
    if tensor.dim() > 2:
        # math.prod is not always available, accumulate the product manually
        # we could use functools.reduce but that is not supported by TorchScript
        for s in tensor.shape[2:]:
            receptive_field_size *= s
    fan_in = num_input_fmaps * receptive_field_size
    fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out

Xavier在tanh中表现的很好,但在Relu激活函数中表现的很差,所以何凯明提出了针对于relu的初始化方法。

kaiming_uniform_从$u(-bound,bound)$中采样值填充张量,$bound=gain\cdot \sqrt{3/fan_{mode}}=\sqrt{\frac{6}{(1+a^2)fan_in}}$

$f_out=out_channels×kernel_size^2,f_in=in_channels×kernel_size^2$

$fan_in$可以保持前向传播的权重方差的数量级,$fan_out$可以保持反向传播的权重方差的数量级。

理论分析

calculate_gain(nonlinearity, param=None)

输入激活函数的名字,返回对应的$gain$值,增益值$gain$是一个比例,来调控输入数量级和输出数量级之间的关系。

gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with

$nonlinearity\to gain$:

$Linear/identity=1,Conv{1,2,3}D=1,Sigmoid=1,tanh=5/3,Relu=\sqrt{2}$

$Leaky Relu=\sqrt{2/(1+negative_slope^2)},SELU=3/4$

kaiming_normal_

def kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
    if 0 in tensor.shape:
        warnings.warn("Initializing zero-element tensors is a no-op")
        return tensor
    fan = _calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    with torch.no_grad():
        return tensor.normal_(0, std)

从$N~(0,std^2)$中采样值填充张量,$std=\frac{gain}{\sqrt{fan_mode}}$。

参数层初始化

Linear

def __init__(self, in_features: int, out_features: int, bias: bool = True,
                 device=None, dtype=None) -> None:
    factory_kwargs = {'device': device, 'dtype': dtype}
    super(Linear, self).__init__()
    self.in_features = in_features
    self.out_features = out_features
    self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
    if bias:
        self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
    else:
        self.register_parameter('bias', None)
    self.reset_parameters()

def reset_parameters(self) -> None:
    init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    if self.bias is not None:
        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
        init.uniform_(self.bias, -bound, bound)

$a=\sqrt{5}$,所以$bound=\sqrt{1/fan_in}$,所以$w,b\sim U(-bound,bound)$

Conv

比如一个输入channel为3,输出channel为64,kernel size=3的卷积层,其权值即为一个3×64×3×3的向量,它会这样进行初始化：

def reset_parameters(self):
    init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    if self.bias is not None:
        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        init.uniform_(self.bias, -bound, bound)

BatchNorm

def reset_parameters(self):
    self.reset_running_stats()
    if self.affine:
        init.uniform_(self.weight)
        init.zeros_(self.bias)

$w\sim U(0,1),bias=0$

网络初始化

在各种内置的网络模型中,初始化的方法也有不同。

ResNet

resnet在定义各层之后，pytorch官方代码的__init__方法会对不同的层进行手动的初始化.

for m in self.modules():
    if isinstance(m, nn.Conv2d):
        # 卷积层使用kaiming_normal_
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
    elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
        # BatchNorm、GroupNorm使用常数1和0
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)

VGG

VGG的pytorch官方初始化如下:

def _initialize_weights(self):
    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, 0, 0.01)
            nn.init.constant_(m.bias, 0)

设置随机数种子

torch.manual_seed(seed) → torch._C.Generator

在神经网络中，参数默认是进行随机初始化的。如果不设置的话每次训练时的初始化都是随机的，导致结果不确定。如果设置初始化，则每次初始化都是固定的。pytorch默认使用何恺明的初始化

if args.seed is not None:    　　
    random.seed(args.seed) # 如果使用random的随机的话，就需要设置
    np.random.seed(args.seed)# 如果使用np.random的随机的话，
    torch.manual_seed(args.seed)  #为CPU设置种子用于生成随机数,以使得结果是确定的
    torch.cuda.manual_seed(args.seed) #为当前GPU设置随机种子
    cudnn.deterministic = True #使用确定性卷积
    #顾名思义，将这个 flag 置为True的话，每次返回的卷积算法将是确定的，即默认算法。
    #如果配合上设置 Torch 的随机种子为固定值的话，应该可以保证每次运行网络的时候相同输入的输出是固定的
    #如果使用多GPU,需要
    #torch.cuda.manual_seed_all() 为所有GPU设置种子

torch.nn.functioal

nn与nn.functional的区别

interpolate

下/上采样输入到给定的大小或给定的 scale_factor。

torch.nn.functional.interpolate(input,
                                size=None,
                                scale_factor=None,
                                mode='nearest',
                                align_corners=None,# 
                                recompute_scale_factor=None)

upsample

将输入上采样到给定的大小或给定的scale_factor

torch.nn.functional.upsample(input,#输入张量
                             size=None,#输出大小
                             scale_factor=None,#变换因子
                             mode='nearest',#模式
                             align_corners=None)

avg_pool2d

torch.nn.functional.avg_pool2d(input,#操作的Tensor
                               kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None) → Tensor

max_pool2d

torch.nn.functional.adaptive_max_pool2d(*args, **kwargs)

在由几个输入平面组成的输入信号上应用2D自适应最大池化。

cross_entropy

See CrossEntropyLoss

grid_sample

grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corners=None)
# input:input参数是输入特征图tensor，也就是特征图，可以是四维或者五维张量
# 以四维形式为例(N,C,Hin,Win)，N可以理解为Batch_size,C可以理解为通道数，Hin和Win也就是特征图高和宽。
# grid:包含输出特征图特征图的格网大小以及每个格网对应到输入特征图的采样点位，对应四维input，其张量形式为(N,Hout,Wout,2)，其中最后一维大小必须为2
# 如果输入为五维张量，那么最后一维大小必须为3。为什么最后一维必须为2或者3？因为grid的最后一个维度实际上代表一个坐标(x,y)或者(xy,z)，
# 对应到输入特征图的二维或三维特特征图的坐标维度，xy取值范围一般为[-1,1]，该范围映射到输入特征图的全图。
# mode:选择采样方法，有三种内插算法可选，分别是'bilinear'双线性差值、'nearest'最邻近插值、'bicubic' 双三次插值.
# padding_mode:为填充模式，即当(x,y)取值超过输入特征图采样范围，返回一个特定值，有'zeros' 、 'border' 、 'reflection'三种可选，一般用zero。
# align_corners:为bool类型，指设定特征图坐标与特征值对应方式，设定为TRUE时，特征值位于像素中心。

经典的应用:pytorch光流函数warp

关键点:图片的坐标加上光流即为在输出点的坐标

def warp(x, flo):
        """
        warp an image/tensor (im2) back to im1, according to the optical flow
        x: [B, C, H, W] (im2)
        flo: [B, 2, H, W] flow
        """
        B, C, H, W = x.size()
        # mesh grid 
        xx = torch.arange(0, W).view(1,-1).repeat(H,1)
        yy = torch.arange(0, H).view(-1,1).repeat(1,W)
        xx = xx.view(1,1,H,W).repeat(B,1,1,1)
        yy = yy.view(1,1,H,W).repeat(B,1,1,1)
        grid = torch.cat((xx,yy),1).float()


        x = x.cuda()
        grid = grid.cuda()
        vgrid = Variable(grid) + flo # B,2,H,W
        #图二的每个像素坐标加上它的光流即为该像素点对应在图一的坐标

 # scale grid to [-1,1] 
 ##2019 code
        vgrid[:,0,:,:] = 2.0*vgrid[:,0,:,:].clone()/max(W-1,1)-1.0 
        #取出光流v这个维度，原来范围是0~W-1，再除以W-1，范围是0~1，再乘以2，范围是0~2，再-1，范围是-1~1
        vgrid[:,1,:,:] = 2.0*vgrid[:,1,:,:].clone()/max(H-1,1)-1.0 #取出光流u这个维度，同上

        vgrid = vgrid.permute(0,2,3,1)#from B,2,H,W -> B,H,W,2，为什么要这么变呢？是因为要配合grid_sample这个函数的使用
        output = nn.functional.grid_sample(x, vgrid,align_corners=True)
        mask = torch.autograd.Variable(torch.ones(x.size())).cuda()
        mask = nn.functional.grid_sample(mask, vgrid,align_corners=True)

 ##2019 author
        mask[mask<0.9999] = 0
        mask[mask>0] = 1

 ##2019 code
 # mask = torch.floor(torch.clamp(mask, 0 ,1))

 return output*mask

Vision functions

pad

torch.nn.functional.pad(input,
                        pad,
                        mode='constant',
                        value=0.0)

从最后一个维度开始pad,例如只pad最后一个维度,pad的形式是(padding_left, padding_right);pad最后两个维度,pad的形式是

(padding_left, padding_right, padding_top, padding_bottom);(padding_left,padding_right,padding_top,padding_bottom,

padding_front,padding_back)。

假设原始是(2,3,4)

padding_left,padding_right是pad最后一个维度,即4这个维度

padding_top,padding_bottom是pad倒数第二个维度,即3这个维度

padding_front,padding_back是pad倒数第三个维度,即2这个维度

torch.optim

optim教程

是一个实现各种优化算法的包。已经支持了最常用的方法，接口也足够通用，因此将来还可以轻松地集成更复杂的方法。
$$
\begin{split}\begin{aligned} w_1 &\leftarrow \left(1- \eta\lambda \right)w_1 - \frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}}x_1^{(i)} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right),\ w_2 &\leftarrow \left(1- \eta\lambda \right)w_2 - \frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}}x_2^{(i)} \left(x_1^{(i)} w_1 + x_2^{(i)} w_2 + b - y^{(i)}\right). \end{aligned}\end{split}
$$
整个包就是解决这一步的,就是参数优化

To use torch.optim you have to construct an optimizer object, that will hold the current state and will update the parameters based on the computed gradients.

创建optim对象时，要给它一个包含模型参数的的可迭代对象(所有的都应该是 Variable)，然后指定learning rate,weight decay等参数.

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam([var1, var2], lr=0.0001)

注意：由于要把模型参数传给 optim ，所以如果要使用GPU时，要在把模型参数传给 optim之前写 model().cuda()，因为调用 .cuda() 前后不是一个参数对象，在此optimize期间，要保证 optimized parameters 在同一位置，不要 .cpu()或 .cuda() 乱用，注意下顺序 ！！！有待验证！！！

也支持为每个参数单独设置选项。若想这么做，不要直接传入Variable的iterable，而是传入dict的iterable。这种方法在对每层分别指定learning rate时很有用:

optim.SGD([
                {'params': model.base.parameters()},
                {'params': model.classifier.parameters(), 'lr': 1e-3}
            ], lr=1e-2, momentum=0.9)

上面这样写，表示 $model.base$ 的 $lr$ 是 1e-2，$model.classifier$ 的 lr 是 $1e-3$，$momentum=0.9$ 同时用于这两个参数

class Optimizer(object):
    def __init__(self, params, defaults):
        torch._C._log_api_usage_once("python.optimizer")
        self.defaults = defaults

        self._hook_for_profile()

        if isinstance(params, torch.Tensor):
            raise TypeError("params argument given to the optimizer should be "
                            "an iterable of Tensors or dicts, but got " +
                            torch.typename(params))

        self.state = defaultdict(dict)
        self.param_groups = []
        # 在源码中有一个参数param_groups来存储params
        # 以resnet18为例,params是generator,list完之后维度为(64,),param_groups[0]的维度为torch.Size([64, 3, 7, 7])
        param_groups = list(params)
        if len(param_groups) == 0:
            raise ValueError("optimizer got an empty parameter list")
        # 成为一个字典
        if not isinstance(param_groups[0], dict):
            param_groups = [{'params': param_groups}]

        for param_group in param_groups:
            self.add_param_group(param_group)

torch.optim.Optimizer

所有优化器的基类。

zero_grad(set_to_none: bool = False)

zero_grad教程

设置被优化的张量的梯度为0,显然，我们进行下一次batch梯度计算的时候，前一个batch的梯度计算结果，没有保留的必要了。所以在下一次梯度更新的时候，先使用optimizer.zero_grad把梯度信息设置为0。

唯一一个参数意思是不设为0而设置为None

def zero_grad(self, set_to_none: bool = False):
        r"""Sets the gradients of all optimized :class:`torch.Tensor` s to zero.

        Args:
            set_to_none (bool): instead of setting to zero, set the grads to None.
                This will in general have lower memory footprint, and can modestly improve performance.
                However, it changes certain behaviors. For example:
                1. When the user tries to access a gradient and perform manual ops on it,
                a None attribute or a Tensor full of 0s will behave differently.
                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
                are guaranteed to be None for params that did not receive a gradient.
                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
                (in one case it does the step with a gradient of 0 and in the other it skips
                the step altogether).
        """
        if not hasattr(self, "_zero_grad_profile_name"):
            self._hook_for_profile()
        with torch.autograd.profiler.record_function(self._zero_grad_profile_name):
            for group in self.param_groups:
                for p in group['params']:
                    # p.grad可以直接查看梯度，只有不是None的时候才设为0,是None的话保持None就行
                    # a = [None, None] a[0]=1 =>a = [1, None]
                    if p.grad is not None:
                        if set_to_none:
                            p.grad = None
                        else:
                            if p.grad.grad_fn is not None:
                                p.grad.detach_()
                            else:
                                p.grad.requires_grad_(False)
                            p.grad.zero_()

optimizer执行的两种方式:optimizer.step()和``optimizer.step(closure)`，

所有的optim 都实现了前一种方法，第一种方法会更新所有参数，这是大多数 optim 都支持的方法，只要损失反向传播后就可以调用此函数:

for input, target in dataset:
    optimizer.zero_grad()
    output = model(input)
    loss = loss_fn(output, target)
    # 求出每一阶段的损失
    loss.backward()
    # 更新参数
    optimizer.step()

关于第二种：optimizer.step(closure) 一些优化算法，例如 Conjugate Gradient 和 LBFGS 需要重复多次计算，因此你需要传入一个 closure 去允许它们重新计算你的模型。这个closure 应当清空梯度，计算损失，然后返回

for input, target in dataset:
    def closure():
        optimizer.zero_grad()
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        return loss
    optimizer.step(closure)

torch.optim.RMSprop

torch.optim.RMSprop(params,#用于优化的参数iterable或定义参数组的dicts
                    lr=0.01,
                    alpha=0.99,
                    eps=1e-08,
                    weight_decay=0,#权重衰减
                    momentum=0,#动量
                    centered=False)

torch.optim.SGD

实现随机梯度下降(可选动量)。

torch.optim.SGD(params,# 待优化参数的可迭代对象或者是定义了参数组的dict
                lr=<required parameter>,#学习率
                momentum=0,#动量
                dampening=0,#
                weight_decay=0,#权重衰减
                nesterov=False)#
>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
>>> optimizer.zero_grad()
>>> loss_fn(model(input), target).backward()
>>> optimizer.step()
# 随机梯度下降,就比较依赖于数据的随机程度,如果不对数据进行打乱处理,可能异常值集中在数据某一块,会对算法收敛拟合造成干扰。

torch.optim.Adam

torch.optim.Adam(params,
                 lr=0.001,
                 betas=(0.9, 0.999),#
                 eps=1e-08,
                 weight_decay=0,
                 amsgrad=False)

How to adjust learning rate?

torch.optim.lr_scheduler provides several methods to adjust the learning rate based on the number of epochs. torch.optim.lr_scheduler.ReduceLROnPlateau allows dynamic learning rate reducing based on some validation measurements.

# Learning rate scheduling should be applied after optimizer’s update:
model = [Parameter(torch.randn(2, 2, requires_grad=True))]
optimizer = SGD(model, 0.1)
scheduler = ExponentialLR(optimizer, gamma=0.9)

for epoch in range(20):
    for input, target in dataset:
        optimizer.zero_grad()
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
    scheduler.step()
# Most learning rate schedulers can be called back-to-back (also referred to as chaining schedulers). 
# The result is that each scheduler is applied one after the other on the learning rate obtained by the one preceding it.
model = [Parameter(torch.randn(2, 2, requires_grad=True))]
optimizer = SGD(model, 0.1)
scheduler1 = ExponentialLR(optimizer, gamma=0.9)
scheduler2 = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)

for epoch in range(20):
    for input, target in dataset:
        optimizer.zero_grad()
        output = model(input)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
    scheduler1.step()
    scheduler2.step()
# In many places in the documentation, we will use the following template to refer to schedulers algorithms.
scheduler = ...
for epoch in range(100):
    train(...)
    validate(...)
    scheduler.step()

lr_scheduler.LambdaLR

将每个参数组的学习率设置为初始 lr 乘以给定函数。

# Assuming optimizer has two groups.
lambda1 = lambda epoch: epoch // 30
lambda2 = lambda epoch: 0.95 ** epoch
scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
for epoch in range(100):
    train(...)
    validate(...)
    scheduler.step()

torch.utils.tensorboard

一旦你安装了TensorBoard，这些工具可以让你将PyTorch模型和指标记录到一个目录中，以便在TensorBoard UI中可视化。标量、图像、直方图、图形和嵌入可视化都支持PyTorch模型和张量以及Caffe2 网络和blobs。

安装pip install tensorboardX 还需要安装tensorflow pip install tensorflow

除此之外还需要设置隧道映射,多换几个端口试试

tensorboardX介绍

SummaryWriter

CLASS torch.utils.tensorboard.writer.SummaryWriter

直接将条目写入log_dir中的事件文件中，以供TensorBoard使用。

SummaryWriter类提供了一个高级API，用于在给定目录中创建事件文件并向其添加摘要和事件。该类异步更新文件内容。这允许训练程序调用方法直接从训练循环向文件添加数据，而不会减慢训练速度。

__init__创建一个SummaryWriter，将事件和摘要写入事件文件中。

from torch.utils.tensorboard import SummaryWriter
__init__(log_dir=None,#保存目录位置，默认值是'run/CURRENT_DATETIME_HOSTNAME',所以每次运行后都会更改，日期时间肯定会变
         comment='',#注解添加到默认log_dir的后缀，若log_dir被指定，则此参数不起作用。
         purge_step=None,#
         max_queue=10,#在一个'add'调用强制刷新磁盘之前，挂起事件和汇总的队列的大小。默认为10项。
         flush_secs=120,#将挂起事件和摘要刷新到磁盘的频率(以秒为单位)。默认每120s一次
         filename_suffix='')#添加到log_dir目录中所有事件文件名的后缀。

SummaryWriter类是您记录TensorBoard使用和可视化数据的主要入口。例如:

import torch
import torchvision
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms

# Writer will output to ./runs/ directory by default
writer = SummaryWriter()

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset = datasets.MNIST('mnist_train', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
model = torchvision.models.resnet50(False)
# Have ResNet model take in grayscale rather than RGB
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
images, labels = next(iter(trainloader))

grid = torchvision.utils.make_grid(images)
writer.add_image('images', grid, 0)
writer.add_graph(model, images)
writer.close()
#这可以通过TensorBoard进行可视化，TensorBoard安装和运行:
pip install tensorboard
tensorboard --logdir=#包含记录文件的文件夹路径
#记录文件名类似如下:events.out.tfevents.1616293286.vision806-desktop
# 默认的终端号是6006，要是一台服务器多人使用，很麻烦，可以指定端口，输入
tensorboard --logdir=logs --port=6007

一个实验可以记录很多信息。为了避免UI的混乱和更好的结果聚类，我们可以通过分级命名来对图进行分组。例如， Loss/train 和Loss/test 被分组在一起，Accuracy/train和Accuarcy/test被分组在一起。

add_scalar

add_scalar(tag:string,# 数据标识,train/loss,分成了两级
           scalar_value,# 数字常量值,一定要是float类型
           # 如果是 Pytorch scalar tensor,则需要调用.item()方法获取其数值。
           global_step=None, # 训练的step,作为横坐标
           walltime=None)# 记录发生的时间，默认为time.time()

添加标量,一般使用其记录训练过程的 loss、accuracy、learning rate 等数值的变化，直观地监控训练过程。

如果tag相同标量会被放在一个图里，如writer.add_scalar(‘y=2x’, i * 2, i)和writer.add_scalar('y=2x, i * i, i)

add_image

add_image(self, tag,#就是保存图片的名称
          img_tensor,#图形:torch.Tensor numpy.array or string
          global_step=None,# 训练的step 作为横坐标
          walltime=None,# 记录发生的时间，默认为time.time()
          dataformats=‘CHW’)#默认为CHW tensor是CHW numpy是HWC

也可以画特征图的变化

visdom

必须提前开启:

python -m visdom server -p 10086

torch.utils.data

PyTorch数据加载工具的核心是torch.utils.data.DataLoader类。它表示一个数据集上的Python可迭代对象。

pytorch输入数据PipeLine一般遵循一个“三步走”的策略，一般pytorch 的数据加载到模型的操作顺序是这样的：
① 创建一个 Dataset 对象。必须实现__len__()、__getitem__()这两个方法，这里面会用到transform对数据集进行扩充。
② 创建一个 DataLoader 对象。它是对DataSet对象进行迭代的，一般不需要事先里面的其他方法了。
③ 循环遍历这个 DataLoader 对象。将img, label加载到模型中进行训练

dataset = MyDataset()           # 第一步：构造Dataset对象
dataloader = DataLoader(dataset)# 第二步：通过DataLoader来构造迭代对象

num_epoches = 100
for epoch in range(num_epoches):# 第三步：逐步迭代数据
&nbsp;&nbsp;&nbsp; for img, label in dataloader:
        # 训练代码

Map-style datasets

map风格的数据集是一个实现__getitem__()和__len__()协议的数据集，并表示从(可能非整数)索引/键到数据样本的映射。

例如，当使用dataset[idx]访问这样的数据集时，可以从磁盘上的文件夹中读取idx-th映像及其对应的标签。

查看Dataset了解更多细节。

Iterable-style datasets

iterable风格的数据集是IterableDataset的一个子类的实例，它实现了__iter__()协议，并表示数据样本上的一个iterable。这种类型的数据集特别适合于这样的情况:随机读取代价很高，甚至不太可能，而且批大小取决于所获取的数据。

例如，当调用iter(dataset)时，这样的数据集可以返回从数据库、远程服务器甚至实时生成的日志读取的数据流。

查看IterableDataset了解更多细节。

Data Loading Order and Sampler

Memory Pinning

pin_memory就是锁页内存，创建DataLoader时，设置pin_memory=True，则意味着生成的Tensor数据最开始是属于内存中的锁页内存，这样将内存的Tensor转义到GPU的显存就会更快一些。

主机中的内存，有两种存在方式，一是锁页，二是不锁页，锁页内存存放的内容在任何情况下都不会与主机的虚拟内存进行交换（注：虚拟内存就是硬盘），而不锁页内存在主机内存不足时，数据会存放在虚拟内存中。

而显卡中的显存全部是锁页内存！

当计算机的内存充足的时候，可以设置pin_memory=True。当系统卡住，或者交换内存使用过多的时候，设置pin_memory=False。因为pin_memory与电脑硬件性能有关，pytorch开发者不能确保每一个炼丹玩家都有高端设备，因此pin_memory默认为False。

torch.utils.data.DataLoader

数据加载程序。组合数据集dataset和采样器sampler,并提供给定数据集上的迭代。

在训练模型时使用到此函数，用来把训练数据分成多个小组，此函数每次抛出一组数据。直至把所有的数据都抛出。就是做一个数据的初始化。

DataLoader支持map-style和iterable-style的数据集，具有单进程或多进程加载、自定义加载顺序和可选的自动批处理(整理)和内存固定.

DataLoader本质是一个可迭代对象，所以:

可以使用for inputs, labels in dataloaders进行可迭代对象的访问
先使用iter对dataloader进行第一步包装，使用iter(dataloader)返回的是一个迭代器，然后就可以可以使用next访问了。Dataloader的__iter__()根据num_workers的数量返回单线程或多线程的迭代器
我们一般不需要再自己去实现DataLoader的方法了，只需要在构造函数中指定相应的参数即可，比如常见的batch_size，shuffle等等参数。所以使用DataLoader十分简洁方便。
DataLoader实际上一个较为高层的封装类，它的功能都是通过更底层的_DataLoader来完成的，但是_DataLoader类较为低层，这里就不再展开叙述了。DataLoaderIter就是_DataLoaderIter的一个框架, 用来传给_DataLoaderIter 一堆参数, 并把自己装进DataLoaderIter 里。

torch.utils.data.DataLoader(dataset: torch.utils.data.dataset.Dataset[T_co],
                            batch_size: Optional[int] = 1,
                            shuffle: bool = False,
                    #表示每一个epoch之后是否对样本进行随机打乱,所有的先打乱,再取batch.具体解析见下文
                            sampler: Optional[torch.utils.data.sampler.Sampler[int]] = None,
                            #自定义从数据集中抽取样本的策略，如果指定这个参数，那么shuffle必须为False。
                            batch_sampler: Optional[torch.utils.data.sampler.Sampler[Sequence[int]]] = None,
                            num_workers: int = 0,#要使用多少子进程装载数据。0表示数据将在主进程中加载。
                            collate_fn: Callable[List[T], Any] = None,
                   # 合并一个list的samples以形成mini-batch的Tensors
                   # collate_fn这个函数的输入就是一个list，list的长度是一个batch size，list中的每个元素都是__getitem__得到的结果。
                            pin_memory: bool = False,#是否锁页内存
                            drop_last: bool = False,
#如果数据集大小不能被批大小整除，则设置为True可删除最后一个不完整的批处理。
#如果为False，并且dataset的大小不能被batch-size整除，那么最后一批将变小。(默认值:False)
                            timeout: float = 0,
           #timeout (numeric, optional): 如果是正数，表明等待从worker进程中收集一个batch等待的时间，若超出设定的时间还没有收集到，那就不收集这个内容了。这个numeric应总是大于等于0。默认为0
                            worker_init_fn: Callable[int, None] = None,
                            multiprocessing_context=None,
                            generator=None, *,
                            prefetch_factor: int = 2,
                            persistent_workers: bool = False)

"""
    批训练，把数据变成一小批一小批数据进行训练。
    DataLoader就是用来包装所使用的数据，每次抛出一批数据
"""
import torch
import torch.utils.data as Data

BATCH_SIZE = 5

x = torch.linspace(1, 10, 10)
y = torch.linspace(10, 1, 10)
# 把数据放在数据库中
torch_dataset = Data.TensorDataset(x, y)
loader = Data.DataLoader(
    # 从数据库中每次抽出batch size个样本
    dataset=torch_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
)


def show_batch():
    for epoch in range(3):
        for step, (batch_x, batch_y) in enumerate(loader):
            # training


            print("steop:{}, batch_x:{}, batch_y:{}".format(step, batch_x, batch_y))


if __name__ == '__main__':
    show_batch()
#loader在这里就是迭代器,在for里面一旦取用自动更新
#for data in testloader: 这里的data就是一个batch的数据,是一个tuple,既有train_X,也有train_Y.

pytorch数据预处理三剑客之Dataset，DataLoader，Transform

上面这篇讲的非常好

num_worker的作用

为什么机器学习需要打乱数据?

防止数据按一定规律排列,这样神经网络学习时会把这种规律当做一种特征学习,从而过拟合。这样做使得数据更接近于真实分布。

比如随机梯度下降,就比较依赖于数据的随机程度,如果不对数据进行打乱处理,可能异常值集中在数据某一块,会对算法收敛拟合造成干扰。

torch.utils.data.Dataset

表示一个Dataset的抽象类。

所有表示从键到数据样本映射的数据集都应该子类化它。所有子类都应该覆盖__getitem__()，从而支持获取一个给定键的数据样本。子类也可以选择性地覆盖__len__()，许多Sampler实现和DataLoader默认选项都希望它返回数据集的大小。

#dataset的抽象父类定义如下
def __getitem__(self, index) -> T_co:
        raise NotImplementedError
def __add__(self, other: 'Dataset[T_co]') -> 'ConcatDataset[T_co]':
        return ConcatDataset([self, other])

__init__(self):主要是数据的获取，比如从某个文件中获取

__len__(self):整个数据集的长度

__getitem__(self,index):这个是最重要的，一般情况下它会包含以下几个业务需要处理

比如如果我们需要在读取数据的同时对图像进行增强的话，当然，图像增强的方法可以使用Pytorch内置的图像增强方式，也可以使用自定义或者其他的图像增强库,这个很灵活。
在Pytorch中得到的图像必须是tensor，也就是说我们必须要将数据格式转化成pytorch的tensor格式才行。

# coding: utf-8

import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import cv2
import numpy as np

from torchvision.transforms import ToTensor
from torchvision import datasets, transforms

import random

class LaneDataSet(Dataset):
    def __init__(self, dataset, transform):
        '''
        param：
            detaset: 实际上就是tusimple数据集的三个文本文件train.txt、val.txt、test.txt三者的文件路径
            transform: 决定是否进行变换,它其实是一个函数或者是几个函数的组合
        构造三个列表，存储每一张图片的文件路径          
        '''
        self._gt_img_list = []
        self._gt_label_binary_list = []
        self._gt_label_instance_list = []
        self.transform = transform

        with open(dataset, 'r') as file:  # 打开其实是那个 training下面的那个train.txt 文件
            for _info in file:
                info_tmp = _info.strip(' ').split()

                self._gt_img_list.append(info_tmp[0])
                self._gt_label_binary_list.append(info_tmp[1])
                self._gt_label_instance_list.append(info_tmp[2])

        assert len(self._gt_img_list) == len(self._gt_label_binary_list) == len(self._gt_label_instance_list)

        self._shuffle()
def _shuffle(self):
    # 将gt_image、binary_image、instance_image三者所对应的图片路径组合起来，再进行随机打乱
    c = list(zip(self._gt_img_list, self._gt_label_binary_list, self._gt_label_instance_list))
    random.shuffle(c)
    self._gt_img_list, self._gt_label_binary_list, self._gt_label_instance_list = zip(*c)
def __len__(self):
    return len(self._gt_img_list)
def __getitem__(self, idx):
    assert len(self._gt_label_binary_list) == len(self._gt_label_instance_list) \
               == len(self._gt_img_list)

    # 读取图片
    img = cv2.imread(self._gt_img_list[idx], cv2.IMREAD_COLOR) #真实图片 (720,1280,3)

    label_instance_img = cv2.imread(self._gt_label_instance_list[idx], cv2.IMREAD_UNCHANGED) # instance图片 （720,1280）

    label_binary_img = cv2.imread(self._gt_label_binary_list[idx], cv2.IMREAD_GRAYSCALE) #binary图片 （720,1280)

    # optional transformations,裁剪成（256,512）
    if self.transform:
        img = self.transform(img)
        label_binary_img = self.transform(label_binary_img)
        label_instance_img = self.transform(label_instance_img)

    img = img.reshape(img.shape[2], img.shape[0], img.shape[1]) #（3,720,1280） 这里都没有问题
    return (img, label_binary_img, label_instance_img)

torch.utils.data.TensorDataset(*tensors)

TensorDataset 可以用来对 tensor 进行打包，就好像 python 中的 zip 功能。该类通过每一个 tensor 的第一个维度进行索引。因此，该类中的 tensor 第一维度必须相等。

举个例子,六张图片,六个label,维度分别是(6,H,W,C)和(6,)

torch.utils.data.Concat(datasets)

连接多个数据集产生一个新的数据集,该类用于组装不同的现有数据集。

datasets:(sequence) – List of datasets to be concatenated

torch.utils.data.Subset

torch.utils.data.random_split

torch.utils.data.random_split(dataset: torch.utils.data.dataset.Dataset[T],#要被划分的数据集
                              lengths: Sequence[int],#要生成的切片长度
                              generator: Optional[torch._C.Generator] = <torch._C.Generator object>) → List[torch.utils.data.dataset.Subset[T]]#用于随机排列的生成器。
#eg:random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))
#eg:train, val = random_split(dataset, [n_train, n_val])

torch.utils.data.Sampler

所有采样器的基类。

每个采样器子类都必须提供一个__iter__()方法，提供一种遍历数据集元素索引的方法，以及一个__len__()方法，该方法返回返回的迭代器的长度。

#pytorch采样器
class Sampler(object):  
class SequentialSampler(Sampler):
class RandomSampler(Sampler):
class SubsetRandomSampler(Sampler):
class WeightedRandomSampler(Sampler):
class BatchSampler(Sampler):
#Dataloader中的采样器
if sampler is None:  # give default samplers
    if self._dataset_kind == _DatasetKind.Iterable:
        # See NOTE [ Custom Samplers and IterableDataset ]
        sampler = _InfiniteConstantSampler()
    else:  # map-style
        if shuffle:
            sampler = RandomSampler(dataset)
        else:
            sampler = SequentialSampler(dataset)

torchvision

torchvision简介

该库是Pytorch项目的一部分。安装pytorch时，torchvision独立于torch。torchvision包由流行的数据集（torchvision.datasets）、模型架构(torchvision.models)和用于计算机视觉的常见图像转换组成t(torchvision.transforms)。

torchvision.datasets

import torchvision
mnist = torchvision.datasets.MNIST("path/to/mnist/", train=True, transform=transforms, target_transform=None, download=False)
torchvision.datasets.MNIST(root: str,#存放training.pt和test.pt的root directory
                           train: bool = True,#从training.pt创建数据集,否则从test.pt
                           transform: Union[Callable, NoneType] = None,#转换操作
                           target_transform: Union[Callable, NoneType] = None,
                           download: bool = False) → None#下载到本地并存放到root directory

torchvision.models

import torchvision
vgg16 = torchvision.models.vgg16(pretrained=True)
#pretrained=True加载别人预训练好的模型,否则就是权重随机初始化的模型

torchvision.transforms

transforms的二十二个方法

第三篇-pytorch数据预处理三剑客

变换是常见的图像变换。它们可以使用Compose链接在一起。此外，还有torchvision.transforms.functional模块。Functional transforms可以对转换进行细粒度控制。如果您必须构建更复杂的转换管道（例如，在分割任务的情况下），这将非常有用。

所有的转换都接受PIL Image，Tensor Image或batch of tensor Image作为输入。Tensor Image是一个具有(C, H, W)形状的张量，其中C是通道，H和W是图像的高度和宽度。Batch of Tensor Images是(B, C, H, W) 形状的张量，其中B是一个Batch中图像的个数。对Batch of Tensor Image应用确定或随机变换，就能对这批图像的所有图像进行相同的变换。

注意事项：

（1）transfroms中的数据增强操作针对的是pillow的Image图像格式，而我们很多时候在使用opencv读进去的又是ndarray格式，所以需要第一步先将ndarray转化成Image格式，即transforms.ToPILImage().

（2）但是我们后需要的数据又是需要ndarray格式或者是tensor格式，故而有需要将Image转换回来，即transforms.ToTensor()。

自从v0.8.0以来，所有的随机转换都使用torch默认的随机生成器来采样随机参数。这是一个破坏后项兼容的更改，后向兼容是指向低版本兼容，用户应该设置随机状态如下:

# Previous versions
# import random
# random.seed(12)

# Now
import torch
torch.manual_seed(17)
#请记住，pytorch随机数生成器和Python随机数生成器的相同种子不会产生相同的结果。

torchvision.transforms.Compose(transforms)
将几个变换组合在一起

class Compose(object):

    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img):#self.transform(img)实际上是一个函数调用形式,果然实现了__call__()s
        for t in self.transforms:#从这可以看出,传入的是一个容器,列表就可以
            img = t(img)
        return img #返回的直接就是img,注意,处理的是单张img,返回的也是单张img

    def __repr__(self):
        format_string = self.__class__.__name__ + '('
        for t in self.transforms:
            format_string += '\n'
            format_string += '    {0}'.format(t)
        format_string += '\n)'
        return format_string

>>> transforms.Compose([
>>>     transforms.CenterCrop(10),
>>>     transforms.ToTensor(),
>>> ])

裁剪(Crop)

翻转和旋转(Flip and Rotation)

图像变换(resize)

torchvision.transforms.ToTensor()
# 1.转换通道顺序:HWC->CHW
# 2.将PIL Image或者 ndarray 转换为tensor float
# 3.归一化至[0-1] 注意事项：归一化至[0-1]是直接除以255，若自己的ndarray数据尺度有变化，则需要自行修改。
torchvision.transforms.Normalize(mean, std)
#用平均值和标准偏差归一化张量图像。给定mean：(M1,…,Mn)和std：(S1,…,Sn)对于n通道，此变换将标准化输入的每个通道，torch.*Tensor即 input[channel] = (input[channel] - mean[channel]) / std[channel]
#mean(sequence)-每个通道的均值序列。
#std(sequence)-每个通道的标准偏差序列。
torchvision.transforms.ToPILImage(mode=None)
# Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape H x W x C to a PIL Image while preserving the value range.

对transforms操作，使数据增强更灵活

Normalize参数解惑https://blog.csdn.net/xys430381_1/article/details/85724668

functional

torchvision.transforms.functional.adjust_gamma(img: torch.Tensor, gamma: float, gain: float = 1) → torch.Tensor

对图片进行gamma校正
$$
I_{out}=255\cdot gain \cdot (\frac{I_{in}}{255})^y
$$
img:PIL Image或Tensor

torchvision.utils

torchvision.utils.make_grid()

制作图像网格

torchvision.utils.make_grid(tensor: Union[torch.Tensor, List[torch.Tensor]],
              #4D mini-batch Tensor of shape (B x C x H x W) or a list of images all of the same size.
                            nrow: int = 8,
              #网格中每一行中显示的图像数,最终尺寸为(B/nrow, nrow)
                            padding: int = 2,
              #子图像与子图像之间的pad有多宽。
                            normalize: bool = False,
              #If True, 归一化图像到(0, 1)区间
                            value_range: Union[Tuple[int, int], NoneType] = None,
              # 用来normalize
                            scale_each: bool = False,
                            pad_value: int = 0,
                            **kwargs) → torch.Tensor

torchvision.utils.save_image()

Docs

Features for large-scale deployments

API usage logging

When running in a broader ecosystem, for example in managed job scheduler, it’s often useful to track which binaries invoke particular PyTorch APIs. There exists simple instrumentation injected at several important API points that triggers a given callback. Because usually PyTorch is invoked in one-off 一次性的 python scripts, the callback fires only once for a given process for each of the APIs.

Note for developers: new API trigger points can be added in code with C10_LOG_API_USAGE_ONCE("my_api") in C++ or torch._C._log_api_usage_once("my.api") in Python.

eg:torch._C._log_api_usage_once("python.optimizer")

回调函数:被中间函数回调的函数

作者：no.body
链接：https://www.zhihu.com/question/19801131/answer/27459821
来源：知乎
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

什么是回调函数？

我们绕点远路来回答这个问题。

编程分为两类：系统编程（system programming）和应用编程（application programming）。所谓系统编程，简单来说，就是编写库；而应用编程就是利用写好的各种库来编写具某种功用的程序，也就是应用。系统程序员会给自己写的库留下一些接口，即API（application programming interface，应用编程接口），以供应用程序员使用。所以在抽象层的图示里，库位于应用的底下。

当程序跑起来时，一般情况下，应用程序（application program）会时常通过API调用库里所预先备好的函数。但是有些库函数（library function）却要求应用先传给它一个函数，好在合适的时候调用，以完成目标任务。这个被传入的、后又被调用的函数就称为回调函数（callback function）。

打个比方，有一家旅馆提供叫醒服务，但是要求旅客自己决定叫醒的方法。可以是打客房电话，也可以是派服务员去敲门，睡得死怕耽误事的，还可以要求往自己头上浇盆水。这里，“叫醒”这个行为是旅馆提供的，相当于库函数，但是叫醒的方式是由旅客决定并告诉旅馆的，也就是回调函数。而旅客告诉旅馆怎么叫醒自己的动作，也就是把回调函数传入库函数的动作，称为登记回调函数（to register a callback function）。如下图所示（图片来源：维基百科）：

可以看到，回调函数通常和应用处于同一抽象层（因为传入什么样的回调函数是在应用级别决定的）。而回调就成了一个高层调用底层，底层再回过头来调用高层的过程。（我认为）这应该是回调最早的应用之处，也是其得名如此的原因。

回调机制的优势

从上面的例子可以看出，回调机制提供了非常大的灵活性。请注意，从现在开始，我们把图中的库函数改称为中间函数了，这是因为回调并不仅仅用在应用和库之间。任何时候，只要想获得类似于上面情况的灵活性，都可以利用回调。

这种灵活性是怎么实现的呢？乍看起来，回调似乎只是函数间的调用，但仔细一琢磨，可以发现两者之间的一个关键的不同：在回调中，我们利用某种方式，把回调函数像参数一样传入中间函数。可以这么理解，在传入一个回调函数之前，中间函数是不完整的。换句话说，程序可以在运行时，通过登记不同的回调函数，来决定、改变中间函数的行为。这就比简单的函数调用要灵活太多了。请看下面这段Python写成的回调的简单示例：

even.py
#回调函数1
#生成一个2k形式的偶数
def double(x):
    return x * 2

#回调函数2
#生成一个4k形式的偶数
def quadruple(x):
    return x * 4
callback_demo.py
from even import *

#中间函数
#接受一个生成偶数的函数作为参数
#返回一个奇数
def getOddNumber(k, getEvenNumber):
    return 1 + getEvenNumber(k)

#起始函数，这里是程序的主函数
def main():    
    k = 1
    #当需要生成一个2k+1形式的奇数时
    i = getOddNumber(k, double)
    print(i)
    #当需要一个4k+1形式的奇数时
    i = getOddNumber(k, quadruple)
    print(i)
    #当需要一个8k+1形式的奇数时
    i = getOddNumber(k, lambda x: x * 8)
    print(i)

if __name__ == "__main__":
    main()

# 最终输出: 3 5 9
上面的代码里，给getOddNumber传入不同的回调函数，它的表现也不同，这就是回调机制的优势所在。值得一提的是，上面的第三个回调函数是一个匿名函数。

易被忽略的第三方

通过上面的论述可知，中间函数和回调函数是回调的两个必要部分，不过人们往往忽略了回调里的第三位要角，就是中间函数的调用者。绝大多数情况下，这个调用者可以和程序的主函数等同起来，但为了表示区别，我这里把它称为起始函数（如上面的代码中注释所示）。

之所以特意强调这个第三方，是因为我在网上读相关文章时得到一种印象，很多人把它简单地理解为两个个体之间的来回调用。譬如，很多中文网页在解释“回调”（callback）时，都会提到这么一句话：“If you call me, I will call you back.”我没有查到这句英文的出处。我个人揣测，很多人把起始函数和回调函数看作为一体，大概有两个原因：第一，可能是“回调”这一名字的误导；第二，给中间函数传入什么样的回调函数，是在起始函数里决定的。实际上，回调并不是“你我”两方的互动，而是ABC的三方联动。有了这个清楚的概念，在自己的代码里实现回调时才不容易混淆出错。

另外，回调实际上有两种：阻塞式回调和延迟式回调。两者的区别在于：阻塞式回调里，回调函数的调用一定发生在起始函数返回之前；而延迟式回调里，回调函数的调用有可能是在起始函数返回之后。这里不打算对这两个概率做更深入的讨论，之所以把它们提出来，也是为了说明强调起始函数的重要性。网上的很多文章，提到这两个概念时，只是笼统地说阻塞式回调发生在主调函数返回之前，却没有明确这个主调函数到底是起始函数还是中间函数，不免让人糊涂，所以这里特意说明一下。另外还请注意，本文中所举的示例均为阻塞式回调。延迟式回调通常牵扯到多线程，我自己还没有完全搞明白，所以这里就不多说了。

REPRODUCIBILITY

终归无法一模一样!

torch.manual_seed() :https://blog.csdn.net/X_singing/article/details/104447052?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.channel_param&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.channel_param

为了保证能够复现，初始随机梯度是固定的

随机数种子

cudnn.enabled = True
cudnn.benchmark = True
cudnn.deterministic = False
if args.seed is not None:
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

Data Loader

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

DataLoader(
    train_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    worker_init_fn=seed_worker
)

奇迹淫巧

pytorch常见的坑汇总

GPU利用率不高+gpu显存占用浪费

1 主函数前面加(这个会牺牲一点点显存提高模型精度):

cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.enabled = True

设置torch.backends.cudnn.benchmark

通过预先搜索最适合的卷积算法的实现，因为卷积算法有很多种不同的实现，最简单的实现方式就是使用多层循环嵌套，对于每张输入图像，对于每个要输出的通道，对于每个输入的通道，选取一个区域，同指定卷积核进行卷积操作，然后逐行滑动，直到整张图像都处理完毕，这个方法一般被称为 direct 法，这个方法虽然简单，但是看到这么多循环，我们就知道效率在一般情况下不会很高了。除此之外，实现卷积层的算法还有基于 GEMM (General Matrix Multiply) 的，基于 FFT 的，基于 Winograd 算法的等等，而且每个算法还有自己的一些变体。在一个开源的 C++ 库 triNNity 中，就实现了接近 80 种的卷积前向传播算法！

要求有：

网络结构固定
输入尺寸固定

2 训练时,epoch前面加:(定期清空模型，效果感觉不明显）

torch.cuda.empty_cache()

3 无用变量前面加:(同上，效果某些操作上还挺明显的）

del xxx(变量名)

4 dataloader的长度__len__设置:(dataloader会间歇式出现卡顿,设置成这样会避免不少）

def __len__(self):
    return self.images.shape[0]

5 dataloader的预加载设置:(会在模型训练的时候加载数据，提高一点点gpu利用率）

train_loader = torch.utils.data.DataLoader(
        train_dataset,
        pin_memory=True,
    )

6 loss的item()

没有item()的话会导致显存占用变高

7 混合精度计算

加速Pytorch训练

加速pytorch训练

prefetch_generator

使用 prefetch_generator库在后台加载下一batch的数据，原本Pytorch默认的DataLoader会创建一些worker线程来预读取新的数据，但是除非这些线程的数据全部都被清空，这些线程才会读下一批数据。使用prefetch_generator，我们可以保证线程不会等待，每个线程都总有至少一个数据在加载。

#使用
from torch.utils.data import DataLoader
from prefetch_generator import BackgroundGenerator

class DataLoaderX(DataLoader):

    def __iter__(self):
        return BackgroundGenerator(super().__iter__())
#然后用DataLoaderX替换原本的DataLoader

常用函数

获取参数量

def print_network(net):
    num_params = 0
    for param in net.parameters():
        num_params += param.numel()
    print(net)
    print('Total number of parameters: %d' % num_params)