当前位置：首页 > news >正文

关于电商网站规划方案做网站软件定制开发

news 2025/11/14 13:04:56

关于电商网站规划方案,做网站软件定制开发,网站域名跳转,怎么做的网站怎么放上网经典语义分割(一)利用pytorch复现全卷积神经网络FCN 这里选择B站up主[霹雳吧啦Wz]根据pytorch官方torchvision模块中实现的FCN源码。 Github连接#xff1a;FCN源码 1 FCN模型搭建 1.1 FCN网络图 pytorch官方实现的FCN网络图#xff0c;如下所示。 1.2 backbone FCN原…经典语义分割(一)利用pytorch复现全卷积神经网络FCN 这里选择B站up主[霹雳吧啦Wz]根据pytorch官方torchvision模块中实现的FCN源码。 Github连接FCN源码 1 FCN模型搭建 1.1 FCN网络图 pytorch官方实现的FCN网络图如下所示。 1.2 backbone FCN原文中的backbone是VGG这里pytorch官方采用了resnet作为FCN的backbone。 ResNet的前两层跟GoogLeNet中的⼀样在输出通道数为64、步幅为2的7 × 7卷积层后接步幅为2的3 × 3的最大汇聚层。不同之处在于ResNet每个卷积层后增加了批量规范化层。 GoogLeNet在后面接了4个由Inception块组成的模块。ResNet后接4个由残差块。 ResNet则使用4个由残差块组成的模块每个模块使用若干个同样输出通道数的残差块。第1个模块(layer1)由于之前已经使用了步幅为2的最大汇聚层所以无须减小高和宽。原生的ResNet在之后的每个模块(layer2、layer3、layer4)在第⼀个残差块里将上一个模块的通道数翻倍并将高和宽减半。不过在这里和原生的ResNet不同的是layer3和layer4使用了空洞卷积并且高宽不减半。 # /fcn/src/backbone.py import torch import torch.nn as nn from torchinfo import summarydef conv3x3(in_planes, out_planes, stride1, groups1, dilation1):3x3 convolution with paddingreturn nn.Conv2d(in_planes, out_planes, kernel_size3, stridestride,paddingdilation, groupsgroups, biasFalse, dilationdilation)def conv1x1(in_planes, out_planes, stride1):1x1 convolutionreturn nn.Conv2d(in_planes, out_planes, kernel_size1, stridestride, biasFalse)class Bottleneck(nn.Module):# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)# while original implementation places the stride at the first 1x1 convolution(self.conv1)# according to Deep residual learning for image recognitionhttps://arxiv.org/abs/1512.03385.# This variant is also known as ResNet V1.5 and improves accuracy according to# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.expansion 4def __init__(self, inplanes, planes, stride1, downsampleNone, groups1,base_width64, dilation1, norm_layerNone):super(Bottleneck, self).__init__()if norm_layer is None:norm_layer nn.BatchNorm2dwidth int(planes * (base_width / 64.)) * groups# Both self.conv2 and self.downsample layers downsample the input when stride ! 1self.conv1 conv1x1(inplanes, width)self.bn1 norm_layer(width)self.conv2 conv3x3(width, width, stride, groups, dilation)self.bn2 norm_layer(width)self.conv3 conv1x1(width, planes * self.expansion)self.bn3 norm_layer(planes * self.expansion)self.relu nn.ReLU(inplaceTrue)self.downsample downsampleself.stride stridedef forward(self, x):identity xout self.conv1(x)out self.bn1(out)out self.relu(out)out self.conv2(out)out self.bn2(out)out self.relu(out)out self.conv3(out)out self.bn3(out)if self.downsample is not None:identity self.downsample(x)out identityout self.relu(out)return outclass ResNet(nn.Module):def __init__(self, block, layers, num_classes1000, zero_init_residualFalse,groups1, width_per_group64, replace_stride_with_dilationNone,norm_layerNone):super(ResNet, self).__init__()if norm_layer is None:norm_layer nn.BatchNorm2dself._norm_layer norm_layerself.inplanes 64self.dilation 1if replace_stride_with_dilation is None:# each element in the tuple indicates if we should replace# the 2x2 stride with a dilated convolution insteadreplace_stride_with_dilation [False, False, False]if len(replace_stride_with_dilation) ! 3:raise ValueError(replace_stride_with_dilation should be None or a 3-element tuple, got {}.format(replace_stride_with_dilation))self.groups groupsself.base_width width_per_group1、ResNet的前两层ResNet的前两层跟GoogLeNet中的⼀样在输出通道数为64、步幅为2的7 × 7卷积层后接步幅为2的3 × 3的最⼤汇聚层。不同之处在于ResNet每个卷积层后增加了批量规范化层。self.conv1 nn.Conv2d(in_channels3, out_channelsself.inplanes, kernel_size7, stride2, padding3,biasFalse)self.bn1 norm_layer(self.inplanes)self.relu nn.ReLU(inplaceTrue)self.maxpool nn.MaxPool2d(kernel_size3, stride2, padding1)2、ResNet后接4个由残差块GoogLeNet在后⾯接了4个由Inception块组成的模块。ResNet则使⽤4个由残差块组成的模块每个模块使⽤若⼲个同样输出通道数的残差块。第⼀个模块(layer1)由于之前已经使⽤了步幅为2的最⼤汇聚层所以⽆须减⼩⾼和宽。之后的每个模块(layer2、layer3、layer4)在第⼀个残差块⾥将上⼀个模块的通道数翻倍并将⾼和宽减半。不过在这里和原生的ResNet不同的是layer3和layer4使用了空洞卷积并且高宽不减半。self.layer1 self._make_layer(block, 64, layers[0])self.layer2 self._make_layer(block, 128, layers[1], stride2, dilatereplace_stride_with_dilation[0])self.layer3 self._make_layer(block, 256, layers[2], stride2, dilatereplace_stride_with_dilation[1])self.layer4 self._make_layer(block, 512, layers[3], stride2, dilatereplace_stride_with_dilation[2])self.avgpool nn.AdaptiveAvgPool2d((1, 1))self.fc nn.Linear(512 * block.expansion, num_classes)for m in self.modules():if isinstance(m, nn.Conv2d):nn.init.kaiming_normal_(m.weight, modefan_out, nonlinearityrelu)elif isinstance(m, nn.BatchNorm2d):nn.init.constant_(m.weight, 1)nn.init.constant_(m.bias, 0)# Zero-initialize the last BN in each residual branch,# so that the residual branch starts with zeros, and each residual block behaves like an identity.# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677if zero_init_residual:for m in self.modules():if isinstance(m, Bottleneck):nn.init.constant_(m.bn3.weight, 0)def _make_layer(self, block, planes, blocks, stride1, dilateFalse):norm_layer self._norm_layerdownsample Noneprevious_dilation self.dilationif dilate:# layer3和layer4使用了空洞卷积高宽不减半因此设置stride 1self.dilation * stridestride 1# layer2、layer3和layer4的stride2满足# layer1的stride1但是inplanes(64) planes * block.expansion64×4因此也满足if stride ! 1 or self.inplanes ! planes * block.expansion:downsample nn.Sequential(conv1x1(self.inplanes, planes * block.expansion, stride),norm_layer(planes * block.expansion),)# 对于每个layer只有第1个Bottleneck需要downsamplelayers []layers.append(block(self.inplanes, planes, stride, downsample, self.groups,self.base_width, previous_dilation, norm_layer))self.inplanes planes * block.expansion# 对于每个layer从第2个Bottleneck开始就不需要downsamplefor _ in range(1, blocks):layers.append(block(self.inplanes, planes, groupsself.groups,base_widthself.base_width, dilationself.dilation,norm_layernorm_layer))return nn.Sequential(*layers)def _forward_impl(self, x):# See note [TorchScript super()]x self.conv1(x)x self.bn1(x)x self.relu(x)x self.maxpool(x)x self.layer1(x)x self.layer2(x)x self.layer3(x)x self.layer4(x)x self.avgpool(x)x torch.flatten(x, 1)x self.fc(x)return xdef forward(self, x):return self._forward_impl(x)def _resnet(block, layers, **kwargs):model ResNet(block, layers, **kwargs)return modeldef resnet50(**kwargs):rResNet-50 model fromDeep Residual Learning for Image Recognition https://arxiv.org/pdf/1512.03385.pdf_Args:pretrained (bool): If True, returns a model pre-trained on ImageNetprogress (bool): If True, displays a progress bar of the download to stderrreturn _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)def resnet101(**kwargs):rResNet-101 model fromDeep Residual Learning for Image Recognition https://arxiv.org/pdf/1512.03385.pdf_Args:pretrained (bool): If True, returns a model pre-trained on ImageNetprogress (bool): If True, displays a progress bar of the download to stderrreturn _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)if __name__ __main__:net resnet50(replace_stride_with_dilation[False, True, True])print(net)# pip install torchinfo# 可以看到网络每一层的输出shape以及网络参数信息summary(net, input_size(1, 3, 480, 480))1.3 FCN Head 经过backbone后再通过FCN Head模块。通过3×3卷积层缩小通道为原来的1/4【2048-512】再通过一个dropout和一个1×1卷积层这里1×1卷积层调整特征层的channel为分割类别中的类别个数。layer3中引出的一条FCN Head辅助分类器是为了防止误差梯度无法传递到网络浅层。训练的时候是可以使用辅助分类器件的。最后去预测或者部署到正式环境的时候只用主干的output不用aux output。最后经过双线性插值还原特征图大小到原图。 # /fcn/src/fcn_model.py from collections import OrderedDictfrom typing import Dictimport torch from torch import nn, Tensor from torch.nn import functional as F try:from .backbone import resnet50, resnet101 except:from backbone import resnet50, resnet101class IntermediateLayerGetter(nn.ModuleDict):_version 2__annotations__ {return_layers: Dict[str, str],}def __init__(self, model: nn.Module, return_layers: Dict[str, str]) - None:if not set(return_layers).issubset([name for name, _ in model.named_children()]):raise ValueError(return_layers are not present in model)orig_return_layers return_layersreturn_layers {str(k): str(v) for k, v in return_layers.items()}# 重新构建backbone将没有使用到的模块全部删掉layers OrderedDict()for name, module in model.named_children():layers[name] moduleif name in return_layers:del return_layers[name]if not return_layers:breaksuper(IntermediateLayerGetter, self).__init__(layers)self.return_layers orig_return_layersdef forward(self, x: Tensor) - Dict[str, Tensor]:out OrderedDict()for name, module in self.items():x module(x)# self.return_layers {layer4: out, layer3: aux}if name in self.return_layers:out_name self.return_layers[name]out[out_name] xreturn outclass FCN(nn.Module):__constants__ [aux_classifier]def __init__(self, backbone, classifier, aux_classifierNone):super(FCN, self).__init__()self.backbone backboneself.classifier classifierself.aux_classifier aux_classifierdef forward(self, x: Tensor) - Dict[str, Tensor]:input_shape x.shape[-2:]# contract: features is a dict of tensorsfeatures self.backbone(x)result OrderedDict()x features[out]x self.classifier(x)# 原论文中虽然使用的是ConvTranspose2d但权重是冻结的所以就是一个bilinear插值x F.interpolate(x, sizeinput_shape, modebilinear, align_cornersFalse)result[out] x# FCN Head辅助分类器是为了防止误差梯度无法传递到网络浅层if self.aux_classifier is not None:x features[aux]x self.aux_classifier(x)# 原论文中虽然使用的是ConvTranspose2d但权重是冻结的所以就是一个bilinear插值x F.interpolate(x, sizeinput_shape, modebilinear, align_cornersFalse)result[aux] xreturn resultclass FCNHead(nn.Sequential):def __init__(self, in_channels, channels):# 通过3×3卷积层缩小通道为原来的1/4【2048-512】再通过一个dropout和一个1×1卷积层inter_channels in_channels // 4layers [nn.Conv2d(in_channels, inter_channels, 3, padding1, biasFalse),nn.BatchNorm2d(inter_channels),nn.ReLU(),nn.Dropout(0.1),nn.Conv2d(inter_channels, channels, 1) # 这里1×1卷积层调整特征层的channel为分割类别中的类别个数]super(FCNHead, self).__init__(*layers)def fcn_resnet50(aux, num_classes21, pretrain_backboneFalse):# resnet50_imagenet: https://download.pytorch.org/models/resnet50-0676ba61.pth# fcn_resnet50_coco: https://download.pytorch.org/models/fcn_resnet50_coco-1167a1af.pthbackbone resnet50(replace_stride_with_dilation[False, True, True])if pretrain_backbone:# 载入resnet50 backbone预训练权重backbone.load_state_dict(torch.load(resnet50.pth, map_locationcpu))out_inplanes 2048aux_inplanes 1024return_layers {layer4: out}if aux:return_layers[layer3] aux# backbone经过前向传播的结果为OrderedDict()backbone IntermediateLayerGetter(backbone, return_layersreturn_layers)aux_classifier None# why using aux: https://github.com/pytorch/vision/issues/4292if aux:aux_classifier FCNHead(aux_inplanes, num_classes)classifier FCNHead(out_inplanes, num_classes)model FCN(backbone, classifier, aux_classifier)return modeldef fcn_resnet101(aux, num_classes21, pretrain_backboneFalse):# resnet101_imagenet: https://download.pytorch.org/models/resnet101-63fe2227.pth# fcn_resnet101_coco: https://download.pytorch.org/models/fcn_resnet101_coco-7ecb50ca.pthbackbone resnet101(replace_stride_with_dilation[False, True, True])if pretrain_backbone:# 载入resnet101 backbone预训练权重backbone.load_state_dict(torch.load(resnet101.pth, map_locationcpu))out_inplanes 2048aux_inplanes 1024return_layers {layer4: out}if aux:return_layers[layer3] auxbackbone IntermediateLayerGetter(backbone, return_layersreturn_layers)aux_classifier None# why using aux: https://github.com/pytorch/vision/issues/4292if aux:aux_classifier FCNHead(aux_inplanes, num_classes)classifier FCNHead(out_inplanes, num_classes)model FCN(backbone, classifier, aux_classifier)return modelif __name__ __main__:model fcn_resnet50(auxTrue, num_classes21)print(model)x torch.randn(size(1, 3, 480, 480))print(model(x)[out].shape)print(model(x)[aux].shape)2 损失函数的计算 2.1 VOC的标注详解这张图片大致可以分为四部分一部分是黑色背景一部分是粉红色的人一部分是大红色的飞机还有一部分是白色的神秘物体。图片的背景它是黑色的背景类别为0因此在调色板中0所对应的RGB值为[0,0,0]为黑色。 pascal_voc_classes.json中person: 15可知人用数字15表示而在palette.json中15: [192, 128, 128]可知15对应的RGB为粉红色因此粉红色的是人。同理可知飞机aeroplane: 1在调色板中对应的颜色为大红色。这个白色的神秘物体其实也是一个小飞机但很难分辨故标注时用白色像素给隐藏起来了最后白色对应的像素也不会参与损失计算。如果你足够细心的话你会发现在人和飞机的边缘其实都是存在一圈白色的像素的这是为了更好的区分不同类别对应的像素。同样这里的白色也不会参与损失计算。我们可以用程序来看看标注图像中是否有白色像素。 from PIL import Image import numpy as np img Image.open(D:\\VOCdevkit\\VOC2007\\SegmentationClass\\2007_000032.png) img_np np.array(img)可以看到地下的像素是1表示飞机大红色上面的像素为0表示背景黑色中间的像素为255这就对应着飞机周围的白色像素。我们可以看一下255对应的RGB值 [224,224,192]表示的RGB颜色为白色。这里的255需要注意后面计算损失时白色部分不计算正是通过忽略这个值实现的。 2.2 交叉熵损失cross_entropy l o s s ( x , c l a s s ) − l o g ( e x [ c l a s s ] ∑ j e x [ j ] ) − x [ c l a s s ] l o g ( ∑ j e x [ j ] ) 举个例子假设输入 x [ 0.1 , 0.2 , 0.3 ] 标签 c l a s s 1 l o s s ( x , c l a s s ) − x [ c l a s s ] l o g ( ∑ j e x [ j ] ) − 0.2 l o g ( e x [ 0 ] e x [ 1 ] e x [ 2 ] ) − 0.2 l o g ( e 0.1 e 0.2 e 0.3 ) loss(x,class)-log(\frac{e^{x[class]}}{\sum\limits_{j} e^{x[j]}})-x[class]log(\sum\limits_{j} e^{x[j]})\\ 举个例子假设输入x[0.1,0.2,0.3]标签class1 \\ loss(x,class)-x[class]log(\sum\limits_{j} e^{x[j]})-0.2 log( e^{x[0]} e^{x[1]} e^{x[2]}) \\ -0.2 log( e^{0.1} e^{0.2} e^{0.3}) loss(x,class)−log(j∑ex[j]ex[class])−x[class]log(j∑ex[j])举个例子假设输入x[0.1,0.2,0.3]标签class1loss(x,class)−x[class]log(j∑ex[j])−0.2log(ex[0]ex[1]ex[2])−0.2log(e0.1e0.2e0.3) 我们可以用程序进行验证 import torch import numpy as np import math# 官方实现 input torch.tensor([[0.1, 0.2, 0.3],[0.1, 0.2, 0.3],[0.1, 0.2, 0.3]]) target torch.tensor([0, 1, 2]) loss torch.nn.functional.cross_entropy(input, target) print(官方计算 loss , loss.numpy())# 自己计算 res0 -0.1 np.log(math.exp(0.1) math.exp(0.2) math.exp(0.3)) res1 -0.2 np.log(math.exp(0.1) math.exp(0.2) math.exp(0.3)) res2 -0.3 np.log(math.exp(0.1) math.exp(0.2) math.exp(0.3)) res (res0 res1 res2) / 3 print(自己计算 loss %.7f % res)# 仅精度有差别所以这证明了我们的计算方式是没有错的。官方计算 loss 1.1019429 自己计算 loss 1.1019428 FCN在计算损失是会忽略白色的像素其就对应着标签中的255。忽略白色像素的损失其实很简单只要在函数调用时传入ignore_index并指定对应的值即可。如对本例来说现我打算忽略target中标签为2的数据即不让其参与损失计算我们来看看如何使用cross_entropy函数来实现。 import torch import numpy as np import math# 官方实现 input torch.tensor([[0.1, 0.2, 0.3],[0.1, 0.2, 0.3],[0.1, 0.2, 0.3]]) target torch.tensor([0, 1, 2]) loss torch.nn.functional.cross_entropy(input, target, ignore_index2) print(官方计算 loss , loss.numpy())# 自己计算 res0 -0.1 np.log(math.exp(0.1) math.exp(0.2) math.exp(0.3)) res1 -0.2 np.log(math.exp(0.1) math.exp(0.2) math.exp(0.3)) res (res0 res1 ) / 2 print(自己计算 loss %.6f % res)官方计算 loss 1.151943 自己计算 loss 1.151943 2.3 FCN中损失计算过程程序中输入cross_entropy函数中的x通常是4维的tensor即[NCHW]这时候训练损失是怎么计算的呢我们以x的维度为[1222]为例讲解我们手动计算时候会将数据按通道方向展开然后分别计算cross_entropy最后求平均如下图所示 import torch import numpy as np import math# 1、官方计算 input torch.tensor([[[[0.1, 0.2],[0.3, 0.4]],[[0.5, 0.6],[0.7, 0.8]]]]) #shape(1 2 2 2 )target torch.tensor([[[0, 1],[0, 1]]])loss torch.nn.functional.cross_entropy(input, target) print(官方计算 loss , loss.numpy())# 2、自己计算 res0 -0.1 np.log(math.exp(0.1) math.exp(0.5)) res1 -0.6 np.log(math.exp(0.2) math.exp(0.6)) res2 -0.3 np.log(math.exp(0.3) math.exp(0.7)) res3 -0.8 np.log(math.exp(0.4) math.exp(0.8)) res (res0 res1 res2 res3)/4 print(自己计算 loss %.8f % res)官方计算 loss 0.71301526 自己计算 loss 0.71301525 如果我们此时忽略target0 import torch import numpy as np import math# 1、官方计算 input torch.tensor([[[[0.1, 0.2],[0.3, 0.4]],[[0.5, 0.6],[0.7, 0.8]]]]) #shape(1 2 2 2 )target torch.tensor([[[0, 1],[0, 1]]])loss torch.nn.functional.cross_entropy(input, target , ignore_index0) print(官方计算 loss , loss.numpy())# 2、自己计算 res1 -0.6 np.log(math.exp(0.2) math.exp(0.6)) res3 -0.8 np.log(math.exp(0.4) math.exp(0.8)) res ( res1 res3)/2 print(自己计算 loss %.7f % res)官方计算 loss 0.5130153 自己计算 loss 0.5130153 2.4 FCN中损失代码通过上面讲解我们就很容易理解FCN的损失计算了。这里忽略了255像素不让其参与到损失的计算中。如果辅助分类器存在给予较小的损失权重。 # fcn/train_utils/train_and_eval.py def criterion(inputs, target):losses {}for name, x in inputs.items():# 忽略target中值为255的像素255的像素是目标边缘或者padding填充losses[name] nn.functional.cross_entropy(x, target, ignore_index255)if len(losses) 1:return losses[out]return losses[out] 0.5 * losses[aux]3 VOC数据集的读取及数据预处理我们自定义VOCSegmentation类继承pytorch提供的torch.utils.data.Dataset类主要实现__getitem__函数。再利用pytorch提供的Dataloader就可以通过调用__getitem__函数来批量读取VOC数据集图片和标签了。 VOCSegmentation类的初始化部分如下方的代码所示: # fcn/my_dataset.py class VOCSegmentation(data.Dataset):def __init__(self, voc_root, year2007, transformsNone, txt_name: str train.txt):super(VOCSegmentation, self).__init__()assert year in [2007, 2012], year must be in [2007, 2012]root os.path.join(voc_root, VOCdevkit, fVOC{year})assert os.path.exists(root), path {} does not exist..format(root)image_dir os.path.join(root, JPEGImages)mask_dir os.path.join(root, SegmentationClass)txt_path os.path.join(root, ImageSets, Segmentation, txt_name)assert os.path.exists(txt_path), file {} does not exist..format(txt_path)with open(os.path.join(txt_path), r) as f:file_names [x.strip() for x in f.readlines() if len(x.strip()) 0]self.images [os.path.join(image_dir, x .jpg) for x in file_names]self.masks [os.path.join(mask_dir, x .png) for x in file_names]assert (len(self.images) len(self.masks))self.transforms transforms首先我们需要获取输入(image)和标签(target)的路径。 voc_root是我们应该传入VOCdevkit所在的文件夹。最终self.image和self.masks里存储的就是我们输入和标签的路径了。接着我们对输入图片和标签进行transformer预处理(代码如下) 训练集采用了随机缩放、水平翻转、随机裁剪、toTensor和Normalize。验证集仅使用了随机缩放、toTensor和Normalize。crop_size设置为480即训练图片都会裁剪到480*480大小而验证时没有使用随机裁剪方法因此验证集的图片尺寸是不一致的, 需要进行进一步的处理。 # fcn/train.py class SegmentationPresetTrain:def __init__(self, base_size, crop_size, hflip_prob0.5, mean(0.485, 0.456, 0.406), std(0.229, 0.224, 0.225)):min_size int(0.5 * base_size)max_size int(2.0 * base_size)trans [T.RandomResize(min_size, max_size)]if hflip_prob 0:trans.append(T.RandomHorizontalFlip(hflip_prob))trans.extend([T.RandomCrop(crop_size),T.ToTensor(),T.Normalize(meanmean, stdstd),])self.transforms T.Compose(trans)def __call__(self, img, target):return self.transforms(img, target)class SegmentationPresetEval:def __init__(self, base_size, mean(0.485, 0.456, 0.406), std(0.229, 0.224, 0.225)):self.transforms T.Compose([T.RandomResize(base_size, base_size),T.ToTensor(),T.Normalize(meanmean, stdstd),])def __call__(self, img, target):return self.transforms(img, target)def get_transform(train):base_size 520crop_size 480return SegmentationPresetTrain(base_size, crop_size) if train else SegmentationPresetEval(base_size)预处理代码完成后就可以实现__getitem__以及__len__方法。 # fcn/my_dataset.pydef __getitem__(self, index):Args:index (int): IndexReturns:tuple: (image, target) where target is the image segmentation.img Image.open(self.images[index]).convert(RGB)target Image.open(self.masks[index])if self.transforms is not None:img, target self.transforms(img, target)return img, targetdef __len__(self):return len(self.images)staticmethoddef collate_fn(batch):images, targets list(zip(*batch))batched_imgs cat_list(images, fill_value0)batched_targets cat_list(targets, fill_value255)return batched_imgs, batched_targets在VOCSegmentation类中还实现了DataLoader中需要的collate_fn。在collate_fn中接受一个List类型数据其中每个元素是一个Tuple2类型包括了image和target。在collate_fn中调用cat_list方法对验证集图片尺寸是不一致进行处理。 # fcn/my_dataset.py def cat_list(images, fill_value0):# 计算该batch数据中channel, h, w的最大值max_size tuple(max(s) for s in zip(*[img.shape for img in images]))batch_shape (len(images),) max_sizebatched_imgs images[0].new(*batch_shape).fill_(fill_value)for img, pad_img in zip(images, batched_imgs):pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)return batched_imgs最后就可以调用Dataloader批量获取数据了。 # fcn/train.py # VOCdevkit - VOC2007 - ImageSets - Segmentation - train.txttrain_dataset VOCSegmentation(args.data_path,year2007,transformsget_transform(trainTrue),txt_nametrain.txt)# VOCdevkit - VOC2007 - ImageSets - Segmentation - val.txtval_dataset VOCSegmentation(args.data_path,year2007,transformsget_transform(trainFalse),txt_nameval.txt)num_workers min([os.cpu_count(), batch_size if batch_size 1 else 0, 8])train_loader torch.utils.data.DataLoader(train_dataset,batch_sizebatch_size,num_workersnum_workers,shuffleTrue,pin_memoryTrue,collate_fntrain_dataset.collate_fn)val_loader torch.utils.data.DataLoader(val_dataset,batch_size1,num_workersnum_workers,pin_memoryTrue,collate_fnval_dataset.collate_fn)4 模型训练及测试 4.1 模型训练代码在 fcn/train.py 中。先利用Dataset和DataLoader批量获取数据。然后创建FCN网络模型可以加载在COCO数据集上的预训练权重。 def create_model(aux, num_classes, pretrainTrue):model fcn_resnet50(auxaux, num_classesnum_classes)if pretrain:weights_dict torch.load(./fcn_resnet50_coco.pth, map_locationcpu)if num_classes ! 21:# 官方提供的预训练权重是21类(包括背景)# 如果训练自己的数据集将和类别相关的权重删除防止权重shape不一致报错for k in list(weights_dict.keys()):if classifier.4 in k:del weights_dict[k]missing_keys, unexpected_keys model.load_state_dict(weights_dict, strictFalse)if len(missing_keys) ! 0 or len(unexpected_keys) ! 0:print(missing_keys: , missing_keys)print(unexpected_keys: , unexpected_keys)return model设置SGD优化器 # 设置优化器 optimizer torch.optim.SGD(params_to_optimize,lrargs.lr, momentumargs.momentum, weight_decayargs.weight_decay)设置学习率更新策略。 # 创建学习率更新策略这里是每个step更新一次(不是每个epoch)lr_scheduler create_lr_scheduler(optimizer, len(train_loader), args.epochs, warmupTrue)# fcn/train_utils/train_and_eval.py def create_lr_scheduler(optimizer,num_step: int,epochs: int,warmupTrue,warmup_epochs1,warmup_factor1e-3):assert num_step 0 and epochs 0if warmup is False:warmup_epochs 0def f(x):根据step数返回一个学习率倍率因子注意在训练开始之前pytorch会提前调用一次lr_scheduler.step()方法if warmup is True and x (warmup_epochs * num_step):alpha float(x) / (warmup_epochs * num_step)# warmup过程中lr倍率因子从warmup_factor - 1return warmup_factor * (1 - alpha) alphaelse:# warmup后lr倍率因子从1 - 0# 参考deeplab_v2: Learning rate policyreturn (1 - (x - warmup_epochs * num_step) / ((epochs - warmup_epochs) * num_step)) ** 0.9return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambdaf)训练代码如下可以代码调试。 for epoch in range(args.start_epoch, args.epochs):mean_loss, lr train_one_epoch(model, optimizer, train_loader, device, epoch,lr_schedulerlr_scheduler, print_freqargs.print_freq, scalerscaler)# 测试confmat evaluate(model, val_loader, devicedevice, num_classesnum_classes)val_info str(confmat)print(val_info)# write into txtwith open(results_file, a) as f:# 记录每个epoch对应的train_loss、lr以及验证集各指标train_info f[epoch: {epoch}]\n \ftrain_loss: {mean_loss:.4f}\n \flr: {lr:.6f}\nf.write(train_info val_info \n\n)save_file {model: model.state_dict(),optimizer: optimizer.state_dict(),lr_scheduler: lr_scheduler.state_dict(),epoch: epoch,args: args}if args.amp:save_file[scaler] scaler.state_dict()torch.save(save_file, save_weights/model_{}.pth.format(epoch))4.2 模型测试在 train_and_val.py 文件中的 evaluate 函数代码如下创建 ConfusionMatrix 混淆矩阵使用 for 循环遍历 data_loader 得到 image 和 target 信息并将其指给对应的设备当中再将 image 图像输入到 model 模型中进行预测得到 output 输出只使用主分支上的输出调用 update 方法时在计算每一批数据预测结果与真实结果对比的过程中将 target 和 output.argmax(1) 进行 flatten 处理 output.argmax(1) 中的 1 是指在 channel 维度而 argmax 方法用于将每个像素预测值最大的类别作为其预测类别如下图所示。 # fcn/train_utils/train_and_eval.py def evaluate(model, data_loader, device, num_classes):model.eval()confmat utils.ConfusionMatrix(num_classes)metric_logger utils.MetricLogger(delimiter )header Test:with torch.no_grad():for image, target in metric_logger.log_every(data_loader, 100, header):image, target image.to(device), target.to(device)output model(image)output output[out]confmat.update(target.flatten(), output.argmax(1).flatten())confmat.reduce_from_all_processes()return confmatConfusionMatrix 类代码如下 ConfusionMatrix 类中的 update 函数传入了真实标签 a 和预测标签 b 等参数代码的具体解析这里的 num_classes 是指包含了背景的类别个数。如果 self.mat 是 None 就使用 torch.zeros 创建一个全零矩阵作为混淆矩阵大小为 n x n 用于记录真实标签和预测标签之间的关系。通过检查真实标签 a 中的元素是否属于有效类别范围 [ 0 , N ) 来寻找属于目标类别的像素索引。根据像素的真实类别 a [ k ] 和预测类别 b [ k ] 计算类别索引 inds 用于统计真实类别为 a [ k ] 被预测成 b [ k ] 的像素个数。使用 torch.bincount 统计类别索引 inds 在 [ 0 , n**2 ) 内的出现次数并将结果重塑成 ( n , n ) 的矩阵形状统计数据累加到混淆矩阵中。 ConfusionMatrix 类中的 compute 函数计算常见的语义分割评价指标。语义分割评价指标主要包括 Pixel Accuracy ( Global Accuracy )、mean Accuracy、mean IoU 等 Pixel Accuracy 类别预测正确的像素个数总和 ÷ 图片的总像素个数mean Accuracy 对每个类别的 Accuracy 求平均值mean IoU 对每个类别的 IoU 求平均值 class ConfusionMatrix(object):def __init__(self, num_classes):self.num_classes num_classesself.mat Nonedef update(self, a, b):n self.num_classesif self.mat is None:# 创建混淆矩阵self.mat torch.zeros((n, n), dtypetorch.int64, devicea.device)with torch.no_grad():# 寻找GT中为目标的像素索引例如255就不是目标的像素索引k (a 0) (a n)# 统计像素真实类别a[k]被预测成类别b[k]的个数(这里的做法很巧妙)inds n * a[k].to(torch.int64) b[k]self.mat torch.bincount(inds, minlengthn**2).reshape(n, n)def reset(self):if self.mat is not None:self.mat.zero_()def compute(self):h self.mat.float()# 计算全局预测准确率(混淆矩阵的对角线为预测正确的个数)acc_global torch.diag(h).sum() / h.sum()# 计算每个类别的准确率acc torch.diag(h) / h.sum(1)# 计算每个类别预测与真实目标的iouiu torch.diag(h) / (h.sum(1) h.sum(0) - torch.diag(h))return acc_global, acc, iudef reduce_from_all_processes(self):if not torch.distributed.is_available():returnif not torch.distributed.is_initialized():returntorch.distributed.barrier()torch.distributed.all_reduce(self.mat)def __str__(self):acc_global, acc, iu self.compute()return (global correct: {:.1f}\naverage row correct: {}\nIoU: {}\nmean IoU: {:.1f}).format(acc_global.item() * 100,[{:.1f}.format(i) for i in (acc * 100).tolist()],[{:.1f}.format(i) for i in (iu * 100).tolist()],iu.mean().item() * 100)4.3 模型预测模型输出为1×c×h×w因为这是预测故batch1这里使用的是VOC数据故这里的cnum_class21。【包含一个背景类】首先我们会取输出中每个像素在21个通道中的最大值如第一个像素在21个通道的最大值在通道0上取得。这个通道对应的索引是0在VOC中是背景类故这个像素所属类别为背景。其它像素同理。 # fcn/predict.pymodel.eval() # 进入验证模式with torch.no_grad():# init modelimg_height, img_width img.shape[-2:]init_img torch.zeros((1, 3, img_height, img_width), devicedevice)model(init_img)t_start time_synchronized()output model(img.to(device))t_end time_synchronized()print(inference time: {}.format(t_end - t_start))# 在输出中的chanel维度求最大值对应的类别索引prediction output[out].argmax(1).squeeze(0)prediction prediction.to(cpu).numpy().astype(np.uint8)mask Image.fromarray(prediction)mask.putpalette(pallette)mask.save(test_result.png)

查看全文

http://www.zqtcl.cn/news/261515/