1. 前言
很多论文都提到了这篇文章 NCNet,以代码为主结合论文补一下方法。
2. 方法
2.1 流程
Fig 1:用 CNN 提取图像 I A , I B I_A,I_B IA,IB 的特征图 f A , f B f^A,f^B fA,fB。所有的特征匹配 f i j A , f k l B f^A_{ij},f^B_{kl} fijA,fklB 都在 4D 的匹配空间 ( i , j , k , l ) (i,j,k,l) (i,j,k,l) 中表示,匹配得分在 4D 张量 c c c 中。这些匹配用 soft-nearest neighbour filtering 和 neighbourhood consensus network 做处理得到最终的对应集。
方法分为5个部分:
(1)dense feature extraction and matching 特征提取与匹配
(2)the neighbourhood consensus network
(3)a soft mutual nearest neighbour filtering 滤波
(4)extraction of correspondences from the output 4D filtered match tensor 提取对应关系
(5)weakly supervised training loss 弱监督损失
class ImMatchNet(nn.Module):
# used only for foward pass at eval and for training with strong supervision
def forward(self, tnf_batch):
# feature extraction
feature_A = self.FeatureExtraction(tnf_batch['source_image'])
feature_B = self.FeatureExtraction(tnf_batch['target_image'])
if self.half_precision:
feature_A=feature_A.half()
feature_B=feature_B.half()
# feature correlation
corr4d = self.FeatureCorrelation(feature_A,feature_B)
# do 4d maxpooling for relocalization
if self.relocalization_k_size>1: # default 0
corr4d,max_i,max_j,max_k,max_l=maxpool4d(corr4d,k_size=self.relocalization_k_size)
# run match processing model
corr4d = MutualMatching(corr4d)
corr4d = self.NeighConsensus(corr4d)
corr4d = MutualMatching(corr4d)
if self.relocalization_k_size>1:
delta4d=(max_i,max_j,max_k,max_l)
return (corr4d,delta4d)
else:
return corr4d
直接看网络的 forward 部分:
(1)feature extraction 特征提取,特征最后做了 L2Norm,毕竟后面肯定要算相似度,常规操作
(2)feature correlation 估计是计算特征的相似度,self.FeatureCorrelation(feature_A,feature_B)
(3)run match processing model MutualMatching(corr4d), self.NeighConsensus(corr4d)
2.2 FeatureCorrelation
self.FeatureCorrelation = FeatureCorrelation(shape='4D',normalization=False)
class FeatureCorrelation(torch.nn.Module):
def __init__(self,shape='3D',normalization=True):
super(FeatureCorrelation, self).__init__()
self.normalization = normalization
self.shape=shape
self.ReLU = nn.ReLU()
def forward(self, feature_A, feature_B):
if self.shape=='3D':
elif self.shape=='4D':
b,c,hA,wA = feature_A.size()
b,c,hB,wB = feature_B.size()
# reshape features for matrix multiplication
feature_A = feature_A.view(b,c,hA*wA).transpose(1,2) # size [b,c,h*w]
feature_B = feature_B.view(b,c,hB*wB) # size [b,c,h*w]
# perform matrix mult.
feature_mul = torch.bmm(feature_A,feature_B)
# indexed [batch,row_A,col_A,row_B,col_B]
correlation_tensor = feature_mul.view(b,hA,wA,hB,wB).unsqueeze(1)
if self.normalization:
correlation_tensor = featureL2Norm(self.ReLU(correlation_tensor))
return correlation_tensor
很显然就是两张图像的所有特征算了个相似度,也就是特征向量的点积,最终输出一个 ( b , 1 , h A , w A , h B , w B ) (b,1,h_A,w_A,h_B,w_B) (b,1,hA,wA,hB,wB) 大小的 tensor。
2.3 MutualMatching
def MutualMatching(corr4d):
# mutual matching
batch_size,ch,fs1,fs2,fs3,fs4 = corr4d.size()
corr4d_B=corr4d.view(batch_size,fs1*fs2,fs3,fs4) # [batch_idx,k_A,i_B,j_B]
corr4d_A=corr4d.view(batch_size,fs1,fs2,fs3*fs4)
# get max
corr4d_B_max,_=torch.max(corr4d_B,dim=1,keepdim=True)
corr4d_A_max,_=torch.max(corr4d_A,dim=3,keepdim=True)
eps = 1e-5
corr4d_B=corr4d_B/(corr4d_B_max+eps)
corr4d_A=corr4d_A/(corr4d_A_max+eps)
corr4d_B=corr4d_B.view(batch_size,1,fs1,fs2,fs3,fs4)
corr4d_A=corr4d_A.view(batch_size,1,fs1,fs2,fs3,fs4)
corr4d=corr4d*(corr4d_A*corr4d_B) # parenthesis are important for symmetric output
return corr4d
对应论文中的 soft mutual nearest neighbour filtering。输入为两幅图像所有特征之间的相似度张量 ( b , 1 , i , j , k , l ) (b,1,i,j,k,l) (b,1,i,j,k,l),对于每一对特征 ( f i j A , f k l B ) (f^A_{ij},f^B_{kl}) (fijA,fklB) 有一个相似度值 c i j k l c_{ijkl} cijkl。这里 MutualMatching 的作用就是将相似度乘上两个系数 c i j k l × s A × s B c_{ijkl} \times s_A \times s_B cijkl×sA×sB。其中 s A = c i j k l / c i j m a x s_A=c_{ijkl}/c_{ij_{max}} sA=cijkl/cijmax, c i j m a x c_{ij_{max}} cijmax 为 f i j A f^A_{ij} fijA 与所有 f B f^B fB 相似度最高的值。这套操作不复杂但是感觉很难简单描述,通俗点讲就是每个匹配都乘上在所有匹配中的一个比例。在匹配中,经常会认为两个点 p A , p B p_A,p_B pA,pB,仅当 B 中所有点与 p A p_A pA 的匹配度最高的为 p B p_B pB,且 A 中所有点与 p B p_B pB 的匹配度最高的为 p A p_A pA,才认为这一对点匹配成功。这里有点像把这个操作做了个可求导的近似。
2.4 NeighConsensus
self.NeighConsensus = NeighConsensus(use_cuda=self.use_cuda,
kernel_sizes=ncons_kernel_sizes, # [3,3,3]
channels=ncons_channels) # [10,10,1]
class NeighConsensus(torch.nn.Module):
def __init__(self, use_cuda=True, kernel_sizes=[3,3,3], channels=[10,10,1], symmetric_mode=True):
super(NeighConsensus, self).__init__()
self.symmetric_mode = symmetric_mode
self.kernel_sizes = kernel_sizes
self.channels = channels
num_layers = len(kernel_sizes)
nn_modules = list()
for i in range(num_layers):
if i==0:
ch_in = 1
else:
ch_in = channels[i-1]
ch_out = channels[i]
k_size = kernel_sizes[i]
nn_modules.append(Conv4d(in_channels=ch_in,out_channels=ch_out,kernel_size=k_size,bias=True))
nn_modules.append(nn.ReLU(inplace=True))
self.conv = nn.Sequential(*nn_modules)
if use_cuda:
self.conv.cuda()
def forward(self, x):
if self.symmetric_mode:
# apply network on the input and its "transpose" (swapping A-B to B-A ordering of the correlation tensor),
# this second result is "transposed back" to the A-B ordering to match the first result and be able to add together
x = self.conv(x)+self.conv(x.permute(0,1,4,5,2,3)).permute(0,1,4,5,2,3)
# because of the ReLU layers in between linear layers,
# this operation is different than convolving a single time with the filters+filters^T
# and therefore it makes sense to do this.
else:
x = self.conv(x)
return x
对应论文中的 neighbourhood consensus network,也就是 NC-Net 部分。网络结构为 Conv4d(1,10,3) + ReLU + Conv4d(10,10,3) + ReLU + Conv4d(10,1,3) + ReLU
,这里的 4D 卷积是自定义的卷积操作。
先看下论文对这边的说法。NC-Net 的作用是对得到的相关性图(correlation map,也就是相似度得分)做进一步处理和过滤。从这个相关性图中找到正确的匹配的难点在于,正确的匹配有 h w hw hw 个要从 ( h w ) 2 (hw)^2 (hw)2 个得分中找到,所以大部分信息是不正确的匹配。文中只解释了一下网络设计的原因,为什么用卷积,每一层的大致作用,两张图像交换顺序得到的匹配是一样的等。虽然没说这个 4D 卷积是怎么计算的,但应该就是和 2D 卷积一样的模式。
NC-Net 的作用是根据局部信息过滤匹配,soft mutual nearest neighbour filtering 则是根据全局信息过滤。其实这个过滤就是在原有的匹配得分上乘上权重,这个权重是基于全局信息或是局部信息。
2.5 损失
def weak_loss(model, batch, normalization="softmax", alpha=30):
if normalization is None:
normalize = lambda x: x
elif normalization == "softmax":
normalize = lambda x: torch.nn.functional.softmax(x, 1)
elif normalization == "l1":
normalize = lambda x: x / (torch.sum(x, dim=1, keepdim=True) + 0.0001)
b = batch["source_image"].size(0)
# positive
# corr4d = model({'source_image':batch['source_image'], 'target_image':batch['target_image']})
corr4d = model(batch)
batch_size = corr4d.size(0)
feature_size = corr4d.size(2)
nc_B_Avec = corr4d.view(
batch_size, feature_size * feature_size, feature_size, feature_size
) # [batch_idx,k_A,i_B,j_B]
nc_A_Bvec = corr4d.view(
batch_size, feature_size, feature_size, feature_size * feature_size
).permute(
0, 3, 1, 2
) #
nc_B_Avec = normalize(nc_B_Avec)
nc_A_Bvec = normalize(nc_A_Bvec)
# compute matching scores
scores_B, _ = torch.max(nc_B_Avec, dim=1)
scores_A, _ = torch.max(nc_A_Bvec, dim=1)
score_pos = torch.mean(scores_A + scores_B) / 2
# negative
batch["source_image"] = batch["source_image"][np.roll(np.arange(b), -1), :] # roll
corr4d = model(batch)
# corr4d = model({'source_image':batch['source_image'], 'target_image':batch['negative_image']})
batch_size = corr4d.size(0)
feature_size = corr4d.size(2)
nc_B_Avec = corr4d.view(
batch_size, feature_size * feature_size, feature_size, feature_size
) # [batch_idx,k_A,i_B,j_B]
nc_A_Bvec = corr4d.view(
batch_size, feature_size, feature_size, feature_size * feature_size
).permute(
0, 3, 1, 2
) #
nc_B_Avec = normalize(nc_B_Avec)
nc_A_Bvec = normalize(nc_A_Bvec)
# compute matching scores
scores_B, _ = torch.max(nc_B_Avec, dim=1)
scores_A, _ = torch.max(nc_A_Bvec, dim=1)
score_neg = torch.mean(scores_A + scores_B) / 2
# loss
loss = score_neg - score_pos
return loss
这边是作者设计的一种不需要标签的弱监督,当一对图像可以匹配的时候,利用 softmax 可以得到每个特征与之匹配的分类得分,并最大化这个得分(也就是可以很好的分类,有明确的匹配);当一对图像不可以匹配的时候,最小化得分(也就是不能分类,没有匹配)
文章评论