balanced_positive_negative_sampler.py

在 rpn/loss.py 文件中的 class RPNLossComputation(object) 类使用了 class BalancedPositiveNegativeSampler(object) 类的实例对象作为函数参数, 下面我们就来看看该类的具体实现:

# ../maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py

class BalancedPositiveNegativeSampler(object):
    # 该类用于生成采样 batch, 使得 batch 中的正负样本比例维持一个固定的数

    def __init__(self, batch_size_per_image, positive_fraction):
        # batch_size_per_image(int): 每张图谱包含的样本个数
        # positive_fraction: 每个 batch 中包含的正样本个数

        self.batch_size_per_image = batch_size_per_image
        self.positive_fraction = positive_fraction


    def __call__(self, matched_idxs):
        # matched idxs: 一个元素类型为 tensor 的列表,
        # tensor 包含值 -1, 0, 1. 每个 tensor 都对应这一个具体的图片
        # -1 代表忽略图片中的该样本, 0 代表该样本为负, 1 代表该样本为正

        # 返回值:
        # pos_idx (list[tensor])
        # neg_idx (list[tensor])
        # 每张图片都返回两个二值掩膜列表, 其中一个指示了哪些正样本被采样, 另一个指示了负样本

        pos_idx = []
        neg_idx = []

        for matched_idxs_per_image in matched_idxs:

box_coder.py

在 ./maskrcnn_benchmark/modeling/rpn/rpn.py 文件中, 使用了下面的语句创建 class BoxCoder(object) 实例:

1	rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))

这个类位于 ./maskrcnn_benchmark/modeling/box_coder.py 文件中, 在对这个类的定义展开详细解析前, 我们首先对 R-CNN 中 bounding box 的回归任务的坐标编码方式进行介绍. 假设我们具有一个候选区域框, 用 $P=(P_x, P_y, P_w, P_h)$ 表示, 它对应的真实框用 $G=(G_x, G_y, G_w, G_h)$ 表示, 那么, 我们的目标是希望回归器能够学习到一个从 $P$ 到 $G$ 的转化(transformation), 假设此时模型从 $P$ 中预测得到的框为 $\hat G = (\hat G_x, \hat G_y, \hat G_h, \hat G_w)$, 那么我们可以定义出如下的从 $P$ 到 $\hat G$ 的转化:

$\hat G_x = P_w d_x(P) + P_x$ $\hat G_y = P_h d_y(P) + P_y$ $\hat G_w = P_w exp(d_w(P))$ $\hat G_h = P_h exp(d_h(P))$

上式中的 $d_x(P), d_y(P), d_w(P), d_h(P)$ 就是我们要学习的参数, 那么我们的学习目标就是使得这些参数可以满足 $\hat G = G$, 也就是说, 我们的学习目标就是令参数 $d_x(P), d_y(P), d_w(P), d_h(P)$ 无限近似于下面的 $(t_x, t_y, t_w, t_h)$:

$t_x = (G_x - P_x) / P_w$ $t_y = (G_y - P_y) / P_h$ $t_w = log(G_w / P_w)$ $t_h = log(G_h / P_h)$

BoxCoder 类的代码解析如下所示:

# ./maskrcnn_benchmark/modeling/box_coder.py

class BoxCoder(object):
    # 这个类负责将一系列的 bounding boxes 编码, 使之可以被用于训练对应的回归器

    def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)):
        # weights (4-element tuple)
        # bbox_xform_clip (float): 用于对 dw 和 dh 剪枝

        # 将参数置为成员变量
        self.weights = weights
        self.bbox_xform_clip = bbox_xform_clip

    def encode(self, reference_boxes, proposals):
        # 根据给定的 reference_boxes 对一系列的 proposals 进行编码
        # reference_boxes(Tensor)
        # proposals(Tensor): 待编码的 boxes

        TO_REMOVE = 1
        # 获取宽度(x2-x1)
        ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE

        # 获取高度(y2-y1)
        ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE

        # 获取中心点坐标
        ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths
        ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights

        # 同样的计算逻辑, 获取真实框的宽高和重心坐标
        gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE
        gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE
        gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths
        gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights

        # 计算当前 proposals box 的训练目标, 注意, 不同的 proposals 具有不同的训练目标
        # 最终返回的 tensor 的 shape 为 [n, 4], n 为 proposals 的大小.
        wx, wy, ww, wh = self.weights
        targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
        targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
        targets_dw = ww * torch.log(gt_widths / ex_widths)
        targets_dh = wh * torch.log(gt_heights / ex_heights)

        # 将中心坐标和宽高的学习目标结合起来
        targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)

        return targets


    def decode(self, rel_codes, boxes):
        # 将编码后的 transformation 形式的 box 转化成 (x1, y1, x2, y2) 形式
        # rel_codes (Tensor): encoded boxes
        # boxes (Tensor): reference boxes

        boxes = boxes.to(rel_codes.dtype)

        TO_REMOVE = 1

        # 获取真实 box 的宽高和中心坐标
        widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE
        heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE
        ctr_x = boxes[:, 0] + 0.5 * widths
        ctr_y = boxes[:, 1] + 0.5 * heights

        # 先对权重解码
        wx, wy, ww, wh = self.weights
        dx = rel_codes[:, 0::4] / wx # 步长为4
        dy = rel_codes[:, 1::4] / wy
        dw = rel_codes[:, 2::4] / ww
        dh = rel_codes[:, 3::4] / wh

        # 对 dw, 和 dh 剪枝, 防止向 troch.exp() 传送过大的值
        dw = torch.clamp(dw, max=self.bbox_xform_clip)
        dh = torch.clamp(dh, max=self.bbox_xform_clip)

        # 计算预测的 box 的中心坐标和宽高, 公式说明参见前面的原理讲解
        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
        pred_w = torch.exp(dw) * widths[:, None]
        pred_h = torch.exp(dh) * heights[:, None]

        # 将中心坐标和宽高转化成 (x1, y1, x2, y2) 的表示形式, 然后将其返回
        pred_boxes = torch.zeros_like(rel_codes)
        # x1
        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
        # y1
        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
        # x2
        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
        # y2
        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1

        return pred_boxes

matcher.py

在 rpn/loss.py 文件中的 make_rpn_loss_evaluator() 函数中利用如下语句创建了一个 class Matcher(object) 实例:

# ./maskrcnn_benchmark/modeling/rpn/loss.py

matcher = Matcher(
    cfg.MODEL.RPN.FG_IOU_THRESHOLD,
    cfg.MODEL.RPN.BG_IOU_THRESHOLD,
    allow_low_quality_matches=True,
)

下面我们就来看看该类的具体实现:

# ./maskrcnn_benchmark/modeling/matcher.py

import torch

class Matcher(object):
    # 这个类会给每一个预测 "元素" (如box) 赋值一个 gt "元素".
    # 每一个预测结果最多匹配一个真实框, 每一个真实框可以有多个匹配的预测框.
    # 匹配过程是基于一个 M×N 的匹配矩阵进行的, 矩阵的值代表了行列元素匹配的程度.(如IoU)
    # matcher 对象会返回一个 N 长度的 tensor, N 代表了预测结果的长度,
    # tensor 中的值是一个 0~m-1 的值, 指向了匹配的真实框的下标, 如果没有匹配, 则为负值.

    BELOW_LOW_THRESHOLD = -1
    BETWEEN_THRESHOLDS = -2

    def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False):
        # high_threshold: 置信度大于等于该阈值的 box 被选为候选框. 如 0.7
        # low_threshold: 置信度小于high阈值但是大于等于low阈值的置为 BETWEEN_THRESHOLD,
        # 置信度小于low阈值的置为 BELOW_LOW_THRESHOLD
        # allow_low_quality_matches: 若为真, 则会产生额外一些只有低匹配度的候选框

        ## high 阈值必须大于等于 low 阈值
        assert low_threshold <= high_threshold

        # 设成员变量
        self.high_threshold = high_threshold
        self.low_threshold = low_threshold
        self.allow_low_quality_matches = allow_low_quality_matches

    def __call__(self, match_quality_matrix):
        # match_quality_matrix (Tensor[float]): 一个 M×N 的 tensor
        # 包含着 M 个 gt box 和 predicted box 之间的匹配程度

        # 返回值 matches(Tensor[int64]): 一个长度为 N 的 tensor, 其中的元素 N[i]
        # 代表了与第 i 个 predict box 匹配的 gt box 的下标

        if match_quality_matrix.numel() == 0:
            # 在训练过程中, 匹配矩阵中的元素个数不能为 0, 否则, 输出如下错误
            if match_quality_matrix.shape[0] == 0:
                raise ValueError(
                    "No ground-truth boxes available for one of the images"
                    "during training"
                )
            else:
                raise ValueError(
                    "No proposal boxes available for one of the images"
                    "during training"
                )

        # match_quality_matrix 的 shape 为 M(gt) × N(predicted)
        # 为每个 prediction 找到匹配度最高的 gt candidate
        matched_vals, matches = match_quality_matrix.max(dim=0)

        # 不在乎每个匹配度的实际大小, 保留所有的 prediction 的匹配值
        if self.allow_low_quality_matches:
            all_matches = matches.clone()

        # 将那些具有低匹配度的值赋值成负数
        below_low_threshold = matched_vals < self.low_threshold
        between_thresholds = (matched_vals < self.low_threshold) & (matched_vals < self.high_threshold)

        # 将 matches 中符合相应条件的值置为对应的值
        matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD
        matches[between_thresholds] = Matcher.BETWEEN_THRESHOLD

        # 如果选项为 True, 则调用类的 set_low_quality_matches_ 函数
        if self.allow_low_quality_matches:
            self.set_low_quality_matches_(matches, all_matches, match_quality_matrix)

        return matches

    def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix):
        # 为 predictions 添加仅具有低匹配度的额外的 matches
        # 具体来说, 就是给每一个 gt 找到一个具有最大交并比的 prediction 集合.
        # 对于集合中的每一个 prediction, 如果它还没有与其他 gt 匹配,
        # 则把它匹配到具有最高匹配值的 gt 上.

        # 对于每一个 gt, 找到匹配度最高的 prediction
        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)

        # 找到非零匹配度的下标: (z×2), z 为非零元素的个数.
        gt_pred_pairs_of_highest_quality = torch.nonzero(
            match_quality_matrix == highest_quality_foreach_gt[:, None]
        )

        pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
        matches[pred_inds_to_update] = all_matches[pred_inds_to_update]

MaskrcnnBenchmark 源码解析-模型定义(modeling)之辅助文件解析

balanced_positive_negative_sampler.py

box_coder.py

matcher.py

poolers.py

registry.py

utils.py