13.Visual Relationship Detection 視覺關係偵測

此範例源自 snorkel-tutorials，目的為對視覺關係檢測 (VRD) 數據集進行操作，專注於圖片內物件之間的關係分類任務。
通常圖片內容物都有物體之間的關聯性，定義描述為為a subject <predictate> object。
- 例如，person riding bicycle，“person”和“bicycle”分別是主詞和受詞，“riding”是關係動詞。
以下圖示紅色框代表主題，而綠色框代表對象。該主詞（如踢）表示什麼關係連接主體和客體。

設定專案環境

複製專案

!git clone https://github.com/snorkel-team/snorkel-tutorials.git > clone_log.txt
!pip3 install snorkel

%cd snorkel-tutorials/visual_relation

*改寫專案檔案

因範例採用舊的pandas，已棄用的df.as_matrix()還在範例程式snorkel-tutorials/visual_relation/model.py中，為了符合現今環境，將改寫為df.values (model.py第135行附近)。
您可以直接執行以下%%writefile指令，或自行循路徑修正檔案。

%%writefile /content/snorkel-tutorials/visual_relation/model.py

# fix _get_wordvec() df.as_matrix() to df.values (line:137)
import csv
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas
import torch
import torch.nn as nn
from PIL import Image
from torchvision import transforms

from snorkel.analysis import Scorer
from snorkel.classification import DictDataset, MultitaskClassifier, Operation, Task
from snorkel.classification.data import XDict, YDict


def union(bbox1, bbox2):
    """Create the union of the two bboxes.

    Parameters
    ----------
    bbox1
        Coordinates of first bounding box
    bbox2
        Coordinates of second bounding box

    Returns
    -------
    [y0, y1, x0, x1]
        Coordinates of union of input bounding boxes

    """
    y0 = min(bbox1[0], bbox2[0])
    y1 = max(bbox1[1], bbox2[1])
    x0 = min(bbox1[2], bbox2[2])
    x1 = max(bbox1[3], bbox2[3])
    return [y0, y1, x0, x1]


def crop_img_arr(img_arr, bbox):
    """Crop bounding box from image.

    Parameters
    ----------
    img_arr
        Image in array format
    bbox
        Coordinates of bounding box to crop

    Returns
    -------
    img_arr
        Cropped image

    """
    return img_arr[bbox[0] : bbox[1], bbox[2] : bbox[3], :]


class SceneGraphDataset(DictDataset):
    """Dataloader for Scene Graph Dataset."""

    def __init__(
        self,
        name: str,
        split: str,
        image_dir: str,
        df: pandas.DataFrame,
        image_size=224,
    ) -> None:
        self.image_dir = Path(image_dir)
        X_dict = {
            "img_fn": df["source_img"].tolist(),
            "obj_bbox": df["object_bbox"].tolist(),
            "sub_bbox": df["subject_bbox"].tolist(),
            "obj_category": df["object_category"].tolist(),
            "sub_category": df["subject_category"].tolist(),
        }
        Y_dict = {
            "visual_relation_task": torch.LongTensor(df["label"].to_numpy())
        }  # change to take in the rounded train labels
        super(SceneGraphDataset, self).__init__(name, split, X_dict, Y_dict)

        # define standard set of transformations to apply to each image
        self.transform = transforms.Compose(
            [
                transforms.Resize((image_size, image_size)),
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                ),
            ]
        )

    def __getitem__(self, index: int) -> Tuple[XDict, YDict]:
        img_fn = self.X_dict["img_fn"][index]
        img_arr = np.array(Image.open(self.image_dir / img_fn))

        obj_bbox = self.X_dict["obj_bbox"][index]
        sub_bbox = self.X_dict["sub_bbox"][index]
        obj_category = self.X_dict["obj_category"][index]
        sub_category = self.X_dict["sub_category"][index]

        # compute crops
        obj_crop = crop_img_arr(img_arr, obj_bbox)
        sub_crop = crop_img_arr(img_arr, sub_bbox)
        union_crop = crop_img_arr(img_arr, union(obj_bbox, sub_bbox))

        # transform each crop
        x_dict = {
            "obj_crop": self.transform(Image.fromarray(obj_crop)),
            "sub_crop": self.transform(Image.fromarray(sub_crop)),
            "union_crop": self.transform(Image.fromarray(union_crop)),
            "obj_category": obj_category,
            "sub_category": sub_category,
        }

        y_dict = {name: label[index] for name, label in self.Y_dict.items()}
        return x_dict, y_dict

    def __len__(self):
        return len(self.X_dict["img_fn"])


class WordEmb(nn.Module):
    """Extract and concat word embeddings for obj and sub categories."""

    def __init__(self, glove_fn="data/glove/glove.6B.100d.txt"):
        super(WordEmb, self).__init__()

        self.word_embs = pandas.read_csv(
            glove_fn, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE
        )

    def _get_wordvec(self, word):
        return self.word_embs.loc[word].values

    def forward(self, obj_category, sub_category):
        obj_emb = self._get_wordvec(obj_category)
        sub_emb = self._get_wordvec(sub_category)
        embs = np.concatenate([obj_emb, sub_emb], axis=1)
        return torch.FloatTensor(embs)


# Classes and helper functions for defining classifier
def init_fc(fc):
    torch.nn.init.xavier_uniform_(fc.weight)
    fc.bias.data.fill_(0.01)


class FlatConcat(nn.Module):
    """Module that flattens and concatenates features"""

    def forward(self, *inputs):
        return torch.cat([input.view(input.size(0), -1) for input in inputs], dim=1)


# Helper functions to geenerate operations
def get_op_sequence():
    # define feature extractors for each of the (union, subject, and object) image crops
    union_feat_op = Operation(
        name="union_feat_op",
        module_name="feat_extractor",
        inputs=[("_input_", "union_crop")],
    )

    sub_feat_op = Operation(
        name="sub_feat_op",
        module_name="feat_extractor",
        inputs=[("_input_", "sub_crop")],
    )

    obj_feat_op = Operation(
        name="obj_feat_op",
        module_name="feat_extractor",
        inputs=[("_input_", "obj_crop")],
    )

    # define an operation to extract word embeddings for subject and object categories
    word_emb_op = Operation(
        name="word_emb_op",
        module_name="word_emb",
        inputs=[("_input_", "sub_category"), ("_input_", "obj_category")],
    )

    # define an operation to concatenate image features and word embeddings
    concat_op = Operation(
        name="concat_op",
        module_name="feat_concat",
        inputs=["obj_feat_op", "sub_feat_op", "union_feat_op", "word_emb_op"],
    )

    # define an operation to make a prediction over all concatenated features
    prediction_op = Operation(
        name="head_op", module_name="prediction_head", inputs=["concat_op"]
    )

    return [
        sub_feat_op,
        obj_feat_op,
        union_feat_op,
        word_emb_op,
        concat_op,
        prediction_op,
    ]


# Create model from pre loaded resnet cnn.
def create_model(resnet_cnn):
    # freeze the resnet weights
    for param in resnet_cnn.parameters():
        param.requires_grad = False

    # define input features
    in_features = resnet_cnn.fc.in_features
    feature_extractor = nn.Sequential(*list(resnet_cnn.children())[:-1])

    # initialize FC layer: maps 3 sets of image features to class logits
    WEMB_SIZE = 100
    fc = nn.Linear(in_features * 3 + 2 * WEMB_SIZE, 3)
    init_fc(fc)

    # define layers
    module_pool = nn.ModuleDict(
        {
            "feat_extractor": feature_extractor,
            "prediction_head": fc,
            "feat_concat": FlatConcat(),
            "word_emb": WordEmb(),
        }
    )

    # define task flow through modules
    op_sequence = get_op_sequence()
    pred_cls_task = Task(
        name="visual_relation_task",
        module_pool=module_pool,
        op_sequence=op_sequence,
        scorer=Scorer(metrics=["f1_micro"]),
    )
    return MultitaskClassifier([pred_cls_task])

1. 加載數據集

下載 VRD 數據集並過濾包含至少一個動作謂詞的圖像，因為這些比幾何關係更難分類，如above或next to。

範例將訓練集、有效集和測試集加載為DataFrame：
- label: 對象之間的關係。0: RIDE, 1: CARRY, 2:OTHER動作謂詞
- object_bbox: [ymin, ymax, xmin, xmax]
- object_category
- source_img
- subject_bbox: [ymin, ymax, xmin, xmax]
- subject_category
數據集的採樣版本在訓練集、開發集和測試集上使用相同的 26 個數據。此設置旨在快速演示 Snorkel 如何處理此任務，而不是演示性能。

import os
from utils import load_vrd_data

# setting sample=False will take ~3 hours to run (downloads full VRD dataset)
sample = True
is_test = os.environ.get("TRAVIS") == "true" or os.environ.get("IS_TEST") == "true"
df_train, df_valid, df_test = load_vrd_data(sample, is_test)

print("Train Relationships: ", len(df_train))
print("Dev Relationships: ", len(df_valid))
print("Test Relationships: ", len(df_test))

df_train.head()

請注意，訓練DataFrame將有一個全為 -1 的標籤字段。這表示該特定數據集缺少標籤。在本教程中，我們將通過在主體和客體的屬性上編寫標籤函數來為訓練集分配概率標籤！

2. 編寫Labeling Functions (LFs)

我們現在編寫標記函數來檢測邊界框對之間存在什麼關係。為此，我們可以將各種直覺編碼到標記函數中：

分類直覺：關於這些關係中通常涉及的主詞與受詞類別的知識（例如，person通常是動詞ride和的主詞carry）
空間直覺：關於主詞與動詞的相對位置的知識（例如，主詞通常高於動詞的受詞ride）

RIDE = 0
CARRY = 1
OTHER = 2
ABSTAIN = -1

我們從編碼分類直覺的標記函數開始：我們使用關於共同的主題-客體類別對的知識RIDE，CARRY以及關於哪些主題或客體不太可能涉及這兩種關係的知識。

from snorkel.labeling import labeling_function

# Category-based LFs
@labeling_function()
def lf_ride_object(x):
    if x.subject_category == "person":
        if x.object_category in [
            "bike",
            "snowboard",
            "motorcycle",
            "horse",
            "bus",
            "truck",
            "elephant",
        ]:
            return RIDE
    return ABSTAIN


@labeling_function()
def lf_carry_object(x):
    if x.subject_category == "person":
        if x.object_category in ["bag", "surfboard", "skis"]:
            return CARRY
    return ABSTAIN


@labeling_function()
def lf_carry_subject(x):
    if x.object_category == "person":
        if x.subject_category in ["chair", "bike", "snowboard", "motorcycle", "horse"]:
            return CARRY
    return ABSTAIN


@labeling_function()
def lf_not_person(x):
    if x.subject_category != "person":
        return OTHER
    return ABSTAIN

現在編碼空間直覺，其中包括測量邊界框之間的距離並比較它們的相對區域。

YMIN = 0
YMAX = 1
XMIN = 2
XMAX = 3

import numpy as np

# Distance-based LFs
@labeling_function()
def lf_ydist(x):
    if x.subject_bbox[XMAX] < x.object_bbox[XMAX]:
        return OTHER
    return ABSTAIN


@labeling_function()
def lf_dist(x):
    if np.linalg.norm(np.array(x.subject_bbox) - np.array(x.object_bbox)) <= 1000:
        return OTHER
    return ABSTAIN


def area(bbox):
    return (bbox[YMAX] - bbox[YMIN]) * (bbox[XMAX] - bbox[XMIN])


# Size-based LF
@labeling_function()
def lf_area(x):
    if area(x.subject_bbox) / area(x.object_bbox) <= 0.5:
        return OTHER
    return ABSTAIN

標記函數具有不同的經驗準確性和覆蓋範圍。由於我們選擇的關係中的類別不平衡，標記OTHER類的標記函數比RIDE或CARRY的標記函數具有更高的覆蓋率。這也反映了數據集中類的分佈。

from snorkel.labeling import PandasLFApplier

lfs = [
    lf_ride_object,
    lf_carry_object,
    lf_carry_subject,
    lf_not_person,
    lf_ydist,
    lf_dist,
    lf_area,
]

applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_valid = applier.apply(df_valid)

from snorkel.labeling import LFAnalysis

Y_valid = df_valid.label.values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

3. 訓練標籤模型

訓練LabelModel來為未標記的訓練集分配訓練標籤。

from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=3, verbose=True)
label_model.fit(
    L_train, 
    seed=123, 
    lr=0.01, 
    log_freq=10, 
    n_epochs=100
    )

使用F1衡量模型 F1 = 2 * (precision * recall) / (precision + recall)

label_model.score(L_valid, Y_valid, metrics=["f1_micro"])

4. 訓練分類器

現在，您可以使用這些訓練標籤來訓練任何標準判別模型，例如現成的 ResNet，它應該學會在我們開發的 LF 之外進行泛化！

Create DataLoaders for Classifier

from snorkel.classification import DictDataLoader
from model import SceneGraphDataset, create_model

df_train["labels"] = label_model.predict(L_train)

if sample:
    TRAIN_DIR = "data/VRD/sg_dataset/samples"
else:
    TRAIN_DIR = "data/VRD/sg_dataset/sg_train_images"

dl_train = DictDataLoader(
    SceneGraphDataset("train_dataset", "train", TRAIN_DIR, df_train),
    batch_size=16,
    shuffle=True,
)

dl_valid = DictDataLoader(
    SceneGraphDataset("valid_dataset", "valid", TRAIN_DIR, df_valid),
    batch_size=16,
    shuffle=False,
)

定義模型架構

import torchvision.models as models

# initialize pretrained feature extractor
cnn = models.resnet18(pretrained=True)
model = create_model(cnn)

訓練與評估模型

from snorkel.classification import Trainer

trainer = Trainer(
    n_epochs=1,  # increase for improved performance
    lr=1e-3,
    checkpointing=True,
    checkpointer_config={"checkpoint_dir": "checkpoint"},
)
trainer.fit(model, [dl_train])

model.score([dl_valid])

我們已經成功訓練了一個視覺關係檢測模型！使用關於視覺關係中的對像如何相互作用的分類和空間直覺，我們能夠在多類分類設置中為 VRD 數據集中的對像對分配高質量的訓練標籤。
有關 Snorkel 如何用於視覺關係任務的更多信息，請參閱ICCV 2019 論文！

設定專案環境​

1. 加載數據集​

2. 編寫Labeling Functions (LFs)​

3. 訓練標籤模型​

4. 訓練分類器​

Create DataLoaders for Classifier​

定義模型架構​

訓練與評估模型​