Siamese network output

Question

I'm trying to implement a siamese network in caffe in which it is composed of two imagenets that don't share weights. So what I am basically trying to do is give each network an image, and in the end try to find out the distance between them for similarity, below is my prototxt. So my main question is what should I set my "num_output" too? I have only 2 classes for my training, 0 for wither they are not alike, and 1 for if they are similar.

name: "Siamese_ImageNet"
layers {
  name: "data"
  type: IMAGE_DATA
  top: "data"
  top: "label"
  image_data_param {
    source: "train1.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TRAIN }
}
layers {
  name: "data"
  type: IMAGE_DATA
  top: "data"
  top: "label"
  image_data_param {
    source: "test1.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TEST }
}

layers {
  name: "data_p"
  type: IMAGE_DATA
  top: "data_p"
  top: "label_p"
  image_data_param {
    source: "train2.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TRAIN }
}
layers {
  name: "data_p"
  type: IMAGE_DATA
  top: "data_p"
  top: "label_p"
  image_data_param {
    source: "test2.txt"
    batch_size: 32
    new_height: 256
    new_width: 256
  }
  include: { phase: TEST }
}


layers {
  name: "conv1"
  type: CONVOLUTION
  bottom: "data"
  top: "conv1"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu1"
  type: RELU
  bottom: "conv1"
  top: "conv1"
}
layers {
  name: "pool1"
  type: POOLING
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm1"
  type: LRN
  bottom: "pool1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv2"
  type: CONVOLUTION
  bottom: "norm1"
  top: "conv2"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu2"
  type: RELU
  bottom: "conv2"
  top: "conv2"
}
layers {
  name: "pool2"
  type: POOLING
  bottom: "conv2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm2"
  type: LRN
  bottom: "pool2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv3"
  type: CONVOLUTION
  bottom: "norm2"
  top: "conv3"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu3"
  type: RELU
  bottom: "conv3"
  top: "conv3"
}
layers {
  name: "conv4"
  type: CONVOLUTION
  bottom: "conv3"
  top: "conv4"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu4"
  type: RELU
  bottom: "conv4"
  top: "conv4"
}
layers {
  name: "conv5"
  type: CONVOLUTION
  bottom: "conv4"
  top: "conv5"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu5"
  type: RELU
  bottom: "conv5"
  top: "conv5"
}
layers {
  name: "pool5"
  type: POOLING
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "fc6"
  type: INNER_PRODUCT
  bottom: "pool5"
  top: "fc6"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu6"
  type: RELU
  bottom: "fc6"
  top: "fc6"
}
layers {
  name: "drop6"
  type: DROPOUT
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layers {
  name: "fc7"
  type: INNER_PRODUCT
  bottom: "fc6"
  top: "fc7"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu7"
  type: RELU
  bottom: "fc7"
  top: "fc7"
}
layers {
  name: "drop7"
  type: DROPOUT
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layers {
  name: "conv1_p"
  type: CONVOLUTION
  bottom: "data_p"
  top: "conv1_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu1_p"
  type: RELU
  bottom: "conv1_p"
  top: "conv1_p"
}
layers {
  name: "pool1_p"
  type: POOLING
  bottom: "conv1_p"
  top: "pool1_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm1_p"
  type: LRN
  bottom: "pool1_p"
  top: "norm1_p"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv2_p"
  type: CONVOLUTION
  bottom: "norm1_p"
  top: "conv2_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu2_p"
  type: RELU
  bottom: "conv2_p"
  top: "conv2_p"
}
layers {
  name: "pool2_p"
  type: POOLING
  bottom: "conv2_p"
  top: "pool2_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "norm2_p"
  type: LRN
  bottom: "pool2_p"
  top: "norm2_p"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layers {
  name: "conv3_p"
  type: CONVOLUTION
  bottom: "norm2_p"
  top: "conv3_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layers {
  name: "relu3_p"
  type: RELU
  bottom: "conv3_p"
  top: "conv3_p"
}
layers {
  name: "conv4_p"
  type: CONVOLUTION
  bottom: "conv3_p"
  top: "conv4_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu4_p"
  type: RELU
  bottom: "conv4_p"
  top: "conv4_p"
}
layers {
  name: "conv5_p"
  type: CONVOLUTION
  bottom: "conv4_p"
  top: "conv5_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu5_p"
  type: RELU
  bottom: "conv5_p"
  top: "conv5_p"
}
layers {
  name: "pool5_p"
  type: POOLING
  bottom: "conv5_p"
  top: "pool5_p"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layers {
  name: "fc6_p"
  type: INNER_PRODUCT
  bottom: "pool5_p"
  top: "fc6_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu6_p"
  type: RELU
  bottom: "fc6_p"
  top: "fc6_p"
}
layers {
  name: "drop6_p"
  type: DROPOUT
  bottom: "fc6_p"
  top: "fc6_p"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layers {
  name: "fc7_p"
  type: INNER_PRODUCT
  bottom: "fc6_p"
  top: "fc7_p"
  blobs_lr: 1
  blobs_lr: 2
  weight_decay: 1
  weight_decay: 0
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 1
    }
  }
}
layers {
  name: "relu7_p"
  type: RELU
  bottom: "fc7_p"
  top: "fc7_p"
}
layers {
  name: "drop7_p"
  type: DROPOUT
  bottom: "fc7_p"
  top: "fc7_p"
  dropout_param {
    dropout_ratio: 0.5
  }
}

layers {
    name: "loss"
    type: CONTRASTIVE_LOSS
    contrastive_loss_param {
        margin: 1.0
    }
    bottom: "fc7"
    bottom: "fc7_p"
    bottom: "label"
    top: "loss"
}

My training file structure: 0 is dissimilar, 1 is similar

 train1.txt:
 /aer/img1_1.jpg 0
 /aer/img1_2.jpg 1
 /aer/img1_3.jpg 1

 train2.txt:
 /tpd/img2_1.jpg 0
 /tpd/img2_2.jpg 1
 /tpd/img2_3.jpg 1

Dale · Accepted Answer

What should I set my "num_output"?

Before understanding how much you should set the num_output, let's explain what it means. In fact, you can view the two sides of the Simense network, data -> fc7, data_p -> fc7_p as 2 feature extractors. Each one is extracting feature e.g.fc7 and fc7_p from the images in the corresponding data layer. So num_output defines the dimension of the extracted feature vector.

During training, the ContrastiveLoss layer always tries to minimize the 2 extracted feature vectors' distance when the images the vectors represent for are similiar(label == 1) and maximize the distance when dissimiliar(label == 0). Namely, the smaller the distance of the feature vectors is, the more similar the images are.

So what's the optimal dimension of the feature vector to best contain the information indicating the similarity? Or what should you set the num_output? There may not be an exact value, and it depends on the encoding quality of the feature extractor(you may view the feature as a code of the image) and how much hard it is to recognize the similarity of the images. So basically if the network(feature extractor) is deep and it is not too hard to recognize the similarity, you can choose a relative small num_output e.g.200, because the feature may be encoded well by a larger network and be more discriminative . If it is not , you can try a larger value e.g. 500, 1000 or try a more complicated network.

If you want to try a MultinomialLogisticLoss instead of ContrastiveLoss layer, you should first fusion the 2 feature vectors fc7, fc7_p into 1 using a layer like CONCAT and then feed it into a SOFTMAX_LOSS layer, like this:

...#original layers
layers {
  name: "concat"
  type: CONCAT
  bottom: "fc7"
  bottom: "fc7_p"  
  top: "fc_concat" # concatenate fc7 and fc7_p along channel axis
}
layer {
  name: "fc_cls"
  type: INNER_PRODUCT
  bottom: "fc_concat"
  top: "fc_cls"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 2 # a binary classification problem in this case
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: ACCURACY
  bottom: "fc_cls"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: SOFTMAX_LOSS
  bottom: "fc_cls"
  bottom: "label"
  top: "loss"
}

Update

Which is the best method to implement in order to compare similarity and use it for deploy, Constrastive Loss or SoftMax Loss?

Softmax Loss is simple and easy for deploy. But it can only give you the binary prediction, namely similar or dissimilar. The probability distribution over the 2 class(similar, dissimilar) it gives is often too hard(nonuniform), e.g. [0.9*, 0.0*], [0.0*, 0.9*],.... which in many cases will not reflect the true input similarity degree well.

While using Constrastive Loss you can get a discriminative feature vector for an image. And you can use the vector to compute a probability of similarity, as what the CVPR 2005 paper Learning a Similarity Metric Discriminatively, with Application to Face Verification did in Section 4.1.(The key point is to compute a multivariate normal density using the feature vectors generated from the images belonging to a same subject). Also you can use a threshold to control the false positive rate and the false negative rate of the model to get a ROC curve to better evaluate a model.

By the way, to dig out more CNN architectures for predicting similarity, you can refer to the CVPR 2015 paper Learning to Compare Image Patches via Convolutional Neural Networks.

Joel Teply · Answer

Just to correct Dale's great answer above for Caffe's uber sensitive syntax, for noobs that get stuck like myself, here's a few corrections (layers to layer, some quotes, plus removal of comments, and valid capitalization)

layer {
  name: "concat"
  type: "Concat"
  bottom: "fc7"
  bottom: "fc7_p"  
  top: "fc_concat"
}
layer {
  name: "fc_cls"
  type: "InnerProduct"
  bottom: "fc_concat"
  top: "fc_cls"
  param {
    lr_mult: 1
  }
  param {
    lr_mult: 2
  }
  inner_product_param {
    num_output: 2
    weight_filler {
      type: "xavier"
    }
    bias_filler {
      type: "constant"
    }
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "fc_cls"
  bottom: "label"
  top: "accuracy"
  include {
    phase: TEST
  }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc_cls"
  bottom: "label"
  top: "loss"
}

Siamese network output

Tags:

machine-learning

neural-network

deep-learning

computer-vision

caffe

MasterWizard

2 Answers

Update

Dale

Joel Teply

Recent Activity

Donate For Us

Siamese network output

Tags:

machine-learning

neural-network

deep-learning

computer-vision

caffe

MasterWizard

2 Answers

Update

Dale

Joel Teply

Related questions

Recent Activity

Donate For Us