## @package crf # Module caffe2.python.crf import numpy as np from caffe2.python import brew, core, model_helper, recurrent """ Due to a limitation in ReccurentNetworkOp, this layer only supports batch_size=1 In order to support batch_size > 1, we will have to implement the CRFUnit and its gradient in C++ and handle the different batches there. """ class CRFWithLoss(object): def __init__(self, model, num_classes, transitions_blob=None): self.model = model self.num_classes = num_classes self.num_classes_padded = num_classes + 2 # After adding BOS and EOS if not transitions_blob: transitions_blob = self.model.param_init_net.UniformFill( [], [core.ScopedBlobReference("crf_transitions")], shape=[self.num_classes_padded, self.num_classes_padded], min=-1.0, max=1.0, ) self.transitions = transitions_blob self.model.params.append(self.transitions) def crf_loss(self, predictions, labels, seq_lengths=None): # Since the transitions matrix is a shared parameter, need to # take a snapshot of it at the beginning since it can be updated # in between the operators that uses it when doing parallel updates transitions_snapshot = self.model.net.Copy( self.transitions, core.ScopedBlobReference("transitions_snapshot") ) # Compute best path unary score from the logits path_unary_score = self._gather_entries_sum( predictions, labels, self.num_classes ) # Append BOS and EOS entries to the predictions and labels predictions = CRFWithLoss.pad_predictions( predictions, self.model.param_init_net, self.model.net, self.num_classes ) labels = CRFWithLoss.pad_labels( labels, self.model.param_init_net, self.model.net, self.num_classes ) # Compute best path binary scores from the transitions matrix path_binary_score = self._path_binary_scores( labels, transitions_snapshot, seq_lengths ) path_total_score = self.model.net.Add( [path_binary_score, path_unary_score], core.ScopedBlobReference("path_total"), ) # Compute all paths score zero_index = self.model.param_init_net.ConstantFill([], shape=[1], value=0) initial_state = self.model.net.Gather( [predictions, zero_index], core.ScopedBlobReference("rnn_initial"), dense_gradient=True, ) input_data, _ = self.model.net.RemovePadding( [predictions], padding_width=1, end_padding_width=0, outputs=2 ) input_data = self.model.net.ExpandDims( [input_data], core.ScopedBlobReference("rnn_input_data"), dims=[1] ) # Due to a bug in RecurrentNetworkGradientOp, we need to copy the # transitions blob before sending it to the recurrent network transitions_copy = self.model.net.Copy( transitions_snapshot, core.ScopedBlobReference("transitions_copy") ) all_paths_scores = self._crf_forward( input_data, initial_state, transitions_copy ) loss = self.model.net.Sub( [all_paths_scores, path_total_score], core.ScopedBlobReference("crf_loss") ) return loss def _path_binary_scores(self, labels, transitions, seq_lengths=None): column_ids, _ = self.model.net.RemovePadding( [labels], outputs=2, padding_width=1, end_padding_width=0 ) row_ids, _ = self.model.net.RemovePadding( [labels], outputs=2, padding_width=0, end_padding_width=1 ) # Since there is no multi-dimensional gather, I flatten the matrix to # a 1-d vector and transform the ids to (row_ids * num_columns + # column_ids) and do gather in 1-d num_columns_blob = self.model.net.ConstantFill( [row_ids], value=self.num_classes_padded ) flattened_ids = self.model.net.Mul([row_ids, num_columns_blob]) flattened_ids = self.model.net.Add([flattened_ids, column_ids]) flattened_transitions = self.model.net.FlattenToVec([transitions]) entries = self.model.net.Gather( [flattened_transitions, flattened_ids], dense_gradient=True ) return self.model.ReduceFrontSum(entries) def _gather_entries_sum(self, in_data, indices, index_size): indices = self.model.net.Cast([indices], to="int64") index_size_blob = self.model.param_init_net.ConstantFill( [], shape=[1], value=index_size ) query_one_hot = self.model.net.OneHot([indices, index_size_blob]) flattend_query = self.model.net.FlattenToVec(query_one_hot) flattend_data = self.model.net.FlattenToVec(in_data) query_scores = self.model.net.DotProduct([flattend_query, flattend_data]) final_sum = self.model.net.ReduceFrontSum([query_scores]) return final_sum def _crf_forward( self, input_blob, initial_state, transitions_copy, seq_lengths=None ): # Build the RNN net and get the last timestep output out_last = self.build_crf_net(input_blob, initial_state, transitions_copy) out_last, _ = self.model.net.Reshape( [out_last], outputs=2, shape=(self.num_classes_padded,) ) zero_segment_id = self.model.param_init_net.ConstantFill( [], value=0, shape=[self.num_classes_padded], dtype=core.DataType.INT32 ) # Compute the accumulated total score of all the paths accum_score = self.model.net.SortedSegmentRangeLogSumExp( [out_last, zero_segment_id] ) accum_score, _ = self.model.net.Reshape(accum_score, outputs=2, shape=()) return accum_score def build_crf_net(self, input_blob, initial_state, transitions): """ Adds the crf_net recurrent operator to the model. model: model_helper.ModelHelper object new operators would be added to input_blob: the input sequence in a format T x N x D where T is sequence size, N - batch size and D - input dimension ##Only supports batch-size 1## seq_lengths: blob containing sequence lengths (unused) """ scope = "crf_net" def s(name): "" # We have to manually scope due to our internal/external blob # relationships. return "{}/{}".format(str(scope), str(name)) step_model = model_helper.ModelHelper(name="crf_step", param_model=self.model) input_t, cell_t_prev, _ = step_model.net.AddExternalInputs( core.ScopedBlobReference("input_t"), core.ScopedBlobReference("cell_t_prev"), transitions, ) zero_segment_id = step_model.param_init_net.ConstantFill( [], [s("zero_segment_id")], value=0, shape=[self.num_classes_padded], dtype=core.DataType.INT32, ) # A hack to bypass model cloning for test step_model.param_init_net.AddExternalOutput(zero_segment_id) """ the CRF step """ # Do tile prev_transpose = brew.transpose( step_model, cell_t_prev, [s("prev_transpose")], axes=(0, 2, 1) ) prev_tiled = step_model.net.Tile( prev_transpose, [s("prev_tiled")], tiles=self.num_classes_padded, axis=2 ) input_t_tiled = step_model.net.Tile( input_t, [s("input_t_tiled")], tiles=self.num_classes_padded, axis=1 ) input_with_prev = step_model.net.Add( [prev_tiled, input_t_tiled], [s("input_with_prev")] ) all_with_transitions = step_model.net.Add( [input_with_prev, transitions], [s("prev_with_transitions")], broadcast=1, use_grad_hack=1, ) all_with_transitions_reshaped, _ = step_model.net.Reshape( all_with_transitions, [s("all_with_transitions_reshaped"), s("all_with_transitions_orig")], shape=(self.num_classes_padded, self.num_classes_padded), ) cell_t = step_model.net.SortedSegmentRangeLogSumExp( [all_with_transitions_reshaped, zero_segment_id], [s("cell_t")] ) step_model.net.AddExternalOutputs(cell_t) """ recurrent network """ cell_input_blob = initial_state out_all, out_last = recurrent.recurrent_net( net=self.model.net, cell_net=step_model.net, inputs=[(input_t, input_blob)], initial_cell_inputs=[(cell_t_prev, cell_input_blob)], links={cell_t_prev: cell_t}, scope=scope, outputs_with_grads=(1,), ) return out_last def update_predictions(self, classes): def crf_update_predictions_op(inputs, outputs): # This operator will compute the best path of classes by performing # Viterbi decoding and then updates the predictions to make the tag # On the best path has the highest score among the others predictions = inputs[0].data transitions = inputs[1].data predictions = inputs[0].data predictions_shape = inputs[0].shape outputs[0].reshape(predictions_shape) trellis = np.zeros(predictions_shape) backpointers = np.zeros(predictions_shape, dtype=np.int32) trellis[0] = predictions[0] for t in range(1, predictions_shape[0]): v = np.expand_dims(trellis[t - 1], 1) + transitions trellis[t] = predictions[t] + np.max(v, 0) backpointers[t] = np.argmax(v, 0) viterbi = [np.argmax(trellis[-1])] for bp in reversed(backpointers[1:]): viterbi.append(bp[viterbi[-1]]) viterbi.reverse() new_predictions = np.zeros(predictions_shape) old_bests = [] for i, w_predictions in enumerate(predictions): # Get the current tag with the maximum score new_predictions[i] = predictions[i] old_best = np.argmax(w_predictions) old_bests.append(old_best) # Swap the scores of the current best tag and the tag on the # Viterbi path w_predictions[viterbi[i]], w_predictions[old_best] = ( w_predictions[old_best], w_predictions[viterbi[i]], ) new_predictions[i] = w_predictions # Remove the BOS and EOS entries from the predictions matrix orig_predictions = new_predictions[1:-1, 0:-2] outputs[0].reshape(orig_predictions.shape) outputs[0].data[...] = orig_predictions padded_classes = CRFWithLoss.pad_predictions( classes, self.model.param_init_net, self.model.net, self.num_classes ) new_classes = self.model.net.Python(crf_update_predictions_op)( [padded_classes, self.transitions], core.ScopedBlobReference("post_crf_classes"), ) return new_classes @staticmethod def pad_labels(labels, init_net, net, num_classes): bos_i = num_classes eos_i = num_classes + 1 bos_i_b = init_net.ConstantFill([], shape=[1], value=bos_i) eos_i_b = init_net.ConstantFill([], shape=[1], value=eos_i) labels = net.Cast([labels], to="int64") padded_labels, _ = net.Concat([bos_i_b, labels, eos_i_b], axis=0, outputs=2) return padded_labels @staticmethod def pad_predictions(predictions, init_net, net, num_classes): # This function will introduce two labels for beginning of sequence # And end of sequence, it will make the necessary udpates to the # the predictions blob low_score = -1000.0 # An arbitray very low number b_scores = np.array([[low_score] * num_classes + [0, low_score]]).astype( np.float32 ) e_scores = np.array([[low_score] * num_classes + [low_score, 0]]).astype( np.float32 ) b_scores = init_net.GivenTensorFill( [], "b_scores", shape=[1, num_classes + 2], values=b_scores ) e_scores = init_net.GivenTensorFill( [], "e_scores", shape=[1, num_classes + 2], values=e_scores ) zero_index = net.ConstantFill([], shape=[1], value=0) length = net.Gather([net.Shape([predictions]), zero_index]) length = net.Cast(length, to="int32") t_range = net.LengthsRangeFill(length) padding = net.ConstantFill([t_range], value=low_score) padding = net.ExpandDims(padding, dims=[1]) padded_predictions, _ = net.Concat( [predictions, padding, padding], outputs=2, axis=1 ) padded_predictions_concat, _ = net.Concat( [b_scores, padded_predictions, e_scores], outputs=2, axis=0 ) return padded_predictions_concat