mratsim · singularperturbation · Oct 22, 2019 · Oct 22, 2019 · Oct 23, 2019 · Oct 24, 2019
diff --git a/src/nn/layers/crf.nim b/src/nn/layers/crf.nim
@@ -0,0 +1,183 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import strformat
+import options
+
+import ../../tensor/tensor,
+        ../../nn_primitives/nn_primitives,
+        ../../nn/init,
+        ../../autograd/autograd
+
+
+type Idx* = SomeInteger
+
+type CRFGate*[TT; Idx] {.final.} = ref object of Gate[TT]
+  ## CRF (Linear) Gate for sequence prediction.
+  transitions: Variable[TT]
+  num_tags: Idx
+
+  # Special values for state transitions
+  bos_tag: Idx
+  eos_tag: Idx
+
+  dims: tuple[timesteps, batch_size, hidden_dim: Idx]
+
+
+proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx; range_val: T = T(
+    0.1)): Tensor[T] =
+  ## Create emissions matrix within bounds [-range, range], uniformly
+  ## distributed.  The special transitions from [any, start] and [end, any] are
+  ## set to be an arbitrarily low value to prevent prohibited transitions.
+  ##
+  ## Input:
+  ##   The `num_tags` indicating how many real (non-special) tag values there are.
+  ##   The `range_val` giving the scale to initialize transition values.
+  ##
+  ## Returns
+  ##   The initialized transitions matrix of shape [num_tags + 2, num_tags + 2]
+
+  # TODO: In future, allow for rules prohibiting / mandating certain transitions.
+  let bos_tag, eos_tag = (num_tags, num_tags + 1)
+  result = xavier_uniform(num_tags + 2, num_tags + 2, T) * range_val
+
+  # Scale for a disallowed transition relative to the range value
+  const disallowed_transition_scale = 100_000
+
+  result[_, bos_tag] = disallowed_transition_scale * -1.0 * abs(range_val)
+  result[eos_tag, _] = disallowed_transition_scale * -1.0 * abs(range_val)
+
+
+proc crf_forward[TT, Idx](
+  result: var Variable[TT];
+  input: Variable[TT];
+  mask: Variable[TT];
+  transitions: Variable[TT];
+  tags: Tensor[Idx];
+  num_tags: int;
+  reduce = false
+) =
+  ## Compute the negative log likelihood for each input sequence.
+  ## If `reduce` is true, return
+  var gate: CRFGate[TT, Idx]
+  new gate
+
+  gate.transitions = transitions
+  gate.num_tags = num_tags
+
+  gate.bos_tag = Idx(num_tags)
+  gate.eos_tag = Idx(num_tags + 1)
+
+  let
+    timesteps = input.value.shape[0]
+    batch_size = input.value.shape[1]
+    hidden_dim = input.value.shape[2]
+
+  gate.dims = (timesteps: timesteps, batch_size: batch_size,
+               hidden_dim: hidden_dim)
+
+  crf_forward(
+    result.value,
+    input.value,
+    mask.value,
+    transitions.value,
+    tags,
+    gate.dims.timesteps, gate.dims.batch_size, gate.dims.hidden_dim,
+    gate.bos_tag, gate.eos_tag
+  )
+
+proc crf_viterbi*[TT]() = discard
+
+proc crf*[TT](
+  input: Variable[TT];
+  mask: Variable[TT];
+  transitions: Variable[TT];
+  tags: Option[Tensor[Idx]];
+  num_tags: int;
+  reduce: bool = false
+): Variable[TT] =
+  ## Input:
+  ##   - An `x` Variable of shape [timesteps, batch_size, num_tags]
+  ##   - A `mask` Variable of shape [timesteps, batch_size] with is_grad_needed
+  ##     set to 0.
+  ##   - A `transitions` matrix of size (num_tags + 2, num_tags + 2)
+  ##     The extra tags are for BOS / EOS tags.
+  ##   - A `tags` tensor of shape [timesteps, batch_size] - only needed if
+  ##     doing training.  If not training, then this can be nil.
+  ##
+  ## Returns:
+  ##   - Negative log likelihood Tensor [batch_size, ]
+  ##   - Logits for tag prediction of shape [batch_size, sequence_length, num_tags]
+  when compileOption("boundChecks"):
+    doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3" &
+      fmt", got shape of {input.value.shape}"
+    doAssert input.value.shape[2] == num_tags, fmt"Expected input variable to" &
+      fmt" emit {num_tags}, emitted {input.value.shape[2]}"
+    doAssert mask.value.shape[0..1] == input.value.shape[0..1],
+        fmt"Mask and input shapes do not match:" &
+        fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}"
+    doAssert transitions.value.shape == [num_tags + 2, num_tags + 2],
+        "Expected transitions matrix shape to " &
+        fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}"
+
+  assert mask.requires_grad == false, "Mask should not need a gradient"
+
+  new result
+  result.context = input.context
+
+  let doing_training = input.is_grad_needed() or transitions.is_grad_needed()
+
+  if doing_training:
+    if tags.isNone:
+      raise newException(ValueError, "Tags must be non-nil when training")
+    else:
+      let tags_tensor = tags.get()
+      result.crf_forward(input, mask, transitions, tags_tensor, num_tags)
+  else:
+    # TODO: Inference time
+    discard
+
+
+when isMainModule:
+  import unittest
+
+  let ctx = newContext Tensor[float32]
+
+  let (timesteps, batch_size, num_tags) = (8, 30, 10)
+
+  let
+    input = ctx.variable(
+      randomTensor[float32](timesteps, batch_size, num_tags, max = 1.1),
+      requires_grad = true
+    )
+
+    mask = ctx.variable(ones[float32](timesteps, batch_size))
+
+    transitions = ctx.variable(
+      (randomTensor(num_tags + 2, num_tags + 2, max = 2.0'f32) .- 1.0'f32),
+      requires_grad = false
+    )
+
+  suite "Basic CRF tests":
+
+    test "When pass in some(Tensor[int]) can call CRF":
+      var tags = option(randomTensor(timesteps, batch_size, max = num_tags - 1))
+      let output = crf(input, mask, transitions, tags, num_tags)
+      assert output.value.shape == [batch_size, ],
+        fmt"Got output shape {output.value.shape}"
+
+    test "When pass in none(Tensor[int]) get ValueError":
+      expect ValueError:
+        let output2 = crf(input, mask, transitions, none(Tensor[int]), num_tags)
diff --git a/src/nn/nn.nim b/src/nn/nn.nim
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 import  ./activation/[sigmoid, relu, tanh],
-        ./layers/[linear, conv2D, maxpool2D, gru, embedding],
+        ./layers/[linear, conv2D, maxpool2D, gru, embedding, crf],
         ./loss/cross_entropy_losses,
         ./loss/mean_square_error_loss,
         ./optimizers/optimizers,
         ./init
 
 export  sigmoid, relu, tanh,
-        linear, conv2D, maxpool2d, gru, embedding,
+        linear, conv2D, maxpool2d, gru, embedding, crf,
         cross_entropy_losses, mean_square_error_loss,
         optimizers,
         init
diff --git a/src/nn_primitives/nn_primitives.nim b/src/nn_primitives/nn_primitives.nim
@@ -21,7 +21,8 @@ import  ./nnp_activation,
         ./nnp_softmax,
         ./nnp_numerical_gradient,
         ./nnp_gru,
-        ./nnp_embedding.nim
+        ./nnp_embedding,
+        ./nnp_crf
 
 export  nnp_activation,
         nnp_convolution,
@@ -32,7 +33,8 @@ export  nnp_activation,
         nnp_softmax,
         nnp_numerical_gradient,
         nnp_gru,
-        nnp_embedding
+        nnp_embedding,
+        nnp_crf
 
 import private/p_nnp_types
 export Size2D

diff --git a/src/nn_primitives/nnp_crf.nim b/src/nn_primitives/nnp_crf.nim
@@ -0,0 +1,127 @@
+# Copyright 2017 the Arraymancer contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import strformat
+
+import ../tensor/tensor,
+        math
+
+# Needed for the partition function
+from private/p_logsumexp import logsumexp
+
+
+type Idx = SomeInteger
+
+
+proc compute_scores[T](
+  result: var Tensor[T],  # (B, ) - not nil
+  input: Tensor[T],       # (T, B, num_tags)
+  mask: Tensor[T],        # (T, B)
+  transitions: Tensor[T], # (num_tags + 2, num_tags + 2)
+  tags: Tensor[Idx],      # (T, B)
+  timesteps, batch_size, hidden_dim: int,
+  bos_tag, eos_tag: Idx
+) =
+  ## Computes the un-normalized log probabilities (combination of emissions and
+  ## transition scores at each timestep).
+  ##
+  ## Returns:
+  ##  - A Tensor[T] of non-normalized emission scores of shape [batch_size, ]
+
+  # DEBUG
+  echo (timesteps, batch_size, hidden_dim)
+  echo input.shape
+
+  # Transitions from bos_tag -> tag at time = 0 for all batches
+  var transition_scores = index_select(transitions[bos_tag, _], axis = 1,
+                                      indices = tags[0, _].squeeze()).squeeze()
+
+  when compileOption("boundChecks"):
+    doAssert result.shape == [batch_size], "Result should be of shape" &
+      fmt" {batch_size} but got {result.shape}"
+    doAssert transition_scores.shape == [batch_size], "Transition scores" &
+      fmt" should be of shape {batch_size} but got {transition_scores.shape}"
+
+  # Emission scores for tag at t = 0 for all in batch
+  # Unoptimized - simple loop
+  var emission_scores = newTensorUninit[input.T](batch_size)
+
+  for i in 0 ..< batch_size:
+    emission_scores[i] = input[0, i, tags[0, i]]
+
+  when compileOption("boundChecks"): 
+    doAssert emission_scores.shape == [batch_size], "Emission scores should" &
+      fmt" be of shape {batch_size} but got {emission_scores.shape}"
+
+  emission_scores .*= mask[0, _].squeeze()
+
+  result += transition_scores + emission_scores
+
+  # TODO: Optimize?
+  for i in 1 ..< timesteps - 1:
+    let 
+      old_tags = tags[i - 1, _].squeeze(1)
+      new_tags = tags[i, _].squeeze(1)
+
+      old_mask = mask[i, _].squeeze()
+      new_mask = mask[i + 1, _].squeeze()
+
+    # New emission scores are the emission at time i for batch j to tag [i, j]
+    for j in 0 ..< batch_size:
+      emission_scores[i] = input[i, j, tags[i, j]]
+
+    # New transition scores
+    # This is applying transtion from old -> new tag across batch
+    # Unoptimized version:
+    # for j in 0 .. batch_size:
+    #   transition_scores[j] = transitions[old_tags[j], new_tags[j]]
+    transition_scores.apply3_inline(old_tags, new_tags):
+      transitions[y, z]
+
+    result += (transition_scores .* new_mask) + (emission_scores .* old_mask)
+
+  # TODO: Make sure that last transition handled correctly 
+
+  # Assume that masked when == 0
+  let last_time_inds = (mask.sum(axis=0).squeeze() .- 1).astype(int)
+  var last_tags = newTensorUninit[tags.T](batch_size)
+
+  for i in 0 ..< batch_size:
+    last_tags[i] = tags[last_time_inds[i], i]
+
+  # Set transition scores to last_real_tag -> EOS_TAG across batch
+  transitions[_, eos_tag].squeeze().index_select(axis=0, indices=last_tags, result=transition_scores)
+
+  result += transition_scores
+
+proc compute_log_partition_function[T](): Tensor[T] =
+  ## Compute the partition function by using the forward algorithm to avoid
+  ## explicitly calculating probabilties for all possible sequence
+  ## configurations.
+  discard
+
+proc crf_forward*[T: SomeFloat](
+  result: var Tensor[T],
+  input: Tensor[T],
+  mask: Tensor[T],
+  transitions: Tensor[T],
+  tags: Tensor[Idx],
+  timesteps, batch_size, hidden_dim: int,
+  bos_tag, eos_tag: Idx
+) =
+  ## Computes the log likelihood of input given transitions (emissions) matrix.
+  ## Loss should be *negative* log likelihood.
+  result = zeros[T](batch_size)
+  result.compute_scores(input, mask, transitions, tags, timesteps, batch_size,
+                        hidden_dim, bos_tag, eos_tag)
diff --git a/src/tensor/shapeshifting.nim b/src/tensor/shapeshifting.nim
@@ -318,3 +318,23 @@ func index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int,
     var r_slice = result.atAxisIndex(axis, i)
     var t_slice = t.atAxisIndex(axis, int(index))
     r_slice.copyFrom(t_slice)
+
+proc index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int, indices: Tensor[Idx], result: var Tensor[T]) =
+  ## Same as the `index_select` function, but use a preallocated tensor for
+  ## output.
+  doAssert indices.shape.len == 1
+
+  var select_shape = t.shape
+  select_shape[axis] = indices.shape[0]
+
+  if (select_shape != result.shape):
+    ## FIXME: Better way of resizing the result when necessary
+    if (select_shape.product() == result.size()):
+      result = result.reshape(select_shape)
+    else:
+      result = newTensorUninit[T](select_shape)
+
+  for i, index in enumerate(indices):
+    var r_slice = result.atAxisIndex(axis, i)
+    var t_slice = t.atAxisIndex(axis, int(index))
+    r_slice.copyFrom(t_slice)