Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CRF head [WIP] #393

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
183 changes: 183 additions & 0 deletions src/nn/layers/crf.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import strformat
import options

import ../../tensor/tensor,
../../nn_primitives/nn_primitives,
../../nn/init,
../../autograd/autograd


type Idx* = SomeInteger

type CRFGate*[TT; Idx] {.final.} = ref object of Gate[TT]
## CRF (Linear) Gate for sequence prediction.
transitions: Variable[TT]
num_tags: Idx

# Special values for state transitions
bos_tag: Idx
eos_tag: Idx

dims: tuple[timesteps, batch_size, hidden_dim: Idx]


proc init_transitions_matrix*[T: SomeFloat](num_tags: Idx; range_val: T = T(
0.1)): Tensor[T] =
## Create emissions matrix within bounds [-range, range], uniformly
## distributed. The special transitions from [any, start] and [end, any] are
## set to be an arbitrarily low value to prevent prohibited transitions.
##
## Input:
## The `num_tags` indicating how many real (non-special) tag values there are.
## The `range_val` giving the scale to initialize transition values.
##
## Returns
## The initialized transitions matrix of shape [num_tags + 2, num_tags + 2]

# TODO: In future, allow for rules prohibiting / mandating certain transitions.
let bos_tag, eos_tag = (num_tags, num_tags + 1)
result = xavier_uniform(num_tags + 2, num_tags + 2, T) * range_val

# Scale for a disallowed transition relative to the range value
const disallowed_transition_scale = 100_000

result[_, bos_tag] = disallowed_transition_scale * -1.0 * abs(range_val)
result[eos_tag, _] = disallowed_transition_scale * -1.0 * abs(range_val)


proc crf_forward[TT, Idx](
result: var Variable[TT];
input: Variable[TT];
mask: Variable[TT];
transitions: Variable[TT];
tags: Tensor[Idx];
num_tags: int;
reduce = false
) =
## Compute the negative log likelihood for each input sequence.
## If `reduce` is true, return
var gate: CRFGate[TT, Idx]
new gate

gate.transitions = transitions
gate.num_tags = num_tags

gate.bos_tag = Idx(num_tags)
gate.eos_tag = Idx(num_tags + 1)

let
timesteps = input.value.shape[0]
batch_size = input.value.shape[1]
hidden_dim = input.value.shape[2]

gate.dims = (timesteps: timesteps, batch_size: batch_size,
hidden_dim: hidden_dim)

crf_forward(
result.value,
input.value,
mask.value,
transitions.value,
tags,
gate.dims.timesteps, gate.dims.batch_size, gate.dims.hidden_dim,
gate.bos_tag, gate.eos_tag
)

proc crf_viterbi*[TT]() = discard

proc crf*[TT](
input: Variable[TT];
mask: Variable[TT];
transitions: Variable[TT];
tags: Option[Tensor[Idx]];
num_tags: int;
reduce: bool = false
): Variable[TT] =
## Input:
## - An `x` Variable of shape [timesteps, batch_size, num_tags]
## - A `mask` Variable of shape [timesteps, batch_size] with is_grad_needed
## set to 0.
## - A `transitions` matrix of size (num_tags + 2, num_tags + 2)
## The extra tags are for BOS / EOS tags.
## - A `tags` tensor of shape [timesteps, batch_size] - only needed if
## doing training. If not training, then this can be nil.
##
## Returns:
## - Negative log likelihood Tensor [batch_size, ]
## - Logits for tag prediction of shape [batch_size, sequence_length, num_tags]
when compileOption("boundChecks"):
doAssert input.value.shape.len == 3, fmt"Expected input variable of rank 3" &
fmt", got shape of {input.value.shape}"
doAssert input.value.shape[2] == num_tags, fmt"Expected input variable to" &
fmt" emit {num_tags}, emitted {input.value.shape[2]}"
doAssert mask.value.shape[0..1] == input.value.shape[0..1],
fmt"Mask and input shapes do not match:" &
fmt"got {mask.value.shape[0..2]} and {input.value.shape[0..2]}"
doAssert transitions.value.shape == [num_tags + 2, num_tags + 2],
"Expected transitions matrix shape to " &
fmt"match ({num_tags+2}, {num_tags+2}), got {transitions.value.shape}"

assert mask.requires_grad == false, "Mask should not need a gradient"

new result
result.context = input.context

let doing_training = input.is_grad_needed() or transitions.is_grad_needed()

if doing_training:
if tags.isNone:
raise newException(ValueError, "Tags must be non-nil when training")
else:
let tags_tensor = tags.get()
result.crf_forward(input, mask, transitions, tags_tensor, num_tags)
else:
# TODO: Inference time
discard


when isMainModule:
import unittest

let ctx = newContext Tensor[float32]

let (timesteps, batch_size, num_tags) = (8, 30, 10)

let
input = ctx.variable(
randomTensor[float32](timesteps, batch_size, num_tags, max = 1.1),
requires_grad = true
)

mask = ctx.variable(ones[float32](timesteps, batch_size))

transitions = ctx.variable(
(randomTensor(num_tags + 2, num_tags + 2, max = 2.0'f32) .- 1.0'f32),
requires_grad = false
)

suite "Basic CRF tests":

test "When pass in some(Tensor[int]) can call CRF":
var tags = option(randomTensor(timesteps, batch_size, max = num_tags - 1))
let output = crf(input, mask, transitions, tags, num_tags)
assert output.value.shape == [batch_size, ],
fmt"Got output shape {output.value.shape}"

test "When pass in none(Tensor[int]) get ValueError":
expect ValueError:
let output2 = crf(input, mask, transitions, none(Tensor[int]), num_tags)
4 changes: 2 additions & 2 deletions src/nn/nn.nim
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
# limitations under the License.

import ./activation/[sigmoid, relu, tanh],
./layers/[linear, conv2D, maxpool2D, gru, embedding],
./layers/[linear, conv2D, maxpool2D, gru, embedding, crf],
./loss/cross_entropy_losses,
./loss/mean_square_error_loss,
./optimizers/optimizers,
./init

export sigmoid, relu, tanh,
linear, conv2D, maxpool2d, gru, embedding,
linear, conv2D, maxpool2d, gru, embedding, crf,
cross_entropy_losses, mean_square_error_loss,
optimizers,
init
6 changes: 4 additions & 2 deletions src/nn_primitives/nn_primitives.nim
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ import ./nnp_activation,
./nnp_softmax,
./nnp_numerical_gradient,
./nnp_gru,
./nnp_embedding.nim
./nnp_embedding,
./nnp_crf

export nnp_activation,
nnp_convolution,
Expand All @@ -32,7 +33,8 @@ export nnp_activation,
nnp_softmax,
nnp_numerical_gradient,
nnp_gru,
nnp_embedding
nnp_embedding,
nnp_crf

import private/p_nnp_types
export Size2D
Expand Down
127 changes: 127 additions & 0 deletions src/nn_primitives/nnp_crf.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Copyright 2017 the Arraymancer contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import strformat

import ../tensor/tensor,
math

# Needed for the partition function
from private/p_logsumexp import logsumexp


type Idx = SomeInteger


proc compute_scores[T](
result: var Tensor[T], # (B, ) - not nil
input: Tensor[T], # (T, B, num_tags)
mask: Tensor[T], # (T, B)
transitions: Tensor[T], # (num_tags + 2, num_tags + 2)
tags: Tensor[Idx], # (T, B)
timesteps, batch_size, hidden_dim: int,
bos_tag, eos_tag: Idx
) =
## Computes the un-normalized log probabilities (combination of emissions and
## transition scores at each timestep).
##
## Returns:
## - A Tensor[T] of non-normalized emission scores of shape [batch_size, ]

# DEBUG
echo (timesteps, batch_size, hidden_dim)
echo input.shape

# Transitions from bos_tag -> tag at time = 0 for all batches
var transition_scores = index_select(transitions[bos_tag, _], axis = 1,
indices = tags[0, _].squeeze()).squeeze()

when compileOption("boundChecks"):
doAssert result.shape == [batch_size], "Result should be of shape" &
fmt" {batch_size} but got {result.shape}"
doAssert transition_scores.shape == [batch_size], "Transition scores" &
fmt" should be of shape {batch_size} but got {transition_scores.shape}"

# Emission scores for tag at t = 0 for all in batch
# Unoptimized - simple loop
var emission_scores = newTensorUninit[input.T](batch_size)

for i in 0 ..< batch_size:
emission_scores[i] = input[0, i, tags[0, i]]

when compileOption("boundChecks"):
doAssert emission_scores.shape == [batch_size], "Emission scores should" &
fmt" be of shape {batch_size} but got {emission_scores.shape}"

emission_scores .*= mask[0, _].squeeze()

result += transition_scores + emission_scores

# TODO: Optimize?
for i in 1 ..< timesteps - 1:
let
old_tags = tags[i - 1, _].squeeze(1)
new_tags = tags[i, _].squeeze(1)

old_mask = mask[i, _].squeeze()
new_mask = mask[i + 1, _].squeeze()

# New emission scores are the emission at time i for batch j to tag [i, j]
for j in 0 ..< batch_size:
emission_scores[i] = input[i, j, tags[i, j]]

# New transition scores
# This is applying transtion from old -> new tag across batch
# Unoptimized version:
# for j in 0 .. batch_size:
# transition_scores[j] = transitions[old_tags[j], new_tags[j]]
transition_scores.apply3_inline(old_tags, new_tags):
transitions[y, z]

result += (transition_scores .* new_mask) + (emission_scores .* old_mask)

# TODO: Make sure that last transition handled correctly

# Assume that masked when == 0
let last_time_inds = (mask.sum(axis=0).squeeze() .- 1).astype(int)
var last_tags = newTensorUninit[tags.T](batch_size)

for i in 0 ..< batch_size:
last_tags[i] = tags[last_time_inds[i], i]

# Set transition scores to last_real_tag -> EOS_TAG across batch
transitions[_, eos_tag].squeeze().index_select(axis=0, indices=last_tags, result=transition_scores)

result += transition_scores

proc compute_log_partition_function[T](): Tensor[T] =
## Compute the partition function by using the forward algorithm to avoid
## explicitly calculating probabilties for all possible sequence
## configurations.
discard

proc crf_forward*[T: SomeFloat](
result: var Tensor[T],
input: Tensor[T],
mask: Tensor[T],
transitions: Tensor[T],
tags: Tensor[Idx],
timesteps, batch_size, hidden_dim: int,
bos_tag, eos_tag: Idx
) =
## Computes the log likelihood of input given transitions (emissions) matrix.
## Loss should be *negative* log likelihood.
result = zeros[T](batch_size)
result.compute_scores(input, mask, transitions, tags, timesteps, batch_size,
hidden_dim, bos_tag, eos_tag)
20 changes: 20 additions & 0 deletions src/tensor/shapeshifting.nim
Original file line number Diff line number Diff line change
Expand Up @@ -318,3 +318,23 @@ func index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int,
var r_slice = result.atAxisIndex(axis, i)
var t_slice = t.atAxisIndex(axis, int(index))
r_slice.copyFrom(t_slice)

proc index_select*[T; Idx: byte or char or SomeNumber](t: Tensor[T], axis: int, indices: Tensor[Idx], result: var Tensor[T]) =
## Same as the `index_select` function, but use a preallocated tensor for
## output.
doAssert indices.shape.len == 1

var select_shape = t.shape
select_shape[axis] = indices.shape[0]

if (select_shape != result.shape):
## FIXME: Better way of resizing the result when necessary
if (select_shape.product() == result.size()):
result = result.reshape(select_shape)
else:
result = newTensorUninit[T](select_shape)

for i, index in enumerate(indices):
var r_slice = result.atAxisIndex(axis, i)
var t_slice = t.atAxisIndex(axis, int(index))
r_slice.copyFrom(t_slice)