You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2017/08/02 21:52:19 UTC
[incubator-mxnet] branch master updated: lstm crf example (#7253)

This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new e0639eb  lstm crf example (#7253)
e0639eb is described below

commit e0639eb16f9fae926151a1c8a244877a535becf5
Author: Sheng Zha <sz...@users.noreply.github.com>
AuthorDate: Wed Aug 2 14:52:17 2017 -0700

    lstm crf example (#7253)
---
 example/gluon/lstm_crf.py | 213 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)

diff --git a/example/gluon/lstm_crf.py b/example/gluon/lstm_crf.py
new file mode 100644
index 0000000..8344789
--- /dev/null
+++ b/example/gluon/lstm_crf.py
@@ -0,0 +1,213 @@
+import mxnet as mx
+from mxnet import autograd as ag, ndarray as nd, gluon
+from mxnet.gluon import Block, nn, rnn
+import mxnet.optimizer as optim
+import sys
+
+# This example demonstrates how LSTM-CRF model can be implemented in Gluon to perform
+# noun-phrase chunking as a sequence labeling task.
+
+mx.random.seed(1)
+
+# Helper functions to make the code more readable.
+def to_scalar(x):
+    return int(x.asscalar())
+
+def argmax(vec):
+    # return the argmax as a python int
+    idx = nd.argmax(vec, axis=1)
+    return to_scalar(idx)
+
+def prepare_sequence(seq, word2idx):
+    return nd.array([word2idx[w] for w in seq])
+
+# Compute log sum exp is numerically more stable than multiplying probabilities
+def log_sum_exp(vec):
+    max_score = nd.max(vec).asscalar()
+    return nd.log(nd.sum(nd.exp(vec - max_score))) + max_score
+
+# Model
+class BiLSTM_CRF(Block):
+    def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
+        super(BiLSTM_CRF, self).__init__()
+        with self.name_scope():
+            self.embedding_dim = embedding_dim
+            self.hidden_dim = hidden_dim
+            self.vocab_size = vocab_size
+            self.tag2idx = tag2idx
+            self.tagset_size = len(tag2idx)
+
+            self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
+            self.lstm = rnn.LSTM(hidden_dim // 2, num_layers=1, bidirectional=True)
+
+            # Maps the output of the LSTM into tag space.
+            self.hidden2tag = nn.Dense(self.tagset_size)
+
+            # Matrix of transition parameters.  Entry i,j is the score of
+            # transitioning *to* i *from* j.
+            self.transitions = nd.random_normal(shape=(self.tagset_size, self.tagset_size))
+
+            self.hidden = self.init_hidden()
+
+    def init_hidden(self):
+        return [nd.random_normal(shape=(2, 1, self.hidden_dim // 2)),
+                nd.random_normal(shape=(2, 1, self.hidden_dim // 2))]
+
+    def _forward_alg(self, feats):
+        # Do the forward algorithm to compute the partition function
+        alphas = [[-10000.] * self.tagset_size]
+        alphas[0][self.tag2idx[START_TAG]] = 0.
+        alphas = nd.array(alphas)
+
+        # Iterate through the sentence
+        for feat in feats:
+            alphas_t = []  # The forward variables at this timestep
+            for next_tag in range(self.tagset_size):
+                # broadcast the emission score: it is the same regardless of
+                # the previous tag
+                emit_score = feat[next_tag].reshape((1, -1))
+                # the ith entry of trans_score is the score of transitioning to
+                # next_tag from i
+                trans_score = self.transitions[next_tag].reshape((1, -1))
+                # The ith entry of next_tag_var is the value for the
+                # edge (i -> next_tag) before we do log-sum-exp
+                next_tag_var = alphas + trans_score + emit_score
+                # The forward variable for this tag is log-sum-exp of all the
+                # scores.
+                alphas_t.append(log_sum_exp(next_tag_var))
+            alphas = nd.concat(*alphas_t, dim=0).reshape((1, -1))
+        terminal_var = alphas + self.transitions[self.tag2idx[STOP_TAG]]
+        alpha = log_sum_exp(terminal_var)
+        return alpha
+
+    def _get_lstm_features(self, sentence):
+        self.hidden = self.init_hidden()
+        length = sentence.shape[0]
+        embeds = self.word_embeds(sentence).reshape((length, 1, -1))
+        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
+        lstm_out = lstm_out.reshape((length, self.hidden_dim))
+        lstm_feats = self.hidden2tag(lstm_out)
+        return nd.split(lstm_feats, num_outputs=length, axis=0, squeeze_axis=True)
+
+    def _score_sentence(self, feats, tags):
+        # Gives the score of a provided tag sequence
+        score = nd.array([0])
+        tags = nd.concat(nd.array([self.tag2idx[START_TAG]]), *tags, dim=0)
+        for i, feat in enumerate(feats):
+            score = score + \
+                self.transitions[to_scalar(tags[i+1]), to_scalar(tags[i])] + feat[to_scalar(tags[i+1])]
+        score = score + self.transitions[self.tag2idx[STOP_TAG],
+                                         to_scalar(tags[int(tags.shape[0]-1)])]
+        return score
+
+    def _viterbi_decode(self, feats):
+        backpointers = []
+
+        # Initialize the viterbi variables in log space
+        vvars = nd.full((1, self.tagset_size), -10000.)
+        vvars[0, self.tag2idx[START_TAG]] = 0
+
+        for feat in feats:
+            bptrs_t = []  # holds the backpointers for this step
+            viterbivars_t = []  # holds the viterbi variables for this step
+
+            for next_tag in range(self.tagset_size):
+                # next_tag_var[i] holds the viterbi variable for tag i at the
+                # previous step, plus the score of transitioning
+                # from tag i to next_tag.
+                # We don't include the emission scores here because the max
+                # does not depend on them (we add them in below)
+                next_tag_var = vvars + self.transitions[next_tag]
+                best_tag_id = argmax(next_tag_var)
+                bptrs_t.append(best_tag_id)
+                viterbivars_t.append(next_tag_var[0, best_tag_id])
+            # Now add in the emission scores, and assign vvars to the set
+            # of viterbi variables we just computed
+            vvars = (nd.concat(*viterbivars_t, dim=0) + feat).reshape((1, -1))
+            backpointers.append(bptrs_t)
+
+        # Transition to STOP_TAG
+        terminal_var = vvars + self.transitions[self.tag2idx[STOP_TAG]]
+        best_tag_id = argmax(terminal_var)
+        path_score = terminal_var[0, best_tag_id]
+
+        # Follow the back pointers to decode the best path.
+        best_path = [best_tag_id]
+        for bptrs_t in reversed(backpointers):
+            best_tag_id = bptrs_t[best_tag_id]
+            best_path.append(best_tag_id)
+        # Pop off the start tag (we dont want to return that to the caller)
+        start = best_path.pop()
+        assert start == self.tag2idx[START_TAG]  # Sanity check
+        best_path.reverse()
+        return path_score, best_path
+
+    def neg_log_likelihood(self, sentence, tags):
+        feats = self._get_lstm_features(sentence)
+        forward_score = self._forward_alg(feats)
+        gold_score = self._score_sentence(feats, tags)
+        return forward_score - gold_score
+
+    def forward(self, sentence):  # dont confuse this with _forward_alg above.
+        # Get the emission scores from the BiLSTM
+        lstm_feats = self._get_lstm_features(sentence)
+
+        # Find the best path, given the features.
+        score, tag_seq = self._viterbi_decode(lstm_feats)
+        return score, tag_seq
+
+# Run training
+START_TAG = "<START>"
+STOP_TAG = "<STOP>"
+EMBEDDING_DIM = 5
+HIDDEN_DIM = 4
+
+# Make up some training data
+training_data = [(
+    "the wall street journal reported today that apple corporation made money".split(),
+    "B I I I O O O B I O O".split()
+), (
+    "georgia tech is a university in georgia".split(),
+    "B I O O O O B".split()
+)]
+
+word2idx = {}
+for sentence, tags in training_data:
+    for word in sentence:
+        if word not in word2idx:
+            word2idx[word] = len(word2idx)
+
+tag2idx = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
+
+model = BiLSTM_CRF(len(word2idx), tag2idx, EMBEDDING_DIM, HIDDEN_DIM)
+model.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=mx.cpu())
+optimizer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': 0.01, 'wd': 1e-4})
+
+# Check predictions before training
+precheck_sent = prepare_sequence(training_data[0][0], word2idx)
+precheck_tags = nd.array([tag2idx[t] for t in training_data[0][1]])
+print(model(precheck_sent))
+
+# Make sure prepare_sequence from earlier in the LSTM section is loaded
+for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
+    for sentence, tags in training_data:
+        # Step 1. Get our inputs ready for the network, that is,
+        # turn them into Variables of word indices.
+        # Remember to use autograd to record the calculation.
+        with ag.record():
+            sentence_in = prepare_sequence(sentence, word2idx)
+            targets = nd.array([tag2idx[t] for t in tags])
+
+            # Step 2. Run our forward pass.
+            neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)
+
+            # Step 3. Compute the loss, gradients, and update the parameters by
+            # calling optimizer.step()
+            neg_log_likelihood.backward()
+        optimizer.step(1)
+
+# Check predictions after training
+precheck_sent = prepare_sequence(training_data[0][0], word2idx)
+print(model(precheck_sent))
+
+# Acknowledgement: this example is adopted from pytorch nlp tutorials.

-- 
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].