You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/30 20:31:12 UTC
[GitHub] szha closed pull request #10514: new NER example: MXNET-321

szha closed pull request #10514: new NER example: MXNET-321
URL: https://github.com/apache/incubator-mxnet/pull/10514
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/example/named_entity_recognition/README.md b/example/named_entity_recognition/README.md
new file mode 100644
index 00000000000..260c19d5ffb
--- /dev/null
+++ b/example/named_entity_recognition/README.md
@@ -0,0 +1,19 @@
+## Goal
+
+- This repo contains an MXNet implementation of this state of the art [entity recognition model](https://www.aclweb.org/anthology/Q16-1026).
+- You can find my blog post on the model [here](https://opringle.github.io/2018/02/06/CNNLSTM_entity_recognition.html).
+
+![](https://github.com/dmlc/web-data/blob/master/mxnet/example/ner/arch1.png?raw=true)
+
+## Running the code
+
+To reproduce the preprocessed training data:
+
+1. Download and unzip the data: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/downloads/ner_dataset.csv
+2. Move ner_dataset.csv into `./data`
+3. create `./preprocessed_data` directory
+3. `$ cd src && python preprocess.py`
+
+To train the model:
+
+- `$ cd src && python ner.py`
\ No newline at end of file
diff --git a/example/named_entity_recognition/src/iterators.py b/example/named_entity_recognition/src/iterators.py
new file mode 100644
index 00000000000..a11c570ffd2
--- /dev/null
+++ b/example/named_entity_recognition/src/iterators.py
@@ -0,0 +1,175 @@
+# !/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# -*- coding: utf-8 -*-
+
+import bisect
+import random
+import numpy as np
+from mxnet.io import DataIter, DataBatch, DataDesc
+from mxnet import ndarray
+from sklearn.utils import shuffle
+
+class BucketNerIter(DataIter):
+    """
+    This iterator can handle variable length feature/label arrays for MXNet RNN classifiers.
+    This iterator can ingest 2d list of sentences, 2d list of entities and 3d list of characters.
+    """
+
+    def __init__(self, sentences, characters, label, max_token_chars, batch_size, buckets=None, data_pad=-1, label_pad = -1, data_names=['sentences', 'characters'],
+                 label_name='seq_label', dtype='float32'):
+
+        super(BucketNerIter, self).__init__()
+
+        # Create a bucket for every seq length where there are more examples than the batch size
+        if not buckets:
+            seq_counts = np.bincount([len(s) for s in sentences])
+            buckets = [i for i, j in enumerate(seq_counts) if j >= batch_size]
+        buckets.sort()
+        print("\nBuckets  created: ", buckets)
+        assert(len(buckets) > 0), "Not enough utterances to create any buckets."
+
+        ###########
+        # Sentences
+        ###########
+        nslice = 0
+        # Create empty nested lists for storing data that falls into each bucket
+        self.sentences = [[] for _ in buckets]
+        for i, sent in enumerate(sentences):
+            # Find the index of the smallest bucket that is larger than the sentence length
+            buck_idx = bisect.bisect_left(buckets, len(sent))
+
+            if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
+                buck_idx = buck_idx - 1
+                nslice += 1
+                sent = sent[:buckets[buck_idx]] #Slice sentence to largest bucket size
+
+            buff = np.full((buckets[buck_idx]), data_pad, dtype=dtype) # Create an array filled with 'data_pad'
+            buff[:len(sent)] = sent # Fill with actual values
+            self.sentences[buck_idx].append(buff) # Append array to index = bucket index
+        self.sentences = [np.asarray(i, dtype=dtype) for i in self.sentences] # Convert to list of array
+        print("Warning, {0} sentences sliced to largest bucket size.".format(nslice)) if nslice > 0 else None
+
+        ############
+        # Characters
+        ############
+        # Create empty nested lists for storing data that falls into each bucket
+        self.characters = [[] for _ in buckets]
+        for i, charsent in enumerate(characters):
+            # Find the index of the smallest bucket that is larger than the sentence length
+            buck_idx = bisect.bisect_left(buckets, len(charsent))
+
+            if buck_idx == len(buckets): # If the sentence is larger than the largest bucket
+                buck_idx = buck_idx - 1
+                charsent = charsent[:buckets[buck_idx]] #Slice sentence to largest bucket size
+
+            charsent = [word[:max_token_chars]for word in charsent] # Slice to max length
+            charsent = [word + [data_pad]*(max_token_chars-len(word)) for word in charsent]# Pad to max length
+            charsent = np.array(charsent)
+            buff = np.full((buckets[buck_idx], max_token_chars), data_pad, dtype=dtype)
+            buff[:charsent.shape[0], :] = charsent # Fill with actual values
+            self.characters[buck_idx].append(buff) # Append array to index = bucket index
+        self.characters = [np.asarray(i, dtype=dtype) for i in self.characters] # Convert to list of array
+
+        ##########
+        # Entities
+        ##########
+        # Create empty nested lists for storing data that falls into each bucket
+        self.label = [[] for _ in buckets]
+        self.indices = [[] for _ in buckets]
+        for i, entities in enumerate(label):
+            # Find the index of the smallest bucket that is larger than the sentence length
+            buck_idx = bisect.bisect_left(buckets, len(entities))
+
+            if buck_idx == len(buckets):  # If the sentence is larger than the largest bucket
+                buck_idx = buck_idx - 1
+                entities = entities[:buckets[buck_idx]]  # Slice sentence to largest bucket size
+
+            buff = np.full((buckets[buck_idx]), label_pad, dtype=dtype)  # Create an array filled with 'data_pad'
+            buff[:len(entities)] = entities  # Fill with actual values
+            self.label[buck_idx].append(buff)  # Append array to index = bucket index
+            self.indices[buck_idx].append(i)
+        self.label = [np.asarray(i, dtype=dtype) for i in self.label]  # Convert to list of array
+        self.indices = [np.asarray(i, dtype=dtype) for i in self.indices]  # Convert to list of array
+
+        self.data_names = data_names
+        self.label_name = label_name
+        self.batch_size = batch_size
+        self.max_token_chars = max_token_chars
+        self.buckets = buckets
+        self.dtype = dtype
+        self.data_pad = data_pad
+        self.label_pad = label_pad
+        self.default_bucket_key = max(buckets)
+        self.layout = 'NT'
+
+        self.provide_data = [DataDesc(name=self.data_names[0], shape=(self.batch_size, self.default_bucket_key), layout=self.layout),
+                             DataDesc(name=self.data_names[1], shape=(self.batch_size, self.default_bucket_key, self.max_token_chars), layout=self.layout)]
+        self.provide_label=[DataDesc(name=self.label_name, shape=(self.batch_size, self.default_bucket_key), layout=self.layout)]
+
+        #create empty list to store batch index values
+        self.idx = []
+        #for each bucketarray
+        for i, buck in enumerate(self.sentences):
+            #extend the list eg output with batch size 5 and 20 training examples in bucket. [(0,0), (0,5), (0,10), (0,15), (1,0), (1,5), (1,10), (1,15)]
+            self.idx.extend([(i, j) for j in range(0, len(buck) - batch_size + 1, batch_size)])
+        self.curr_idx = 0
+        self.reset()
+
+    def reset(self):
+        """Resets the iterator to the beginning of the data."""
+        self.curr_idx = 0
+        #shuffle data in each bucket
+        random.shuffle(self.idx)
+        for i, buck in enumerate(self.sentences):
+            self.indices[i], self.sentences[i], self.characters[i], self.label[i] = shuffle(self.indices[i],
+                                                                                            self.sentences[i],
+                                                                                            self.characters[i],
+                                                                                            self.label[i])
+
+        self.ndindex = []
+        self.ndsent = []
+        self.ndchar = []
+        self.ndlabel = []
+
+        #for each bucket of data
+        for i, buck in enumerate(self.sentences):
+            #append the lists with an array
+            self.ndindex.append(ndarray.array(self.indices[i], dtype=self.dtype))
+            self.ndsent.append(ndarray.array(self.sentences[i], dtype=self.dtype))
+            self.ndchar.append(ndarray.array(self.characters[i], dtype=self.dtype))
+            self.ndlabel.append(ndarray.array(self.label[i], dtype=self.dtype))
+
+    def next(self):
+        """Returns the next batch of data."""
+        if self.curr_idx == len(self.idx):
+            raise StopIteration
+        #i = batches index, j = starting record
+        i, j = self.idx[self.curr_idx] 
+        self.curr_idx += 1
+
+        indices = self.ndindex[i][j:j + self.batch_size]
+        sentences = self.ndsent[i][j:j + self.batch_size]
+        characters = self.ndchar[i][j:j + self.batch_size]
+        label = self.ndlabel[i][j:j + self.batch_size]
+
+        return DataBatch([sentences, characters], [label], pad=0, index = indices, bucket_key=self.buckets[i],
+                         provide_data=[DataDesc(name=self.data_names[0], shape=sentences.shape, layout=self.layout),
+                                       DataDesc(name=self.data_names[1], shape=characters.shape, layout=self.layout)],
+                         provide_label=[DataDesc(name=self.label_name, shape=label.shape, layout=self.layout)])
\ No newline at end of file
diff --git a/example/named_entity_recognition/src/metrics.py b/example/named_entity_recognition/src/metrics.py
new file mode 100644
index 00000000000..40c5015e81b
--- /dev/null
+++ b/example/named_entity_recognition/src/metrics.py
@@ -0,0 +1,79 @@
+# !/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# -*- coding: utf-8 -*-
+
+import mxnet as mx
+import numpy as np
+import pickle
+
+def load_obj(name):
+    with open(name + '.pkl', 'rb') as f:
+        return pickle.load(f)
+
+tag_dict = load_obj("../preprocessed_data/tag_to_index")
+not_entity_index = tag_dict["O"]
+
+def classifer_metrics(label, pred):
+    """
+    computes f1, precision and recall on the entity class
+    """
+    prediction = np.argmax(pred, axis=1)
+    label = label.astype(int)
+
+    pred_is_entity = prediction != not_entity_index
+    label_is_entity = label != not_entity_index
+
+    corr_pred = (prediction == label) == (pred_is_entity == True)
+
+    #how many entities are there?
+    num_entities = np.sum(label_is_entity)
+    entity_preds = np.sum(pred_is_entity)
+
+    #how many times did we correctly predict an entity?
+    correct_entitites = np.sum(corr_pred[pred_is_entity])
+
+    #precision: when we predict entity, how often are we right?
+    precision = correct_entitites/entity_preds
+    if entity_preds == 0:
+        precision = np.nan
+
+    #recall: of the things that were an entity, how many did we catch?
+    recall = correct_entitites / num_entities
+    if num_entities == 0:
+        recall = np.nan
+    f1 = 2 * precision * recall / (precision + recall)
+    return precision, recall, f1
+
+def entity_precision(label, pred):
+    return classifer_metrics(label, pred)[0]
+
+def entity_recall(label, pred):
+    return classifer_metrics(label, pred)[1]
+
+def entity_f1(label, pred):
+    return classifer_metrics(label, pred)[2]
+
+def composite_classifier_metrics():
+    metric1 = mx.metric.CustomMetric(feval=entity_precision, name='entity precision')
+    metric2 = mx.metric.CustomMetric(feval=entity_recall, name='entity recall')
+    metric3 = mx.metric.CustomMetric(feval=entity_f1, name='entity f1 score')
+    metric4 = mx.metric.Accuracy()
+
+    return mx.metric.CompositeEvalMetric([metric4, metric1, metric2, metric3])
diff --git a/example/named_entity_recognition/src/ner.py b/example/named_entity_recognition/src/ner.py
new file mode 100644
index 00000000000..561db4c43d9
--- /dev/null
+++ b/example/named_entity_recognition/src/ner.py
@@ -0,0 +1,236 @@
+# !/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# -*- coding: utf-8 -*-
+
+from collections import Counter
+import itertools
+import iterators
+import os
+import numpy as np
+import pandas as pd
+import mxnet as mx
+import argparse
+import pickle
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
+
+parser = argparse.ArgumentParser(description="Deep neural network for multivariate time series forecasting",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--data-dir', type=str, default='../preprocessed_data',
+                    help='relative path to input data')
+parser.add_argument('--output-dir', type=str, default='../results',
+                    help='directory to save model files to')
+parser.add_argument('--max-records', type=int, default=None,
+                    help='total records before data split')
+parser.add_argument('--train_fraction', type=float, default=0.8,
+                    help='fraction of data to use for training. remainder used for testing.')
+parser.add_argument('--batch-size', type=int, default=128,
+                    help='the batch size.')
+parser.add_argument('--buckets', type=str, default="",
+                    help='unique bucket sizes')
+parser.add_argument('--char-embed', type=int, default=25,
+                    help='Embedding size for each unique character.')
+parser.add_argument('--char-filter-list', type=str, default="3,4,5",
+                    help='unique filter sizes for char level cnn')
+parser.add_argument('--char-filters', type=int, default=20,
+                    help='number of each filter size')
+parser.add_argument('--word-embed', type=int, default=500,
+                    help='Embedding size for each unique character.')
+parser.add_argument('--word-filter-list', type=str, default="3,4,5",
+                    help='unique filter sizes for char level cnn')
+parser.add_argument('--word-filters', type=int, default=200,
+                    help='number of each filter size')
+parser.add_argument('--lstm-state-size', type=int, default=100,
+                    help='number of hidden units in each unrolled recurrent cell')
+parser.add_argument('--lstm-layers', type=int, default=1,
+                    help='number of recurrent layers')
+parser.add_argument('--gpus', type=str, default='',
+                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ')
+parser.add_argument('--optimizer', type=str, default='adam',
+                    help='the optimizer type')
+parser.add_argument('--lr', type=float, default=0.001,
+                    help='initial learning rate')
+parser.add_argument('--dropout', type=float, default=0.2,
+                    help='dropout rate for network')
+parser.add_argument('--num-epochs', type=int, default=100,
+                    help='max num of epochs')
+parser.add_argument('--save-period', type=int, default=20,
+                    help='save checkpoint for every n epochs')
+parser.add_argument('--model_prefix', type=str, default='electricity_model',
+                    help='prefix for saving model params')
+
+def save_obj(obj, name):
+    with open(name + '.pkl', 'wb') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
+
+def save_model():
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+    return mx.callback.do_checkpoint(os.path.join(args.output_dir, "checkpoint"), args.save_period)
+
+def build_vocab(nested_list):
+    """
+    :param nested_list: list of list of string
+    :return: dictionary mapping from string to int, inverse of that dictionary
+    """
+    # Build vocabulary
+    word_counts = Counter(itertools.chain(*nested_list))
+
+    # Mapping from index to label
+    vocabulary_inv = [x[0] for x in word_counts.most_common()]
+
+    # Mapping from label to index
+    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
+    return vocabulary, vocabulary_inv
+
+def build_iters(data_dir, max_records, train_fraction, batch_size, buckets=None):
+    """
+    Reads a csv of sentences/tag sequences into a pandas dataframe.
+    Converts into X = array(list(int)) & Y = array(list(int))
+    Splits into training and test sets
+    Builds dictionaries mapping from index labels to labels/ indexed features to features
+    :param data_dir: directory to read in csv data from
+    :param max_records: total number of records to randomly select from input data
+    :param train_fraction: fraction of the data to use for training
+    :param batch_size: records in mini-batches during training
+    :param buckets: size of each bucket in the iterators
+    :return: train_iter, val_iter, word_to_index, index_to_word, pos_to_index, index_to_pos
+    """
+    # Read in data as numpy array
+    df = pd.read_pickle(os.path.join(data_dir, "ner_data.pkl"))[:max_records]
+
+    # Get feature lists
+    entities=[list(array) for array in df["BILOU_tag"].values]
+    sentences = [list(array) for array in df["token"].values]
+    chars=[[[c for c in word] for word in sentence] for sentence in sentences]
+
+    # Build vocabularies
+    entity_to_index, index_to_entity = build_vocab(entities)
+    word_to_index, index_to_word = build_vocab(sentences)
+    char_to_index, index_to_char = build_vocab([np.array([c for c in word]) for word in index_to_word])
+    save_obj(entity_to_index, os.path.join(args.data_dir, "tag_to_index"))
+
+    # Map strings to integer values
+    indexed_entities=[list(map(entity_to_index.get, l)) for l in entities]
+    indexed_tokens=[list(map(word_to_index.get, l)) for l in sentences]
+    indexed_chars=[[list(map(char_to_index.get, word)) for word in sentence] for sentence in chars]
+
+    # Split into training and testing data
+    idx=int(len(indexed_tokens)*train_fraction)
+    X_token_train, X_char_train, Y_train = indexed_tokens[:idx], indexed_chars[:idx], indexed_entities[:idx]
+    X_token_test, X_char_test, Y_test = indexed_tokens[idx:], indexed_chars[idx:], indexed_entities[idx:]
+
+    # build iterators to feed batches to network
+    train_iter = iterators.BucketNerIter(sentences=X_token_train, characters=X_char_train, label=Y_train,
+                                         max_token_chars=5, batch_size=batch_size, buckets=buckets)
+    val_iter = iterators.BucketNerIter(sentences=X_token_test, characters=X_char_test, label=Y_test,
+                                         max_token_chars=train_iter.max_token_chars, batch_size=batch_size, buckets=train_iter.buckets)
+    return train_iter, val_iter, word_to_index, char_to_index, entity_to_index
+
+def sym_gen(seq_len):
+    """
+    Build NN symbol depending on the length of the input sequence
+    """
+    sentence_shape = train_iter.provide_data[0][1]
+    char_sentence_shape = train_iter.provide_data[1][1]
+    entities_shape = train_iter.provide_label[0][1]
+
+    X_sent = mx.symbol.Variable(train_iter.provide_data[0].name)
+    X_char_sent = mx.symbol.Variable(train_iter.provide_data[1].name)
+    Y = mx.sym.Variable(train_iter.provide_label[0].name)
+
+    ###############################
+    # Character embedding component
+    ###############################
+    char_embeddings = mx.sym.Embedding(data=X_char_sent, input_dim=len(char_to_index), output_dim=args.char_embed, name='char_embed')
+    char_embeddings = mx.sym.reshape(data=char_embeddings, shape=(0,1,seq_len,-1,args.char_embed), name='char_embed2')
+
+    char_cnn_outputs = []
+    for i, filter_size in enumerate(args.char_filter_list):
+        # Kernel that slides over entire words resulting in a 1d output
+        convi = mx.sym.Convolution(data=char_embeddings, kernel=(1, filter_size, args.char_embed), stride=(1, 1, 1),
+                                   num_filter=args.char_filters, name="char_conv_layer_" + str(i))
+        acti = mx.sym.Activation(data=convi, act_type='tanh')
+        pooli = mx.sym.Pooling(data=acti, pool_type='max', kernel=(1, char_sentence_shape[2] - filter_size + 1, 1),
+                               stride=(1, 1, 1), name="char_pool_layer_" + str(i))
+        pooli = mx.sym.transpose(mx.sym.Reshape(pooli, shape=(0, 0, 0)), axes=(0, 2, 1), name="cchar_conv_layer_" + str(i))
+        char_cnn_outputs.append(pooli)
+
+    # combine features from all filters & apply dropout
+    cnn_char_features = mx.sym.Concat(*char_cnn_outputs, dim=2, name="cnn_char_features")
+    regularized_cnn_char_features = mx.sym.Dropout(data=cnn_char_features, p=args.dropout, mode='training',
+                                                   name='regularized charCnn features')
+
+    ##################################
+    # Combine char and word embeddings
+    ##################################
+    word_embeddings = mx.sym.Embedding(data=X_sent, input_dim=len(word_to_index), output_dim=args.word_embed, name='word_embed')
+    rnn_features = mx.sym.Concat(*[word_embeddings, regularized_cnn_char_features], dim=2, name='rnn input')
+
+    ##############################
+    # Bidirectional LSTM component
+    ##############################
+
+    # unroll the lstm cell in time, merging outputs
+    bi_cell.reset()
+    output, states = bi_cell.unroll(length=seq_len, inputs=rnn_features, merge_outputs=True)
+
+    # Map to num entity classes
+    rnn_output = mx.sym.Reshape(output, shape=(-1, args.lstm_state_size * 2), name='r_output')
+    fc = mx.sym.FullyConnected(data=rnn_output, num_hidden=len(entity_to_index), name='fc_layer')
+
+    # reshape back to same shape as loss will be
+    reshaped_fc = mx.sym.transpose(mx.sym.reshape(fc, shape=(-1, seq_len, len(entity_to_index))), axes=(0, 2, 1))
+    sm = mx.sym.SoftmaxOutput(data=reshaped_fc, label=Y, ignore_label=-1, use_ignore=True, multi_output=True, name='softmax')
+    return sm, [v.name for v in train_iter.provide_data], [v.name for v in train_iter.provide_label]
+
+def train(train_iter, val_iter):
+    import metrics
+    devs = mx.cpu() if args.gpus is None or args.gpus is '' else [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    module = mx.mod.BucketingModule(sym_gen, train_iter.default_bucket_key, context=devs)
+    module.fit(train_data=train_iter,
+               eval_data=val_iter,
+               eval_metric=metrics.composite_classifier_metrics(),
+               optimizer=args.optimizer,
+               optimizer_params={'learning_rate': args.lr },
+               initializer=mx.initializer.Uniform(0.1),
+               num_epoch=args.num_epochs,
+               epoch_end_callback=save_model())
+
+if __name__ == '__main__':
+    # parse args
+    args = parser.parse_args()
+    args.buckets = list(map(int, args.buckets.split(','))) if len(args.buckets) > 0 else None
+    args.char_filter_list = list(map(int, args.char_filter_list.split(',')))
+
+    # Build data iterators
+    train_iter, val_iter, word_to_index, char_to_index, entity_to_index = build_iters(args.data_dir, args.max_records,
+                                                                     args.train_fraction, args.batch_size, args.buckets)
+
+    # Define the recurrent layer
+    bi_cell = mx.rnn.SequentialRNNCell()
+    for layer_num in range(args.lstm_layers):
+        bi_cell.add(mx.rnn.BidirectionalCell(
+            mx.rnn.LSTMCell(num_hidden=args.lstm_state_size, prefix="forward_layer_" + str(layer_num)),
+            mx.rnn.LSTMCell(num_hidden=args.lstm_state_size, prefix="backward_layer_" + str(layer_num))))
+        bi_cell.add(mx.rnn.DropoutCell(args.dropout))
+
+    train(train_iter, val_iter)
\ No newline at end of file
diff --git a/example/named_entity_recognition/src/preprocess.py b/example/named_entity_recognition/src/preprocess.py
new file mode 100644
index 00000000000..6ae348ad8ba
--- /dev/null
+++ b/example/named_entity_recognition/src/preprocess.py
@@ -0,0 +1,50 @@
+# !/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+import numpy as np
+
+#read in csv of NER training data
+df = pd.read_csv("../data/ner_dataset.csv", encoding="ISO-8859-1")
+
+#rename columns
+df = df.rename(columns = {"Sentence #" : "utterance_id",
+                            "Word" : "token", 
+                            "POS" : "POS_tag", 
+                            "Tag" : "BILOU_tag"})
+
+#clean utterance_id column
+df.loc[:, "utterance_id"] = df["utterance_id"].str.replace('Sentence: ', '')
+
+#fill np.nan utterance ID's with the last valid entry
+df = df.fillna(method='ffill')
+df.loc[:, "utterance_id"] = df["utterance_id"].apply(int)
+
+#melt BILOU tags and tokens into an array per utterance
+df1 = df.groupby("utterance_id")["BILOU_tag"].apply(lambda x: np.array(x)).to_frame().reset_index()
+df2 = df.groupby("utterance_id")["token"].apply(lambda x: np.array(x)).to_frame().reset_index()
+df3 = df.groupby("utterance_id")["POS_tag"].apply(lambda x: np.array(x)).to_frame().reset_index()
+
+#join the results on utterance id
+df = df1.merge(df2.merge(df3, how = "left", on = "utterance_id"), how = "left", on = "utterance_id")
+
+#save the dataframe to a csv file
+df.to_pickle("../data/ner_data.pkl")
\ No newline at end of file


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services