You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/01/16 18:57:16 UTC

[GitHub] piiswrong closed pull request #9446: text: use _contan, import modules, and keep code style consistent

piiswrong closed pull request #9446: text: use _contan, import modules, and keep code style consistent
URL: https://github.com/apache/incubator-mxnet/pull/9446
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/mxnet/contrib/text/constants.py b/python/mxnet/contrib/text/_constants.py
similarity index 100%
rename from python/mxnet/contrib/text/constants.py
rename to python/mxnet/contrib/text/_constants.py
diff --git a/python/mxnet/contrib/text/embedding.py b/python/mxnet/contrib/text/embedding.py
index 2996f1ea9f..adba867223 100644
--- a/python/mxnet/contrib/text/embedding.py
+++ b/python/mxnet/contrib/text/embedding.py
@@ -29,37 +29,34 @@
 import warnings
 import zipfile
 
-from . import constants as C
-from .indexer import TokenIndexer
+from . import _constants as C
+from . import indexer
 from ... import ndarray as nd
 from ... import registry
 
 
-class TokenEmbedding(TokenIndexer):
+class TokenEmbedding(indexer.TokenIndexer):
     """Token embedding base class.
 
 
-    To load token embeddings from an externally hosted pre-trained
-    token embedding file, such as those of GloVe and FastText, use
-    `TokenEmbedding.create(embedding_name, pretrained_file_name)`. To get all
-    the available `embedding_name` and `pretrained_file_name`, use
+    To load token embeddings from an externally hosted pre-trained token embedding file, such as
+    those of GloVe and FastText, use `TokenEmbedding.create(embedding_name, pretrained_file_name)`.
+    To get all the available `embedding_name` and `pretrained_file_name`, use
     `TokenEmbedding.get_embedding_and_pretrained_file_names()`.
 
-    Alternatively, to load embedding vectors from a custom pre-trained token
-    embedding file, use :class:`~mxnet.text.embedding.CustomEmbedding`.
+    Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use
+    :class:`~mxnet.text.embedding.CustomEmbedding`.
 
-    For every unknown token, if its representation `self.unknown_token` is
-    encountered in the pre-trained token embedding file, index 0 of
-    `self.idx_to_vec` maps to the pre-trained token embedding vector loaded from
-    the file; otherwise, index 0 of `self.idx_to_vec` maps to the token
-    embedding vector initialized by `init_unknown_vec`.
+    For every unknown token, if its representation `self.unknown_token` is encountered in the
+    pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token
+    embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the
+    token embedding vector initialized by `init_unknown_vec`.
 
-    If a token is encountered multiple times in the pre-trained token embedding
-    file, only the first-encountered token embedding vector will be loaded and
-    the rest will be skipped.
+    If a token is encountered multiple times in the pre-trained token embedding file, only the
+    first-encountered token embedding vector will be loaded and the rest will be skipped.
 
-    For the same token, its index and embedding vector may vary across different
-    instances of :class:`~mxnet.text.embedding.TokenEmbedding`.
+    For the same token, its index and embedding vector may vary across different instances of
+    :class:`~mxnet.text.embedding.TokenEmbedding`.
 
 
     Properties
@@ -67,20 +64,18 @@ class TokenEmbedding(TokenIndexer):
     token_to_idx : dict mapping str to int
         A dict mapping each token to its index integer.
     idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices
-        are aligned.
+        A list of indexed tokens where the list indices and the token indices are aligned.
     unknown_token : hashable object
-        The representation for any unknown token. In other words, any
-        unknown token will be indexed as the same representation.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation.
     reserved_tokens : list of strs or None
         A list of reserved tokens that will always be indexed.
     vec_len : int
         The length of the embedding vector for each token.
     idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each
-        token's index to an embedding vector. The largest valid index maps
-        to the initialized embedding vector for every reserved token, such as an
-        unknown_token token and a padding token.
+        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
+        embedding vector. The largest valid index maps to the initialized embedding vector for every
+        reserved token, such as an unknown_token token and a padding token.
     """
 
     def __init__(self, **kwargs):
@@ -96,8 +91,7 @@ def _get_pretrained_file_url(cls, pretrained_file_name):
         embedding_cls = cls.__name__.lower()
 
         url_format = '{repo_url}gluon/embeddings/{cls}/{file_name}'
-        return url_format.format(repo_url=repo_url,
-                                 cls=embedding_cls,
+        return url_format.format(repo_url=repo_url, cls=embedding_cls,
                                  file_name=cls._get_download_file_name(pretrained_file_name))
 
     @classmethod
@@ -133,20 +127,17 @@ def _get_pretrained_file(cls, embedding_root, pretrained_file_name):
                     tar.extractall(path=embedding_dir)
         return pretrained_file_path
 
-    def _load_embedding(self, pretrained_file_path, elem_delim,
-                        init_unknown_vec, encoding='utf8'):
+    def _load_embedding(self, pretrained_file_path, elem_delim, init_unknown_vec, encoding='utf8'):
         """Load embedding vectors from the pre-trained token embedding file.
 
 
-        For every unknown token, if its representation `self.unknown_token` is
-        encountered in the pre-trained token embedding file, index 0 of
-        `self.idx_to_vec` maps to the pre-trained token embedding vector loaded
-        from the file; otherwise, index 0 of `self.idx_to_vec` maps to the text
-        embedding vector initialized by `init_unknown_vec`.
+        For every unknown token, if its representation `self.unknown_token` is encountered in the
+        pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token
+        embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the
+        text embedding vector initialized by `init_unknown_vec`.
 
-        If a token is encountered multiple times in the pre-trained text
-        embedding file, only the first-encountered token embedding vector will
-        be loaded and the rest will be skipped.
+        If a token is encountered multiple times in the pre-trained text embedding file, only the
+        first-encountered token embedding vector will be loaded and the rest will be skipped.
         """
 
         pretrained_file_path = os.path.expanduser(pretrained_file_path)
@@ -155,8 +146,7 @@ def _load_embedding(self, pretrained_file_path, elem_delim,
             raise ValueError('`pretrained_file_path` must be a valid path to '
                              'the pre-trained token embedding file.')
 
-        logging.info('Loading pre-trained token embedding vectors from %s',
-                     pretrained_file_path)
+        logging.info('Loading pre-trained token embedding vectors from %s', pretrained_file_path)
         vec_len = None
         all_elems = []
         tokens = set()
@@ -167,11 +157,9 @@ def _load_embedding(self, pretrained_file_path, elem_delim,
                 line_num += 1
                 elems = line.rstrip().split(elem_delim)
 
-                assert len(elems) > 1, 'At line %d of the pre-trained text ' \
-                                       'embedding file: the data format of the ' \
-                                       'pre-trained token embedding file %s is ' \
-                                       'unexpected.' \
-                                       % (line_num, pretrained_file_path)
+                assert len(elems) > 1, 'At line %d of the pre-trained text embedding file: the ' \
+                                       'data format of the pre-trained token embedding file %s ' \
+                                       'is unexpected.' % (line_num, pretrained_file_path)
 
                 token, elems = elems[0], [float(i) for i in elems[1:]]
 
@@ -179,15 +167,13 @@ def _load_embedding(self, pretrained_file_path, elem_delim,
                     loaded_unknown_vec = elems
                     tokens.add(self.unknown_token)
                 elif token in tokens:
-                    warnings.warn('At line %d of the pre-trained token embedding '
-                                  'file: the embedding vector for token %s has '
-                                  'been loaded and a duplicate embedding for the '
-                                  'same token is seen and skipped.'
-                                  % (line_num, token))
+                    warnings.warn('At line %d of the pre-trained token embedding file: the '
+                                  'embedding vector for token %s has been loaded and a duplicate '
+                                  'embedding for the  same token is seen and skipped.' %
+                                  (line_num, token))
                 elif len(elems) == 1:
-                    warnings.warn('At line %d of the pre-trained text '
-                                  'embedding file: token %s with 1-dimensional '
-                                  'vector %s is likely a header and is '
+                    warnings.warn('At line %d of the pre-trained text embedding file: token %s '
+                                  'with 1-dimensional vector %s is likely a header and is '
                                   'skipped.' % (line_num, token, elems))
                 else:
                     if vec_len is None:
@@ -197,10 +183,9 @@ def _load_embedding(self, pretrained_file_path, elem_delim,
                         all_elems.extend([0] * vec_len)
                     else:
                         assert len(elems) == vec_len, \
-                            'At line %d of the pre-trained token embedding ' \
-                            'file: the dimension of token %s is %d but the ' \
-                            'dimension of previous tokens is %d. Dimensions ' \
-                            'of all the tokens must be the same.' \
+                            'At line %d of the pre-trained token embedding file: the dimension ' \
+                            'of token %s is %d but the dimension of previous tokens is %d. ' \
+                            'Dimensions of all the tokens must be the same.' \
                             % (line_num, token, len(elems), vec_len)
                     all_elems.extend(elems)
                     self._idx_to_token.append(token)
@@ -211,8 +196,7 @@ def _load_embedding(self, pretrained_file_path, elem_delim,
         self._idx_to_vec = nd.array(all_elems).reshape((-1, self.vec_len))
 
         if loaded_unknown_vec is None:
-            self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec(
-                shape=self.vec_len)
+            self._idx_to_vec[C.UNKNOWN_IDX] = init_unknown_vec(shape=self.vec_len)
         else:
             self._idx_to_vec[C.UNKNOWN_IDX] = nd.array(loaded_unknown_vec)
 
@@ -233,19 +217,17 @@ def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
         tokens : str or list of strs
             A token or a list of tokens.
         lower_case_backup : bool, default False
-            If False, each token in the original case will be looked up; if
-            True, each token in the original case will be looked up first, if
-            not found in the keys of the property `token_to_idx`, the token
-            in the lower case will be looked up.
+            If False, each token in the original case will be looked up; if True, each token in the
+            original case will be looked up first, if not found in the keys of the property
+            `token_to_idx`, the token in the lower case will be looked up.
 
 
         Returns
         -------
         mxnet.ndarray.NDArray:
-            The embedding vector(s) of the token(s). According to numpy
-            conventions, if `tokens` is a string, returns a 1-D NDArray of shape
-            `self.vec_len`; if `tokens` is a list of strings, returns a 2-D
-            NDArray of shape=(len(tokens), self.vec_len).
+            The embedding vector(s) of the token(s). According to numpy conventions, if `tokens` is
+            a string, returns a 1-D NDArray of shape `self.vec_len`; if `tokens` is a list of
+            strings, returns a 2-D NDArray of shape=(len(tokens), self.vec_len).
         """
 
         to_reduce = False
@@ -254,15 +236,14 @@ def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
             to_reduce = True
 
         if not lower_case_backup:
-            indices = [self.token_to_idx.get(token, C.UNKNOWN_IDX)
-                       for token in tokens]
+            indices = [self.token_to_idx.get(token, C.UNKNOWN_IDX) for token in tokens]
         else:
             indices = [self.token_to_idx[token] if token in self.token_to_idx
                        else self.token_to_idx.get(token.lower(), C.UNKNOWN_IDX)
                        for token in tokens]
 
-        vecs = nd.Embedding(nd.array(indices), self.idx_to_vec,
-                            self.idx_to_vec.shape[0], self.idx_to_vec.shape[1])
+        vecs = nd.Embedding(nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0],
+                            self.idx_to_vec.shape[1])
 
         return vecs[0] if to_reduce else vecs
 
@@ -273,34 +254,27 @@ def update_token_vectors(self, tokens, new_vectors):
         Parameters
         ----------
         tokens : str or a list of strs
-            A token or a list of tokens whose embedding vector are to be
-            updated.
+            A token or a list of tokens whose embedding vector are to be updated.
         new_vectors : mxnet.ndarray.NDArray
-            An NDArray to be assigned to the embedding vectors of `tokens`.
-            Its length must be equal to the number of `tokens` and its width
-            must be equal to the dimension of embeddings of the glossary. If
-            `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a
-            list of multiple strings, it must be 2-D.
+            An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal
+            to the number of `tokens` and its width must be equal to the dimension of embeddings of
+            the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list
+            of multiple strings, it must be 2-D.
         """
 
-        assert self.idx_to_vec is not None, \
-            'The property `idx_to_vec` has not been properly set.'
+        assert self.idx_to_vec is not None, 'The property `idx_to_vec` has not been properly set.'
 
         if not isinstance(tokens, list) or len(tokens) == 1:
-            assert isinstance(new_vectors, nd.NDArray) and \
-                len(new_vectors.shape) in [1, 2], \
-                '`new_vectors` must be a 1-D or 2-D NDArray if `tokens` is a ' \
-                'singleton.'
+            assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) in [1, 2], \
+                '`new_vectors` must be a 1-D or 2-D NDArray if `tokens` is a singleton.'
             if not isinstance(tokens, list):
                 tokens = [tokens]
             if len(new_vectors.shape) == 1:
                 new_vectors = new_vectors.expand_dims(0)
 
         else:
-            assert isinstance(new_vectors, nd.NDArray) and \
-                len(new_vectors.shape) == 2, \
-                '`new_vectors` must be a 2-D NDArray if `tokens` is a list ' \
-                'of multiple strings.'
+            assert isinstance(new_vectors, nd.NDArray) and len(new_vectors.shape) == 2, \
+                '`new_vectors` must be a 2-D NDArray if `tokens` is a list of multiple strings.'
         assert new_vectors.shape == (len(tokens), self.vec_len), \
             'The length of new_vectors must be equal to the number of tokens ' \
             'and the width of new_vectors must be equal to the dimension of ' \
@@ -311,12 +285,10 @@ def update_token_vectors(self, tokens, new_vectors):
             if token in self.token_to_idx:
                 indices.append(self.token_to_idx[token])
             else:
-                raise ValueError('Token %s is unknown. To update the embedding '
-                                 'vector for an unknown token, please specify '
-                                 'it explicitly as the `unknown_token` %s in '
-                                 '`tokens`. This is to avoid unintended '
-                                 'updates.' %
-                                 (token, self.idx_to_token[C.UNKNOWN_IDX]))
+                raise ValueError('Token %s is unknown. To update the embedding vector for an '
+                                 'unknown token, please specify it explicitly as the '
+                                 '`unknown_token` %s in `tokens`. This is to avoid unintended '
+                                 'updates.' % (token, self.idx_to_token[C.UNKNOWN_IDX]))
 
         self._idx_to_vec[nd.array(indices)] = new_vectors
 
@@ -325,8 +297,8 @@ def register(embedding_cls):
         """Registers a new token embedding.
 
 
-        Once an embedding is registered, we can create an instance of this
-        embedding with :func:`~mxnet.text.embedding.TokenEmbedding.create`.
+        Once an embedding is registered, we can create an instance of this embedding with
+        :func:`~mxnet.text.embedding.TokenEmbedding.create`.
 
 
         Examples
@@ -340,8 +312,7 @@ def register(embedding_cls):
         <class '__main__.MyTokenEmbed'>
         """
 
-        register_text_embedding = registry.get_register_func(
-            TokenEmbedding, 'token embedding')
+        register_text_embedding = registry.get_register_func(TokenEmbedding, 'token embedding')
         return register_text_embedding(embedding_cls)
 
     @staticmethod
@@ -349,11 +320,10 @@ def create(embedding_name, **kwargs):
         """Creates an instance of :class:`~mxnet.text.embedding.TokenEmbedding`.
 
 
-        Creates a token embedding instance by loading embedding vectors from an
-        externally hosted pre-trained token embedding file, such as those
-        of GloVe and FastText. To get all the valid `embedding_name` and
-        `pretrained_file_name`, use `mxnet.text.embedding.TokenEmbedding.
-        get_embedding_and_pretrained_file_names()`.
+        Creates a token embedding instance by loading embedding vectors from an externally hosted
+        pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid
+        `embedding_name` and `pretrained_file_name`, use
+        `mxnet.text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names()`.
 
 
         Parameters
@@ -365,12 +335,11 @@ def create(embedding_name, **kwargs):
         Returns
         -------
         :class:`~mxnet.text.glossary.TokenEmbedding`:
-            A token embedding instance that loads embedding vectors from an
-            externally hosted pre-trained token embedding file.
+            A token embedding instance that loads embedding vectors from an externally hosted
+            pre-trained token embedding file.
         """
 
-        create_text_embedding = registry.get_create_func(
-            TokenEmbedding, 'token embedding')
+        create_text_embedding = registry.get_create_func(TokenEmbedding, 'token embedding')
         return create_text_embedding(embedding_name, **kwargs)
 
     @classmethod
@@ -386,10 +355,9 @@ def _check_pretrained_file_names(cls, pretrained_file_name):
 
         embedding_name = cls.__name__.lower()
         if pretrained_file_name not in cls.pretrained_file_name_sha1:
-            raise KeyError('Cannot find pretrained file %s for token embedding '
-                           '%s. Valid pretrained files for embedding %s: %s' %
-                           (pretrained_file_name, embedding_name,
-                            embedding_name,
+            raise KeyError('Cannot find pretrained file %s for token embedding %s. Valid '
+                           'pretrained files for embedding %s: %s' %
+                           (pretrained_file_name, embedding_name, embedding_name,
                             ', '.join(cls.pretrained_file_name_sha1.keys())))
 
     @staticmethod
@@ -397,13 +365,12 @@ def get_embedding_and_pretrained_file_names(embedding_name=None):
         """Get valid token embedding names and their pre-trained file names.
 
 
-        To load token embedding vectors from an externally hosted pre-trained
-        token embedding file, such as those of GloVe and FastText, one should
-        use `mxnet.text.embedding.TokenEmbedding.create(embedding_name,
-        pretrained_file_name)`. This method returns all the valid names of
-        `pretrained_file_name` for the specified `embedding_name`. If
-        `embedding_name` is set to None, this method returns all the valid names
-        of `embedding_name` with associated `pretrained_file_name`.
+        To load token embedding vectors from an externally hosted pre-trained token embedding file,
+        such as those of GloVe and FastText, one should use
+        `mxnet.text.embedding.TokenEmbedding.create(embedding_name, pretrained_file_name)`. This
+        method returns all the valid names of `pretrained_file_name` for the specified
+        `embedding_name`. If `embedding_name` is set to None, this method returns all the valid
+        names of `embedding_name` with associated `pretrained_file_name`.
 
 
         Parameters
@@ -415,13 +382,11 @@ def get_embedding_and_pretrained_file_names(embedding_name=None):
         Returns
         -------
         dict or list:
-            A list of all the valid pre-trained token embedding file names
-            (`pretrained_file_name`) for the specified token embedding name
-            (`embedding_name`). If the text embeding name is set to None,
-            returns a dict mapping each valid token embedding name to a list
-            of valid pre-trained files (`pretrained_file_name`). They can be
-            plugged into `mxnet.text.embedding.TokenEmbedding.create(
-            embedding_name, pretrained_file_name)`.
+            A list of all the valid pre-trained token embedding file names (`pretrained_file_name`)
+            for the specified token embedding name (`embedding_name`). If the text embeding name is
+            set to None, returns a dict mapping each valid token embedding name to a list of valid
+            pre-trained files (`pretrained_file_name`). They can be plugged into
+            `mxnet.text.embedding.TokenEmbedding.create(embedding_name, pretrained_file_name)`.
         """
 
         text_embedding_reg = registry.get_registry(TokenEmbedding)
@@ -430,8 +395,8 @@ def get_embedding_and_pretrained_file_names(embedding_name=None):
             if embedding_name not in text_embedding_reg:
                 raise KeyError('Cannot find `embedding_name` %s. Use '
                                '`get_embedding_and_pretrained_file_names('
-                               'embedding_name=None).keys()` to get all the '
-                               'valid embedding names.' % embedding_name)
+                               'embedding_name=None).keys()` to get all the valid embedding '
+                               'names.' % embedding_name)
             return list(text_embedding_reg[
                 embedding_name].pretrained_file_name_sha1.keys())
         else:
@@ -446,10 +411,9 @@ class GloVe(TokenEmbedding):
     """The GloVe word embedding.
 
 
-    GloVe is an unsupervised learning algorithm for obtaining vector
-    representations for words. Training is performed on aggregated global
-    word-word co-occurrence statistics from a corpus, and the resulting
-    representations showcase interesting linear substructures of the word vector
+    GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
+    Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
+    the resulting representations showcase interesting linear substructures of the word vector
     space. (Source from https://nlp.stanford.edu/projects/glove/)
 
     Reference:
@@ -477,8 +441,7 @@ class GloVe(TokenEmbedding):
     embed_root : str, default os.path.join('~', '.mxnet', 'embeddings')
         The root directory for storing embedding-related files.
     unknown_vec : callback
-        The callback used to initialize the embedding vector for the unknown
-        token.
+        The callback used to initialize the embedding vector for the unknown token.
 
 
     Properties
@@ -486,20 +449,18 @@ class GloVe(TokenEmbedding):
     token_to_idx : dict mapping str to int
         A dict mapping each token to its index integer.
     idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices
-        are aligned.
+        A list of indexed tokens where the list indices and the token indices are aligned.
     unknown_token : hashable object
-        The representation for any unknown token. In other words, any
-        unknown token will be indexed as the same representation.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation.
     reserved_tokens : list of strs or None
         A list of reserved tokens that will always be indexed.
     vec_len : int
         The length of the embedding vector for each token.
     idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each
-        token's index to an embedding vector. The largest valid index maps
-        to the initialized embedding vector for every reserved token, such as an
-        unknown_token token and a padding token.
+        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
+        embedding vector. The largest valid index maps to the initialized embedding vector for every
+        reserved token, such as an unknown_token token and a padding token.
     """
 
     # Map a pre-trained token embedding archive file and its SHA-1 hash.
@@ -522,8 +483,7 @@ def __init__(self, pretrained_file_name='glove.840B.300d.txt',
         GloVe._check_pretrained_file_names(pretrained_file_name)
 
         super(GloVe, self).__init__(**kwargs)
-        pretrained_file_path = GloVe._get_pretrained_file(embedding_root,
-                                                          pretrained_file_name)
+        pretrained_file_path = GloVe._get_pretrained_file(embedding_root, pretrained_file_name)
 
         self._load_embedding(pretrained_file_path, ' ', init_unknown_vec)
 
@@ -533,10 +493,9 @@ class FastText(TokenEmbedding):
     """The fastText word embedding.
 
 
-    FastText is an open-source, free, lightweight library that allows users to
-    learn text representations and text classifiers. It works on standard,
-    generic hardware. Models can later be reduced in size to even fit on mobile
-    devices. (Source from https://fasttext.cc/)
+    FastText is an open-source, free, lightweight library that allows users to learn text
+    representations and text classifiers. It works on standard, generic hardware. Models can later
+    be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/)
 
     References:
 
@@ -557,8 +516,7 @@ class FastText(TokenEmbedding):
 
     https://fasttext.cc/
 
-    To get the updated URLs to the externally hosted pre-trained token embedding
-    files, visit
+    To get the updated URLs to the externally hosted pre-trained token embedding files, visit
     https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
 
     License for pre-trained embeddings:
@@ -573,8 +531,7 @@ class FastText(TokenEmbedding):
     embed_root : str, default os.path.join('~', '.mxnet', 'embeddings')
         The root directory for storing embedding-related files.
     unknown_vec : callback
-        The callback used to initialize the embedding vector for the unknown
-        token.
+        The callback used to initialize the embedding vector for the unknown token.
 
 
     Properties
@@ -582,20 +539,18 @@ class FastText(TokenEmbedding):
     token_to_idx : dict mapping str to int
         A dict mapping each token to its index integer.
     idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices
-        are aligned.
+        A list of indexed tokens where the list indices and the token indices are aligned.
     unknown_token : hashable object
-        The representation for any unknown token. In other words, any
-        unknown token will be indexed as the same representation.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation.
     reserved_tokens : list of strs or None
         A list of reserved tokens that will always be indexed.
     vec_len : int
         The length of the embedding vector for each token.
     idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each
-        token's index to an embedding vector. The largest valid index maps
-        to the initialized embedding vector for every reserved token, such as an
-        unknown_token token and a padding token.
+        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
+        embedding vector. The largest valid index maps to the initialized embedding vector for every
+        reserved token, such as an unknown_token token and a padding token.
     """
 
     # Map a pre-trained token embedding file and its SHA-1 hash.
@@ -607,8 +562,7 @@ def __init__(self, pretrained_file_name='wiki.simple.vec',
         FastText._check_pretrained_file_names(pretrained_file_name)
 
         super(FastText, self).__init__(**kwargs)
-        pretrained_file_path = FastText._get_pretrained_file(embedding_root,
-                                                             pretrained_file_name)
+        pretrained_file_path = FastText._get_pretrained_file(embedding_root, pretrained_file_name)
 
         self._load_embedding(pretrained_file_path, ' ', init_unknown_vec)
 
@@ -616,15 +570,14 @@ def __init__(self, pretrained_file_name='wiki.simple.vec',
 class CustomEmbedding(TokenEmbedding):
     """User-defined token embedding.
 
-    This is to load embedding vectors from a user-defined pre-trained text
-    embedding file.
+    This is to load embedding vectors from a user-defined pre-trained text embedding file.
 
-    Denote by '<ed>' the argument `elem_delim`. Denote by <v_ij> the j-th
-    element of the token embedding vector for <token_i>, the expected format of
-    a custom pre-trained token embedding file is:
+    Denote by '<ed>' the argument `elem_delim`. Denote by <v_ij> the j-th element of the token
+    embedding vector for <token_i>, the expected format of a custom pre-trained token embedding file
+    is:
 
-    '<token_1><ed><v_11><ed><v_12><ed>...<ed><v_1k>\\\\n<token_2><ed><v_21><ed>
-    <v_22><ed>...<ed><v_2k>\\\\n...'
+    '<token_1><ed><v_11><ed><v_12><ed>...<ed><v_1k>\\\\n<token_2><ed><v_21><ed><v_22><ed>...<ed>
+    <v_2k>\\\\n...'
 
     where k is the length of the embedding vector `vec_len`.
 
@@ -634,11 +587,10 @@ class CustomEmbedding(TokenEmbedding):
     pretrain_file_path : str
         The path to the custom pre-trained token embedding file.
     elem_delim : str, default ' '
-        The delimiter for splitting a token and every embedding vector element
-        value on the same line of the custom pre-trained token embedding file.
+        The delimiter for splitting a token and every embedding vector element value on the same
+        line of the custom pre-trained token embedding file.
     unknown_vec : callback
-        The callback used to initialize the embedding vector for the unknown
-        token.
+        The callback used to initialize the embedding vector for the unknown token.
 
 
     Properties
@@ -646,24 +598,21 @@ class CustomEmbedding(TokenEmbedding):
     token_to_idx : dict mapping str to int
         A dict mapping each token to its index integer.
     idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices
-        are aligned.
+        A list of indexed tokens where the list indices and the token indices are aligned.
     unknown_token : hashable object
-        The representation for any unknown token. In other words, any
-        unknown token will be indexed as the same representation.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation.
     reserved_tokens : list of strs or None
         A list of reserved tokens that will always be indexed.
     vec_len : int
         The length of the embedding vector for each token.
     idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each
-        token's index to an embedding vector. The largest valid index maps
-        to the initialized embedding vector for every reserved token, such as an
-        unknown_token token and a padding token.
+        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
+        embedding vector. The largest valid index maps to the initialized embedding vector for every
+        reserved token, such as an unknown_token token and a padding token.
     """
 
     def __init__(self, pretrained_file_path, elem_delim=' ', encoding='utf8',
                  init_unknown_vec=nd.zeros, **kwargs):
         super(CustomEmbedding, self).__init__(**kwargs)
-        self._load_embedding(pretrained_file_path, elem_delim, init_unknown_vec,
-                             encoding)
+        self._load_embedding(pretrained_file_path, elem_delim, init_unknown_vec, encoding)
diff --git a/python/mxnet/contrib/text/glossary.py b/python/mxnet/contrib/text/glossary.py
index 4de082b5f8..2fd46a3924 100644
--- a/python/mxnet/contrib/text/glossary.py
+++ b/python/mxnet/contrib/text/glossary.py
@@ -21,53 +21,45 @@
 from __future__ import absolute_import
 from __future__ import print_function
 
+from . import embedding
 from ... import ndarray as nd
-from .embedding import TokenEmbedding
 
 
-class Glossary(TokenEmbedding):
+class Glossary(embedding.TokenEmbedding):
     """Indexing and embedding for text tokens in a glossary.
 
 
-    For each indexed token in a glossary, an embedding vector will be associated
-    with it. Such embedding vectors can be loaded from externally hosted or
-    custom pre-trained token embedding files, such as via instances of
-    :class:`~mxnet.text.embedding.TokenEmbedding`.
+    For each indexed token in a glossary, an embedding vector will be associated with it. Such
+    embedding vectors can be loaded from externally hosted or custom pre-trained token embedding
+    files, such as via instances of :class:`~mxnet.text.embedding.TokenEmbedding`.
 
 
     Parameters
     ----------
     counter : collections.Counter or None, default None
-        Counts text token frequencies in the text data. Its keys will be indexed
-        according to frequency thresholds such as `most_freq_count` and
-        `min_freq`. Keys of `counter`, `unknown_token`, and values of
-        `reserved_tokens` must be of the same hashable type. Examples: str, int,
-        and tuple.
+        Counts text token frequencies in the text data. Its keys will be indexed according to
+        frequency thresholds such as `most_freq_count` and `min_freq`. Keys of `counter`,
+        `unknown_token`, and values of `reserved_tokens` must be of the same hashable type.
+        Examples: str, int, and tuple.
     token_embeddings : instance or list of :class:`~TokenEmbedding`
-        One or multiple pre-trained token embeddings to load. If it is a list of
-        multiple embeddings, these embedding vectors will be concatenated for
-        each token.
+        One or multiple pre-trained token embeddings to load. If it is a list of multiple
+        embeddings, these embedding vectors will be concatenated for each token.
     most_freq_count : None or int, default None
-        The maximum possible number of the most frequent tokens in the keys of
-        `counter` that can be indexed. Note that this argument does not count
-        any token from `reserved_tokens`. If this argument is None or larger
-        than its largest possible value restricted by `counter` and
+        The maximum possible number of the most frequent tokens in the keys of `counter` that can be
+        indexed. Note that this argument does not count any token from `reserved_tokens`. If this
+        argument is None or larger than its largest possible value restricted by `counter` and
         `reserved_tokens`, this argument becomes positive infinity.
     min_freq : int, default 1
-        The minimum frequency required for a token in the keys of `counter` to
-        be indexed.
+        The minimum frequency required for a token in the keys of `counter` to be indexed.
     unknown_token : hashable object, default '<unk>'
-        The representation for any unknown token. In other words, any unknown
-        token will be indexed as the same representation. Keys of `counter`,
-        `unknown_token`, and values of `reserved_tokens` must be of the same
-        hashable type. Examples: str, int, and tuple.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation. Keys of `counter`, `unknown_token`, and values of
+        `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple.
     reserved_tokens : list of hashable objects or None, default None
-        A list of reserved tokens that will always be indexed, such as special
-        symbols representing padding, beginning of sentence, and end of
-        sentence. It cannot contain `unknown_token`, or duplicate reserved
-        tokens. Keys of `counter`, `unknown_token`, and values of
-        `reserved_tokens` must be of the same hashable type. Examples: str, int,
-        and tuple.
+        A list of reserved tokens that will always be indexed, such as special symbols representing
+        padding, beginning of sentence, and end of sentence. It cannot contain `unknown_token`, or
+        duplicate reserved tokens. Keys of `counter`, `unknown_token`, and values of
+        `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple.
 
 
     Properties
@@ -75,40 +67,35 @@ class Glossary(TokenEmbedding):
     token_to_idx : dict mapping str to int
         A dict mapping each token to its index integer.
     idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices
-        are aligned.
+        A list of indexed tokens where the list indices and the token indices are aligned.
     unknown_token : hashable object
-        The representation for any unknown token. In other words, any
-        unknown token will be indexed as the same representation.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation.
     reserved_tokens : list of strs or None
         A list of reserved tokens that will always be indexed.
     vec_len : int
         The length of the embedding vector for each token.
     idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each
-        token's index to an embedding vector. The largest valid index maps
-        to the initialized embedding vector for every reserved token, such as an
-        unknown_token token and a padding token.
+        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
+        embedding vector. The largest valid index maps to the initialized embedding vector for every
+        reserved token, such as an unknown_token token and a padding token.
     """
-    def __init__(self, counter, token_embeddings, most_freq_count=None,
-                 min_freq=1, unknown_token='<unk>', reserved_tokens=None):
+    def __init__(self, counter, token_embeddings, most_freq_count=None, min_freq=1,
+                 unknown_token='<unk>', reserved_tokens=None):
 
         if not isinstance(token_embeddings, list):
             token_embeddings = [token_embeddings]
 
         # Sanity checks.
         for embed in token_embeddings:
-            assert isinstance(embed, TokenEmbedding), \
-                'The parameter `token_embeddings` must be an instance or a ' \
-                'list of instances of `mxnet.text.embedding.TextEmbed` ' \
-                'whose embedding vectors will be loaded or ' \
+            assert isinstance(embed, embedding.TokenEmbedding), \
+                'The parameter `token_embeddings` must be an instance or a list of instances ' \
+                'of `mxnet.text.embedding.TextEmbed` whose embedding vectors will be loaded or ' \
                 'concatenated-then-loaded to map to the indexed tokens.'
 
         # Index tokens from keys of `counter` and reserved tokens.
-        super(Glossary, self).__init__(counter=counter,
-                                       most_freq_count=most_freq_count,
-                                       min_freq=min_freq,
-                                       unknown_token=unknown_token,
+        super(Glossary, self).__init__(counter=counter, most_freq_count=most_freq_count,
+                                       min_freq=min_freq, unknown_token=unknown_token,
                                        reserved_tokens=reserved_tokens)
 
         # Set _idx_to_vec so that indices of tokens from keys of `counter` are
@@ -123,9 +110,8 @@ def _set_idx_to_vec_by_embeds(self, token_embeddings):
         ----------
         token_embeddings : an instance or a list of instances of
             :class:`~mxnet.text.embedding.TokenEmbedding`
-            One or multiple pre-trained token embeddings to load. If it is a
-            list of multiple embeddings, these embedding vectors will be
-            concatenated for each token.
+            One or multiple pre-trained token embeddings to load. If it is a list of multiple
+            embeddings, these embedding vectors will be concatenated for each token.
         """
 
         self._vec_len = sum(embed.vec_len for embed in token_embeddings)
diff --git a/python/mxnet/contrib/text/indexer.py b/python/mxnet/contrib/text/indexer.py
index bed2794b29..409dfb0bb2 100644
--- a/python/mxnet/contrib/text/indexer.py
+++ b/python/mxnet/contrib/text/indexer.py
@@ -22,54 +22,46 @@
 from __future__ import absolute_import
 from __future__ import print_function
 
-from collections import Counter
+import collections
 
-from . import constants as C
+from . import _constants as C
 
 
 class TokenIndexer(object):
     """Indexing for text tokens.
 
 
-    Build indices for the unknown token, reserved tokens, and input counter
-    keys. Indexed tokens can be used by instances of
-    :class:`~mxnet.text.embedding.TokenEmbedding`, such as instances of
+    Build indices for the unknown token, reserved tokens, and input counter keys. Indexed tokens can
+    be used by instances of :class:`~mxnet.text.embedding.TokenEmbedding`, such as instances of
     :class:`~mxnet.text.glossary.Glossary`.
 
 
     Parameters
     ----------
     counter : collections.Counter or None, default None
-        Counts text token frequencies in the text data. Its keys will be indexed
-        according to frequency thresholds such as `most_freq_count` and
-        `min_freq`. Keys of `counter`, `unknown_token`, and values of
-        `reserved_tokens` must be of the same hashable type. Examples: str, int,
-        and tuple.
+        Counts text token frequencies in the text data. Its keys will be indexed according to
+        frequency thresholds such as `most_freq_count` and `min_freq`. Keys of `counter`,
+        `unknown_token`, and values of `reserved_tokens` must be of the same hashable type.
+        Examples: str, int, and tuple.
     most_freq_count : None or int, default None
-        The maximum possible number of the most frequent tokens in the keys of
-        `counter` that can be indexed. Note that this argument does not count
-        any token from `reserved_tokens`. Suppose that there are different
-        keys of `counter` whose frequency are the same, if indexing all of them
-        will exceed this argument value, such keys will be indexed one by one
-        according to their __cmp__() order until the frequency threshold is
-        met. If this argument is None or larger than its largest possible value
-        restricted by `counter` and `reserved_tokens`, this argument has no
-        effect.
+        The maximum possible number of the most frequent tokens in the keys of `counter` that can be
+        indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose
+        that there are different keys of `counter` whose frequency are the same, if indexing all of
+        them will exceed this argument value, such keys will be indexed one by one according to
+        their __cmp__() order until the frequency threshold is met. If this argument is None or
+        larger than its largest possible value restricted by `counter` and `reserved_tokens`, this
+        argument has no effect.
     min_freq : int, default 1
-        The minimum frequency required for a token in the keys of `counter` to
-        be indexed.
+        The minimum frequency required for a token in the keys of `counter` to be indexed.
     unknown_token : hashable object, default '<unk>'
-        The representation for any unknown token. In other words, any unknown
-        token will be indexed as the same representation. Keys of `counter`,
-        `unknown_token`, and values of `reserved_tokens` must be of the same
-        hashable type. Examples: str, int, and tuple.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation. Keys of `counter`, `unknown_token`, and values of
+        `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple.
     reserved_tokens : list of hashable objects or None, default None
-        A list of reserved tokens that will always be indexed, such as special
-        symbols representing padding, beginning of sentence, and end of
-        sentence. It cannot contain `unknown_token`, or duplicate reserved
-        tokens. Keys of `counter`, `unknown_token`, and values of
-        `reserved_tokens` must be of the same hashable type. Examples: str, int,
-        and tuple.
+        A list of reserved tokens that will always be indexed, such as special symbols representing
+        padding, beginning of sentence, and end of sentence. It cannot contain `unknown_token`, or
+        duplicate reserved tokens. Keys of `counter`, `unknown_token`, and values of
+        `reserved_tokens` must be of the same hashable type. Examples: str, int, and tuple.
 
 
     Properties
@@ -80,14 +72,14 @@ class TokenIndexer(object):
         A list of indexed tokens where the list indices and the token indices
         are aligned.
     unknown_token : hashable object
-        The representation for any unknown token. In other words, any
-        unknown token will be indexed as the same representation.
+        The representation for any unknown token. In other words, any unknown token will be indexed
+        as the same representation.
     reserved_tokens : list of strs or None
         A list of reserved tokens that will always be indexed.
     """
 
-    def __init__(self, counter=None, most_freq_count=None, min_freq=1,
-                 unknown_token='<unk>', reserved_tokens=None):
+    def __init__(self, counter=None, most_freq_count=None, min_freq=1, unknown_token='<unk>',
+                 reserved_tokens=None):
 
         # Sanity checks.
         assert min_freq > 0, '`min_freq` must be set to a positive value.'
@@ -102,11 +94,10 @@ def __init__(self, counter=None, most_freq_count=None, min_freq=1,
         self._index_unknown_and_reserved_tokens(unknown_token, reserved_tokens)
 
         if counter is not None:
-            self._index_counter_keys(counter, unknown_token, reserved_tokens,
-                                     most_freq_count, min_freq)
+            self._index_counter_keys(counter, unknown_token, reserved_tokens, most_freq_count,
+                                     min_freq)
 
-    def _index_unknown_and_reserved_tokens(self, unknown_token,
-                                           reserved_tokens):
+    def _index_unknown_and_reserved_tokens(self, unknown_token, reserved_tokens):
         """Indexes unknown and reserved tokens."""
 
         self._unknown_token = unknown_token
@@ -119,23 +110,21 @@ def _index_unknown_and_reserved_tokens(self, unknown_token,
             self._reserved_tokens = reserved_tokens[:]
             self._idx_to_token.extend(reserved_tokens)
 
-        self._token_to_idx = {token: idx for idx, token in
-                              enumerate(self._idx_to_token)}
+        self._token_to_idx = {token: idx for idx, token in enumerate(self._idx_to_token)}
 
-    def _index_counter_keys(self, counter, unknown_token, reserved_tokens,
-                            most_freq_count, min_freq):
+    def _index_counter_keys(self, counter, unknown_token, reserved_tokens, most_freq_count,
+                            min_freq):
         """Indexes keys of `counter`.
 
 
-        Indexes keys of `counter` according to frequency thresholds such as
-        `most_freq_count` and `min_freq`.
+        Indexes keys of `counter` according to frequency thresholds such as `most_freq_count` and
+        `min_freq`.
         """
 
-        assert isinstance(counter, Counter), \
+        assert isinstance(counter, collections.Counter), \
             '`counter` must be an instance of collections.Counter.'
 
-        unknown_and_reserved_tokens = set(reserved_tokens) \
-            if reserved_tokens is not None else set()
+        unknown_and_reserved_tokens = set(reserved_tokens) if reserved_tokens is not None else set()
         unknown_and_reserved_tokens.add(unknown_token)
 
         token_freqs = sorted(counter.items(), key=lambda x: x[0])
@@ -183,8 +172,7 @@ def to_indices(self, tokens):
         Returns
         -------
         int or list of ints
-            A token index or a list of token indices according to the text
-            indexer.
+            A token index or a list of token indices according to the text indexer.
         """
 
         to_reduce = False
@@ -223,8 +211,7 @@ def to_tokens(self, indices):
         tokens = []
         for idx in indices:
             if not isinstance(idx, int) or idx > max_idx:
-                raise ValueError('Token index %d in the provided `indices` is '
-                                 'invalid.' % idx)
+                raise ValueError('Token index %d in the provided `indices` is invalid.' % idx)
             else:
                 tokens.append(self.idx_to_token[idx])
 
diff --git a/python/mxnet/contrib/text/utils.py b/python/mxnet/contrib/text/utils.py
index 91e1b623ed..cd8ce5b0a2 100644
--- a/python/mxnet/contrib/text/utils.py
+++ b/python/mxnet/contrib/text/utils.py
@@ -21,7 +21,7 @@
 from __future__ import absolute_import
 from __future__ import print_function
 
-from collections import Counter
+import collections
 import re
 
 
@@ -29,8 +29,8 @@ def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
                           to_lower=False, counter_to_update=None):
     """Counts tokens in the specified string.
 
-    For token_delim='<td>' and seq_delim='<sd>', a specified string of two
-    sequences of tokens may look like::
+    For token_delim='<td>' and seq_delim='<sd>', a specified string of two sequences of tokens may
+    look like::
 
     <td>token1<td>token2<td>token3<td><sd><td>token4<td>token5<td><sd>
 
@@ -46,18 +46,16 @@ def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
     to_lower : bool, default False
         Whether to convert the source source_str to the lower case.
     counter_to_update : collections.Counter or None, default None
-        The collections.Counter instance to be updated with the token counts
-        of `source_str`. If None, return a new collections.Counter instance
-        counting tokens from `source_str`.
+        The collections.Counter instance to be updated with the token counts of `source_str`. If
+        None, return a new collections.Counter instance counting tokens from `source_str`.
 
 
     Returns
     -------
     collections.Counter
-        The `counter_to_update` collections.Counter instance after being updated
-        with the token counts of `source_str`. If `counter_to_update` is None,
-        return a new collections.Counter instance counting tokens from
-        `source_str`.
+        The `counter_to_update` collections.Counter instance after being updated with the token
+        counts of `source_str`. If `counter_to_update` is None, return a new collections.Counter
+        instance counting tokens from `source_str`.
 
 
     Examples
@@ -73,7 +71,7 @@ def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
         source_str = [t.lower() for t in source_str]
 
     if counter_to_update is None:
-        return Counter(source_str)
+        return collections.Counter(source_str)
     else:
         counter_to_update.update(source_str)
         return counter_to_update
diff --git a/tests/python/unittest/test_contrib_text.py b/tests/python/unittest/test_contrib_text.py
index f666888150..99423aa7d5 100644
--- a/tests/python/unittest/test_contrib_text.py
+++ b/tests/python/unittest/test_contrib_text.py
@@ -21,24 +21,17 @@
 from __future__ import print_function
 
 from collections import Counter
-import unittest
 
 from common import assertRaises
 from mxnet import ndarray as nd
 from mxnet.test_utils import *
-from mxnet.contrib.text import utils
-from mxnet.contrib.text.glossary import Glossary
-from mxnet.contrib.text.indexer import TokenIndexer
-from mxnet.contrib.text.embedding import TokenEmbedding, CustomEmbedding
+from mxnet.contrib import text
 
 
 def _get_test_str_of_tokens(token_delim, seq_delim):
-    seq1 = token_delim + token_delim.join(['Life', 'is', 'great', '!']) \
-           + token_delim + seq_delim
-    seq2 = token_delim + token_delim.join(['life', 'is', 'good', '.']) \
-           + token_delim + seq_delim
-    seq3 = token_delim + token_delim.join(['life', "isn't", 'bad', '.']) \
-           + token_delim + seq_delim
+    seq1 = token_delim + token_delim.join(['Life', 'is', 'great', '!']) + token_delim + seq_delim
+    seq2 = token_delim + token_delim.join(['life', 'is', 'good', '.']) + token_delim + seq_delim
+    seq3 = token_delim + token_delim.join(['life', "isn't", 'bad', '.']) + token_delim + seq_delim
     seqs = seq1 + seq2 + seq3
     return seqs
 
@@ -46,33 +39,31 @@ def _get_test_str_of_tokens(token_delim, seq_delim):
 def _test_count_tokens_from_str_with_delims(token_delim, seq_delim):
     source_str = _get_test_str_of_tokens(token_delim, seq_delim)
 
-    cnt1 = utils.count_tokens_from_str(source_str, token_delim, seq_delim,
-                                       to_lower=False)
+    cnt1 = text.utils.count_tokens_from_str(
+        source_str, token_delim, seq_delim, to_lower=False)
     assert cnt1 == Counter(
-        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1,
-         "isn't": 1, 'bad': 1})
+        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
+         'bad': 1})
 
-    cnt2 = utils.count_tokens_from_str(source_str, token_delim, seq_delim,
-                                       to_lower=True)
+    cnt2 = text.utils.count_tokens_from_str(
+        source_str, token_delim, seq_delim, to_lower=True)
     assert cnt2 == Counter(
-        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1,
-         "isn't": 1, 'bad': 1})
+        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1})
 
     counter_to_update = Counter({'life': 2})
 
-    cnt3 = utils.count_tokens_from_str(
+    cnt3 = text.utils.count_tokens_from_str(
         source_str, token_delim, seq_delim, to_lower=False,
         counter_to_update=counter_to_update.copy())
     assert cnt3 == Counter(
-        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1,
-         "isn't": 1, 'bad': 1})
+        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
+         'bad': 1})
 
-    cnt4 = utils.count_tokens_from_str(
+    cnt4 = text.utils.count_tokens_from_str(
         source_str, token_delim, seq_delim, to_lower=True,
         counter_to_update=counter_to_update.copy())
     assert cnt4 == Counter(
-        {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1,
-         "isn't": 1, 'bad': 1})
+        {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1})
 
 
 def test_count_tokens_from_str():
@@ -83,8 +74,8 @@ def test_count_tokens_from_str():
 def test_tokens_to_indices():
     counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
 
-    indexer = TokenIndexer(counter, most_freq_count=None, min_freq=1,
-                           unknown_token='<unk>', reserved_tokens=None)
+    indexer = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=1,
+                                        unknown_token='<unk>', reserved_tokens=None)
 
     i1 = indexer.to_indices('c')
     assert i1 == 1
@@ -102,9 +93,8 @@ def test_tokens_to_indices():
 def test_indices_to_tokens():
     counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
 
-    indexer = TokenIndexer(counter, most_freq_count=None, min_freq=1,
-                           unknown_token='<unknown>', reserved_tokens=None)
-
+    indexer = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=1,
+                                        unknown_token='<unknown>', reserved_tokens=None)
     i1 = indexer.to_tokens(1)
     assert i1 == 'c'
 
@@ -119,11 +109,13 @@ def test_indices_to_tokens():
 
     assertRaises(ValueError, indexer.to_tokens, 100)
 
+
 def test_download_embed():
-    @TokenEmbedding.register
-    class Test(TokenEmbedding):
+    @text.embedding.TokenEmbedding.register
+    class Test(text.embedding.TokenEmbedding):
+        # 33 bytes
         pretrained_file_name_sha1 = \
-            {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'} # 33 bytes
+            {'embedding_test.vec': '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8'}
         namespace = 'test'
 
         def __init__(self, embedding_root='embeddings',
@@ -133,18 +125,20 @@ def __init__(self, embedding_root='embeddings',
 
             super(Test, self).__init__(**kwargs)
 
-            pretrained_file_path = Test._get_pretrained_file(embedding_root,
-                                                             pretrained_file_name)
+            pretrained_file_path = Test._get_pretrained_file(
+                embedding_root, pretrained_file_name)
 
             self._load_embedding(pretrained_file_path, ' ', init_unknown_vec)
 
-    test_embed = TokenEmbedding.create('test')
+    test_embed = text.embedding.TokenEmbedding.create('test')
     assert test_embed.token_to_idx['hello'] == 1
     assert test_embed.token_to_idx['world'] == 2
-    assert_almost_equal(test_embed.idx_to_vec[1].asnumpy(), (nd.arange(5) + 1).asnumpy())
-    assert_almost_equal(test_embed.idx_to_vec[2].asnumpy(), (nd.arange(5) + 6).asnumpy())
-    assert_almost_equal(test_embed.idx_to_vec[0].asnumpy(), nd.zeros((5,)).asnumpy())
-
+    assert_almost_equal(
+        test_embed.idx_to_vec[1].asnumpy(), (nd.arange(5) + 1).asnumpy())
+    assert_almost_equal(
+        test_embed.idx_to_vec[2].asnumpy(), (nd.arange(5) + 6).asnumpy())
+    assert_almost_equal(
+        test_embed.idx_to_vec[0].asnumpy(), nd.zeros((5,)).asnumpy())
 
 
 def _mk_my_pretrain_file(path, token_delim, pretrain_file):
@@ -162,8 +156,7 @@ def _mk_my_pretrain_file2(path, token_delim, pretrain_file):
     path = os.path.expanduser(path)
     if not os.path.exists(path):
         os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04',
-                             '0.05']) + '\n'
+    seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n'
     seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n'
     seqs = seq1 + seq2
     with open(os.path.join(path, pretrain_file), 'w') as fout:
@@ -187,12 +180,9 @@ def _mk_my_pretrain_file4(path, token_delim, pretrain_file):
     path = os.path.expanduser(path)
     if not os.path.exists(path):
         os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04',
-                             '0.05']) + '\n'
-    seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09',
-                             '0.1']) + '\n'
-    seq3 = token_delim.join(['<unk2>', '0.11', '0.12', '0.13', '0.14',
-                             '0.15']) + '\n'
+    seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n'
+    seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n'
+    seq3 = token_delim.join(['<unk2>', '0.11', '0.12', '0.13', '0.14', '0.15']) + '\n'
     seqs = seq1 + seq2 + seq3
     with open(os.path.join(path, pretrain_file), 'w') as fout:
         fout.write(seqs)
@@ -228,12 +218,11 @@ def test_custom_embed():
     elem_delim = '\t'
     pretrain_file = 'my_pretrain_file.txt'
 
-    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim,
-                         pretrain_file)
+    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
 
     pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file)
 
-    my_embed = CustomEmbedding(pretrain_file_path, elem_delim)
+    my_embed = text.embedding.CustomEmbedding(pretrain_file_path, elem_delim)
 
     assert len(my_embed) == 3
     assert my_embed.vec_len == 5
@@ -250,26 +239,24 @@ def test_custom_embed():
     assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5]))
 
     unk_vecs = my_embed.get_vecs_by_tokens(['<un...@unk>', '<un...@unk>'])
-    assert_almost_equal(unk_vecs.asnumpy(),
-                        np.array([[0, 0, 0, 0, 0],
-                                  [0, 0, 0, 0, 0]]))
+    assert_almost_equal(unk_vecs.asnumpy(), np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]))
 
     # Test loaded unknown vectors.
     pretrain_file2 = 'my_pretrain_file2.txt'
     _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim,
                           pretrain_file2)
     pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2)
-    my_embed2 = CustomEmbedding(pretrain_file_path, elem_delim,
-                                init_unknown_vec=nd.ones,
-                                unknown_token='<unk>')
+    my_embed2 = text.embedding.CustomEmbedding(
+        pretrain_file_path, elem_delim, init_unknown_vec=nd.ones,
+        unknown_token='<unk>')
     unk_vec2 = my_embed2.get_vecs_by_tokens('<unk>')
     assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1]))
     unk_vec2 = my_embed2.get_vecs_by_tokens('<un...@unk>')
     assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1]))
 
-    my_embed3 = CustomEmbedding(pretrain_file_path, elem_delim,
-                                init_unknown_vec=nd.ones,
-                                unknown_token='<unk1>')
+    my_embed3 = text.embedding.CustomEmbedding(
+        pretrain_file_path, elem_delim,init_unknown_vec=nd.ones,
+        unknown_token='<unk1>')
     unk_vec3 = my_embed3.get_vecs_by_tokens('<unk1>')
     assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5]))
     unk_vec3 = my_embed3.get_vecs_by_tokens('<un...@unk>')
@@ -277,157 +264,147 @@ def test_custom_embed():
 
     # Test error handling.
     invalid_pretrain_file = 'invalid_pretrain_file.txt'
-    _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name),
-                                 elem_delim, invalid_pretrain_file)
-    pretrain_file_path = os.path.join(embed_root, embed_name,
-                                      invalid_pretrain_file)
-    assertRaises(AssertionError, CustomEmbedding, pretrain_file_path,
-                 elem_delim)
+    _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name), elem_delim,
+                                 invalid_pretrain_file)
+    pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file)
+    assertRaises(AssertionError, text.embedding.CustomEmbedding, pretrain_file_path, elem_delim)
 
     invalid_pretrain_file2 = 'invalid_pretrain_file2.txt'
     _mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name),
                                   elem_delim, invalid_pretrain_file2)
     pretrain_file_path = os.path.join(embed_root, embed_name,
                                       invalid_pretrain_file2)
-    assertRaises(AssertionError, CustomEmbedding, pretrain_file_path,
-                 elem_delim)
+    assertRaises(AssertionError, text.embedding.CustomEmbedding, pretrain_file_path, elem_delim)
 
 
 def test_token_indexer():
     counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
 
-    g1 = TokenIndexer(counter, most_freq_count=None, min_freq=1,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g1) == 5
-    assert g1.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3,
-                               'some_word$': 4}
-    assert g1.idx_to_token[1] == 'c'
-    assert g1.unknown_token == '<unk>'
-    assert g1.reserved_tokens is None
-
-    g2 = TokenIndexer(counter, most_freq_count=None, min_freq=2,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g2) == 3
-    assert g2.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2}
-    assert g2.idx_to_token[1] == 'c'
-    assert g2.unknown_token == '<unk>'
-    assert g2.reserved_tokens is None
-
-    g3 = TokenIndexer(counter, most_freq_count=None, min_freq=100,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g3) == 1
-    assert g3.token_to_idx == {'<unk>': 0}
-    assert g3.idx_to_token[0] == '<unk>'
-    assert g3.unknown_token == '<unk>'
-    assert g3.reserved_tokens is None
-
-    g4 = TokenIndexer(counter, most_freq_count=2, min_freq=1,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g4) == 3
-    assert g4.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2}
-    assert g4.idx_to_token[1] == 'c'
-    assert g4.unknown_token == '<unk>'
-    assert g4.reserved_tokens is None
-
-    g5 = TokenIndexer(counter, most_freq_count=3, min_freq=1,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g5) == 4
-    assert g5.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3}
-    assert g5.idx_to_token[1] == 'c'
-    assert g5.unknown_token == '<unk>'
-    assert g5.reserved_tokens is None
-
-    g6 = TokenIndexer(counter, most_freq_count=100, min_freq=1,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g6) == 5
-    assert g6.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3,
+    i1 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>',
+                                   reserved_tokens=None)
+    assert len(i1) == 5
+    assert i1.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
+    assert i1.idx_to_token[1] == 'c'
+    assert i1.unknown_token == '<unk>'
+    assert i1.reserved_tokens is None
+
+    i2 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=2, unknown_token='<unk>',
+                                   reserved_tokens=None)
+    assert len(i2) == 3
+    assert i2.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2}
+    assert i2.idx_to_token[1] == 'c'
+    assert i2.unknown_token == '<unk>'
+    assert i2.reserved_tokens is None
+
+    i3 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=100,
+                                   unknown_token='<unk>', reserved_tokens=None)
+    assert len(i3) == 1
+    assert i3.token_to_idx == {'<unk>': 0}
+    assert i3.idx_to_token[0] == '<unk>'
+    assert i3.unknown_token == '<unk>'
+    assert i3.reserved_tokens is None
+
+    i4 = text.indexer.TokenIndexer(counter, most_freq_count=2, min_freq=1, unknown_token='<unk>',
+                                   reserved_tokens=None)
+    assert len(i4) == 3
+    assert i4.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2}
+    assert i4.idx_to_token[1] == 'c'
+    assert i4.unknown_token == '<unk>'
+    assert i4.reserved_tokens is None
+
+    i5 = text.indexer.TokenIndexer(counter, most_freq_count=3, min_freq=1, unknown_token='<unk>',
+                                   reserved_tokens=None)
+    assert len(i5) == 4
+    assert i5.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3}
+    assert i5.idx_to_token[1] == 'c'
+    assert i5.unknown_token == '<unk>'
+    assert i5.reserved_tokens is None
+
+    i6 = text.indexer.TokenIndexer(counter, most_freq_count=100, min_freq=1, unknown_token='<unk>',
+                                   reserved_tokens=None)
+    assert len(i6) == 5
+    assert i6.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3,
                                'some_word$': 4}
-    assert g6.idx_to_token[1] == 'c'
-    assert g6.unknown_token == '<unk>'
-    assert g6.reserved_tokens is None
-
-    g7 = TokenIndexer(counter, most_freq_count=1, min_freq=2,
-                      unknown_token='<unk>', reserved_tokens=None)
-    assert len(g7) == 2
-    assert g7.token_to_idx == {'<unk>': 0, 'c': 1}
-    assert g7.idx_to_token[1] == 'c'
-    assert g7.unknown_token == '<unk>'
-    assert g7.reserved_tokens is None
-
-    assertRaises(AssertionError, TokenIndexer, counter, most_freq_count=None,
-                 min_freq=0, unknown_token='<unknown>',
-                 reserved_tokens=['b'])
-
-    assertRaises(AssertionError, TokenIndexer, counter, most_freq_count=None,
-                 min_freq=1, unknown_token='<unknown>',
-                 reserved_tokens=['b', 'b'])
-
-    assertRaises(AssertionError, TokenIndexer, counter, most_freq_count=None,
-                 min_freq=1, unknown_token='<unknown>',
-                 reserved_tokens=['b', '<unknown>'])
-
-    g8 = TokenIndexer(counter, most_freq_count=None, min_freq=1,
-                      unknown_token='<unknown>', reserved_tokens=['b'])
-    assert len(g8) == 5
-    assert g8.token_to_idx == {'<unknown>': 0, 'b': 1, 'c': 2, 'a': 3,
-                               'some_word$': 4}
-    assert g8.idx_to_token[1] == 'b'
-    assert g8.unknown_token == '<unknown>'
-    assert g8.reserved_tokens == ['b']
-
-    g9 = TokenIndexer(counter, most_freq_count=None, min_freq=2,
-                      unknown_token='<unk>', reserved_tokens=['b', 'a'])
-    assert len(g9) == 4
-    assert g9.token_to_idx == {'<unk>': 0, 'b': 1, 'a': 2, 'c': 3}
-    assert g9.idx_to_token[1] == 'b'
-    assert g9.unknown_token == '<unk>'
-    assert g9.reserved_tokens == ['b', 'a']
-
-    g10 = TokenIndexer(counter, most_freq_count=None, min_freq=100,
-                       unknown_token='<unk>', reserved_tokens=['b', 'c'])
-    assert len(g10) == 3
-    assert g10.token_to_idx == {'<unk>': 0, 'b': 1, 'c': 2}
-    assert g10.idx_to_token[1] == 'b'
-    assert g10.unknown_token == '<unk>'
-    assert g10.reserved_tokens == ['b', 'c']
-
-    g11 = TokenIndexer(counter, most_freq_count=1, min_freq=2,
-                       unknown_token='<unk>', reserved_tokens=['<pad>', 'b'])
-    assert len(g11) == 4
-    assert g11.token_to_idx == {'<unk>': 0, '<pad>': 1, 'b': 2, 'c': 3}
-    assert g11.idx_to_token[1] == '<pad>'
-    assert g11.unknown_token == '<unk>'
-    assert g11.reserved_tokens == ['<pad>', 'b']
-
-    g12 = TokenIndexer(counter, most_freq_count=None, min_freq=2,
-                       unknown_token='b', reserved_tokens=['<pad>'])
-    assert len(g12) == 3
-    assert g12.token_to_idx == {'b': 0, '<pad>': 1, 'c': 2}
-    assert g12.idx_to_token[1] == '<pad>'
-    assert g12.unknown_token == 'b'
-    assert g12.reserved_tokens == ['<pad>']
-
-    g13 = TokenIndexer(counter, most_freq_count=None, min_freq=2,
-                       unknown_token='a', reserved_tokens=['<pad>'])
-    assert len(g13) == 4
-    assert g13.token_to_idx == {'a': 0, '<pad>': 1, 'c': 2, 'b': 3}
-    assert g13.idx_to_token[1] == '<pad>'
-    assert g13.unknown_token == 'a'
-    assert g13.reserved_tokens == ['<pad>']
-
-    counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'),
-                             ('c', 'c'), ('c', 'c'), ('c', 'c'),
+    assert i6.idx_to_token[1] == 'c'
+    assert i6.unknown_token == '<unk>'
+    assert i6.reserved_tokens is None
+
+    i7 = text.indexer.TokenIndexer(counter, most_freq_count=1, min_freq=2, unknown_token='<unk>',
+                                   reserved_tokens=None)
+    assert len(i7) == 2
+    assert i7.token_to_idx == {'<unk>': 0, 'c': 1}
+    assert i7.idx_to_token[1] == 'c'
+    assert i7.unknown_token == '<unk>'
+    assert i7.reserved_tokens is None
+
+    assertRaises(AssertionError, text.indexer.TokenIndexer, counter, most_freq_count=None,
+                 min_freq=0, unknown_token='<unknown>', reserved_tokens=['b'])
+
+    assertRaises(AssertionError, text.indexer.TokenIndexer, counter, most_freq_count=None,
+                 min_freq=1, unknown_token='<unknown>', reserved_tokens=['b', 'b'])
+
+    assertRaises(AssertionError, text.indexer.TokenIndexer, counter, most_freq_count=None,
+                 min_freq=1, unknown_token='<unknown>', reserved_tokens=['b', '<unknown>'])
+
+    i8 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=1,
+                                   unknown_token='<unknown>', reserved_tokens=['b'])
+    assert len(i8) == 5
+    assert i8.token_to_idx == {'<unknown>': 0, 'b': 1, 'c': 2, 'a': 3, 'some_word$': 4}
+    assert i8.idx_to_token[1] == 'b'
+    assert i8.unknown_token == '<unknown>'
+    assert i8.reserved_tokens == ['b']
+
+    i9 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=2, unknown_token='<unk>',
+                                   reserved_tokens=['b', 'a'])
+    assert len(i9) == 4
+    assert i9.token_to_idx == {'<unk>': 0, 'b': 1, 'a': 2, 'c': 3}
+    assert i9.idx_to_token[1] == 'b'
+    assert i9.unknown_token == '<unk>'
+    assert i9.reserved_tokens == ['b', 'a']
+
+    i10 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=100,
+                                    unknown_token='<unk>', reserved_tokens=['b', 'c'])
+    assert len(i10) == 3
+    assert i10.token_to_idx == {'<unk>': 0, 'b': 1, 'c': 2}
+    assert i10.idx_to_token[1] == 'b'
+    assert i10.unknown_token == '<unk>'
+    assert i10.reserved_tokens == ['b', 'c']
+
+    i11 = text.indexer.TokenIndexer(counter, most_freq_count=1, min_freq=2, unknown_token='<unk>',
+                                    reserved_tokens=['<pad>', 'b'])
+    assert len(i11) == 4
+    assert i11.token_to_idx == {'<unk>': 0, '<pad>': 1, 'b': 2, 'c': 3}
+    assert i11.idx_to_token[1] == '<pad>'
+    assert i11.unknown_token == '<unk>'
+    assert i11.reserved_tokens == ['<pad>', 'b']
+
+    i12 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=2, unknown_token='b',
+                                    reserved_tokens=['<pad>'])
+    assert len(i12) == 3
+    assert i12.token_to_idx == {'b': 0, '<pad>': 1, 'c': 2}
+    assert i12.idx_to_token[1] == '<pad>'
+    assert i12.unknown_token == 'b'
+    assert i12.reserved_tokens == ['<pad>']
+
+    i13 = text.indexer.TokenIndexer(counter, most_freq_count=None, min_freq=2, unknown_token='a',
+                                    reserved_tokens=['<pad>'])
+    assert len(i13) == 4
+    assert i13.token_to_idx == {'a': 0, '<pad>': 1, 'c': 2, 'b': 3}
+    assert i13.idx_to_token[1] == '<pad>'
+    assert i13.unknown_token == 'a'
+    assert i13.reserved_tokens == ['<pad>']
+
+    counter_tuple = Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), ('c', 'c'), ('c', 'c'), ('c', 'c'),
                              ('some_word$', 'some_word$')])
 
-    g14 = TokenIndexer(counter_tuple, most_freq_count=None, min_freq=1,
-                       unknown_token=('<unk>', '<unk>'), reserved_tokens=None)
-    assert len(g14) == 5
-    assert g14.token_to_idx == {('<unk>', '<unk>'): 0, ('c', 'c'): 1,
-                                ('b', 'b'): 2, ('a', 'a'): 3,
+    i14 = text.indexer.TokenIndexer(counter_tuple, most_freq_count=None, min_freq=1,
+                                    unknown_token=('<unk>', '<unk>'), reserved_tokens=None)
+    assert len(i14) == 5
+    assert i14.token_to_idx == {('<unk>', '<unk>'): 0, ('c', 'c'): 1, ('b', 'b'): 2, ('a', 'a'): 3,
                                 ('some_word$', 'some_word$'): 4}
-    assert g14.idx_to_token[1] == ('c', 'c')
-    assert g14.unknown_token == ('<unk>', '<unk>')
-    assert g14.reserved_tokens is None
+    assert i14.idx_to_token[1] == ('c', 'c')
+    assert i14.unknown_token == ('<unk>', '<unk>')
+    assert i14.reserved_tokens is None
 
 
 def test_glossary_with_one_embed():
@@ -436,21 +413,19 @@ def test_glossary_with_one_embed():
     elem_delim = '\t'
     pretrain_file = 'my_pretrain_file1.txt'
 
-    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim,
-                         pretrain_file)
+    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
 
     pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file)
 
-    my_embed = CustomEmbedding(pretrain_file_path, elem_delim,
-                               init_unknown_vec=nd.ones)
+    my_embed = text.embedding.CustomEmbedding(pretrain_file_path, elem_delim,
+                                              init_unknown_vec=nd.ones)
 
     counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
 
-    g1 = Glossary(counter, my_embed, most_freq_count=None, min_freq=1,
-                  unknown_token='<unk>', reserved_tokens=['<pad>'])
+    g1 = text.glossary.Glossary(counter, my_embed, most_freq_count=None, min_freq=1,
+                                unknown_token='<unk>', reserved_tokens=['<pad>'])
 
-    assert g1.token_to_idx == {'<unk>': 0, '<pad>': 1, 'c': 2, 'b': 3, 'a': 4,
-                               'some_word$': 5}
+    assert g1.token_to_idx == {'<unk>': 0, '<pad>': 1, 'c': 2, 'b': 3, 'a': 4, 'some_word$': 5}
     assert g1.idx_to_token == ['<unk>', '<pad>', 'c', 'b', 'a', 'some_word$']
 
     assert_almost_equal(g1.idx_to_vec.asnumpy(),
@@ -488,8 +463,7 @@ def test_glossary_with_one_embed():
                                   [0.6, 0.7, 0.8, 0.9, 1]])
                         )
 
-    assert_almost_equal(g1.get_vecs_by_tokens(['A', 'b'],
-                                              lower_case_backup=True).asnumpy(),
+    assert_almost_equal(g1.get_vecs_by_tokens(['A', 'b'], lower_case_backup=True).asnumpy(),
                         np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
                                   [0.6, 0.7, 0.8, 0.9, 1]])
                         )
@@ -508,18 +482,14 @@ def test_glossary_with_one_embed():
                                   [1, 1, 1, 1, 1]])
                         )
 
-    assertRaises(ValueError, g1.update_token_vectors, 'unknown$$$',
-                 nd.array([0, 0, 0, 0, 0]))
+    assertRaises(ValueError, g1.update_token_vectors, 'unknown$$$', nd.array([0, 0, 0, 0, 0]))
 
     assertRaises(AssertionError, g1.update_token_vectors, '<unk>',
                  nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]))
 
-    assertRaises(AssertionError, g1.update_token_vectors, '<unk>',
-                 nd.array([0]))
+    assertRaises(AssertionError, g1.update_token_vectors, '<unk>', nd.array([0]))
 
-    g1.update_token_vectors(['<unk>'],
-                            nd.array([0, 0, 0, 0, 0])
-                            )
+    g1.update_token_vectors(['<unk>'], nd.array([0, 0, 0, 0, 0]))
     assert_almost_equal(g1.idx_to_vec.asnumpy(),
                         np.array([[0, 0, 0, 0, 0],
                                   [1, 1, 1, 1, 1],
@@ -528,9 +498,7 @@ def test_glossary_with_one_embed():
                                   [2, 2, 2, 2, 2],
                                   [1, 1, 1, 1, 1]])
                         )
-    g1.update_token_vectors(['<unk>'],
-                            nd.array([[10, 10, 10, 10, 10]])
-                            )
+    g1.update_token_vectors(['<unk>'], nd.array([[10, 10, 10, 10, 10]]))
     assert_almost_equal(g1.idx_to_vec.asnumpy(),
                         np.array([[10, 10, 10, 10, 10],
                                   [1, 1, 1, 1, 1],
@@ -539,9 +507,7 @@ def test_glossary_with_one_embed():
                                   [2, 2, 2, 2, 2],
                                   [1, 1, 1, 1, 1]])
                         )
-    g1.update_token_vectors('<unk>',
-                            nd.array([0, 0, 0, 0, 0])
-                            )
+    g1.update_token_vectors('<unk>', nd.array([0, 0, 0, 0, 0]))
     assert_almost_equal(g1.idx_to_vec.asnumpy(),
                         np.array([[0, 0, 0, 0, 0],
                                   [1, 1, 1, 1, 1],
@@ -550,9 +516,7 @@ def test_glossary_with_one_embed():
                                   [2, 2, 2, 2, 2],
                                   [1, 1, 1, 1, 1]])
                         )
-    g1.update_token_vectors('<unk>',
-                            nd.array([[10, 10, 10, 10, 10]])
-                            )
+    g1.update_token_vectors('<unk>', nd.array([[10, 10, 10, 10, 10]]))
     assert_almost_equal(g1.idx_to_vec.asnumpy(),
                         np.array([[10, 10, 10, 10, 10],
                                   [1, 1, 1, 1, 1],
@@ -570,25 +534,22 @@ def test_glossary_with_two_embeds():
     pretrain_file1 = 'my_pretrain_file1.txt'
     pretrain_file2 = 'my_pretrain_file2.txt'
 
-    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim,
-                         pretrain_file1)
-    _mk_my_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim,
-                          pretrain_file2)
+    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file1)
+    _mk_my_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2)
 
     pretrain_file_path1 = os.path.join(embed_root, embed_name, pretrain_file1)
     pretrain_file_path2 = os.path.join(embed_root, embed_name, pretrain_file2)
 
-    my_embed1 = CustomEmbedding(pretrain_file_path1, elem_delim,
-                                init_unknown_vec=nd.ones)
-    my_embed2 = CustomEmbedding(pretrain_file_path2, elem_delim)
+    my_embed1 = text.embedding.CustomEmbedding(pretrain_file_path1, elem_delim,
+                                               init_unknown_vec=nd.ones)
+    my_embed2 = text.embedding.CustomEmbedding(pretrain_file_path2, elem_delim)
 
     counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
 
-    g1 = Glossary(counter, [my_embed1, my_embed2], most_freq_count=None,
-                  min_freq=1, unknown_token='<unk>', reserved_tokens=None)
+    g1 = text.glossary.Glossary(counter, [my_embed1, my_embed2], most_freq_count=None, min_freq=1,
+                                unknown_token='<unk>', reserved_tokens=None)
 
-    assert g1.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3,
-                               'some_word$': 4}
+    assert g1.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
     assert g1.idx_to_token == ['<unk>', 'c', 'b', 'a', 'some_word$']
 
     assert_almost_equal(g1.idx_to_vec.asnumpy(),
@@ -627,22 +588,19 @@ def test_glossary_with_two_embeds():
     pretrain_file3 = 'my_pretrain_file3.txt'
     pretrain_file4 = 'my_pretrain_file4.txt'
 
-    _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim,
-                          pretrain_file3)
-    _mk_my_pretrain_file4(os.path.join(embed_root, embed_name), elem_delim,
-                          pretrain_file4)
+    _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file3)
+    _mk_my_pretrain_file4(os.path.join(embed_root, embed_name), elem_delim, pretrain_file4)
 
     pretrain_file_path3 = os.path.join(embed_root, embed_name, pretrain_file3)
     pretrain_file_path4 = os.path.join(embed_root, embed_name, pretrain_file4)
 
-    my_embed3 = CustomEmbedding(pretrain_file_path3, elem_delim,
-                                init_unknown_vec=nd.ones,
-                                unknown_token='<unk1>')
-    my_embed4 = CustomEmbedding(pretrain_file_path4, elem_delim,
-                                unknown_token='<unk2>')
+    my_embed3 = text.embedding.CustomEmbedding(pretrain_file_path3, elem_delim,
+                                               init_unknown_vec=nd.ones, unknown_token='<unk1>')
+    my_embed4 = text.embedding.CustomEmbedding(pretrain_file_path4, elem_delim,
+                                               unknown_token='<unk2>')
 
-    g2 = Glossary(counter, [my_embed3, my_embed4], most_freq_count=None,
-                  min_freq=1, unknown_token='<unk>', reserved_tokens=None)
+    g2 = text.glossary.Glossary(counter, [my_embed3, my_embed4], most_freq_count=None, min_freq=1,
+                                unknown_token='<unk>', reserved_tokens=None)
     assert_almost_equal(g2.idx_to_vec.asnumpy(),
                         np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                    0.11, 0.12, 0.13, 0.14, 0.15],
@@ -656,8 +614,8 @@ def test_glossary_with_two_embeds():
                                    0.11, 0.12, 0.13, 0.14, 0.15]])
                         )
 
-    g3 = Glossary(counter, [my_embed3, my_embed4], most_freq_count=None,
-                  min_freq=1, unknown_token='<unk1>', reserved_tokens=None)
+    g3 = text.glossary.Glossary(counter, [my_embed3, my_embed4], most_freq_count=None, min_freq=1,
+                                unknown_token='<unk1>', reserved_tokens=None)
     assert_almost_equal(g3.idx_to_vec.asnumpy(),
                         np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                    0.11, 0.12, 0.13, 0.14, 0.15],
@@ -671,8 +629,8 @@ def test_glossary_with_two_embeds():
                                    0.11, 0.12, 0.13, 0.14, 0.15]])
                         )
 
-    g4 = Glossary(counter, [my_embed3, my_embed4], most_freq_count=None,
-                  min_freq=1, unknown_token='<unk2>', reserved_tokens=None)
+    g4 = text.glossary.Glossary(counter, [my_embed3, my_embed4],most_freq_count=None, min_freq=1,
+                                unknown_token='<unk2>', reserved_tokens=None)
     assert_almost_equal(g4.idx_to_vec.asnumpy(),
                         np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
                                    0.11, 0.12, 0.13, 0.14, 0.15],
@@ -688,8 +646,8 @@ def test_glossary_with_two_embeds():
 
     counter2 = Counter(['b', 'b', 'c', 'c', 'c', 'some_word$'])
 
-    g5 = Glossary(counter2, [my_embed3, my_embed4], most_freq_count=None,
-                  min_freq=1, unknown_token='a', reserved_tokens=None)
+    g5 = text.glossary.Glossary(counter2, [my_embed3, my_embed4], most_freq_count=None, min_freq=1,
+                                unknown_token='a', reserved_tokens=None)
     assert g5.token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3}
     assert g5.idx_to_token == ['a', 'c', 'b', 'some_word$']
     assert_almost_equal(g5.idx_to_vec.asnumpy(),
@@ -705,20 +663,19 @@ def test_glossary_with_two_embeds():
 
 
 def test_get_embedding_names_and_pretrain_files():
-    assert len(TokenEmbedding.get_embedding_and_pretrained_file_names(
+    assert len(text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names(
         embedding_name='fasttext')) == 294
 
-    assert len(TokenEmbedding.get_embedding_and_pretrained_file_names(
+    assert len(text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names(
         embedding_name='glove')) == 10
 
-    reg = TokenEmbedding.get_embedding_and_pretrained_file_names(
+    reg = text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names(
         embedding_name=None)
 
     assert len(reg['glove']) == 10
     assert len(reg['fasttext']) == 294
 
-    assertRaises(KeyError,
-                 TokenEmbedding.get_embedding_and_pretrained_file_names,
+    assertRaises(KeyError, text.embedding.TokenEmbedding.get_embedding_and_pretrained_file_names,
                  'unknown$$')
 
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services