You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/03/04 01:08:59 UTC
[GitHub] eric-haibin-lin closed pull request #9983: [WIP] language model with google's billion words dataset

eric-haibin-lin closed pull request #9983: [WIP] language model with google's billion words dataset
URL: https://github.com/apache/incubator-mxnet/pull/9983
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/example/sparse/nce_language_model/7d1b5d.log b/example/sparse/nce_language_model/7d1b5d.log
new file mode 100644
index 00000000000..011483e618f
--- /dev/null
+++ b/example/sparse/nce_language_model/7d1b5d.log
@@ -0,0 +1,2494 @@
+2018-03-03 14:52:11,361 Namespace(batch_size=128, bptt=20, checkpoint_dir='./checkpoint/', checkpoint_interval=1, clip=1.0, data='/home/ubuntu/gbw/training-monolingual.tokenized.shuffled/*', dense=False, dropout=0.01, emsize=512, epochs=10, eps=0.0001, gpus='0,1,2,3', k=8192, kvstore='device', load_epoch=-1, log_interval=1000, lr=0.05, nhid=2048, nlayers=1, num_proj=512, per_ctx_clip=True, profile=False, rescale_embed=True, seed=1, vocab='./data/1b_word_vocab.txt', wd=0.0)
+train.py:105: UserWarning: Optimizer created manually outside Module but rescale_grad is not normalized to 1.0/batch_size/num_workers (0.25 vs. 0.001953125). Is this intended?
+  module.init_optimizer(optimizer=optimizer, kvstore=kvstore)
+2018-03-03 14:52:22,739 Training started ... 
+/home/ubuntu/tf/python/mxnet/ndarray/ndarray.py:1905: RuntimeWarning: You are attempting to copy an array to itself
+  warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
+2018-03-03 14:53:30,871 Iter[0] Batch [1000]	Speed: 163682.91 samples/sec
+2018-03-03 14:53:30,871 Iter[0] Batch [1000] 	loss 7.5901234, ppl 1978.5577269
+2018-03-03 14:54:33,532 Iter[0] Batch [2000]	Speed: 163418.56 samples/sec
+2018-03-03 14:54:33,532 Iter[0] Batch [2000] 	loss 6.3533074, ppl 574.3893186
+2018-03-03 14:55:36,787 Iter[0] Batch [3000]	Speed: 161884.90 samples/sec
+2018-03-03 14:55:36,787 Iter[0] Batch [3000] 	loss 6.0736285, ppl 434.2535220
+2018-03-03 14:56:40,624 Iter[0] Batch [4000]	Speed: 160407.99 samples/sec
+2018-03-03 14:56:40,625 Iter[0] Batch [4000] 	loss 5.9148180, ppl 370.4868564
+2018-03-03 14:57:42,336 Iter[0] Batch [5000]	Speed: 165932.27 samples/sec
+2018-03-03 14:57:42,337 Iter[0] Batch [5000] 	loss 5.8100621, ppl 333.6398472
+2018-03-03 14:58:44,958 Iter[0] Batch [6000]	Speed: 163522.48 samples/sec
+2018-03-03 14:58:44,958 Iter[0] Batch [6000] 	loss 5.7320094, ppl 308.5887163
+2018-03-03 14:59:47,093 Iter[0] Batch [7000]	Speed: 164803.01 samples/sec
+2018-03-03 14:59:47,093 Iter[0] Batch [7000] 	loss 5.6664082, ppl 288.9946579
+2018-03-03 15:00:50,267 Iter[0] Batch [8000]	Speed: 162093.03 samples/sec
+2018-03-03 15:00:50,267 Iter[0] Batch [8000] 	loss 5.6088469, ppl 272.8294501
+2018-03-03 15:01:52,244 Iter[0] Batch [9000]	Speed: 165223.43 samples/sec
+2018-03-03 15:01:52,244 Iter[0] Batch [9000] 	loss 5.5672785, ppl 261.7208599
+2018-03-03 15:02:53,990 Iter[0] Batch [10000]	Speed: 165839.84 samples/sec
+2018-03-03 15:02:53,990 Iter[0] Batch [10000] 	loss 5.5300129, ppl 252.1471613
+2018-03-03 15:03:55,990 Iter[0] Batch [11000]	Speed: 165163.09 samples/sec
+2018-03-03 15:03:55,990 Iter[0] Batch [11000] 	loss 5.4951867, ppl 243.5169911
+2018-03-03 15:04:58,855 Iter[0] Batch [12000]	Speed: 162888.87 samples/sec
+2018-03-03 15:04:58,855 Iter[0] Batch [12000] 	loss 5.4657656, ppl 236.4568231
+2018-03-03 15:06:00,991 Iter[0] Batch [13000]	Speed: 164800.24 samples/sec
+2018-03-03 15:06:00,991 Iter[0] Batch [13000] 	loss 5.4375758, ppl 229.8842178
+2018-03-03 15:07:02,667 Iter[0] Batch [14000]	Speed: 166029.50 samples/sec
+2018-03-03 15:07:02,667 Iter[0] Batch [14000] 	loss 5.4132848, ppl 224.3673728
+2018-03-03 15:08:06,029 Iter[0] Batch [15000]	Speed: 161610.85 samples/sec
+2018-03-03 15:08:06,029 Iter[0] Batch [15000] 	loss 5.3914773, ppl 219.5274636
+2018-03-03 15:09:07,671 Iter[0] Batch [16000]	Speed: 166120.38 samples/sec
+2018-03-03 15:09:07,671 Iter[0] Batch [16000] 	loss 5.3701520, ppl 214.8955193
+2018-03-03 15:10:09,787 Iter[0] Batch [17000]	Speed: 164853.78 samples/sec
+2018-03-03 15:10:09,787 Iter[0] Batch [17000] 	loss 5.3529672, ppl 211.2341402
+2018-03-03 15:11:11,665 Iter[0] Batch [18000]	Speed: 165487.70 samples/sec
+2018-03-03 15:11:11,665 Iter[0] Batch [18000] 	loss 5.3348043, ppl 207.4321497
+2018-03-03 15:12:15,344 Iter[0] Batch [19000]	Speed: 160806.95 samples/sec
+2018-03-03 15:12:15,344 Iter[0] Batch [19000] 	loss 5.3186437, ppl 204.1068742
+2018-03-03 15:13:17,950 Iter[0] Batch [20000]	Speed: 163562.35 samples/sec
+2018-03-03 15:13:17,950 Iter[0] Batch [20000] 	loss 5.3021621, ppl 200.7704287
+2018-03-03 15:14:19,559 Iter[0] Batch [21000]	Speed: 166209.87 samples/sec
+2018-03-03 15:14:19,559 Iter[0] Batch [21000] 	loss 5.2894547, ppl 198.2352957
+2018-03-03 15:15:21,607 Iter[0] Batch [22000]	Speed: 165033.97 samples/sec
+2018-03-03 15:15:21,607 Iter[0] Batch [22000] 	loss 5.2748727, ppl 195.3655963
+2018-03-03 15:16:24,900 Iter[0] Batch [23000]	Speed: 161786.94 samples/sec
+2018-03-03 15:16:24,901 Iter[0] Batch [23000] 	loss 5.2656535, ppl 193.5727703
+2018-03-03 15:17:27,271 Iter[0] Batch [24000]	Speed: 164180.36 samples/sec
+2018-03-03 15:17:27,271 Iter[0] Batch [24000] 	loss 5.2518320, ppl 190.9157118
+['train.py', '--data=/home/ubuntu/gbw/training-monolingual.tokenized.shuffled/*', '--gpus=0,1,2,3', '--per-ctx-clip', '--clip=1', '--lr=0.05', '--dropout=0.01', '--rescale-embed', '--eps=0.0001', '--epoch=10', '--checkpoint-interval=1', '--log-interval=1000']
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+['lstmp0_i2h_weight', 'lstmp0_i2h_bias', 'lstmp0_h2h_weight', 'lstmp0_h2h_bias', 'lstmp0_proj_weight']
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/n2018-03-03 15:18:28,878 Iter[0] Batch [25000]	Speed: 166216.05 samples/sec
+2018-03-03 15:18:28,878 Iter[0] Batch [25000] 	loss 5.2403078, ppl 188.7281864
+2018-03-03 15:19:30,824 Iter[0] Batch [26000]	Speed: 165306.48 samples/sec
+2018-03-03 15:19:30,824 Iter[0] Batch [26000] 	loss 5.2279660, ppl 186.4132560
+2018-03-03 15:20:34,613 Iter[0] Batch [27000]	Speed: 160527.95 samples/sec
+2018-03-03 15:20:34,613 Iter[0] Batch [27000] 	loss 5.2164617, ppl 184.2809912
+2018-03-03 15:21:37,127 Iter[0] Batch [28000]	Speed: 163804.88 samples/sec
+2018-03-03 15:21:37,127 Iter[0] Batch [28000] 	loss 5.2100805, ppl 183.1087921
+2018-03-03 15:22:38,039 Iter[0] Batch [29000]	Speed: 168111.81 samples/sec
+2018-03-03 15:22:38,039 Iter[0] Batch [29000] 	loss 5.2017957, ppl 181.5980454
+2018-03-03 15:23:40,762 Iter[0] Batch [30000]	Speed: 163257.67 samples/sec
+2018-03-03 15:23:40,762 Iter[0] Batch [30000] 	loss 5.1899043, ppl 179.4513781
+2018-03-03 15:24:41,858 Iter[0] Batch [31000]	Speed: 167604.07 samples/sec
+2018-03-03 15:24:41,859 Iter[0] Batch [31000] 	loss 5.1830141, ppl 178.2191660
+2018-03-03 15:25:43,005 Iter[0] Batch [32000]	Speed: 167468.03 samples/sec
+2018-03-03 15:25:43,005 Iter[0] Batch [32000] 	loss 5.1752891, ppl 176.8477269
+2018-03-03 15:26:45,458 Iter[0] Batch [33000]	Speed: 163961.75 samples/sec
+2018-03-03 15:26:45,459 Iter[0] Batch [33000] 	loss 5.1653699, ppl 175.1022207
+2018-03-03 15:27:48,366 Iter[0] Batch [34000]	Speed: 162777.80 samples/sec
+2018-03-03 15:27:48,367 Iter[0] Batch [34000] 	loss 5.1593457, ppl 174.0505376
+2018-03-03 15:28:49,940 Iter[0] Batch [35000]	Speed: 166304.44 samples/sec
+2018-03-03 15:28:49,941 Iter[0] Batch [35000] 	loss 5.1492113, ppl 172.2955521
+2018-03-03 15:29:51,514 Iter[0] Batch [36000]	Speed: 166304.68 samples/sec
+2018-03-03 15:29:51,515 Iter[0] Batch [36000] 	loss 5.1445809, ppl 171.4995872
+2018-03-03 15:30:52,724 Iter[0] Batch [37000]	Speed: 167294.26 samples/sec
+2018-03-03 15:30:52,724 Iter[0] Batch [37000] 	loss 5.1363930, ppl 170.1011004
+2018-03-03 15:31:55,960 Iter[0] Batch [38000]	Speed: 161932.74 samples/sec
+2018-03-03 15:31:55,961 Iter[0] Batch [38000] 	loss 5.1316207, ppl 169.2912667
+2018-03-03 15:32:56,810 Iter[0] Batch [39000]	Speed: 168284.64 samples/sec
+2018-03-03 15:32:56,810 Iter[0] Batch [39000] 	loss 5.1257738, ppl 168.3043299
+2018-03-03 15:33:58,778 Iter[0] Batch [40000]	Speed: 165246.33 samples/sec
+2018-03-03 15:33:58,778 Iter[0] Batch [40000] 	loss 5.1186707, ppl 167.1130790
+2018-03-03 15:35:01,188 Iter[0] Batch [41000]	Speed: 164077.64 samples/sec
+2018-03-03 15:35:01,188 Iter[0] Batch [41000] 	loss 5.1143371, ppl 166.3904458
+2018-03-03 15:36:02,424 Iter[0] Batch [42000]	Speed: 167220.46 samples/sec
+2018-03-03 15:36:02,425 Iter[0] Batch [42000] 	loss 5.1080926, ppl 165.3546528
+2018-03-03 15:37:04,244 Iter[0] Batch [43000]	Speed: 165643.78 samples/sec
+2018-03-03 15:37:04,244 Iter[0] Batch [43000] 	loss 5.1026187, ppl 164.4520026
+2018-03-03 15:38:05,978 Iter[0] Batch [44000]	Speed: 165872.41 samples/sec
+2018-03-03 15:38:05,979 Iter[0] Batch [44000] 	loss 5.0950367, ppl 163.2098374
+2018-03-03 15:39:09,721 Iter[0] Batch [45000]	Speed: 160647.72 samples/sec
+2018-03-03 15:39:09,721 Iter[0] Batch [45000] 	loss 5.0925094, ppl 162.7978708
+2018-03-03 15:40:11,526 Iter[0] Batch [46000]	Speed: 165681.28 samples/sec
+2018-03-03 15:40:11,526 Iter[0] Batch [46000] 	loss 5.0845535, ppl 161.5078124
+2018-03-03 15:41:13,015 Iter[0] Batch [47000]	Speed: 166533.57 samples/sec
+2018-03-03 15:41:13,016 Iter[0] Batch [47000] 	loss 5.0800680, ppl 160.7849839
+2018-03-03 15:42:14,503 Iter[0] Batch [48000]	Speed: 166538.17 samples/sec
+2018-03-03 15:42:14,503 Iter[0] Batch [48000] 	loss 5.0751375, ppl 159.9941897
+2018-03-03 15:43:17,542 Iter[0] Batch [49000]	Speed: 162438.87 samples/sec
+2018-03-03 15:43:17,542 Iter[0] Batch [49000] 	loss 5.0710129, ppl 159.3356352
+2018-03-03 15:44:18,464 Iter[0] Batch [50000]	Speed: 168085.51 samples/sec
+2018-03-03 15:44:18,464 Iter[0] Batch [50000] 	loss 5.0678664, ppl 158.8350760
+2018-03-03 15:45:20,018 Iter[0] Batch [51000]	Speed: 166358.16 samples/sec
+2018-03-03 15:45:20,018 Iter[0] Batch [51000] 	loss 5.0652848, ppl 158.4255498
+2018-03-03 15:46:21,086 Iter[0] Batch [52000]	Speed: 167682.03 samples/sec
+2018-03-03 15:46:21,086 Iter[0] Batch [52000] 	loss 5.0617914, ppl 157.8730780
+ews.en-00050-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/trainin2018-03-03 15:47:24,814 Iter[0] Batch [53000]	Speed: 160682.90 samples/sec
+2018-03-03 15:47:24,814 Iter[0] Batch [53000] 	loss 5.0548746, ppl 156.7848698
+2018-03-03 15:48:26,349 Iter[0] Batch [54000]	Speed: 166410.14 samples/sec
+2018-03-03 15:48:26,349 Iter[0] Batch [54000] 	loss 5.0491824, ppl 155.8949561
+2018-03-03 15:49:28,236 Iter[0] Batch [55000]	Speed: 165464.69 samples/sec
+2018-03-03 15:49:28,236 Iter[0] Batch [55000] 	loss 5.0486516, ppl 155.8122197
+2018-03-03 15:50:31,174 Iter[0] Batch [56000]	Speed: 162700.14 samples/sec
+2018-03-03 15:50:31,174 Iter[0] Batch [56000] 	loss 5.0454430, ppl 155.3130828
+2018-03-03 15:51:31,923 Iter[0] Batch [57000]	Speed: 168560.96 samples/sec
+2018-03-03 15:51:31,924 Iter[0] Batch [57000] 	loss 5.0391125, ppl 154.3329837
+2018-03-03 15:52:33,669 Iter[0] Batch [58000]	Speed: 165841.63 samples/sec
+2018-03-03 15:52:33,669 Iter[0] Batch [58000] 	loss 5.0367840, ppl 153.9740350
+2018-03-03 15:53:35,065 Iter[0] Batch [59000]	Speed: 166786.64 samples/sec
+2018-03-03 15:53:35,065 Iter[0] Batch [59000] 	loss 5.0325109, ppl 153.3175004
+2018-03-03 15:54:38,416 Iter[0] Batch [60000]	Speed: 161639.59 samples/sec
+2018-03-03 15:54:38,416 Iter[0] Batch [60000] 	loss 5.0291582, ppl 152.8043283
+2018-03-03 15:55:40,063 Iter[0] Batch [61000]	Speed: 166107.46 samples/sec
+2018-03-03 15:55:40,063 Iter[0] Batch [61000] 	loss 5.0210918, ppl 151.5767045
+2018-03-03 15:56:41,385 Iter[0] Batch [62000]	Speed: 166986.97 samples/sec
+2018-03-03 15:56:41,386 Iter[0] Batch [62000] 	loss 5.0218312, ppl 151.6888298
+2018-03-03 15:57:42,662 Iter[0] Batch [63000]	Speed: 167110.39 samples/sec
+2018-03-03 15:57:42,663 Iter[0] Batch [63000] 	loss 5.0168801, ppl 150.9396485
+2018-03-03 15:58:45,674 Iter[0] Batch [64000]	Speed: 162509.79 samples/sec
+2018-03-03 15:58:45,674 Iter[0] Batch [64000] 	loss 5.0179297, ppl 151.0981593
+2018-03-03 15:59:47,973 Iter[0] Batch [65000]	Speed: 164370.86 samples/sec
+2018-03-03 15:59:47,973 Iter[0] Batch [65000] 	loss 5.0162820, ppl 150.8494065
+2018-03-03 16:00:49,047 Iter[0] Batch [66000]	Speed: 167665.88 samples/sec
+2018-03-03 16:00:49,047 Iter[0] Batch [66000] 	loss 5.0114043, ppl 150.1153948
+2018-03-03 16:01:53,464 Iter[0] Batch [67000]	Speed: 158963.14 samples/sec
+2018-03-03 16:01:53,464 Iter[0] Batch [67000] 	loss 5.0071582, ppl 149.4793421
+2018-03-03 16:02:56,067 Iter[0] Batch [68000]	Speed: 163570.18 samples/sec
+2018-03-03 16:02:56,068 Iter[0] Batch [68000] 	loss 5.0059676, ppl 149.3014741
+2018-03-03 16:03:58,208 Iter[0] Batch [69000]	Speed: 164789.01 samples/sec
+2018-03-03 16:03:58,208 Iter[0] Batch [69000] 	loss 4.9993875, ppl 148.3222839
+2018-03-03 16:04:59,927 Iter[0] Batch [70000]	Speed: 165912.25 samples/sec
+2018-03-03 16:04:59,927 Iter[0] Batch [70000] 	loss 4.9979016, ppl 148.1020499
+2018-03-03 16:06:04,134 Iter[0] Batch [71000]	Speed: 159484.70 samples/sec
+2018-03-03 16:06:04,134 Iter[0] Batch [71000] 	loss 4.9942664, ppl 147.5646532
+2018-03-03 16:07:05,795 Iter[0] Batch [72000]	Speed: 166069.01 samples/sec
+2018-03-03 16:07:05,796 Iter[0] Batch [72000] 	loss 4.9937613, ppl 147.4901403
+2018-03-03 16:08:08,562 Iter[0] Batch [73000]	Speed: 163145.05 samples/sec
+2018-03-03 16:08:08,562 Iter[0] Batch [73000] 	loss 4.9881813, ppl 146.6694258
+2018-03-03 16:09:10,601 Iter[0] Batch [74000]	Speed: 165057.80 samples/sec
+2018-03-03 16:09:10,601 Iter[0] Batch [74000] 	loss 4.9868941, ppl 146.4807676
+2018-03-03 16:10:13,703 Iter[0] Batch [75000]	Speed: 162277.81 samples/sec
+2018-03-03 16:10:13,703 Iter[0] Batch [75000] 	loss 4.9837836, ppl 146.0258402
+2018-03-03 16:11:14,705 Iter[0] Batch [76000]	Speed: 167864.22 samples/sec
+2018-03-03 16:11:14,705 Iter[0] Batch [76000] 	loss 4.9829594, ppl 145.9055326
+2018-03-03 16:12:16,836 Iter[0] Batch [77000]	Speed: 164811.17 samples/sec
+2018-03-03 16:12:16,837 Iter[0] Batch [77000] 	loss 4.9807777, ppl 145.5875661
+2018-03-03 16:13:19,092 Iter[0] Batch [78000]	Speed: 164484.14 samples/sec
+2018-03-03 16:13:19,092 Iter[0] Batch [78000] 	loss 4.9772844, ppl 145.0798637
+2018-03-03 16:14:16,997 Saved checkpoint to "./checkpoint/-0000.params"
+2018-03-03 16:16:55,519 eval batch 19 : 4.1856079
+2018-03-03 16:19:41,212 eval batch 39 : 4.1723332
+2018-03-03 16:22:25,886 eval batch 59 : 4.1866217
+2018-03-03 16:23:21,414 Iter[0]		 CE loss 3.9500276, ppl 51.9368017. Time cost = 543.11 seconds
+g-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
+Finished processing!
+[]
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Finished processing!
+reset
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldou2018-03-03 16:24:25,679 Iter[1] Batch [1000]	Speed: 164925.10 samples/sec
+2018-03-03 16:24:25,680 Iter[1] Batch [1000] 	loss 4.9390906, ppl 139.6432038
+2018-03-03 16:25:27,586 Iter[1] Batch [2000]	Speed: 165410.47 samples/sec
+2018-03-03 16:25:27,586 Iter[1] Batch [2000] 	loss 4.9200309, ppl 137.0068411
+2018-03-03 16:26:30,207 Iter[1] Batch [3000]	Speed: 163524.70 samples/sec
+2018-03-03 16:26:30,207 Iter[1] Batch [3000] 	loss 4.9162293, ppl 136.4869897
+2018-03-03 16:27:34,050 Iter[1] Batch [4000]	Speed: 160393.78 samples/sec
+2018-03-03 16:27:34,050 Iter[1] Batch [4000] 	loss 4.9111660, ppl 135.7976643
+2018-03-03 16:28:36,214 Iter[1] Batch [5000]	Speed: 164725.30 samples/sec
+2018-03-03 16:28:36,214 Iter[1] Batch [5000] 	loss 4.8997117, ppl 134.2510720
+2018-03-03 16:29:38,102 Iter[1] Batch [6000]	Speed: 165461.72 samples/sec
+2018-03-03 16:29:38,102 Iter[1] Batch [6000] 	loss 4.8914723, ppl 133.1494612
+2018-03-03 16:30:40,148 Iter[1] Batch [7000]	Speed: 165039.52 samples/sec
+2018-03-03 16:30:40,148 Iter[1] Batch [7000] 	loss 4.8894883, ppl 132.8855566
+2018-03-03 16:31:44,245 Iter[1] Batch [8000]	Speed: 159758.64 samples/sec
+2018-03-03 16:31:44,245 Iter[1] Batch [8000] 	loss 4.9333434, ppl 138.8429390
+2018-03-03 16:32:45,600 Iter[1] Batch [9000]	Speed: 166897.74 samples/sec
+2018-03-03 16:32:45,600 Iter[1] Batch [9000] 	loss 4.9427477, ppl 140.1548182
+2018-03-03 16:33:46,799 Iter[1] Batch [10000]	Speed: 167321.40 samples/sec
+2018-03-03 16:33:46,800 Iter[1] Batch [10000] 	loss 4.9241152, ppl 137.5675727
+2018-03-03 16:34:49,086 Iter[1] Batch [11000]	Speed: 164402.35 samples/sec
+2018-03-03 16:34:49,086 Iter[1] Batch [11000] 	loss 4.8977336, ppl 133.9857691
+2018-03-03 16:35:51,390 Iter[1] Batch [12000]	Speed: 164354.23 samples/sec
+2018-03-03 16:35:51,391 Iter[1] Batch [12000] 	loss 4.9092898, ppl 135.5431234
+2018-03-03 16:36:52,413 Iter[1] Batch [13000]	Speed: 167808.02 samples/sec
+2018-03-03 16:36:52,413 Iter[1] Batch [13000] 	loss 4.8824406, ppl 131.9523173
+2018-03-03 16:37:54,133 Iter[1] Batch [14000]	Speed: 165908.99 samples/sec
+2018-03-03 16:37:54,134 Iter[1] Batch [14000] 	loss 4.9028551, ppl 134.6737353
+2018-03-03 16:38:57,681 Iter[1] Batch [15000]	Speed: 161139.18 samples/sec
+2018-03-03 16:38:57,681 Iter[1] Batch [15000] 	loss 4.8992105, ppl 134.1838060
+2018-03-03 16:39:59,812 Iter[1] Batch [16000]	Speed: 164812.76 samples/sec
+2018-03-03 16:39:59,813 Iter[1] Batch [16000] 	loss 4.8777809, ppl 131.3388808
+2018-03-03 16:41:00,573 Iter[1] Batch [17000]	Speed: 168531.85 samples/sec
+2018-03-03 16:41:00,573 Iter[1] Batch [17000] 	loss 4.8855484, ppl 132.3630383
+2018-03-03 16:42:00,949 Iter[1] Batch [18000]	Speed: 169602.24 samples/sec
+2018-03-03 16:42:00,950 Iter[1] Batch [18000] 	loss 4.9183914, ppl 136.7824088
+2018-03-03 16:43:03,878 Iter[1] Batch [19000]	Speed: 162723.25 samples/sec
+2018-03-03 16:43:03,879 Iter[1] Batch [19000] 	loss 4.9198074, ppl 136.9762320
+2018-03-03 16:44:04,783 Iter[1] Batch [20000]	Speed: 168132.41 samples/sec
+2018-03-03 16:44:04,783 Iter[1] Batch [20000] 	loss 4.9080895, ppl 135.3805163
+2018-03-03 16:45:05,930 Iter[1] Batch [21000]	Speed: 167466.98 samples/sec
+2018-03-03 16:45:05,930 Iter[1] Batch [21000] 	loss 4.8982281, ppl 134.0520457
+2018-03-03 16:46:06,048 Iter[1] Batch [22000]	Speed: 170330.15 samples/sec
+2018-03-03 16:46:06,049 Iter[1] Batch [22000] 	loss 4.8907039, ppl 133.0471938
+2018-03-03 16:47:08,674 Iter[1] Batch [23000]	Speed: 163512.75 samples/sec
+2018-03-03 16:47:08,674 Iter[1] Batch [23000] 	loss 4.8967297, ppl 133.8513275
+2018-03-03 16:48:09,503 Iter[1] Batch [24000]	Speed: 168339.90 samples/sec
+2018-03-03 16:48:09,503 Iter[1] Batch [24000] 	loss 4.8891504, ppl 132.8406634
+2018-03-03 16:49:11,108 Iter[1] Batch [25000]	Speed: 166219.91 samples/sec
+2018-03-03 16:49:11,109 Iter[1] Batch [25000] 	loss 4.8956113, ppl 133.7017173
+2018-03-03 16:50:12,381 Iter[1] Batch [26000]	Speed: 167123.38 samples/sec
+2018-03-03 16:50:12,381 Iter[1] Batch [26000] 	loss 4.8809359, ppl 131.7539196
+t-00000-of-00050reset
+
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/traini2018-03-03 16:51:15,692 Iter[1] Batch [27000]	Speed: 161739.77 samples/sec
+2018-03-03 16:51:15,693 Iter[1] Batch [27000] 	loss 4.9102363, ppl 135.6714736
+2018-03-03 16:52:17,036 Iter[1] Batch [28000]	Speed: 166927.99 samples/sec
+2018-03-03 16:52:17,037 Iter[1] Batch [28000] 	loss 4.8756668, ppl 131.0615155
+2018-03-03 16:53:18,708 Iter[1] Batch [29000]	Speed: 166039.70 samples/sec
+2018-03-03 16:53:18,709 Iter[1] Batch [29000] 	loss 4.8792348, ppl 131.5299741
+2018-03-03 16:54:21,410 Iter[1] Batch [30000]	Speed: 163312.70 samples/sec
+2018-03-03 16:54:21,411 Iter[1] Batch [30000] 	loss 4.8824988, ppl 131.9599976
+2018-03-03 16:55:23,050 Iter[1] Batch [31000]	Speed: 166128.23 samples/sec
+2018-03-03 16:55:23,050 Iter[1] Batch [31000] 	loss 4.8684957, ppl 130.1250229
+2018-03-03 16:56:24,450 Iter[1] Batch [32000]	Speed: 166773.72 samples/sec
+2018-03-03 16:56:24,451 Iter[1] Batch [32000] 	loss 4.8767625, ppl 131.2051987
+2018-03-03 16:57:25,599 Iter[1] Batch [33000]	Speed: 167462.13 samples/sec
+2018-03-03 16:57:25,599 Iter[1] Batch [33000] 	loss 4.8798141, ppl 131.6061911
+2018-03-03 16:58:29,219 Iter[1] Batch [34000]	Speed: 160956.79 samples/sec
+2018-03-03 16:58:29,219 Iter[1] Batch [34000] 	loss 4.8881164, ppl 132.7033792
+2018-03-03 16:59:30,711 Iter[1] Batch [35000]	Speed: 166525.82 samples/sec
+2018-03-03 16:59:30,711 Iter[1] Batch [35000] 	loss 4.8780195, ppl 131.3702314
+2018-03-03 17:00:30,998 Iter[1] Batch [36000]	Speed: 169853.27 samples/sec
+2018-03-03 17:00:30,998 Iter[1] Batch [36000] 	loss 4.8954352, ppl 133.6781648
+2018-03-03 17:01:32,689 Iter[1] Batch [37000]	Speed: 165990.01 samples/sec
+2018-03-03 17:01:32,689 Iter[1] Batch [37000] 	loss 4.8794676, ppl 131.5605995
+2018-03-03 17:02:35,630 Iter[1] Batch [38000]	Speed: 162691.87 samples/sec
+2018-03-03 17:02:35,630 Iter[1] Batch [38000] 	loss 4.8910629, ppl 133.0949643
+2018-03-03 17:03:36,209 Iter[1] Batch [39000]	Speed: 169036.23 samples/sec
+2018-03-03 17:03:36,209 Iter[1] Batch [39000] 	loss 4.8787941, ppl 131.4720315
+2018-03-03 17:04:37,607 Iter[1] Batch [40000]	Speed: 166781.28 samples/sec
+2018-03-03 17:04:37,607 Iter[1] Batch [40000] 	loss 4.8680129, ppl 130.0622121
+2018-03-03 17:05:40,689 Iter[1] Batch [41000]	Speed: 162329.08 samples/sec
+2018-03-03 17:05:40,689 Iter[1] Batch [41000] 	loss 4.8684781, ppl 130.1227356
+2018-03-03 17:06:41,763 Iter[1] Batch [42000]	Speed: 167666.44 samples/sec
+2018-03-03 17:06:41,763 Iter[1] Batch [42000] 	loss 4.8630313, ppl 129.4159001
+2018-03-03 17:07:41,962 Iter[1] Batch [43000]	Speed: 170102.78 samples/sec
+2018-03-03 17:07:41,962 Iter[1] Batch [43000] 	loss 4.8687215, ppl 130.1544060
+2018-03-03 17:08:42,845 Iter[1] Batch [44000]	Speed: 168192.20 samples/sec
+2018-03-03 17:08:42,845 Iter[1] Batch [44000] 	loss 4.8880449, ppl 132.6938934
+2018-03-03 17:09:45,825 Iter[1] Batch [45000]	Speed: 162590.06 samples/sec
+2018-03-03 17:09:45,825 Iter[1] Batch [45000] 	loss 4.8684984, ppl 130.1253787
+2018-03-03 17:10:47,461 Iter[1] Batch [46000]	Speed: 166138.64 samples/sec
+2018-03-03 17:10:47,461 Iter[1] Batch [46000] 	loss 4.8756043, ppl 131.0533244
+2018-03-03 17:11:48,188 Iter[1] Batch [47000]	Speed: 168621.90 samples/sec
+2018-03-03 17:11:48,189 Iter[1] Batch [47000] 	loss 4.8724305, ppl 130.6380430
+2018-03-03 17:12:49,519 Iter[1] Batch [48000]	Speed: 166964.87 samples/sec
+2018-03-03 17:12:49,519 Iter[1] Batch [48000] 	loss 4.8757063, ppl 131.0666864
+2018-03-03 17:13:52,727 Iter[1] Batch [49000]	Speed: 162003.56 samples/sec
+2018-03-03 17:13:52,728 Iter[1] Batch [49000] 	loss 4.8733168, ppl 130.7538825
+2018-03-03 17:14:53,890 Iter[1] Batch [50000]	Speed: 167423.08 samples/sec
+2018-03-03 17:14:53,890 Iter[1] Batch [50000] 	loss 4.8732645, ppl 130.7470386
+2018-03-03 17:15:54,645 Iter[1] Batch [51000]	Speed: 168545.11 samples/sec
+2018-03-03 17:15:54,646 Iter[1] Batch [51000] 	loss 4.8573309, ppl 128.6802776
+2018-03-03 17:16:55,087 Iter[1] Batch [52000]	Speed: 169420.70 samples/sec
+2018-03-03 17:16:55,087 Iter[1] Batch [52000] 	loss 4.8651437, ppl 129.6895801
+2018-03-03 17:17:58,863 Iter[1] Batch [53000]	Speed: 160561.85 samples/sec
+2018-03-03 17:17:58,863 Iter[1] Batch [53000] 	loss 4.8541965, ppl 128.2775768
+2018-03-03 17:19:00,089 Iter[1] Batch [54000]	Speed: 167249.28 samples/sec
+2018-03-03 17:19:00,089 Iter[1] Batch [54000] 	loss 4.8688742, ppl 130.1742866
+ng-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
+Finished processing!
+Proces2018-03-03 17:20:01,547 Iter[1] Batch [55000]	Speed: 166619.13 samples/sec
+2018-03-03 17:20:01,547 Iter[1] Batch [55000] 	loss 4.8611895, ppl 129.1777617
+2018-03-03 17:21:04,413 Iter[1] Batch [56000]	Speed: 162885.35 samples/sec
+2018-03-03 17:21:04,414 Iter[1] Batch [56000] 	loss 4.8578672, ppl 128.7493109
+2018-03-03 17:22:05,837 Iter[1] Batch [57000]	Speed: 166712.79 samples/sec
+2018-03-03 17:22:05,837 Iter[1] Batch [57000] 	loss 4.8503008, ppl 127.7788175
+2018-03-03 17:23:07,146 Iter[1] Batch [58000]	Speed: 167021.79 samples/sec
+2018-03-03 17:23:07,146 Iter[1] Batch [58000] 	loss 4.8533273, ppl 128.1661339
+2018-03-03 17:24:08,599 Iter[1] Batch [59000]	Speed: 166633.25 samples/sec
+2018-03-03 17:24:08,599 Iter[1] Batch [59000] 	loss 4.8573926, ppl 128.6882198
+2018-03-03 17:25:11,477 Iter[1] Batch [60000]	Speed: 162853.90 samples/sec
+2018-03-03 17:25:11,478 Iter[1] Batch [60000] 	loss 4.8713312, ppl 130.4945221
+2018-03-03 17:26:12,558 Iter[1] Batch [61000]	Speed: 167647.78 samples/sec
+2018-03-03 17:26:12,558 Iter[1] Batch [61000] 	loss 4.8531969, ppl 128.1494134
+2018-03-03 17:27:14,123 Iter[1] Batch [62000]	Speed: 166327.70 samples/sec
+2018-03-03 17:27:14,124 Iter[1] Batch [62000] 	loss 4.8550742, ppl 128.3902198
+2018-03-03 17:28:15,061 Iter[1] Batch [63000]	Speed: 168042.29 samples/sec
+2018-03-03 17:28:15,061 Iter[1] Batch [63000] 	loss 4.8479262, ppl 127.4757527
+2018-03-03 17:29:18,178 Iter[1] Batch [64000]	Speed: 162238.25 samples/sec
+2018-03-03 17:29:18,178 Iter[1] Batch [64000] 	loss 4.8470379, ppl 127.3625687
+2018-03-03 17:30:18,856 Iter[1] Batch [65000]	Speed: 168760.35 samples/sec
+2018-03-03 17:30:18,856 Iter[1] Batch [65000] 	loss 4.8516234, ppl 127.9479368
+2018-03-03 17:31:20,905 Iter[1] Batch [66000]	Speed: 165030.83 samples/sec
+2018-03-03 17:31:20,905 Iter[1] Batch [66000] 	loss 4.8534996, ppl 128.1882145
+2018-03-03 17:32:23,581 Iter[1] Batch [67000]	Speed: 163380.33 samples/sec
+2018-03-03 17:32:23,581 Iter[1] Batch [67000] 	loss 4.8458426, ppl 127.2104216
+2018-03-03 17:33:23,930 Iter[1] Batch [68000]	Speed: 169680.58 samples/sec
+2018-03-03 17:33:23,930 Iter[1] Batch [68000] 	loss 4.8525285, ppl 128.0637921
+2018-03-03 17:34:24,656 Iter[1] Batch [69000]	Speed: 168626.01 samples/sec
+2018-03-03 17:34:24,656 Iter[1] Batch [69000] 	loss 4.8435539, ppl 126.9196116
+2018-03-03 17:35:25,832 Iter[1] Batch [70000]	Speed: 167385.49 samples/sec
+2018-03-03 17:35:25,833 Iter[1] Batch [70000] 	loss 4.8405191, ppl 126.5350242
+2018-03-03 17:36:28,496 Iter[1] Batch [71000]	Speed: 163411.91 samples/sec
+2018-03-03 17:36:28,496 Iter[1] Batch [71000] 	loss 4.8548895, ppl 128.3664999
+2018-03-03 17:37:29,665 Iter[1] Batch [72000]	Speed: 167405.62 samples/sec
+2018-03-03 17:37:29,665 Iter[1] Batch [72000] 	loss 4.8547133, ppl 128.3438873
+2018-03-03 17:38:30,781 Iter[1] Batch [73000]	Speed: 167551.40 samples/sec
+2018-03-03 17:38:30,781 Iter[1] Batch [73000] 	loss 4.8480617, ppl 127.4930328
+2018-03-03 17:39:31,823 Iter[1] Batch [74000]	Speed: 167754.92 samples/sec
+2018-03-03 17:39:31,823 Iter[1] Batch [74000] 	loss 4.8545594, ppl 128.3241359
+2018-03-03 17:40:34,595 Iter[1] Batch [75000]	Speed: 163129.62 samples/sec
+2018-03-03 17:40:34,595 Iter[1] Batch [75000] 	loss 4.8611594, ppl 129.1738763
+2018-03-03 17:41:35,303 Iter[1] Batch [76000]	Speed: 168676.06 samples/sec
+2018-03-03 17:41:35,303 Iter[1] Batch [76000] 	loss 4.8435859, ppl 126.9236770
+2018-03-03 17:42:36,730 Iter[1] Batch [77000]	Speed: 166702.40 samples/sec
+2018-03-03 17:42:36,730 Iter[1] Batch [77000] 	loss 4.8405465, ppl 126.5384841
+2018-03-03 17:43:37,757 Iter[1] Batch [78000]	Speed: 167795.99 samples/sec
+2018-03-03 17:43:37,757 Iter[1] Batch [78000] 	loss 4.8496887, ppl 127.7006269
+2018-03-03 17:44:31,271 Saved checkpoint to "./checkpoint/-0000.params"
+2018-03-03 17:47:08,466 eval batch 19 : 4.0814539
+2018-03-03 17:49:52,715 eval batch 39 : 4.0768787
+2018-03-03 17:52:37,436 eval batch 59 : 4.0924117
+2018-03-03 17:53:32,377 Iter[1]		 CE loss 3.8619234, ppl 47.5567324. Time cost = 539.75 seconds
+2018-03-03 17:54:35,426 Iter[2] Batch [1000]	Speed: 168170.00 samples/sec
+2018-03-03 17:54:35,426 Iter[2] Batch [1000] 	loss 4.8103312, ppl 122.7722791
+sing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
+Finished processing!
+[]
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Finished processing!
+reset
+reset
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/train2018-03-03 17:55:36,654 Iter[2] Batch [2000]	Speed: 167244.00 samples/sec
+2018-03-03 17:55:36,654 Iter[2] Batch [2000] 	loss 4.7914680, ppl 120.4780970
+2018-03-03 17:56:37,562 Iter[2] Batch [3000]	Speed: 168123.92 samples/sec
+2018-03-03 17:56:37,562 Iter[2] Batch [3000] 	loss 4.7764973, ppl 118.6878889
+2018-03-03 17:57:40,665 Iter[2] Batch [4000]	Speed: 162273.40 samples/sec
+2018-03-03 17:57:40,665 Iter[2] Batch [4000] 	loss 4.7918480, ppl 120.5238968
+2018-03-03 17:58:42,523 Iter[2] Batch [5000]	Speed: 165541.81 samples/sec
+2018-03-03 17:58:42,523 Iter[2] Batch [5000] 	loss 4.7809613, ppl 119.2189035
+2018-03-03 17:59:43,184 Iter[2] Batch [6000]	Speed: 168806.58 samples/sec
+2018-03-03 17:59:43,184 Iter[2] Batch [6000] 	loss 4.7947031, ppl 120.8684936
+2018-03-03 18:00:44,726 Iter[2] Batch [7000]	Speed: 166389.89 samples/sec
+2018-03-03 18:00:44,727 Iter[2] Batch [7000] 	loss 4.7752500, ppl 118.5399459
+2018-03-03 18:01:48,331 Iter[2] Batch [8000]	Speed: 160996.08 samples/sec
+2018-03-03 18:01:48,331 Iter[2] Batch [8000] 	loss 4.7860457, ppl 119.8266006
+2018-03-03 18:02:50,009 Iter[2] Batch [9000]	Speed: 166022.47 samples/sec
+2018-03-03 18:02:50,009 Iter[2] Batch [9000] 	loss 4.7863484, ppl 119.8628818
+2018-03-03 18:03:51,371 Iter[2] Batch [10000]	Speed: 166880.13 samples/sec
+2018-03-03 18:03:51,371 Iter[2] Batch [10000] 	loss 4.7749316, ppl 118.5022136
+2018-03-03 18:04:52,506 Iter[2] Batch [11000]	Speed: 167497.28 samples/sec
+2018-03-03 18:04:52,507 Iter[2] Batch [11000] 	loss 4.7751266, ppl 118.5253145
+2018-03-03 18:05:54,413 Iter[2] Batch [12000]	Speed: 165410.31 samples/sec
+2018-03-03 18:05:54,413 Iter[2] Batch [12000] 	loss 4.7764320, ppl 118.6801467
+2018-03-03 18:06:56,364 Iter[2] Batch [13000]	Speed: 165293.46 samples/sec
+2018-03-03 18:06:56,364 Iter[2] Batch [13000] 	loss 4.7905723, ppl 120.3702327
+2018-03-03 18:07:57,966 Iter[2] Batch [14000]	Speed: 166226.89 samples/sec
+2018-03-03 18:07:57,967 Iter[2] Batch [14000] 	loss 4.7898160, ppl 120.2792371
+2018-03-03 18:09:01,749 Iter[2] Batch [15000]	Speed: 160545.30 samples/sec
+2018-03-03 18:09:01,749 Iter[2] Batch [15000] 	loss 4.7917484, ppl 120.5118921
+2018-03-03 18:10:02,322 Iter[2] Batch [16000]	Speed: 169054.11 samples/sec
+2018-03-03 18:10:02,322 Iter[2] Batch [16000] 	loss 4.7814828, ppl 119.2810905
+2018-03-03 18:11:03,706 Iter[2] Batch [17000]	Speed: 166817.86 samples/sec
+2018-03-03 18:11:03,706 Iter[2] Batch [17000] 	loss 4.7798367, ppl 119.0849041
+2018-03-03 18:12:05,052 Iter[2] Batch [18000]	Speed: 166922.25 samples/sec
+2018-03-03 18:12:05,052 Iter[2] Batch [18000] 	loss 4.7851676, ppl 119.7214241
+2018-03-03 18:13:07,880 Iter[2] Batch [19000]	Speed: 162985.74 samples/sec
+2018-03-03 18:13:07,880 Iter[2] Batch [19000] 	loss 4.7973652, ppl 121.1906874
+2018-03-03 18:14:09,467 Iter[2] Batch [20000]	Speed: 166270.54 samples/sec
+2018-03-03 18:14:09,467 Iter[2] Batch [20000] 	loss 4.7945398, ppl 120.8487597
+2018-03-03 18:15:10,740 Iter[2] Batch [21000]	Speed: 167120.23 samples/sec
+2018-03-03 18:15:10,740 Iter[2] Batch [21000] 	loss 4.8025227, ppl 121.8173335
+2018-03-03 18:16:11,873 Iter[2] Batch [22000]	Speed: 167503.02 samples/sec
+2018-03-03 18:16:11,874 Iter[2] Batch [22000] 	loss 4.7968652, ppl 121.1301072
+2018-03-03 18:17:14,432 Iter[2] Batch [23000]	Speed: 163686.89 samples/sec
+2018-03-03 18:17:14,432 Iter[2] Batch [23000] 	loss 4.7946254, ppl 120.8590984
+2018-03-03 18:18:14,962 Iter[2] Batch [24000]	Speed: 169172.43 samples/sec
+2018-03-03 18:18:14,962 Iter[2] Batch [24000] 	loss 4.7939445, ppl 120.7768383
+2018-03-03 18:19:16,641 Iter[2] Batch [25000]	Speed: 166020.60 samples/sec
+2018-03-03 18:19:16,642 Iter[2] Batch [25000] 	loss 4.7932512, ppl 120.6931256
+2018-03-03 18:20:18,976 Iter[2] Batch [26000]	Speed: 164275.54 samples/sec
+2018-03-03 18:20:18,976 Iter[2] Batch [26000] 	loss 4.7833879, ppl 119.5085469
+2018-03-03 18:21:20,605 Iter[2] Batch [27000]	Speed: 166155.57 samples/sec
+2018-03-03 18:21:20,605 Iter[2] Batch [27000] 	loss 4.7781789, ppl 118.8876472
+2018-03-03 18:22:21,990 Iter[2] Batch [28000]	Speed: 166815.91 samples/sec
+2018-03-03 18:22:21,990 Iter[2] Batch [28000] 	loss 4.7803012, ppl 119.1402263
+2018-03-03 18:23:23,919 Iter[2] Batch [29000]	Speed: 165352.35 samples/sec
+2018-03-03 18:23:23,919 Iter[2] Batch [29000] 	loss 4.7801879, ppl 119.1267307
+ing-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Proce2018-03-03 18:24:27,970 Iter[2] Batch [30000]	Speed: 159871.76 samples/sec
+2018-03-03 18:24:27,970 Iter[2] Batch [30000] 	loss 4.7802812, ppl 119.1378529
+2018-03-03 18:25:29,316 Iter[2] Batch [31000]	Speed: 166922.51 samples/sec
+2018-03-03 18:25:29,316 Iter[2] Batch [31000] 	loss 4.7842293, ppl 119.6091444
+2018-03-03 18:26:29,628 Iter[2] Batch [32000]	Speed: 169783.64 samples/sec
+2018-03-03 18:26:29,629 Iter[2] Batch [32000] 	loss 4.7805309, ppl 119.1675945
+2018-03-03 18:27:30,650 Iter[2] Batch [33000]	Speed: 167810.43 samples/sec
+2018-03-03 18:27:30,650 Iter[2] Batch [33000] 	loss 4.7812262, ppl 119.2504820
+2018-03-03 18:28:33,272 Iter[2] Batch [34000]	Speed: 163520.25 samples/sec
+2018-03-03 18:28:33,273 Iter[2] Batch [34000] 	loss 4.7808750, ppl 119.2086120
+2018-03-03 18:29:34,347 Iter[2] Batch [35000]	Speed: 167663.20 samples/sec
+2018-03-03 18:29:34,348 Iter[2] Batch [35000] 	loss 4.7796590, ppl 119.0637405
+2018-03-03 18:30:35,416 Iter[2] Batch [36000]	Speed: 167680.84 samples/sec
+2018-03-03 18:30:35,416 Iter[2] Batch [36000] 	loss 4.7769668, ppl 118.7436297
+2018-03-03 18:31:35,689 Iter[2] Batch [37000]	Speed: 169892.61 samples/sec
+2018-03-03 18:31:35,690 Iter[2] Batch [37000] 	loss 4.7872020, ppl 119.9652303
+2018-03-03 18:32:38,677 Iter[2] Batch [38000]	Speed: 162572.22 samples/sec
+2018-03-03 18:32:38,677 Iter[2] Batch [38000] 	loss 4.7904344, ppl 120.3536359
+2018-03-03 18:33:39,543 Iter[2] Batch [39000]	Speed: 168238.26 samples/sec
+2018-03-03 18:33:39,543 Iter[2] Batch [39000] 	loss 4.7802051, ppl 119.1287782
+2018-03-03 18:34:40,834 Iter[2] Batch [40000]	Speed: 167073.22 samples/sec
+2018-03-03 18:34:40,834 Iter[2] Batch [40000] 	loss 4.7817820, ppl 119.3167869
+2018-03-03 18:35:42,941 Iter[2] Batch [41000]	Speed: 164875.58 samples/sec
+2018-03-03 18:35:42,942 Iter[2] Batch [41000] 	loss 4.7708977, ppl 118.0251404
+2018-03-03 18:36:43,320 Iter[2] Batch [42000]	Speed: 169598.32 samples/sec
+2018-03-03 18:36:43,320 Iter[2] Batch [42000] 	loss 4.7739895, ppl 118.3906149
+2018-03-03 18:37:44,038 Iter[2] Batch [43000]	Speed: 168648.18 samples/sec
+2018-03-03 18:37:44,038 Iter[2] Batch [43000] 	loss 4.7733820, ppl 118.3187237
+2018-03-03 18:38:44,748 Iter[2] Batch [44000]	Speed: 168671.16 samples/sec
+2018-03-03 18:38:44,748 Iter[2] Batch [44000] 	loss 4.7766043, ppl 118.7005929
+2018-03-03 18:39:46,778 Iter[2] Batch [45000]	Speed: 165080.74 samples/sec
+2018-03-03 18:39:46,779 Iter[2] Batch [45000] 	loss 4.7919605, ppl 120.5374565
+2018-03-03 18:40:47,694 Iter[2] Batch [46000]	Speed: 168101.88 samples/sec
+2018-03-03 18:40:47,694 Iter[2] Batch [46000] 	loss 4.7731184, ppl 118.2875305
+2018-03-03 18:41:48,867 Iter[2] Batch [47000]	Speed: 167394.88 samples/sec
+2018-03-03 18:41:48,867 Iter[2] Batch [47000] 	loss 4.7759102, ppl 118.6182266
+2018-03-03 18:42:50,310 Iter[2] Batch [48000]	Speed: 166657.30 samples/sec
+2018-03-03 18:42:50,311 Iter[2] Batch [48000] 	loss 4.7763641, ppl 118.6720804
+2018-03-03 18:43:53,350 Iter[2] Batch [49000]	Speed: 162437.91 samples/sec
+2018-03-03 18:43:53,350 Iter[2] Batch [49000] 	loss 4.7772734, ppl 118.7800469
+2018-03-03 18:44:54,986 Iter[2] Batch [50000]	Speed: 166137.44 samples/sec
+2018-03-03 18:44:54,986 Iter[2] Batch [50000] 	loss 4.7765937, ppl 118.6993410
+2018-03-03 18:45:55,347 Iter[2] Batch [51000]	Speed: 169645.37 samples/sec
+2018-03-03 18:45:55,348 Iter[2] Batch [51000] 	loss 4.7725066, ppl 118.2151939
+2018-03-03 18:46:56,188 Iter[2] Batch [52000]	Speed: 168308.62 samples/sec
+2018-03-03 18:46:56,188 Iter[2] Batch [52000] 	loss 4.7784762, ppl 118.9229937
+2018-03-03 18:48:00,031 Iter[2] Batch [53000]	Speed: 160394.03 samples/sec
+2018-03-03 18:48:00,031 Iter[2] Batch [53000] 	loss 4.7875273, ppl 120.0042722
+2018-03-03 18:49:03,172 Iter[2] Batch [54000]	Speed: 162177.55 samples/sec
+2018-03-03 18:49:03,172 Iter[2] Batch [54000] 	loss 4.7784039, ppl 118.9144000
+2018-03-03 18:50:04,027 Iter[2] Batch [55000]	Speed: 168268.16 samples/sec
+2018-03-03 18:50:04,028 Iter[2] Batch [55000] 	loss 4.7790777, ppl 118.9945548
+ssing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00037-o2018-03-03 18:51:07,026 Iter[2] Batch [56000]	Speed: 162542.96 samples/sec
+2018-03-03 18:51:07,026 Iter[2] Batch [56000] 	loss 4.7713543, ppl 118.0790478
+2018-03-03 18:52:07,745 Iter[2] Batch [57000]	Speed: 168647.41 samples/sec
+2018-03-03 18:52:07,745 Iter[2] Batch [57000] 	loss 4.7696707, ppl 117.8804179
+2018-03-03 18:53:09,718 Iter[2] Batch [58000]	Speed: 165234.20 samples/sec
+2018-03-03 18:53:09,718 Iter[2] Batch [58000] 	loss 4.7755492, ppl 118.5754206
+2018-03-03 18:54:10,844 Iter[2] Batch [59000]	Speed: 167521.30 samples/sec
+2018-03-03 18:54:10,845 Iter[2] Batch [59000] 	loss 4.7699723, ppl 117.9159716
+2018-03-03 18:55:13,138 Iter[2] Batch [60000]	Speed: 164382.91 samples/sec
+2018-03-03 18:55:13,138 Iter[2] Batch [60000] 	loss 4.7771191, ppl 118.7617209
+2018-03-03 18:56:15,120 Iter[2] Batch [61000]	Speed: 165210.46 samples/sec
+2018-03-03 18:56:15,120 Iter[2] Batch [61000] 	loss 4.7744531, ppl 118.4455220
+2018-03-03 18:57:15,816 Iter[2] Batch [62000]	Speed: 168708.81 samples/sec
+2018-03-03 18:57:15,817 Iter[2] Batch [62000] 	loss 4.7708531, ppl 118.0198847
+2018-03-03 18:58:17,120 Iter[2] Batch [63000]	Speed: 167037.48 samples/sec
+2018-03-03 18:58:17,120 Iter[2] Batch [63000] 	loss 4.7656074, ppl 117.4024084
+2018-03-03 18:59:21,078 Iter[2] Batch [64000]	Speed: 160106.90 samples/sec
+2018-03-03 18:59:21,078 Iter[2] Batch [64000] 	loss 4.7762754, ppl 118.6615580
+2018-03-03 19:00:22,033 Iter[2] Batch [65000]	Speed: 167990.68 samples/sec
+2018-03-03 19:00:22,034 Iter[2] Batch [65000] 	loss 4.7787699, ppl 118.9579325
+2018-03-03 19:01:22,774 Iter[2] Batch [66000]	Speed: 168587.24 samples/sec
+2018-03-03 19:01:22,774 Iter[2] Batch [66000] 	loss 4.7625191, ppl 117.0403961
+2018-03-03 19:02:24,630 Iter[2] Batch [67000]	Speed: 165545.52 samples/sec
+2018-03-03 19:02:24,630 Iter[2] Batch [67000] 	loss 4.7685652, ppl 117.7501768
+2018-03-03 19:03:25,363 Iter[2] Batch [68000]	Speed: 168607.53 samples/sec
+2018-03-03 19:03:25,363 Iter[2] Batch [68000] 	loss 4.7681160, ppl 117.6972931
+2018-03-03 19:04:26,070 Iter[2] Batch [69000]	Speed: 168678.02 samples/sec
+2018-03-03 19:04:26,071 Iter[2] Batch [69000] 	loss 4.7689594, ppl 117.7965961
+2018-03-03 19:05:27,327 Iter[2] Batch [70000]	Speed: 167164.97 samples/sec
+2018-03-03 19:05:27,328 Iter[2] Batch [70000] 	loss 4.7653687, ppl 117.3743911
+2018-03-03 19:06:30,718 Iter[2] Batch [71000]	Speed: 161538.08 samples/sec
+2018-03-03 19:06:30,718 Iter[2] Batch [71000] 	loss 4.7656914, ppl 117.4122688
+2018-03-03 19:07:31,409 Iter[2] Batch [72000]	Speed: 168724.45 samples/sec
+2018-03-03 19:07:31,409 Iter[2] Batch [72000] 	loss 4.7713480, ppl 118.0783098
+2018-03-03 19:08:32,880 Iter[2] Batch [73000]	Speed: 166584.28 samples/sec
+2018-03-03 19:08:32,880 Iter[2] Batch [73000] 	loss 4.7653324, ppl 117.3701272
+2018-03-03 19:09:33,901 Iter[2] Batch [74000]	Speed: 167809.32 samples/sec
+2018-03-03 19:09:33,902 Iter[2] Batch [74000] 	loss 4.7875586, ppl 120.0080224
+2018-03-03 19:10:36,553 Iter[2] Batch [75000]	Speed: 163444.16 samples/sec
+2018-03-03 19:10:36,553 Iter[2] Batch [75000] 	loss 4.7734734, ppl 118.3295392
+2018-03-03 19:11:37,436 Iter[2] Batch [76000]	Speed: 168191.11 samples/sec
+2018-03-03 19:11:37,437 Iter[2] Batch [76000] 	loss 4.7731500, ppl 118.2912732
+2018-03-03 19:12:38,578 Iter[2] Batch [77000]	Speed: 167479.60 samples/sec
+2018-03-03 19:12:38,578 Iter[2] Batch [77000] 	loss 4.7698437, ppl 117.9008185
+2018-03-03 19:13:39,883 Iter[2] Batch [78000]	Speed: 167034.39 samples/sec
+2018-03-03 19:13:39,883 Iter[2] Batch [78000] 	loss 4.7656820, ppl 117.4111681
+2018-03-03 19:14:22,681 Saved checkpoint to "./checkpoint/-0000.params"
+2018-03-03 19:17:01,937 eval batch 19 : 4.0011024
+2018-03-03 19:19:47,294 eval batch 39 : 3.9968174
+2018-03-03 19:22:32,617 eval batch 59 : 4.0129429
+2018-03-03 19:23:28,170 Iter[2]		 CE loss 3.7865513, ppl 44.1040368. Time cost = 544.16 seconds
+2018-03-03 19:24:31,174 Iter[3] Batch [1000]	Speed: 168240.02 samples/sec
+2018-03-03 19:24:31,174 Iter[3] Batch [1000] 	loss 4.7279176, ppl 113.0598787
+2018-03-03 19:25:32,696 Iter[3] Batch [2000]	Speed: 166445.21 samples/sec
+2018-03-03 19:25:32,696 Iter[3] Batch [2000] 	loss 4.7260813, ppl 112.8524541
+2018-03-03 19:26:33,962 Iter[3] Batch [3000]	Speed: 167140.86 samples/sec
+2018-03-03 19:26:33,962 Iter[3] Batch [3000] 	loss 4.7128227, ppl 111.3660648
+f-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+[]
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Finished processing!
+reset
+reset
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Proc2018-03-03 19:27:37,394 Iter[3] Batch [4000]	Speed: 161433.94 samples/sec
+2018-03-03 19:27:37,394 Iter[3] Batch [4000] 	loss 4.7084793, ppl 110.8834109
+2018-03-03 19:28:38,155 Iter[3] Batch [5000]	Speed: 168529.62 samples/sec
+2018-03-03 19:28:38,155 Iter[3] Batch [5000] 	loss 4.7090238, ppl 110.9438068
+2018-03-03 19:29:39,703 Iter[3] Batch [6000]	Speed: 166373.91 samples/sec
+2018-03-03 19:29:39,703 Iter[3] Batch [6000] 	loss 4.7344289, ppl 113.7984506
+2018-03-03 19:30:39,929 Iter[3] Batch [7000]	Speed: 170026.78 samples/sec
+2018-03-03 19:30:39,929 Iter[3] Batch [7000] 	loss 4.7279629, ppl 113.0650018
+2018-03-03 19:31:43,872 Iter[3] Batch [8000]	Speed: 160143.12 samples/sec
+2018-03-03 19:31:43,872 Iter[3] Batch [8000] 	loss 4.7034707, ppl 110.3294294
+2018-03-03 19:32:44,760 Iter[3] Batch [9000]	Speed: 168177.69 samples/sec
+2018-03-03 19:32:44,760 Iter[3] Batch [9000] 	loss 4.7296973, ppl 113.2612691
+2018-03-03 19:33:45,477 Iter[3] Batch [10000]	Speed: 168651.28 samples/sec
+2018-03-03 19:33:45,477 Iter[3] Batch [10000] 	loss 4.7138438, ppl 111.4798381
+2018-03-03 19:34:46,075 Iter[3] Batch [11000]	Speed: 168984.02 samples/sec
+2018-03-03 19:34:46,075 Iter[3] Batch [11000] 	loss 4.7114805, ppl 111.2166909
+2018-03-03 19:35:48,366 Iter[3] Batch [12000]	Speed: 164388.65 samples/sec
+2018-03-03 19:35:48,367 Iter[3] Batch [12000] 	loss 4.7248113, ppl 112.7092313
+2018-03-03 19:36:50,484 Iter[3] Batch [13000]	Speed: 164848.45 samples/sec
+2018-03-03 19:36:50,484 Iter[3] Batch [13000] 	loss 4.7092559, ppl 110.9695522
+2018-03-03 19:37:51,138 Iter[3] Batch [14000]	Speed: 168828.11 samples/sec
+2018-03-03 19:37:51,138 Iter[3] Batch [14000] 	loss 4.7189695, ppl 112.0527263
+2018-03-03 19:38:55,003 Iter[3] Batch [15000]	Speed: 160337.79 samples/sec
+2018-03-03 19:38:55,003 Iter[3] Batch [15000] 	loss 4.7231359, ppl 112.5205574
+2018-03-03 19:39:56,144 Iter[3] Batch [16000]	Speed: 167483.40 samples/sec
+2018-03-03 19:39:56,144 Iter[3] Batch [16000] 	loss 4.7255566, ppl 112.7932662
+2018-03-03 19:40:57,480 Iter[3] Batch [17000]	Speed: 166947.80 samples/sec
+2018-03-03 19:40:57,481 Iter[3] Batch [17000] 	loss 4.7186621, ppl 112.0182842
+2018-03-03 19:41:58,329 Iter[3] Batch [18000]	Speed: 168286.98 samples/sec
+2018-03-03 19:41:58,329 Iter[3] Batch [18000] 	loss 4.7220215, ppl 112.3952284
+2018-03-03 19:43:01,676 Iter[3] Batch [19000]	Speed: 161650.22 samples/sec
+2018-03-03 19:43:01,676 Iter[3] Batch [19000] 	loss 4.7222984, ppl 112.4263609
+2018-03-03 19:44:03,891 Iter[3] Batch [20000]	Speed: 164591.84 samples/sec
+2018-03-03 19:44:03,891 Iter[3] Batch [20000] 	loss 4.7271016, ppl 112.9676577
+2018-03-03 19:45:05,940 Iter[3] Batch [21000]	Speed: 165030.00 samples/sec
+2018-03-03 19:45:05,940 Iter[3] Batch [21000] 	loss 4.7137883, ppl 111.4736546
+2018-03-03 19:46:06,650 Iter[3] Batch [22000]	Speed: 168672.14 samples/sec
+2018-03-03 19:46:06,650 Iter[3] Batch [22000] 	loss 4.7197758, ppl 112.1431053
+2018-03-03 19:47:09,127 Iter[3] Batch [23000]	Speed: 163900.87 samples/sec
+2018-03-03 19:47:09,127 Iter[3] Batch [23000] 	loss 4.7141898, ppl 111.5184272
+2018-03-03 19:48:10,727 Iter[3] Batch [24000]	Speed: 166232.69 samples/sec
+2018-03-03 19:48:10,727 Iter[3] Batch [24000] 	loss 4.7175047, ppl 111.8887067
+2018-03-03 19:49:11,476 Iter[3] Batch [25000]	Speed: 168562.39 samples/sec
+2018-03-03 19:49:11,477 Iter[3] Batch [25000] 	loss 4.7161285, ppl 111.7348346
+2018-03-03 19:50:11,939 Iter[3] Batch [26000]	Speed: 169361.73 samples/sec
+2018-03-03 19:50:11,939 Iter[3] Batch [26000] 	loss 4.7177453, ppl 111.9156332
+2018-03-03 19:51:15,673 Iter[3] Batch [27000]	Speed: 160667.94 samples/sec
+2018-03-03 19:51:15,673 Iter[3] Batch [27000] 	loss 4.7194840, ppl 112.1103870
+2018-03-03 19:52:16,737 Iter[3] Batch [28000]	Speed: 167694.46 samples/sec
+2018-03-03 19:52:16,737 Iter[3] Batch [28000] 	loss 4.7212895, ppl 112.3129817
+2018-03-03 19:53:18,572 Iter[3] Batch [29000]	Speed: 165600.07 samples/sec
+2018-03-03 19:53:18,573 Iter[3] Batch [29000] 	loss 4.7335465, ppl 113.6980766
+2018-03-03 19:54:21,929 Iter[3] Batch [30000]	Speed: 161626.14 samples/sec
+2018-03-03 19:54:21,929 Iter[3] Batch [30000] 	loss 4.7134488, ppl 111.4358209
+essing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-2018-03-03 19:55:22,475 Iter[3] Batch [31000]	Speed: 169128.89 samples/sec
+2018-03-03 19:55:22,475 Iter[3] Batch [31000] 	loss 4.7096945, ppl 111.0182421
+2018-03-03 19:56:23,712 Iter[3] Batch [32000]	Speed: 167217.77 samples/sec
+2018-03-03 19:56:23,712 Iter[3] Batch [32000] 	loss 4.7275785, ppl 113.0215508
+2018-03-03 19:57:24,583 Iter[3] Batch [33000]	Speed: 168225.55 samples/sec
+2018-03-03 19:57:24,583 Iter[3] Batch [33000] 	loss 4.7258250, ppl 112.8235394
+2018-03-03 19:58:28,105 Iter[3] Batch [34000]	Speed: 161203.68 samples/sec
+2018-03-03 19:58:28,106 Iter[3] Batch [34000] 	loss 4.7237512, ppl 112.5898052
+2018-03-03 19:59:29,171 Iter[3] Batch [35000]	Speed: 167689.19 samples/sec
+2018-03-03 19:59:29,171 Iter[3] Batch [35000] 	loss 4.7231887, ppl 112.5264913
+2018-03-03 20:00:29,498 Iter[3] Batch [36000]	Speed: 169741.31 samples/sec
+2018-03-03 20:00:29,498 Iter[3] Batch [36000] 	loss 4.7219062, ppl 112.3822773
+2018-03-03 20:01:30,667 Iter[3] Batch [37000]	Speed: 167405.81 samples/sec
+2018-03-03 20:01:30,667 Iter[3] Batch [37000] 	loss 4.7183469, ppl 111.9829777
+2018-03-03 20:02:34,114 Iter[3] Batch [38000]	Speed: 161393.74 samples/sec
+2018-03-03 20:02:34,115 Iter[3] Batch [38000] 	loss 4.7142285, ppl 111.5227399
+2018-03-03 20:03:35,569 Iter[3] Batch [39000]	Speed: 166628.60 samples/sec
+2018-03-03 20:03:35,569 Iter[3] Batch [39000] 	loss 4.7194898, ppl 112.1110439
+2018-03-03 20:04:36,625 Iter[3] Batch [40000]	Speed: 167714.31 samples/sec
+2018-03-03 20:04:36,625 Iter[3] Batch [40000] 	loss 4.7230742, ppl 112.5136130
+2018-03-03 20:05:40,398 Iter[3] Batch [41000]	Speed: 160569.44 samples/sec
+2018-03-03 20:05:40,399 Iter[3] Batch [41000] 	loss 4.7139078, ppl 111.4869800
+2018-03-03 20:06:41,394 Iter[3] Batch [42000]	Speed: 167881.72 samples/sec
+2018-03-03 20:06:41,394 Iter[3] Batch [42000] 	loss 4.7384203, ppl 114.2535741
+2018-03-03 20:07:43,352 Iter[3] Batch [43000]	Speed: 165272.96 samples/sec
+2018-03-03 20:07:43,352 Iter[3] Batch [43000] 	loss 4.7246977, ppl 112.6964202
+2018-03-03 20:08:45,001 Iter[3] Batch [44000]	Speed: 166102.52 samples/sec
+2018-03-03 20:08:45,001 Iter[3] Batch [44000] 	loss 4.7228605, ppl 112.4895746
+2018-03-03 20:09:47,733 Iter[3] Batch [45000]	Speed: 163233.41 samples/sec
+2018-03-03 20:09:47,734 Iter[3] Batch [45000] 	loss 4.7156781, ppl 111.6845216
+2018-03-03 20:10:49,177 Iter[3] Batch [46000]	Speed: 166658.04 samples/sec
+2018-03-03 20:10:49,177 Iter[3] Batch [46000] 	loss 4.7250000, ppl 112.7304984
+2018-03-03 20:11:49,982 Iter[3] Batch [47000]	Speed: 168406.89 samples/sec
+2018-03-03 20:11:49,982 Iter[3] Batch [47000] 	loss 4.7175859, ppl 111.8977981
+2018-03-03 20:12:51,240 Iter[3] Batch [48000]	Speed: 167162.50 samples/sec
+2018-03-03 20:12:51,240 Iter[3] Batch [48000] 	loss 4.7198020, ppl 112.1460403
+2018-03-03 20:13:54,503 Iter[3] Batch [49000]	Speed: 161864.74 samples/sec
+2018-03-03 20:13:54,503 Iter[3] Batch [49000] 	loss 4.7313398, ppl 113.4474624
+2018-03-03 20:14:55,497 Iter[3] Batch [50000]	Speed: 167884.66 samples/sec
+2018-03-03 20:14:55,497 Iter[3] Batch [50000] 	loss 4.7257383, ppl 112.8137559
+2018-03-03 20:15:56,248 Iter[3] Batch [51000]	Speed: 168558.77 samples/sec
+2018-03-03 20:15:56,248 Iter[3] Batch [51000] 	loss 4.7163148, ppl 111.7556558
+2018-03-03 20:16:57,449 Iter[3] Batch [52000]	Speed: 167317.34 samples/sec
+2018-03-03 20:16:57,449 Iter[3] Batch [52000] 	loss 4.7201355, ppl 112.1834578
+2018-03-03 20:18:00,262 Iter[3] Batch [53000]	Speed: 163023.02 samples/sec
+2018-03-03 20:18:00,263 Iter[3] Batch [53000] 	loss 4.7174039, ppl 111.8774310
+2018-03-03 20:19:00,896 Iter[3] Batch [54000]	Speed: 168883.81 samples/sec
+2018-03-03 20:19:00,896 Iter[3] Batch [54000] 	loss 4.7200719, ppl 112.1763151
+2018-03-03 20:20:02,353 Iter[3] Batch [55000]	Speed: 166621.76 samples/sec
+2018-03-03 20:20:02,353 Iter[3] Batch [55000] 	loss 4.7214602, ppl 112.3321555
+2018-03-03 20:21:04,969 Iter[3] Batch [56000]	Speed: 163535.99 samples/sec
+2018-03-03 20:21:04,969 Iter[3] Batch [56000] 	loss 4.7273848, ppl 112.9996550
+2018-03-03 20:22:06,606 Iter[3] Batch [57000]	Speed: 166135.38 samples/sec
+2018-03-03 20:22:06,606 Iter[3] Batch [57000] 	loss 4.7287137, ppl 113.1499208
+2018-03-03 20:23:07,358 Iter[3] Batch [58000]	Speed: 168553.45 samples/sec
+2018-03-03 20:23:07,358 Iter[3] Batch [58000] 	loss 4.7222730, ppl 112.4235064
+of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual2018-03-03 20:24:07,572 Iter[3] Batch [59000]	Speed: 170060.55 samples/sec
+2018-03-03 20:24:07,572 Iter[3] Batch [59000] 	loss 4.7266918, ppl 112.9213769
+2018-03-03 20:25:10,933 Iter[3] Batch [60000]	Speed: 161613.48 samples/sec
+2018-03-03 20:25:10,934 Iter[3] Batch [60000] 	loss 4.7295047, ppl 113.2394595
+2018-03-03 20:26:11,557 Iter[3] Batch [61000]	Speed: 168911.31 samples/sec
+2018-03-03 20:26:11,557 Iter[3] Batch [61000] 	loss 4.7181480, ppl 111.9607146
+2018-03-03 20:27:12,238 Iter[3] Batch [62000]	Speed: 168751.16 samples/sec
+2018-03-03 20:27:12,238 Iter[3] Batch [62000] 	loss 4.7144312, ppl 111.5453517
+2018-03-03 20:28:13,363 Iter[3] Batch [63000]	Speed: 167527.28 samples/sec
+2018-03-03 20:28:13,363 Iter[3] Batch [63000] 	loss 4.7130520, ppl 111.3916036
+2018-03-03 20:29:16,628 Iter[3] Batch [64000]	Speed: 161858.11 samples/sec
+2018-03-03 20:29:16,628 Iter[3] Batch [64000] 	loss 4.7234320, ppl 112.5538790
+2018-03-03 20:30:18,411 Iter[3] Batch [65000]	Speed: 165742.62 samples/sec
+2018-03-03 20:30:18,411 Iter[3] Batch [65000] 	loss 4.7249098, ppl 112.7203267
+2018-03-03 20:31:18,789 Iter[3] Batch [66000]	Speed: 169599.11 samples/sec
+2018-03-03 20:31:18,789 Iter[3] Batch [66000] 	loss 4.7293855, ppl 113.2259689
+2018-03-03 20:32:22,570 Iter[3] Batch [67000]	Speed: 160550.27 samples/sec
+2018-03-03 20:32:22,570 Iter[3] Batch [67000] 	loss 4.7242328, ppl 112.6440461
+2018-03-03 20:33:24,582 Iter[3] Batch [68000]	Speed: 165127.96 samples/sec
+2018-03-03 20:33:24,582 Iter[3] Batch [68000] 	loss 4.7246758, ppl 112.6939550
+2018-03-03 20:34:25,611 Iter[3] Batch [69000]	Speed: 167790.81 samples/sec
+2018-03-03 20:34:25,611 Iter[3] Batch [69000] 	loss 4.7248352, ppl 112.7119170
+2018-03-03 20:35:25,902 Iter[3] Batch [70000]	Speed: 169842.14 samples/sec
+2018-03-03 20:35:25,902 Iter[3] Batch [70000] 	loss 4.7215641, ppl 112.3438281
+2018-03-03 20:36:28,977 Iter[3] Batch [71000]	Speed: 162347.11 samples/sec
+2018-03-03 20:36:28,977 Iter[3] Batch [71000] 	loss 4.7258125, ppl 112.8221291
+2018-03-03 20:37:30,700 Iter[3] Batch [72000]	Speed: 165903.98 samples/sec
+2018-03-03 20:37:30,700 Iter[3] Batch [72000] 	loss 4.7194414, ppl 112.1056137
+2018-03-03 20:38:31,765 Iter[3] Batch [73000]	Speed: 167690.59 samples/sec
+2018-03-03 20:38:31,765 Iter[3] Batch [73000] 	loss 4.7210375, ppl 112.2846876
+2018-03-03 20:39:33,349 Iter[3] Batch [74000]	Speed: 166278.01 samples/sec
+2018-03-03 20:39:33,349 Iter[3] Batch [74000] 	loss 4.7322613, ppl 113.5520507
+2018-03-03 20:40:36,157 Iter[3] Batch [75000]	Speed: 163035.26 samples/sec
+2018-03-03 20:40:36,157 Iter[3] Batch [75000] 	loss 4.7210605, ppl 112.2872755
+2018-03-03 20:41:36,750 Iter[3] Batch [76000]	Speed: 168997.13 samples/sec
+2018-03-03 20:41:36,750 Iter[3] Batch [76000] 	loss 4.7188699, ppl 112.0415654
+2018-03-03 20:42:38,733 Iter[3] Batch [77000]	Speed: 165207.56 samples/sec
+2018-03-03 20:42:38,733 Iter[3] Batch [77000] 	loss 4.7161164, ppl 111.7334815
+2018-03-03 20:43:40,074 Iter[3] Batch [78000]	Speed: 166935.52 samples/sec
+2018-03-03 20:43:40,074 Iter[3] Batch [78000] 	loss 4.7150875, ppl 111.6185774
+2018-03-03 20:44:17,189 Saved checkpoint to "./checkpoint/-0000.params"
+2018-03-03 20:46:54,742 eval batch 19 : 3.9828273
+2018-03-03 20:49:37,898 eval batch 39 : 3.9785287
+2018-03-03 20:52:21,136 eval batch 59 : 3.9940803
+2018-03-03 20:53:15,988 Iter[3]		 CE loss 3.7686789, ppl 43.3227931. Time cost = 537.46 seconds
+2018-03-03 20:54:18,958 Iter[4] Batch [1000]	Speed: 168356.69 samples/sec
+2018-03-03 20:54:18,958 Iter[4] Batch [1000] 	loss 4.6713051, ppl 106.8370822
+2018-03-03 20:55:20,675 Iter[4] Batch [2000]	Speed: 165920.19 samples/sec
+2018-03-03 20:55:20,675 Iter[4] Batch [2000] 	loss 4.6659090, ppl 106.2621320
+2018-03-03 20:56:22,134 Iter[4] Batch [3000]	Speed: 166615.04 samples/sec
+2018-03-03 20:56:22,134 Iter[4] Batch [3000] 	loss 4.6615000, ppl 105.7946552
+2018-03-03 20:57:25,694 Iter[4] Batch [4000]	Speed: 161106.67 samples/sec
+2018-03-03 20:57:25,695 Iter[4] Batch [4000] 	loss 4.6615590, ppl 105.8008956
+2018-03-03 20:58:27,231 Iter[4] Batch [5000]	Speed: 166405.34 samples/sec
+2018-03-03 20:58:27,231 Iter[4] Batch [5000] 	loss 4.6748719, ppl 107.2188288
+.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+[]
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Finished processing!
+reset
+reset
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
+Finished processing!
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-000952018-03-03 20:59:28,082 Iter[4] Batch [6000]	Speed: 168279.43 samples/sec
+2018-03-03 20:59:28,083 Iter[4] Batch [6000] 	loss 4.6702957, ppl 106.7292980
+2018-03-03 21:00:29,418 Iter[4] Batch [7000]	Speed: 166950.91 samples/sec
+2018-03-03 21:00:29,418 Iter[4] Batch [7000] 	loss 4.6712289, ppl 106.8289446
+2018-03-03 21:01:33,622 Iter[4] Batch [8000]	Speed: 159491.34 samples/sec
+2018-03-03 21:01:33,623 Iter[4] Batch [8000] 	loss 4.6607652, ppl 105.7169495
+2018-03-03 21:02:34,805 Iter[4] Batch [9000]	Speed: 167369.21 samples/sec
+2018-03-03 21:02:34,805 Iter[4] Batch [9000] 	loss 4.6752902, ppl 107.2636942
+2018-03-03 21:03:35,786 Iter[4] Batch [10000]	Speed: 167920.81 samples/sec
+2018-03-03 21:03:35,786 Iter[4] Batch [10000] 	loss 4.6758957, ppl 107.3286587
+2018-03-03 21:04:36,684 Iter[4] Batch [11000]	Speed: 168150.52 samples/sec
+2018-03-03 21:04:36,684 Iter[4] Batch [11000] 	loss 4.6673668, ppl 106.4171552
+2018-03-03 21:05:40,167 Iter[4] Batch [12000]	Speed: 161304.02 samples/sec
+2018-03-03 21:05:40,167 Iter[4] Batch [12000] 	loss 4.6841004, ppl 108.2128792
+2018-03-03 21:06:41,380 Iter[4] Batch [13000]	Speed: 167284.78 samples/sec
+2018-03-03 21:06:41,380 Iter[4] Batch [13000] 	loss 4.6728297, ppl 107.0000913
+2018-03-03 21:07:42,706 Iter[4] Batch [14000]	Speed: 166977.39 samples/sec
+2018-03-03 21:07:42,706 Iter[4] Batch [14000] 	loss 4.6733598, ppl 107.0568247
+2018-03-03 21:08:45,051 Iter[4] Batch [15000]	Speed: 164245.93 samples/sec
+2018-03-03 21:08:45,052 Iter[4] Batch [15000] 	loss 4.6761488, ppl 107.3558297
+2018-03-03 21:09:45,303 Iter[4] Batch [16000]	Speed: 169954.58 samples/sec
+2018-03-03 21:09:45,303 Iter[4] Batch [16000] 	loss 4.6823297, ppl 108.0214358
+2018-03-03 21:10:45,865 Iter[4] Batch [17000]	Speed: 169081.80 samples/sec
+2018-03-03 21:10:45,866 Iter[4] Batch [17000] 	loss 4.6778836, ppl 107.5422285
+2018-03-03 21:11:46,533 Iter[4] Batch [18000]	Speed: 168789.00 samples/sec
+2018-03-03 21:11:46,533 Iter[4] Batch [18000] 	loss 4.6794219, ppl 107.7077860
+2018-03-03 21:12:49,471 Iter[4] Batch [19000]	Speed: 162700.59 samples/sec
+2018-03-03 21:12:49,471 Iter[4] Batch [19000] 	loss 4.6764648, ppl 107.3897612
+2018-03-03 21:13:51,063 Iter[4] Batch [20000]	Speed: 166255.18 samples/sec
+2018-03-03 21:13:51,063 Iter[4] Batch [20000] 	loss 4.6822977, ppl 108.0179758
+2018-03-03 21:14:52,810 Iter[4] Batch [21000]	Speed: 165838.00 samples/sec
+2018-03-03 21:14:52,811 Iter[4] Batch [21000] 	loss 4.6856641, ppl 108.3822210
+2018-03-03 21:15:54,415 Iter[4] Batch [22000]	Speed: 166220.81 samples/sec
+2018-03-03 21:15:54,416 Iter[4] Batch [22000] 	loss 4.6729809, ppl 107.0162679
+2018-03-03 21:16:58,445 Iter[4] Batch [23000]	Speed: 159926.36 samples/sec
+2018-03-03 21:16:58,445 Iter[4] Batch [23000] 	loss 4.6808180, ppl 107.8582612
+2018-03-03 21:18:00,115 Iter[4] Batch [24000]	Speed: 166046.57 samples/sec
+2018-03-03 21:18:00,115 Iter[4] Batch [24000] 	loss 4.6821898, ppl 108.0063308
+2018-03-03 21:19:01,309 Iter[4] Batch [25000]	Speed: 167337.30 samples/sec
+2018-03-03 21:19:01,309 Iter[4] Batch [25000] 	loss 4.6773578, ppl 107.4856997
+2018-03-03 21:20:04,209 Iter[4] Batch [26000]	Speed: 162796.89 samples/sec
+2018-03-03 21:20:04,209 Iter[4] Batch [26000] 	loss 4.6751676, ppl 107.2505384
+2018-03-03 21:21:05,113 Iter[4] Batch [27000]	Speed: 168133.38 samples/sec
+2018-03-03 21:21:05,114 Iter[4] Batch [27000] 	loss 4.6750262, ppl 107.2353736
+2018-03-03 21:22:06,122 Iter[4] Batch [28000]	Speed: 167844.33 samples/sec
+2018-03-03 21:22:06,123 Iter[4] Batch [28000] 	loss 4.6800562, ppl 107.7761348
+2018-03-03 21:23:07,523 Iter[4] Batch [29000]	Speed: 166775.10 samples/sec
+2018-03-03 21:23:07,523 Iter[4] Batch [29000] 	loss 4.6846770, ppl 108.2752887
+2018-03-03 21:24:10,647 Iter[4] Batch [30000]	Speed: 162220.52 samples/sec
+2018-03-03 21:24:10,647 Iter[4] Batch [30000] 	loss 4.6852688, ppl 108.3393846
+2018-03-03 21:25:11,753 Iter[4] Batch [31000]	Speed: 167577.15 samples/sec
+2018-03-03 21:25:11,753 Iter[4] Batch [31000] 	loss 4.6711660, ppl 106.8222262
+2018-03-03 21:26:12,786 Iter[4] Batch [32000]	Speed: 167778.35 samples/sec
+2018-03-03 21:26:12,787 Iter[4] Batch [32000] 	loss 4.6766195, ppl 107.4063743
+2018-03-03 21:27:13,787 Iter[4] Batch [33000]	Speed: 167867.08 samples/sec
+2018-03-03 21:27:13,787 Iter[4] Batch [33000] 	loss 4.6689219, ppl 106.5827709
+-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingua2018-03-03 21:28:17,717 Iter[4] Batch [34000]	Speed: 160174.79 samples/sec
+2018-03-03 21:28:17,718 Iter[4] Batch [34000] 	loss 4.6882105, ppl 108.6585663
+2018-03-03 21:29:18,628 Iter[4] Batch [35000]	Speed: 168116.86 samples/sec
+2018-03-03 21:29:18,628 Iter[4] Batch [35000] 	loss 4.6772289, ppl 107.4718450
+2018-03-03 21:30:19,050 Iter[4] Batch [36000]	Speed: 169474.70 samples/sec
+2018-03-03 21:30:19,050 Iter[4] Batch [36000] 	loss 4.6867871, ppl 108.5040077
+2018-03-03 21:31:19,414 Iter[4] Batch [37000]	Speed: 169638.38 samples/sec
+2018-03-03 21:31:19,414 Iter[4] Batch [37000] 	loss 4.6836020, ppl 108.1589553
+2018-03-03 21:32:23,715 Iter[4] Batch [38000]	Speed: 159250.05 samples/sec
+2018-03-03 21:32:23,715 Iter[4] Batch [38000] 	loss 4.6810250, ppl 107.8805935
+2018-03-03 21:33:25,155 Iter[4] Batch [39000]	Speed: 166668.47 samples/sec
+2018-03-03 21:33:25,155 Iter[4] Batch [39000] 	loss 4.6811418, ppl 107.8931944
+2018-03-03 21:34:26,405 Iter[4] Batch [40000]	Speed: 167184.59 samples/sec
+2018-03-03 21:34:26,405 Iter[4] Batch [40000] 	loss 4.6754656, ppl 107.2825089
+2018-03-03 21:35:29,326 Iter[4] Batch [41000]	Speed: 162742.81 samples/sec
+2018-03-03 21:35:29,326 Iter[4] Batch [41000] 	loss 4.6750211, ppl 107.2348291
+2018-03-03 21:36:28,947 Iter[4] Batch [42000]	Speed: 171752.97 samples/sec
+2018-03-03 21:36:28,947 Iter[4] Batch [42000] 	loss 4.6878016, ppl 108.6141358
+2018-03-03 21:37:29,718 Iter[4] Batch [43000]	Speed: 168500.35 samples/sec
+2018-03-03 21:37:29,719 Iter[4] Batch [43000] 	loss 4.6837215, ppl 108.1718844
+2018-03-03 21:38:30,896 Iter[4] Batch [44000]	Speed: 167382.22 samples/sec
+2018-03-03 21:38:30,896 Iter[4] Batch [44000] 	loss 4.6773859, ppl 107.4887228
+2018-03-03 21:39:34,801 Iter[4] Batch [45000]	Speed: 160238.14 samples/sec
+2018-03-03 21:39:34,801 Iter[4] Batch [45000] 	loss 4.6817031, ppl 107.9537749
+2018-03-03 21:40:35,431 Iter[4] Batch [46000]	Speed: 168893.76 samples/sec
+2018-03-03 21:40:35,431 Iter[4] Batch [46000] 	loss 4.6871313, ppl 108.5413547
+2018-03-03 21:41:37,353 Iter[4] Batch [47000]	Speed: 165370.15 samples/sec
+2018-03-03 21:41:37,353 Iter[4] Batch [47000] 	loss 4.6881148, ppl 108.6481679
+2018-03-03 21:42:38,096 Iter[4] Batch [48000]	Speed: 168579.64 samples/sec
+2018-03-03 21:42:38,096 Iter[4] Batch [48000] 	loss 4.6801965, ppl 107.7912498
+2018-03-03 21:43:41,221 Iter[4] Batch [49000]	Speed: 162218.24 samples/sec
+2018-03-03 21:43:41,221 Iter[4] Batch [49000] 	loss 4.6754977, ppl 107.2859453
+2018-03-03 21:44:41,669 Iter[4] Batch [50000]	Speed: 169401.68 samples/sec
+2018-03-03 21:44:41,669 Iter[4] Batch [50000] 	loss 4.6764512, ppl 107.3882929
+2018-03-03 21:45:42,800 Iter[4] Batch [51000]	Speed: 167509.67 samples/sec
+2018-03-03 21:45:42,800 Iter[4] Batch [51000] 	loss 4.6814844, ppl 107.9301626
+2018-03-03 21:46:44,275 Iter[4] Batch [52000]	Speed: 166572.07 samples/sec
+2018-03-03 21:46:44,275 Iter[4] Batch [52000] 	loss 4.6819246, ppl 107.9776876
+2018-03-03 21:47:46,399 Iter[4] Batch [53000]	Speed: 164830.71 samples/sec
+2018-03-03 21:47:46,400 Iter[4] Batch [53000] 	loss 4.6777020, ppl 107.5226962
+2018-03-03 21:48:47,003 Iter[4] Batch [54000]	Speed: 168968.69 samples/sec
+2018-03-03 21:48:47,003 Iter[4] Batch [54000] 	loss 4.6802113, ppl 107.7928498
+2018-03-03 21:49:48,531 Iter[4] Batch [55000]	Speed: 166427.16 samples/sec
+2018-03-03 21:49:48,531 Iter[4] Batch [55000] 	loss 4.6871184, ppl 108.5399556
+2018-03-03 21:50:51,650 Iter[4] Batch [56000]	Speed: 162233.15 samples/sec
+2018-03-03 21:50:51,651 Iter[4] Batch [56000] 	loss 4.6891434, ppl 108.7599717
+2018-03-03 21:51:53,132 Iter[4] Batch [57000]	Speed: 166554.55 samples/sec
+2018-03-03 21:51:53,132 Iter[4] Batch [57000] 	loss 4.6902535, ppl 108.8807793
+2018-03-03 21:52:54,681 Iter[4] Batch [58000]	Speed: 166371.18 samples/sec
+2018-03-03 21:52:54,681 Iter[4] Batch [58000] 	loss 4.6775125, ppl 107.5023277
+2018-03-03 21:53:55,271 Iter[4] Batch [59000]	Speed: 169005.79 samples/sec
+2018-03-03 21:53:55,271 Iter[4] Batch [59000] 	loss 4.6836844, ppl 108.1678703
+2018-03-03 21:54:57,621 Iter[4] Batch [60000]	Speed: 164233.54 samples/sec
+2018-03-03 21:54:57,622 Iter[4] Batch [60000] 	loss 4.6822191, ppl 108.0094951
+l.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /h2018-03-03 21:55:58,013 Iter[4] Batch [61000]	Speed: 169559.58 samples/sec
+2018-03-03 21:55:58,014 Iter[4] Batch [61000] 	loss 4.6820883, ppl 107.9953619
+2018-03-03 21:56:58,668 Iter[4] Batch [62000]	Speed: 168824.87 samples/sec
+2018-03-03 21:56:58,668 Iter[4] Batch [62000] 	loss 4.6781777, ppl 107.5738657
+2018-03-03 21:57:59,776 Iter[4] Batch [63000]	Speed: 167571.93 samples/sec
+2018-03-03 21:57:59,777 Iter[4] Batch [63000] 	loss 4.6847164, ppl 108.2795605
+2018-03-03 21:59:03,315 Iter[4] Batch [64000]	Speed: 161163.19 samples/sec
+2018-03-03 21:59:03,315 Iter[4] Batch [64000] 	loss 4.6907465, ppl 108.9344673
+2018-03-03 22:00:04,290 Iter[4] Batch [65000]	Speed: 167937.53 samples/sec
+2018-03-03 22:00:04,290 Iter[4] Batch [65000] 	loss 4.6854340, ppl 108.3572875
+2018-03-03 22:01:05,474 Iter[4] Batch [66000]	Speed: 167365.31 samples/sec
+2018-03-03 22:01:05,474 Iter[4] Batch [66000] 	loss 4.6859516, ppl 108.4133854
+2018-03-03 22:02:08,590 Iter[4] Batch [67000]	Speed: 162239.55 samples/sec
+2018-03-03 22:02:08,591 Iter[4] Batch [67000] 	loss 4.6805539, ppl 107.8297836
+2018-03-03 22:03:09,279 Iter[4] Batch [68000]	Speed: 168729.87 samples/sec
+2018-03-03 22:03:09,279 Iter[4] Batch [68000] 	loss 4.6766230, ppl 107.4067519
+2018-03-03 22:04:10,239 Iter[4] Batch [69000]	Speed: 167979.92 samples/sec
+2018-03-03 22:04:10,239 Iter[4] Batch [69000] 	loss 4.6789004, ppl 107.6516327
+2018-03-03 22:05:10,710 Iter[4] Batch [70000]	Speed: 169337.77 samples/sec
+2018-03-03 22:05:10,710 Iter[4] Batch [70000] 	loss 4.6820840, ppl 107.9948979
+2018-03-03 22:06:13,413 Iter[4] Batch [71000]	Speed: 163309.59 samples/sec
+2018-03-03 22:06:13,413 Iter[4] Batch [71000] 	loss 4.6818992, ppl 107.9749460
+2018-03-03 22:07:15,337 Iter[4] Batch [72000]	Speed: 165365.03 samples/sec
+2018-03-03 22:07:15,337 Iter[4] Batch [72000] 	loss 4.6760105, ppl 107.3409854
+2018-03-03 22:08:16,058 Iter[4] Batch [73000]	Speed: 168640.08 samples/sec
+2018-03-03 22:08:16,058 Iter[4] Batch [73000] 	loss 4.6804441, ppl 107.8179483
+2018-03-03 22:09:18,435 Iter[4] Batch [74000]	Speed: 164164.93 samples/sec
+2018-03-03 22:09:18,435 Iter[4] Batch [74000] 	loss 4.6881242, ppl 108.6491864
+2018-03-03 22:10:21,857 Iter[4] Batch [75000]	Speed: 161459.01 samples/sec
+2018-03-03 22:10:21,857 Iter[4] Batch [75000] 	loss 4.6846973, ppl 108.2774880
+2018-03-03 22:11:23,414 Iter[4] Batch [76000]	Speed: 166348.50 samples/sec
+2018-03-03 22:11:23,414 Iter[4] Batch [76000] 	loss 4.6812754, ppl 107.9076092
+2018-03-03 22:12:24,264 Iter[4] Batch [77000]	Speed: 168283.41 samples/sec
+2018-03-03 22:12:24,264 Iter[4] Batch [77000] 	loss 4.6758273, ppl 107.3213220
+2018-03-03 22:13:25,173 Iter[4] Batch [78000]	Speed: 168120.58 samples/sec
+2018-03-03 22:13:25,173 Iter[4] Batch [78000] 	loss 4.6906129, ppl 108.9199153
+2018-03-03 22:13:55,033 Saved checkpoint to "./checkpoint/-0000.params"
+2018-03-03 22:16:34,217 eval batch 19 : 3.9664347
+2018-03-03 22:19:18,274 eval batch 39 : 3.9636856
+2018-03-03 22:22:02,792 eval batch 59 : 3.9800350
+2018-03-03 22:22:57,749 Iter[4]		 CE loss 3.7552670, ppl 42.7456334. Time cost = 541.38 seconds
+2018-03-03 22:24:00,795 Iter[5] Batch [1000]	Speed: 168079.73 samples/sec
+2018-03-03 22:24:00,795 Iter[5] Batch [1000] 	loss 4.6442629, ppl 103.9866880
+2018-03-03 22:25:01,735 Iter[5] Batch [2000]	Speed: 168034.88 samples/sec
+2018-03-03 22:25:01,735 Iter[5] Batch [2000] 	loss 4.6387754, ppl 103.4176238
+2018-03-03 22:26:03,264 Iter[5] Batch [3000]	Speed: 166425.72 samples/sec
+2018-03-03 22:26:03,264 Iter[5] Batch [3000] 	loss 4.6261934, ppl 102.1245717
+2018-03-03 22:27:06,725 Iter[5] Batch [4000]	Speed: 161359.74 samples/sec
+2018-03-03 22:27:06,725 Iter[5] Batch [4000] 	loss 4.6400707, ppl 103.5516688
+2018-03-03 22:28:07,725 Iter[5] Batch [5000]	Speed: 167868.96 samples/sec
+2018-03-03 22:28:07,725 Iter[5] Batch [5000] 	loss 4.6464637, ppl 104.2157919
+2018-03-03 22:29:08,087 Iter[5] Batch [6000]	Speed: 169642.12 samples/sec
+2018-03-03 22:29:08,088 Iter[5] Batch [6000] 	loss 4.6404652, ppl 103.5925312
+2018-03-03 22:30:08,763 Iter[5] Batch [7000]	Speed: 168766.15 samples/sec
+2018-03-03 22:30:08,763 Iter[5] Batch [7000] 	loss 4.6361500, ppl 103.1464683
+ome/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+[]
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Finished processing!
+reset
+reset
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
+Finished processing!
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00014-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00097-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingu2018-03-03 22:31:12,048 Iter[5] Batch [8000]	Speed: 161808.88 samples/sec
+2018-03-03 22:31:12,048 Iter[5] Batch [8000] 	loss 4.6444668, ppl 104.0078937
+2018-03-03 22:32:12,646 Iter[5] Batch [9000]	Speed: 168982.44 samples/sec
+2018-03-03 22:32:12,646 Iter[5] Batch [9000] 	loss 4.6412844, ppl 103.6774228
+2018-03-03 22:33:15,102 Iter[5] Batch [10000]	Speed: 163954.98 samples/sec
+2018-03-03 22:33:15,103 Iter[5] Batch [10000] 	loss 4.6300957, ppl 102.5238755
+2018-03-03 22:34:16,187 Iter[5] Batch [11000]	Speed: 167638.00 samples/sec
+2018-03-03 22:34:16,187 Iter[5] Batch [11000] 	loss 4.6393637, ppl 103.4784804
+2018-03-03 22:35:19,732 Iter[5] Batch [12000]	Speed: 161144.97 samples/sec
+2018-03-03 22:35:19,732 Iter[5] Batch [12000] 	loss 4.6416641, ppl 103.7167953
+2018-03-03 22:36:20,937 Iter[5] Batch [13000]	Speed: 167306.89 samples/sec
+2018-03-03 22:36:20,937 Iter[5] Batch [13000] 	loss 4.6391492, ppl 103.4562915
+2018-03-03 22:37:22,530 Iter[5] Batch [14000]	Speed: 166254.15 samples/sec
+2018-03-03 22:37:22,530 Iter[5] Batch [14000] 	loss 4.6427469, ppl 103.8291620
+2018-03-03 22:38:25,411 Iter[5] Batch [15000]	Speed: 162845.77 samples/sec
+2018-03-03 22:38:25,412 Iter[5] Batch [15000] 	loss 4.6423703, ppl 103.7900712
+2018-03-03 22:39:26,388 Iter[5] Batch [16000]	Speed: 167933.85 samples/sec
+2018-03-03 22:39:26,388 Iter[5] Batch [16000] 	loss 4.6348168, ppl 103.0090447
+2018-03-03 22:40:28,009 Iter[5] Batch [17000]	Speed: 166178.97 samples/sec
+2018-03-03 22:40:28,009 Iter[5] Batch [17000] 	loss 4.6413613, ppl 103.6854014
+2018-03-03 22:41:30,278 Iter[5] Batch [18000]	Speed: 164445.91 samples/sec
+2018-03-03 22:41:30,279 Iter[5] Batch [18000] 	loss 4.6507820, ppl 104.6668063
+2018-03-03 22:42:34,104 Iter[5] Batch [19000]	Speed: 160436.56 samples/sec
+2018-03-03 22:42:34,105 Iter[5] Batch [19000] 	loss 4.6366105, ppl 103.1939830
+2018-03-03 22:43:34,947 Iter[5] Batch [20000]	Speed: 168303.45 samples/sec
+2018-03-03 22:43:34,947 Iter[5] Batch [20000] 	loss 4.6319699, ppl 102.7162078
+2018-03-03 22:44:35,959 Iter[5] Batch [21000]	Speed: 167837.45 samples/sec
+2018-03-03 22:44:35,959 Iter[5] Batch [21000] 	loss 4.6401152, ppl 103.5562801
+2018-03-03 22:45:36,611 Iter[5] Batch [22000]	Speed: 168831.69 samples/sec
+2018-03-03 22:45:36,611 Iter[5] Batch [22000] 	loss 4.6375406, ppl 103.2900061
+2018-03-03 22:46:39,663 Iter[5] Batch [23000]	Speed: 162405.00 samples/sec
+2018-03-03 22:46:39,664 Iter[5] Batch [23000] 	loss 4.6466086, ppl 104.2308962
+2018-03-03 22:47:40,350 Iter[5] Batch [24000]	Speed: 168736.83 samples/sec
+2018-03-03 22:47:40,350 Iter[5] Batch [24000] 	loss 4.6403422, ppl 103.5797852
+2018-03-03 22:48:41,526 Iter[5] Batch [25000]	Speed: 167386.29 samples/sec
+2018-03-03 22:48:41,526 Iter[5] Batch [25000] 	loss 4.6374285, ppl 103.2784270
+2018-03-03 22:49:43,130 Iter[5] Batch [26000]	Speed: 166222.57 samples/sec
+2018-03-03 22:49:43,130 Iter[5] Batch [26000] 	loss 4.6492441, ppl 104.5059639
+2018-03-03 22:50:45,324 Iter[5] Batch [27000]	Speed: 164646.85 samples/sec
+2018-03-03 22:50:45,324 Iter[5] Batch [27000] 	loss 4.6427930, ppl 103.8339479
+2018-03-03 22:51:46,332 Iter[5] Batch [28000]	Speed: 167846.51 samples/sec
+2018-03-03 22:51:46,332 Iter[5] Batch [28000] 	loss 4.6436820, ppl 103.9263039
+2018-03-03 22:52:47,476 Iter[5] Batch [29000]	Speed: 167475.67 samples/sec
+2018-03-03 22:52:47,476 Iter[5] Batch [29000] 	loss 4.6451305, ppl 104.0769437
+2018-03-03 22:53:50,718 Iter[5] Batch [30000]	Speed: 161916.13 samples/sec
+2018-03-03 22:53:50,719 Iter[5] Batch [30000] 	loss 4.6388785, ppl 103.4282893
+2018-03-03 22:54:51,837 Iter[5] Batch [31000]	Speed: 167543.04 samples/sec
+2018-03-03 22:54:51,837 Iter[5] Batch [31000] 	loss 4.6411578, ppl 103.6643020
+2018-03-03 22:55:52,790 Iter[5] Batch [32000]	Speed: 167998.00 samples/sec
+2018-03-03 22:55:52,791 Iter[5] Batch [32000] 	loss 4.6429129, ppl 103.8464007
+2018-03-03 22:56:53,422 Iter[5] Batch [33000]	Speed: 168888.08 samples/sec
+2018-03-03 22:56:53,423 Iter[5] Batch [33000] 	loss 4.6489668, ppl 104.4769838
+2018-03-03 22:57:56,199 Iter[5] Batch [34000]	Speed: 163118.28 samples/sec
+2018-03-03 22:57:56,199 Iter[5] Batch [34000] 	loss 4.6510918, ppl 104.6992335
+2018-03-03 22:58:56,246 Iter[5] Batch [35000]	Speed: 170535.26 samples/sec
+2018-03-03 22:58:56,246 Iter[5] Batch [35000] 	loss 4.6524066, ppl 104.8369872
+al.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00051-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00086-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00040-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00041-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00032-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00037-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00021-of-00100
+Finished processing!
+Processing file: /2018-03-03 22:59:58,338 Iter[5] Batch [36000]	Speed: 164916.03 samples/sec
+2018-03-03 22:59:58,338 Iter[5] Batch [36000] 	loss 4.6478309, ppl 104.3583719
+2018-03-03 23:01:00,186 Iter[5] Batch [37000]	Speed: 165568.60 samples/sec
+2018-03-03 23:01:00,186 Iter[5] Batch [37000] 	loss 4.6558211, ppl 105.1955600
+2018-03-03 23:02:02,621 Iter[5] Batch [38000]	Speed: 164009.45 samples/sec
+2018-03-03 23:02:02,621 Iter[5] Batch [38000] 	loss 4.6507973, ppl 104.6684008
+2018-03-03 23:03:03,222 Iter[5] Batch [39000]	Speed: 168974.69 samples/sec
+2018-03-03 23:03:03,222 Iter[5] Batch [39000] 	loss 4.6455488, ppl 104.1204944
+2018-03-03 23:04:04,577 Iter[5] Batch [40000]	Speed: 166899.72 samples/sec
+2018-03-03 23:04:04,577 Iter[5] Batch [40000] 	loss 4.6538289, ppl 104.9861993
+2018-03-03 23:05:07,221 Iter[5] Batch [41000]	Speed: 163463.51 samples/sec
+2018-03-03 23:05:07,221 Iter[5] Batch [41000] 	loss 4.6582336, ppl 105.4496506
+2018-03-03 23:06:07,900 Iter[5] Batch [42000]	Speed: 168757.01 samples/sec
+2018-03-03 23:06:07,900 Iter[5] Batch [42000] 	loss 4.6441734, ppl 103.9773864
+2018-03-03 23:07:08,768 Iter[5] Batch [43000]	Speed: 168232.63 samples/sec
+2018-03-03 23:07:08,768 Iter[5] Batch [43000] 	loss 4.6513883, ppl 104.7302798
+2018-03-03 23:08:09,713 Iter[5] Batch [44000]	Speed: 168022.21 samples/sec
+2018-03-03 23:08:09,713 Iter[5] Batch [44000] 	loss 4.6419422, ppl 103.7456455
+2018-03-03 23:09:13,680 Iter[5] Batch [45000]	Speed: 160080.74 samples/sec
+2018-03-03 23:09:13,681 Iter[5] Batch [45000] 	loss 4.6475699, ppl 104.3311444
+2018-03-03 23:10:14,935 Iter[5] Batch [46000]	Speed: 167171.78 samples/sec
+2018-03-03 23:10:14,935 Iter[5] Batch [46000] 	loss 4.6508914, ppl 104.6782549
+2018-03-03 23:11:16,250 Iter[5] Batch [47000]	Speed: 167006.73 samples/sec
+2018-03-03 23:11:16,250 Iter[5] Batch [47000] 	loss 4.6529027, ppl 104.8890090
+2018-03-03 23:12:18,324 Iter[5] Batch [48000]	Speed: 164965.91 samples/sec
+2018-03-03 23:12:18,324 Iter[5] Batch [48000] 	loss 4.6535426, ppl 104.9561431
+2018-03-03 23:13:21,607 Iter[5] Batch [49000]	Speed: 161812.58 samples/sec
+2018-03-03 23:13:21,607 Iter[5] Batch [49000] 	loss 4.6493863, ppl 104.5208244
+2018-03-03 23:14:22,196 Iter[5] Batch [50000]	Speed: 169008.05 samples/sec
+2018-03-03 23:14:22,196 Iter[5] Batch [50000] 	loss 4.6463805, ppl 104.2071212
+2018-03-03 23:15:24,504 Iter[5] Batch [51000]	Speed: 164345.30 samples/sec
+2018-03-03 23:15:24,504 Iter[5] Batch [51000] 	loss 4.6463676, ppl 104.2057779
+2018-03-03 23:16:25,900 Iter[5] Batch [52000]	Speed: 166786.32 samples/sec
+2018-03-03 23:16:25,900 Iter[5] Batch [52000] 	loss 4.6491625, ppl 104.4974323
+2018-03-03 23:17:28,462 Iter[5] Batch [53000]	Speed: 163678.35 samples/sec
+2018-03-03 23:17:28,462 Iter[5] Batch [53000] 	loss 4.6490488, ppl 104.4855546
+2018-03-03 23:18:29,390 Iter[5] Batch [54000]	Speed: 168067.84 samples/sec
+2018-03-03 23:18:29,390 Iter[5] Batch [54000] 	loss 4.6531852, ppl 104.9186362
+2018-03-03 23:19:30,198 Iter[5] Batch [55000]	Speed: 168399.73 samples/sec
+2018-03-03 23:19:30,198 Iter[5] Batch [55000] 	loss 4.6489012, ppl 104.4701278
+2018-03-03 23:20:34,244 Iter[5] Batch [56000]	Speed: 159884.47 samples/sec
+2018-03-03 23:20:34,244 Iter[5] Batch [56000] 	loss 4.6465703, ppl 104.2269062
+2018-03-03 23:21:36,213 Iter[5] Batch [57000]	Speed: 165244.94 samples/sec
+2018-03-03 23:21:36,213 Iter[5] Batch [57000] 	loss 4.6492160, ppl 104.5030247
+2018-03-03 23:22:37,061 Iter[5] Batch [58000]	Speed: 168287.74 samples/sec
+2018-03-03 23:22:37,062 Iter[5] Batch [58000] 	loss 4.6424285, ppl 103.7961122
+2018-03-03 23:23:38,620 Iter[5] Batch [59000]	Speed: 166345.84 samples/sec
+2018-03-03 23:23:38,620 Iter[5] Batch [59000] 	loss 4.6521941, ppl 104.8147117
+2018-03-03 23:24:41,917 Iter[5] Batch [60000]	Speed: 161777.98 samples/sec
+2018-03-03 23:24:41,917 Iter[5] Batch [60000] 	loss 4.6517289, ppl 104.7659596
+2018-03-03 23:25:43,298 Iter[5] Batch [61000]	Speed: 166827.55 samples/sec
+2018-03-03 23:25:43,298 Iter[5] Batch [61000] 	loss 4.6526359, ppl 104.8610287
+2018-03-03 23:26:44,263 Iter[5] Batch [62000]	Speed: 167963.86 samples/sec
+2018-03-03 23:26:44,264 Iter[5] Batch [62000] 	loss 4.6490145, ppl 104.4819629
+home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00050-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00063-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00043-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00026-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00031-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00029-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00012-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00090-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00052-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finis2018-03-03 23:27:46,041 Iter[5] Batch [63000]	Speed: 165754.82 samples/sec
+2018-03-03 23:27:46,042 Iter[5] Batch [63000] 	loss 4.6492504, ppl 104.5066171
+2018-03-03 23:28:49,206 Iter[5] Batch [64000]	Speed: 162117.43 samples/sec
+2018-03-03 23:28:49,206 Iter[5] Batch [64000] 	loss 4.6557367, ppl 105.1866845
+2018-03-03 23:29:49,877 Iter[5] Batch [65000]	Speed: 168779.33 samples/sec
+2018-03-03 23:29:49,877 Iter[5] Batch [65000] 	loss 4.6501355, ppl 104.5991627
+2018-03-03 23:30:50,082 Iter[5] Batch [66000]	Speed: 170085.59 samples/sec
+2018-03-03 23:30:50,082 Iter[5] Batch [66000] 	loss 4.6464371, ppl 104.2130237
+2018-03-03 23:31:51,997 Iter[5] Batch [67000]	Speed: 165389.31 samples/sec
+2018-03-03 23:31:51,997 Iter[5] Batch [67000] 	loss 4.6439121, ppl 103.9502178
+2018-03-03 23:32:53,461 Iter[5] Batch [68000]	Speed: 166602.23 samples/sec
+2018-03-03 23:32:53,461 Iter[5] Batch [68000] 	loss 4.6549035, ppl 105.0990791
+2018-03-03 23:33:54,006 Iter[5] Batch [69000]	Speed: 169129.02 samples/sec
+2018-03-03 23:33:54,007 Iter[5] Batch [69000] 	loss 4.6484305, ppl 104.4209649
+2018-03-03 23:34:54,781 Iter[5] Batch [70000]	Speed: 168490.79 samples/sec
+2018-03-03 23:34:54,782 Iter[5] Batch [70000] 	loss 4.6558445, ppl 105.1980255
+2018-03-03 23:35:58,056 Iter[5] Batch [71000]	Speed: 161834.27 samples/sec
+2018-03-03 23:35:58,056 Iter[5] Batch [71000] 	loss 4.6580852, ppl 105.4339991
+2018-03-03 23:36:58,202 Iter[5] Batch [72000]	Speed: 170253.29 samples/sec
+2018-03-03 23:36:58,202 Iter[5] Batch [72000] 	loss 4.6510426, ppl 104.6940805
+2018-03-03 23:37:58,336 Iter[5] Batch [73000]	Speed: 170286.10 samples/sec
+2018-03-03 23:37:58,336 Iter[5] Batch [73000] 	loss 4.6577730, ppl 105.4010973
+2018-03-03 23:39:00,101 Iter[5] Batch [74000]	Speed: 165790.43 samples/sec
+2018-03-03 23:39:00,101 Iter[5] Batch [74000] 	loss 4.6538891, ppl 104.9925151
+2018-03-03 23:40:02,535 Iter[5] Batch [75000]	Speed: 164014.89 samples/sec
+2018-03-03 23:40:02,535 Iter[5] Batch [75000] 	loss 4.6564352, ppl 105.2601765
+2018-03-03 23:41:03,219 Iter[5] Batch [76000]	Speed: 168742.25 samples/sec
+2018-03-03 23:41:03,219 Iter[5] Batch [76000] 	loss 4.6563074, ppl 105.2467320
+2018-03-03 23:42:03,616 Iter[5] Batch [77000]	Speed: 169546.91 samples/sec
+2018-03-03 23:42:03,616 Iter[5] Batch [77000] 	loss 4.6502793, ppl 104.6141999
+2018-03-03 23:43:04,444 Iter[5] Batch [78000]	Speed: 168343.24 samples/sec
+2018-03-03 23:43:04,444 Iter[5] Batch [78000] 	loss 4.6541383, ppl 105.0186844
+2018-03-03 23:43:26,793 Saved checkpoint to "./checkpoint/-0000.params"
+2018-03-03 23:46:07,093 eval batch 19 : 3.9632552
+2018-03-03 23:48:51,986 eval batch 39 : 3.9600956
+2018-03-03 23:51:36,947 eval batch 59 : 3.9772385
+2018-03-03 23:52:32,377 Iter[5]		 CE loss 3.7523685, ppl 42.6219115. Time cost = 544.25 seconds
+2018-03-03 23:53:35,377 Iter[6] Batch [1000]	Speed: 168199.67 samples/sec
+2018-03-03 23:53:35,377 Iter[6] Batch [1000] 	loss 4.6104641, ppl 100.5307914
+2018-03-03 23:54:36,267 Iter[6] Batch [2000]	Speed: 168172.28 samples/sec
+2018-03-03 23:54:36,267 Iter[6] Batch [2000] 	loss 4.5907848, ppl 98.5717555
+2018-03-03 23:55:37,431 Iter[6] Batch [3000]	Speed: 167419.39 samples/sec
+2018-03-03 23:55:37,431 Iter[6] Batch [3000] 	loss 4.5965676, ppl 99.1434289
+2018-03-03 23:56:41,308 Iter[6] Batch [4000]	Speed: 160307.34 samples/sec
+2018-03-03 23:56:41,309 Iter[6] Batch [4000] 	loss 4.5984609, ppl 99.3313208
+2018-03-03 23:57:42,734 Iter[6] Batch [5000]	Speed: 166705.60 samples/sec
+2018-03-03 23:57:42,734 Iter[6] Batch [5000] 	loss 4.6077391, ppl 100.2572179
+2018-03-03 23:58:44,382 Iter[6] Batch [6000]	Speed: 166104.54 samples/sec
+2018-03-03 23:58:44,383 Iter[6] Batch [6000] 	loss 4.6172977, ppl 101.2201306
+2018-03-03 23:59:44,867 Iter[6] Batch [7000]	Speed: 169299.35 samples/sec
+2018-03-03 23:59:44,867 Iter[6] Batch [7000] 	loss 4.6007848, ppl 99.5624182
+2018-03-04 00:00:47,570 Iter[6] Batch [8000]	Speed: 163310.37 samples/sec
+2018-03-04 00:00:47,570 Iter[6] Batch [8000] 	loss 4.6148285, ppl 100.9705122
+2018-03-04 00:01:48,550 Iter[6] Batch [9000]	Speed: 167923.39 samples/sec
+2018-03-04 00:01:48,551 Iter[6] Batch [9000] 	loss 4.6115035, ppl 100.6353428
+2018-03-04 00:02:49,085 Iter[6] Batch [10000]	Speed: 169158.27 samples/sec
+2018-03-04 00:02:49,086 Iter[6] Batch [10000] 	loss 4.6053836, ppl 100.0213431
+hed processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00053-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00011-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00002-of-00100
+Finished processing!
+[]
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Finished processing!
+reset
+reset
+Processing file: /home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00056-of-00100
+Finished processing!
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00057-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00071-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00016-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00007-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00081-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00072-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00001-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00008-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00048-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00003-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00069-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00068-of-00100
+Finished processing!
+Processing file: 2018-03-04 00:03:48,960 Iter[6] Batch [11000]	Speed: 171025.64 samples/sec
+2018-03-04 00:03:48,960 Iter[6] Batch [11000] 	loss 4.6131062, ppl 100.7967638
+2018-03-04 00:04:51,984 Iter[6] Batch [12000]	Speed: 162477.24 samples/sec
+2018-03-04 00:04:51,984 Iter[6] Batch [12000] 	loss 4.6037902, ppl 99.8621000
+2018-03-04 00:05:54,239 Iter[6] Batch [13000]	Speed: 164484.42 samples/sec
+2018-03-04 00:05:54,240 Iter[6] Batch [13000] 	loss 4.6094184, ppl 100.4257210
+2018-03-04 00:06:54,943 Iter[6] Batch [14000]	Speed: 168689.91 samples/sec
+2018-03-04 00:06:54,943 Iter[6] Batch [14000] 	loss 4.6235395, ppl 101.8539020
+2018-03-04 00:07:57,094 Iter[6] Batch [15000]	Speed: 164758.85 samples/sec
+2018-03-04 00:07:57,095 Iter[6] Batch [15000] 	loss 4.6204820, ppl 101.5429672
+2018-03-04 00:08:59,317 Iter[6] Batch [16000]	Speed: 164571.55 samples/sec
+2018-03-04 00:08:59,317 Iter[6] Batch [16000] 	loss 4.6080207, ppl 100.2854584
+2018-03-04 00:10:00,278 Iter[6] Batch [17000]	Speed: 167974.59 samples/sec
+2018-03-04 00:10:00,279 Iter[6] Batch [17000] 	loss 4.6229777, ppl 101.7967048
+2018-03-04 00:11:01,014 Iter[6] Batch [18000]	Speed: 168600.91 samples/sec
+2018-03-04 00:11:01,014 Iter[6] Batch [18000] 	loss 4.6174848, ppl 101.2390716
+2018-03-04 00:12:03,322 Iter[6] Batch [19000]	Speed: 164345.49 samples/sec
+2018-03-04 00:12:03,322 Iter[6] Batch [19000] 	loss 4.6241227, ppl 101.9133208
+2018-03-04 00:13:04,161 Iter[6] Batch [20000]	Speed: 168312.61 samples/sec
+2018-03-04 00:13:04,161 Iter[6] Batch [20000] 	loss 4.6128973, ppl 100.7757011
+2018-03-04 00:14:05,475 Iter[6] Batch [21000]	Speed: 167009.93 samples/sec
+2018-03-04 00:14:05,475 Iter[6] Batch [21000] 	loss 4.6154629, ppl 101.0345857
+2018-03-04 00:15:06,134 Iter[6] Batch [22000]	Speed: 168813.19 samples/sec
+2018-03-04 00:15:06,134 Iter[6] Batch [22000] 	loss 4.6180406, ppl 101.2953620
+2018-03-04 00:16:09,649 Iter[6] Batch [23000]	Speed: 161222.77 samples/sec
+2018-03-04 00:16:09,649 Iter[6] Batch [23000] 	loss 4.6164480, ppl 101.1341696
+2018-03-04 00:17:10,643 Iter[6] Batch [24000]	Speed: 167885.43 samples/sec
+2018-03-04 00:17:10,643 Iter[6] Batch [24000] 	loss 4.6199922, ppl 101.4932392
+2018-03-04 00:18:11,907 Iter[6] Batch [25000]	Speed: 167145.52 samples/sec
+2018-03-04 00:18:11,907 Iter[6] Batch [25000] 	loss 4.6217156, ppl 101.6683073
+2018-03-04 00:19:13,293 Iter[6] Batch [26000]	Speed: 166813.21 samples/sec
+2018-03-04 00:19:13,293 Iter[6] Batch [26000] 	loss 4.6133062, ppl 100.8169252
+2018-03-04 00:20:16,204 Iter[6] Batch [27000]	Speed: 162769.50 samples/sec
+2018-03-04 00:20:16,205 Iter[6] Batch [27000] 	loss 4.6241016, ppl 101.9111711
+2018-03-04 00:21:17,118 Iter[6] Batch [28000]	Speed: 168106.75 samples/sec
+2018-03-04 00:21:17,118 Iter[6] Batch [28000] 	loss 4.6228645, ppl 101.7851738
+2018-03-04 00:22:17,832 Iter[6] Batch [29000]	Speed: 168661.73 samples/sec
+2018-03-04 00:22:17,832 Iter[6] Batch [29000] 	loss 4.6172465, ppl 101.2149511
+2018-03-04 00:23:21,614 Iter[6] Batch [30000]	Speed: 160547.31 samples/sec
+2018-03-04 00:23:21,614 Iter[6] Batch [30000] 	loss 4.6194973, ppl 101.4430204
+2018-03-04 00:24:23,642 Iter[6] Batch [31000]	Speed: 165084.91 samples/sec
+2018-03-04 00:24:23,643 Iter[6] Batch [31000] 	loss 4.6162914, ppl 101.1183291
+2018-03-04 00:25:24,621 Iter[6] Batch [32000]	Speed: 167928.78 samples/sec
+2018-03-04 00:25:24,621 Iter[6] Batch [32000] 	loss 4.6162504, ppl 101.1141817
+2018-03-04 00:26:25,518 Iter[6] Batch [33000]	Speed: 168154.06 samples/sec
+2018-03-04 00:26:25,518 Iter[6] Batch [33000] 	loss 4.6201730, ppl 101.5115969
+2018-03-04 00:27:28,370 Iter[6] Batch [34000]	Speed: 162922.51 samples/sec
+2018-03-04 00:27:28,370 Iter[6] Batch [34000] 	loss 4.6176832, ppl 101.2591632
+2018-03-04 00:28:29,601 Iter[6] Batch [35000]	Speed: 167234.33 samples/sec
+2018-03-04 00:28:29,601 Iter[6] Batch [35000] 	loss 4.6204934, ppl 101.5441175
+2018-03-04 00:29:30,838 Iter[6] Batch [36000]	Speed: 167220.18 samples/sec
+2018-03-04 00:29:30,838 Iter[6] Batch [36000] 	loss 4.6213152, ppl 101.6276084
+2018-03-04 00:30:32,001 Iter[6] Batch [37000]	Speed: 167420.88 samples/sec
+2018-03-04 00:30:32,002 Iter[6] Batch [37000] 	loss 4.6304430, ppl 102.5594847
+/home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00054-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00058-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00017-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00013-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00055-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00004-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00089-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00006-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00092-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00042-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00018-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00088-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00030-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00098-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00045-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00084-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00075-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00060-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00027-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00062-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00059-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00010-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00065-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00044-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00019-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00080-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00077-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00066-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00083-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00036-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00025-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00047-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00009-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00064-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00015-of-00100
+Fini2018-03-04 00:31:34,957 Iter[6] Batch [38000]	Speed: 162655.62 samples/sec
+2018-03-04 00:31:34,957 Iter[6] Batch [38000] 	loss 4.6218008, ppl 101.6769653
+2018-03-04 00:32:36,782 Iter[6] Batch [39000]	Speed: 165629.30 samples/sec
+2018-03-04 00:32:36,782 Iter[6] Batch [39000] 	loss 4.6173082, ppl 101.2211982
+2018-03-04 00:33:37,243 Iter[6] Batch [40000]	Speed: 169364.83 samples/sec
+2018-03-04 00:33:37,243 Iter[6] Batch [40000] 	loss 4.6296887, ppl 102.4821536
+2018-03-04 00:34:39,517 Iter[6] Batch [41000]	Speed: 164435.30 samples/sec
+2018-03-04 00:34:39,517 Iter[6] Batch [41000] 	loss 4.6287004, ppl 102.3809224
+2018-03-04 00:35:41,423 Iter[6] Batch [42000]	Speed: 165413.68 samples/sec
+2018-03-04 00:35:41,423 Iter[6] Batch [42000] 	loss 4.6260457, ppl 102.1094935
+2018-03-04 00:36:42,863 Iter[6] Batch [43000]	Speed: 166665.92 samples/sec
+2018-03-04 00:36:42,863 Iter[6] Batch [43000] 	loss 4.6179070, ppl 101.2818304
+2018-03-04 00:37:44,597 Iter[6] Batch [44000]	Speed: 165872.45 samples/sec
+2018-03-04 00:37:44,598 Iter[6] Batch [44000] 	loss 4.6238766, ppl 101.8882437
+2018-03-04 00:38:46,807 Iter[6] Batch [45000]	Speed: 164605.42 samples/sec
+2018-03-04 00:38:46,807 Iter[6] Batch [45000] 	loss 4.6214637, ppl 101.6426949
+2018-03-04 00:39:48,099 Iter[6] Batch [46000]	Speed: 167070.36 samples/sec
+2018-03-04 00:39:48,099 Iter[6] Batch [46000] 	loss 4.6308258, ppl 102.5987533
+2018-03-04 00:40:48,922 Iter[6] Batch [47000]	Speed: 168357.76 samples/sec
+2018-03-04 00:40:48,922 Iter[6] Batch [47000] 	loss 4.6226230, ppl 101.7606052
+2018-03-04 00:41:49,527 Iter[6] Batch [48000]	Speed: 168962.29 samples/sec
+2018-03-04 00:41:49,527 Iter[6] Batch [48000] 	loss 4.6182293, ppl 101.3144753
+2018-03-04 00:42:52,500 Iter[6] Batch [49000]	Speed: 162610.67 samples/sec
+2018-03-04 00:42:52,500 Iter[6] Batch [49000] 	loss 4.6192609, ppl 101.4190494
+2018-03-04 00:43:53,990 Iter[6] Batch [50000]	Speed: 166530.32 samples/sec
+2018-03-04 00:43:53,990 Iter[6] Batch [50000] 	loss 4.6262789, ppl 102.1333085
+2018-03-04 00:44:55,041 Iter[6] Batch [51000]	Speed: 167729.46 samples/sec
+2018-03-04 00:44:55,041 Iter[6] Batch [51000] 	loss 4.6218105, ppl 101.6779583
+2018-03-04 00:45:55,339 Iter[6] Batch [52000]	Speed: 169822.59 samples/sec
+2018-03-04 00:45:55,340 Iter[6] Batch [52000] 	loss 4.6190098, ppl 101.3935790
+2018-03-04 00:46:59,079 Iter[6] Batch [53000]	Speed: 160653.56 samples/sec
+2018-03-04 00:46:59,079 Iter[6] Batch [53000] 	loss 4.6170242, ppl 101.1924570
+2018-03-04 00:48:00,371 Iter[6] Batch [54000]	Speed: 167070.52 samples/sec
+2018-03-04 00:48:00,371 Iter[6] Batch [54000] 	loss 4.6269730, ppl 102.2042280
+2018-03-04 00:49:02,036 Iter[6] Batch [55000]	Speed: 166059.99 samples/sec
+2018-03-04 00:49:02,036 Iter[6] Batch [55000] 	loss 4.6276789, ppl 102.2763953
+2018-03-04 00:50:05,679 Iter[6] Batch [56000]	Speed: 160896.01 samples/sec
+2018-03-04 00:50:05,680 Iter[6] Batch [56000] 	loss 4.6248195, ppl 101.9843664
+2018-03-04 00:51:06,360 Iter[6] Batch [57000]	Speed: 168753.87 samples/sec
+2018-03-04 00:51:06,360 Iter[6] Batch [57000] 	loss 4.6248285, ppl 101.9852827
+2018-03-04 00:52:07,116 Iter[6] Batch [58000]	Speed: 168541.50 samples/sec
+2018-03-04 00:52:07,116 Iter[6] Batch [58000] 	loss 4.6165457, ppl 101.1440464
+2018-03-04 00:53:08,186 Iter[6] Batch [59000]	Speed: 167678.75 samples/sec
+2018-03-04 00:53:08,186 Iter[6] Batch [59000] 	loss 4.6233309, ppl 101.8326581
+2018-03-04 00:54:10,392 Iter[6] Batch [60000]	Speed: 164614.90 samples/sec
+2018-03-04 00:54:10,392 Iter[6] Batch [60000] 	loss 4.6271492, ppl 102.2222351
+2018-03-04 00:55:11,485 Iter[6] Batch [61000]	Speed: 167613.63 samples/sec
+2018-03-04 00:55:11,485 Iter[6] Batch [61000] 	loss 4.6189902, ppl 101.3915987
+shed processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00087-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00033-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00079-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00034-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00070-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00085-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00099-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00094-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00046-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00078-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00049-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00082-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00005-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00073-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00028-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00076-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00023-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00061-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00093-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00038-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00091-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00039-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00022-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00024-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00067-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00096-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00095-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00035-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00020-of-00100
+Finished processing!
+Processing file: /home/ubuntu/gbw/training-monolingual.tokenized.shuffled/news.en-00074-of-00100
+Finished processing!
+Traceback (most recent call last):
+  File "train.py", line 178, in <module>
+    norm = module.clip_by_global_norm_per_ctx(max_norm=args.clip, param_names=lstm_args)
+  File "/home/ubuntu/tf/python/mxnet/module/module.py", line 891, in clip_by_global_norm_per_ctx
+    norm_val = self.global_grad_norm(grad_array_per_ctx[i])
+  File "/home/ubuntu/tf/python/mxnet/module/module.py", line 918, in global_grad_norm
+    norm_val += nd_global_norm(arr).asscalar()
+  File "/home/ubuntu/tf/python/mxnet/ndarray/ndarray.py", line 1842, in asscalar
+    return self.asnumpy()[0]
+  File "/home/ubuntu/tf/python/mxnet/ndarray/ndarray.py", line 1824, in asnumpy
+    ctypes.c_size_t(data.size)))
+KeyboardInterrupt
diff --git a/example/sparse/nce_language_model/data.py b/example/sparse/nce_language_model/data.py
new file mode 100644
index 00000000000..1dd4ae5b1af
--- /dev/null
+++ b/example/sparse/nce_language_model/data.py
@@ -0,0 +1,215 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os, gzip, sys
+import mxnet as mx
+import numpy as np
+import data_utils
+
+class DummyIter(mx.io.DataIter):
+    "A dummy iterator that always returns the same batch, used for speed testing"
+    def __init__(self, real_iter):
+        super(DummyIter, self).__init__()
+        self.real_iter = real_iter
+        self.provide_data = real_iter.provide_data
+        self.provide_label = real_iter.provide_label
+        self.batch_size = real_iter.batch_size
+
+        for batch in real_iter:
+            self.the_batch = batch
+            break
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        return self.the_batch
+
+class Dictionary(object):
+    def __init__(self):
+        self.word2idx = {}
+        self.idx2word = []
+        self.word_count = []
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.idx2word.append(word)
+            self.word2idx[word] = len(self.idx2word) - 1
+            self.word_count.append(0)
+        index = self.word2idx[word]
+        self.word_count[index] += 1
+        return index
+
+    def __len__(self):
+        return len(self.idx2word)
+
+    def unigram(self):
+        prob = mx.nd.array(self.word_count)
+        total_count = mx.nd.sum(prob)
+        return prob / total_count
+
+    def save(self, path, replace_unk=True):
+        f = open(path, "w")
+        for idx, word in enumerate(self.idx2word):
+            count = self.word_count[idx]
+            if replace_unk and word == '<unk>':
+                word = '<UNK>'
+            f.write("%s %d\n" %(word, count))
+        f.close()
+
+class Corpus(object):
+    def __init__(self, path, prepend=False):
+        self.dictionary = Dictionary()
+        self.train = self.tokenize(path + 'train.txt', prepend)
+        self.valid = self.tokenize(path + 'valid.txt', prepend)
+        self.test = self.tokenize(path + 'test.txt', prepend)
+
+    def tokenize(self, path, prepend):
+        """Tokenizes a text file."""
+        assert os.path.exists(path)
+        # Add words to the dictionary
+        with open(path, 'r') as f:
+            tokens = 0
+            for line in f:
+                words = ['<S>'] + line.split() + ['<S>'] if prepend else line.split() + ['<S>']
+                tokens += len(words)
+                for word in words:
+                    self.dictionary.add_word(word)
+
+        # Tokenize file content
+        with open(path, 'r') as f:
+            ids = np.zeros((tokens,), dtype='int32')
+            token = 0
+            for line in f:
+                words = ['<S>'] + line.split() + ['<S>'] if prepend else line.split() + ['<S>']
+                for word in words:
+                    ids[token] = self.dictionary.word2idx[word]
+                    token += 1
+
+        return mx.nd.array(ids, dtype='int32')
+
+def batchify(data, batch_size):
+    """Reshape data into (num_example, batch_size)"""
+    nbatch = data.shape[0] // batch_size
+    data = data[:nbatch * batch_size]
+    data = data.reshape((batch_size, nbatch)).T
+    return data
+
+class CorpusIter(mx.io.DataIter):
+    "An iterator that returns the a batch of sequence each time"
+    def __init__(self, source, batch_size, bptt):
+        super(CorpusIter, self).__init__()
+        self.batch_size = batch_size
+        self.provide_data = [('data', (bptt, batch_size), np.int32)]
+        self.provide_label = [('label', (bptt, batch_size))]
+        self._index = 0
+        self._bptt = bptt
+        self._source = batchify(source, batch_size)
+
+    def iter_next(self):
+        i = self._index
+        if i+self._bptt > self._source.shape[0] - 1:
+            return False
+        self._next_data = self._source[i:i+self._bptt]
+        self._next_label = self._source[i+1:i+1+self._bptt].astype(np.float32)
+        self._index += self._bptt
+        return True
+
+    def next(self):
+        if self.iter_next():
+            return mx.io.DataBatch(data=self.getdata(), label=self.getlabel())
+        else:
+            raise StopIteration
+
+    def reset(self):
+        self._index = 0
+        self._next_data = None
+        self._next_label = None
+
+    def getdata(self):
+        return [self._next_data]
+
+    def getlabel(self):
+        return [self._next_label]
+
+class MultiSentenceIter(mx.io.DataIter):
+    "An iterator that returns the a batch of sequence each time"
+    def __init__(self, data_file, vocab, batch_size, bptt):
+        super(MultiSentenceIter, self).__init__()
+        self.batch_size = batch_size
+        self.bptt = bptt
+        self.provide_data = [('data', (batch_size, bptt), np.int32), ('mask', (batch_size, bptt))]
+        self.provide_label = [('label', (batch_size, bptt))]
+        self.vocab = vocab
+        self.data_file = data_file
+        self._dataset = data_utils.Dataset(self.vocab, data_file, deterministic=True)
+        self._iter = self._dataset.iterate_once(batch_size, bptt)
+
+    def iter_next(self):
+        data = self._iter.next()
+        if data is None:
+            return False
+        self._next_data = mx.nd.array(data[0], dtype=np.int32)
+        self._next_label = mx.nd.array(data[1])
+        self._next_mask = mx.nd.array(data[2])
+        self._next_mask[:] = 1
+        return True
+
+    def next(self):
+        if self.iter_next():
+            return mx.io.DataBatch(data=self.getdata(), label=self.getlabel())
+        else:
+            raise StopIteration
+
+    def reset(self):
+        print('reset')
+        self._dataset = data_utils.Dataset(self.vocab, self.data_file, deterministic=False)
+        self._iter = self._dataset.iterate_once(self.batch_size, self.bptt)
+        self._next_data = None
+        self._next_label = None
+        self._next_mask = None
+
+    def getdata(self):
+        return [self._next_data, self._next_mask]
+
+    def getlabel(self):
+        return [self._next_label]
+
+class SampleIter(mx.io.DataIter):
+    "An iterator that returns the a batch of unique samples each time"
+    def __init__(self, batch_size, num_samples, sampler):
+        super(SampleIter, self).__init__()
+        print("using sample unique data iterator")
+        self.batch_size = batch_size
+        self.num_samples = num_samples
+        self.provide_data = [('sample%d'%i, (num_samples,), np.int32) for i in range(batch_size)]
+        self.provide_label = [('sample%d'%i, (num_samples,), np.int32) for i in range(batch_size)]
+        self.sampler = sampler
+
+    def iter_next(self):
+        self._next_data = [self.sampler.sample_unique_set(self.num_samples) for i in range(self.batch_size)]
+        #self._next_data = [mx.nd.array(self.sampler.sample_unique2(self.num_samples)) for i in range(self.batch_size)]
+        return True
+
+    def next(self):
+        if self.iter_next():
+            return mx.io.DataBatch(data=self._next_data, label=[])
+        else:
+            raise StopIteration
+
+    def reset(self):
+        pass
diff --git a/example/sparse/nce_language_model/data_utils.py b/example/sparse/nce_language_model/data_utils.py
new file mode 100644
index 00000000000..792ae517fe1
--- /dev/null
+++ b/example/sparse/nce_language_model/data_utils.py
@@ -0,0 +1,163 @@
+import codecs
+import glob
+import json
+import random
+
+import numpy as np
+import mxnet as mx
+
+class Vocabulary(object):
+
+    def __init__(self):
+        self._token_to_id = {}
+        self._token_to_count = {}
+        self._id_to_token = []
+        self._num_tokens = 0
+        self._total_count = 0
+        self._s_id = None
+        self._unk_id = None
+
+    @property
+    def num_tokens(self):
+        return self._num_tokens
+
+    @property
+    def unk(self):
+        return "<UNK>"
+
+    @property
+    def unk_id(self):
+        return self._unk_id
+
+    @property
+    def s(self):
+        return "<S>"
+
+    @property
+    def s_id(self):
+        return self._s_id
+
+    def add(self, token, count):
+        self._token_to_id[token] = self._num_tokens
+        self._token_to_count[token] = count
+        self._id_to_token.append(token)
+        self._num_tokens += 1
+        self._total_count += count
+
+    def finalize(self):
+        self._s_id = self.get_id(self.s)
+        self._unk_id = self.get_id(self.unk)
+
+    def get_id(self, token):
+        return self._token_to_id.get(token, self.unk_id)
+
+    def get_token(self, id_):
+        return self._id_to_token[id_]
+
+    @staticmethod
+    def from_file(filename):
+        vocab = Vocabulary()
+        with codecs.open(filename, "r", "utf-8") as f:
+            for line in f:
+                word, count = line.strip().split()
+                vocab.add(word, int(count))
+        vocab.finalize()
+        return vocab
+
+    def unigram(self):
+        counts = [None] * self._num_tokens
+        for i in range(self._num_tokens):
+            tk = self._id_to_token[i]
+            count = self._token_to_count[tk]
+            if tk == self.s:
+                count /= 2
+            counts[i] = count
+        return mx.nd.array(counts) / self._total_count
+
+class Dataset(object):
+
+    def __init__(self, vocab, file_pattern, deterministic=False):
+        self._vocab = vocab
+        self._file_pattern = file_pattern
+        self._deterministic = deterministic
+
+    def _parse_sentence(self, line):
+        s_id = self._vocab.s_id
+        return [s_id] + [self._vocab.get_id(word) for word in line.strip().split()] + [s_id]
+
+    def _parse_file(self, file_name):
+        print("Processing file: %s" % file_name)
+        with codecs.open(file_name, "r", "utf-8") as f:
+            lines = [line.strip() for line in f]
+            if not self._deterministic:
+                random.shuffle(lines)
+            print("Finished processing!")
+            for line in lines:
+                yield self._parse_sentence(line)
+
+    def _sentence_stream(self, file_stream):
+        for file_name in file_stream:
+            for sentence in self._parse_file(file_name):
+                yield sentence
+
+    def _iterate(self, sentences, batch_size, num_steps):
+        streams = [None] * batch_size
+        x = np.zeros([batch_size, num_steps], np.int32)
+        y = np.zeros([batch_size, num_steps], np.int32)
+        w = np.zeros([batch_size, num_steps], np.uint8)
+        while True:
+            x[:] = 0
+            y[:] = 0
+            w[:] = 0
+            for i in range(batch_size):
+                tokens_filled = 0
+                try:
+                    while tokens_filled < num_steps:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sentences)
+                        num_tokens = min(len(streams[i]) - 1, num_steps - tokens_filled)
+                        x[i, tokens_filled:tokens_filled+num_tokens] = streams[i][:num_tokens]
+                        y[i, tokens_filled:tokens_filled + num_tokens] = streams[i][1:num_tokens+1]
+                        w[i, tokens_filled:tokens_filled + num_tokens] = 1
+                        streams[i] = streams[i][num_tokens:]
+                        tokens_filled += num_tokens
+                except StopIteration:
+                    pass
+            if not np.any(w):
+                return
+
+            yield x, y, w
+
+    def iterate_once(self, batch_size, num_steps):
+        def file_stream():
+            file_patterns = glob.glob(self._file_pattern)
+            if not self._deterministic:
+                random.shuffle(file_patterns)
+            for file_name in file_patterns:
+                yield file_name
+        for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps):
+            yield value
+
+    def iterate_forever(self, batch_size, num_steps):
+        def file_stream():
+            while True:
+                file_patterns = glob.glob(self._file_pattern)
+                if not self._deterministic:
+                    random.shuffle(file_patterns)
+                for file_name in file_patterns:
+                    yield file_name
+        for value in self._iterate(self._sentence_stream(file_stream()), batch_size, num_steps):
+            yield value
+
+'''
+data_dir = '/home/ubuntu/gbw/statmt/1-billion-word-language-modeling-benchmark-r13output'
+vocab = Vocabulary.from_file("data/1b_word_vocab.txt")
+dataset = Dataset(vocab, data_dir + "/training-monolingual.tokenized.shuffled/*", deterministic=True)
+data_iterator = dataset.iterate_forever(2 * 1, 4)
+nb = 0
+while True:
+    n = data_iterator.next()
+    nb += 1
+    if nb % 10000 == 0:
+        print(nb)
+'''
diff --git a/example/sparse/nce_language_model/evaluate.py b/example/sparse/nce_language_model/evaluate.py
new file mode 100644
index 00000000000..015ef13bbce
--- /dev/null
+++ b/example/sparse/nce_language_model/evaluate.py
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx
+import argparse
+import run_utils
+from data import Corpus, CorpusIter, DummyIter, MultiSentenceIter
+from model import *
+from sampler import *
+from sparse_module import SparseModule
+import os, math, logging, time, pickle
+import data_utils
+
+
+def evaluate(mod, data_iter, epoch, log_interval, early_stop=None):
+    import time
+    start = time.time()
+    total_L = 0.0
+    nbatch = 0
+    mod.set_states(value=0)
+    for batch in data_iter:
+        mod.forward(batch, is_train=False)
+        outputs = mod.get_outputs(merge_multi_context=False)
+        states = outputs[:-1]
+        total_L += outputs[-1][0].asscalar()
+        mod.set_states(states=states)
+        nbatch += 1
+        if (nbatch + 1) % log_interval == 0:
+            logging.info("eval batch %d : %.7f" % (nbatch, total_L / nbatch))
+        if (nbatch + 1) == early_stop:
+            break
+    data_iter.reset()
+    loss = total_L / nbatch
+    try:
+        ppl = math.exp(loss) if loss < 100 else -1
+    except Exception:
+        ppl = 1e37
+    end = time.time()
+    logging.info('Iter[%d]\t\t CE loss %.7f, ppl %.7f. Time cost = %.2f seconds'%(epoch, loss, ppl, end - start))
+    return loss
+
+if __name__ == '__main__':
+    parser = run_utils.get_parser(is_train=False)
+    args = parser.parse_args()
+    mx.random.seed(args.seed)
+    np.random.seed(args.seed)
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    logging.info(args)
+
+    # data
+    vocab = data_utils.Vocabulary.from_file(args.vocab)
+    unigram = vocab.unigram()
+    ntokens = unigram.size
+
+    train_data = mx.io.PrefetchingIter(MultiSentenceIter(args.data if not args.bench else "./data/ptb.tiny.txt", vocab,
+                                      args.batch_size, args.bptt))
+    eval_data = mx.io.PrefetchingIter(MultiSentenceIter(args.data if not args.bench else "./data/ptb.tiny.txt", vocab,
+                                      args.eval_size, args.bptt))
+
+    rnn_module = RNNModel(args.bptt, ntokens, args.emsize, args.nhid, args.nlayers,
+                          args.dropout, args.num_proj)
+
+    extra_states = ['sample', 'p_noise_sample', 'p_noise_target']
+    state_names = rnn_module.state_names
+    sparse_params=['encoder_weight', 'decoder_weight', 'decoder_bias']
+    data_names = ['data', 'mask']
+    label_names = ['label']
+    epoch = 0
+    while True:
+        nce_mod = SparseModule.load(args.checkpoint_dir, 0, context=mx.cpu(), state_names=(state_names + extra_states),
+                                    data_names=data_names, label_names=label_names, sparse_params=sparse_params)
+        nce_mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+
+        ############### eval model ####################
+        eval_rnn_out, eval_last_states = rnn_module.forward(args.eval_size)
+        eval_model = ce_loss(eval_rnn_out, ntokens, args.dense)
+        eval_last_states.append(eval_model)
+        ############### eval module ####################
+        eval_module = SparseModule(symbol=mx.sym.Group(eval_last_states), context=mx.cpu(), data_names=data_names,
+                                   label_names=label_names, state_names=state_names, sparse_params=sparse_params)
+        eval_module.bind(data_shapes=eval_data.provide_data, label_shapes=eval_data.provide_label, shared_module=nce_mod, for_training=False)
+        val_L = evaluate(eval_module, eval_data, epoch, args.log_interval)
diff --git a/example/sparse/nce_language_model/get_ptb_data.sh b/example/sparse/nce_language_model/get_ptb_data.sh
new file mode 100755
index 00000000000..1e895131006
--- /dev/null
+++ b/example/sparse/nce_language_model/get_ptb_data.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
+
+RNN_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${RNN_DIR}/data/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/sparse/nce_language_model/model.py b/example/sparse/nce_language_model/model.py
new file mode 100644
index 00000000000..dc48414eff9
--- /dev/null
+++ b/example/sparse/nce_language_model/model.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+def ce_loss(pred, vocab_size, dense):
+    stype = 'row_sparse' if not dense else 'default'
+    decoder_b = mx.sym.var("decoder_bias", shape=(vocab_size, 1))
+    decoder_w = mx.sym.var('decoder_weight', stype=stype)
+    pred = mx.sym.FullyConnected(data=pred, weight=decoder_w, num_hidden=vocab_size, name='pred', bias=decoder_b)
+    label = mx.sym.Variable('label')
+    pred = mx.sym.reshape(pred, shape=(-1, vocab_size))
+    label = mx.sym.reshape(label, shape=(-1,))
+    pred = mx.sym.log_softmax(pred, axis=-1)
+    loss = -mx.sym.pick(pred, label, axis=-1, keepdims=True)
+    mask = mx.sym.var("mask")
+    mask = mx.sym.reshape(mask, shape=(-1, 1))
+    loss = loss * mask
+    loss = mx.sym.make_loss(mx.sym.mean(loss), name="nll")
+    return loss
+
+class RNNModel():
+
+    def __init__(self, bptt, vocab_size, num_embed, nhid, num_layers,
+                 dropout, num_proj):
+        self.bptt = bptt
+        self.vocab_size = vocab_size
+        self.num_embed = num_embed
+        self.nhid = nhid
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.num_proj = num_proj
+        self.state_names = []
+        self.embed = mx.sym.contrib.SparseEmbedding
+        self.dim = self.num_proj if self.num_proj > 0 else self.nhid
+
+    def forward(self, batch_size):
+        F = mx.symbol
+        data = F.var('data')
+        weight = F.var("encoder_weight", stype='row_sparse')
+        embed = self.embed(data=data, weight=weight, input_dim=self.vocab_size,
+                           output_dim=self.num_embed, name='embed')
+        states = []
+        outputs = F.Dropout(embed, p=self.dropout)
+        for i in range(self.num_layers):
+            prefix = 'lstmp%d_' % i
+            init_h = F.var(prefix + 'init_h', shape=(batch_size, self.dim), init=mx.init.Zero())
+            init_c = F.var(prefix + 'init_c', shape=(batch_size, self.nhid), init=mx.init.Zero())
+            self.state_names += [prefix + 'init_h', prefix + 'init_c']
+            lstmp = mx.gluon.contrib.rnn.LSTMPCell(self.nhid, self.num_proj)
+            ## TODO(haibin) better layout?
+            outputs, next_states = lstmp.unroll(self.bptt, outputs, begin_state=[init_h, init_c], layout='NTC', merge_outputs=True)
+            outputs = F.Dropout(outputs, p=self.dropout)
+            states += [F.stop_gradient(s) for s in next_states]
+        outputs = F.reshape(outputs, shape=(-1, self.dim))
+        return outputs, states
+
+class SampledModule():
+
+    def __init__(self, vocab_size, nhid, num_samples, bptt, num_proj, remove_hits=True):
+        self.vocab_size = vocab_size
+        self.nhid = nhid
+        self.num_samples = num_samples
+        self.bptt = bptt
+        self.num_proj = num_proj
+        self.dim = num_proj if num_proj > 0 else nhid
+        self.embed = mx.sym.contrib.SparseEmbedding
+        self.remove_hits = remove_hits
+
+    def forward(self, inputs, batch_size):
+        # inputs = (n, nhid)
+        n = batch_size * self.bptt
+        F = mx.symbol
+        # (num_samples, )
+        sample = F.var('sample', shape=(self.num_samples,), dtype='float32')
+        # (n, )
+        label = F.var('label')
+        label = F.reshape(label, shape=(-1,), name="label_reshape")
+        # (num_samples+n, )
+        sample_label = F.concat(sample, label, dim=0)
+        # weight and bias
+        decoder_w = F.var("decoder_weight", stype='row_sparse')
+        decoder_b = F.var("decoder_bias", shape=(self.vocab_size, 1))
+        # lookup weights and biases
+        # (num_samples+n, nhid)
+        sample_target_w = self.embed(data=sample_label, weight=decoder_w,
+                                     input_dim=self.vocab_size, output_dim=self.dim)
+        # (num_samples+n, 1)
+        sample_target_b = F.Embedding(data=sample_label, weight=decoder_b,
+                                      input_dim=self.vocab_size, output_dim=1)
+        # (num_samples, nhid)
+        sample_w = F.slice(sample_target_w, begin=(0, 0), end=(self.num_samples, self.dim))
+        target_w = F.slice(sample_target_w, begin=(self.num_samples, 0), end=(self.num_samples+n, self.dim))
+        sample_b = F.slice(sample_target_b, begin=(0, 0), end=(self.num_samples, 1))
+        target_b = F.slice(sample_target_b, begin=(self.num_samples, 0), end=(self.num_samples+n, 1))
+    
+        # target
+        # (n, 1)
+        true_pred = F.sum(target_w * inputs, axis=1, keepdims=True) + target_b
+        # samples
+        # (n, num_samples)
+        sample_b = F.reshape(sample_b, (-1,))
+        sample_pred = F.FullyConnected(inputs, weight=sample_w, bias=sample_b, num_hidden=self.num_samples)
+
+        # remove accidental hits
+        if self.remove_hits:
+            label_v = F.reshape(label, (-1, 1))
+            sample_v = F.reshape(sample, (1, -1))
+            neg = F.broadcast_equal(label_v, sample_v) * -1e37
+            sample_pred = sample_pred + neg
+
+        p_noise_sample = F.var("p_noise_sample", shape=(self.num_samples, ))
+        p_noise_sample = F.reshape(p_noise_sample, shape=(1, self.num_samples))
+        p_noise_target = F.var("p_noise_target", shape=(n, 1))
+        p_target = true_pred - F.log(p_noise_target)
+        p_sample = F.broadcast_sub(sample_pred, F.log(p_noise_sample))
+
+        # return logits and new_labels
+        # (n, 1+num_samples)
+        logits = F.concat(p_target, p_sample, dim=1)
+        new_targets = F.zeros(shape=(n))
+        return logits, new_targets
+
+class CrossEntropyLoss():
+
+    def __init__(self):
+        self.criterion = mx.gluon.loss.SoftmaxCrossEntropyLoss()
+
+    def forward(self, inputs, labels, scale):
+        loss = self.criterion.hybrid_forward(mx.symbol, inputs, labels)
+        F = mx.symbol
+        mask = F.var('mask')
+        loss = loss * F.reshape(mask, shape=(-1,))
+        return F.make_loss(loss.mean() * scale)
diff --git a/example/sparse/nce_language_model/readme.mk b/example/sparse/nce_language_model/readme.mk
new file mode 100644
index 00000000000..73a60a02fea
--- /dev/null
+++ b/example/sparse/nce_language_model/readme.mk
@@ -0,0 +1,5 @@
+```
+
+PYTHONPATH=~/nce/python python train.py --nhid 5 --emsize 5 --batch_size=128 --k=6 --dropout=0.5 --mom=0.95  --lr-decay=0.5 --optimizer=sgd --gpus=0,1,2,3,4,5,6,7 --checkpoint-dir=./checkpoint2/ --train-data=/home/ubuntu/small-gbw/training-monolingual.tokenized.shuffled/* --eval-data=/home/ubuntu/small-gbw/heldout-monolingual.tokenized.shuffled/* --vocab=./data/1b_word_vocab.txt
+
+```
diff --git a/example/sparse/nce_language_model/run_utils.py b/example/sparse/nce_language_model/run_utils.py
new file mode 100644
index 00000000000..c69500e43b4
--- /dev/null
+++ b/example/sparse/nce_language_model/run_utils.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+
+def _add_train_args(parser):
+    parser.add_argument('--lr', type=float, default=0.1,
+                        help='initial learning rate')
+    parser.add_argument('--wd', type=float, default=0.0,
+                        help='wd')
+    parser.add_argument('--clip', type=float, default=0.2,
+                        help='gradient clipping by global norm')
+    parser.add_argument('--per-ctx-clip', action='store_true',
+                        help='clip per ctx')
+    parser.add_argument('--checkpoint-interval', type=int, default=1,
+                        help='checkpoint every x epochs')
+    # TODO change default value
+    parser.add_argument('--load-epoch', type=int, default=-1,
+                        help='load epoch')
+    parser.add_argument('--rescale-embed', action='store_true',
+                        help='rescale-embedding-grad')
+    return parser
+
+def _add_eval_args(parser):
+    parser.add_argument('--eval-every', type=int, default=1,
+                        help='evalutaion every x epochs')
+    parser.add_argument('--eval_size', type=int, default=32,
+                        help='batch size')
+    return parser
+
+def get_parser(is_train=True):
+    parser = argparse.ArgumentParser(description='Language Model on GBW')
+    parser.add_argument('--data', type=str, default='./data/ptb.train.txt',
+                        help='location of the data corpus')
+    parser.add_argument('--vocab', type=str, default='./data/1b_word_vocab.txt',
+                        help='location of the corpus vocab')
+    parser.add_argument('--emsize', type=int, default=512,
+                        help='size of word embeddings')
+    parser.add_argument('--nhid', type=int, default=2048,
+                        help='number of hidden units per layer')
+    parser.add_argument('--num_proj', type=int, default=512,
+                        help='number of projection units per layer')
+    parser.add_argument('--nlayers', type=int, default=1,
+                        help='number of layers')
+    parser.add_argument('--epochs', type=int, default=5,
+                        help='upper epoch limit')
+    parser.add_argument('--batch_size', type=int, default=128,
+                        help='batch size per gpu')
+    parser.add_argument('--dropout', type=float, default=0.1,
+                        help='dropout applied to layers (0 = no dropout)')
+    parser.add_argument('--eps', type=float, default=1,
+                        help='eps for adagrad')
+    parser.add_argument('--bptt', type=int, default=20,
+                        help='sequence length')
+    parser.add_argument('--k', type=int, default=8192,
+                        help='number of noise samples to estimate')
+    parser.add_argument('--gpus', type=str,
+                        help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
+    parser.add_argument('--dense', action='store_true',
+                        help='use dense embedding instead of sparse embedding')
+    parser.add_argument('--log-interval', type=int, default=200,
+                        help='report interval')
+    parser.add_argument('--seed', type=int, default=1,
+                        help='random seed')
+    parser.add_argument('--profile', action='store_true',
+                        help='whether to use profiler')
+    parser.add_argument('--kvstore', type=str, default='device',
+                        help='type of kv-store to use')
+    parser.add_argument('--checkpoint-dir', type=str, default='./checkpoint/',
+                        help='dir for checkpoint')
+    parser = _add_train_args(parser) if is_train else _add_eval_args(parser)
+    return parser
diff --git a/example/sparse/nce_language_model/sparse_module.py b/example/sparse/nce_language_model/sparse_module.py
new file mode 100644
index 00000000000..55fce2ef726
--- /dev/null
+++ b/example/sparse/nce_language_model/sparse_module.py
@@ -0,0 +1,238 @@
+import logging
+import warnings
+
+import mxnet as mx
+from mxnet.module import Module
+from mxnet.model import _create_kvstore, _initialize_kvstore, _update_params, _update_params_on_kvstore
+from mxnet.model import load_checkpoint
+
+class SparseModule(Module):
+
+    def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
+                 logger=logging, context=mx.cpu(), work_load_list=None,
+                 fixed_param_names=None, state_names=None, group2ctxs=None,
+                 compression_params=None, sparse_params=None):
+
+        super(SparseModule, self).__init__(symbol, data_names=data_names, label_names=label_names,
+                                           logger=logger, context=context, work_load_list=work_load_list,
+                                           fixed_param_names=fixed_param_names, state_names=state_names,
+                                           group2ctxs=group2ctxs, compression_params=compression_params)
+        self._sparse_params = sparse_params
+
+    def init_optimizer(self, kvstore='local', optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
+        """Installs and initializes optimizers.
+
+        Parameters
+        ----------
+        kvstore : str or KVStore
+            Default `'local'`.
+        optimizer : str or Optimizer
+            Default `'sgd'`
+        optimizer_params : dict
+            Default `(('learning_rate', 0.01),)`. The default value is not a dictionary,
+            just to avoid pylint warning of dangerous default values.
+        force_init : bool
+            Default ``False``, indicating whether we should force re-initializing the
+            optimizer in the case an optimizer is already installed.
+        """
+        assert self.binded and self.params_initialized
+        import mxnet.optimizer as opt
+
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring...')
+            return
+
+        if self._params_dirty:
+            self._sync_params_from_devices()
+
+        (kvstore, update_on_kvstore) = \
+                _create_kvstore(kvstore, len(self._context), self._arg_params)
+        #update_on_kvstore = False
+
+        batch_size = self._exec_group.batch_size
+        if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type:
+            batch_size *= kvstore.num_workers
+        rescale_grad = 1.0/batch_size
+
+        if isinstance(optimizer, str):
+            idx2name = {}
+            if update_on_kvstore:
+                idx2name.update(enumerate(self._exec_group.param_names))
+            else:
+                for k in range(len(self._context)):
+                    idx2name.update({i*len(self._context)+k: n
+                                     for i, n in enumerate(self._exec_group.param_names)})
+            optimizer_params = dict(optimizer_params)
+            if 'rescale_grad' not in optimizer_params:
+                optimizer_params['rescale_grad'] = rescale_grad
+            optimizer = opt.create(optimizer,
+                                   sym=self.symbol, param_idx2name=idx2name,
+                                   **optimizer_params)
+        else:
+            assert isinstance(optimizer, mx.optimizer.Optimizer)
+            if optimizer.rescale_grad != rescale_grad:
+                #pylint: disable=no-member
+                warnings.warn(
+                    "Optimizer created manually outside Module but rescale_grad " +
+                    "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%(
+                        optimizer.rescale_grad, rescale_grad) +
+                    "Is this intended?", stacklevel=2)
+
+        self._optimizer = optimizer
+        self._kvstore = kvstore
+        self._update_on_kvstore = update_on_kvstore
+        self._updater = None
+
+        if kvstore:
+            if self._compression_params:
+                kvstore.set_gradient_compression(self._compression_params)
+            # copy initialized local parameters to kvstore
+            _initialize_kvstore(kvstore=kvstore,
+                                param_arrays=self._exec_group.param_arrays,
+                                arg_params=self._arg_params,
+                                param_names=self._param_names,
+                                update_on_kvstore=update_on_kvstore,
+                                skip_pull=self._sparse_params)
+        if update_on_kvstore:
+            kvstore.set_optimizer(self._optimizer)
+        else:
+            self._updater = opt.get_updater(optimizer)
+
+        self.optimizer_initialized = True
+
+        if self._preload_opt_states is not None:
+            self.load_optimizer_states(self._preload_opt_states)
+            self._preload_opt_states = None
+        # TODO(haibin) refactor init kvstore
+
+    def sync_sparse_params(self, param_rowids):
+        '''Prepares the module for processing a data batch.
+        Usually involves switching bucket and reshaping.
+        Parameters
+        ----------
+        '''
+        if not self._kvstore:
+            return
+        assert(isinstance(param_rowids, dict))
+        for param_name, rowid in param_rowids.items():
+            param_idx = self._exec_group.param_names.index(param_name)
+            param_val = self._exec_group.param_arrays[param_idx]
+            self._kvstore.row_sparse_pull(param_name, param_val, row_ids=rowid,
+                                          priority=-param_idx)
+
+    def update(self):
+        """Updates parameters according to the installed optimizer and the gradients computed
+        in the previous forward-backward batch.
+        See Also
+        ----------
+        :meth:`BaseModule.update`.
+        """
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+
+        self._params_dirty = True
+        if self._update_on_kvstore:
+            _update_params_on_kvstore(self._exec_group.param_arrays,
+                                      self._exec_group.grad_arrays,
+                                      self._kvstore, self._exec_group.param_names,
+                                      skip_pull=self._sparse_params)
+        else:
+            _update_params(self._exec_group.param_arrays,
+                           self._exec_group.grad_arrays,
+                           updater=self._updater,
+                           num_device=len(self._context),
+                           kvstore=self._kvstore,
+                           param_names=self._exec_group.param_names)
+    @staticmethod
+    def load(prefix, epoch, load_optimizer_states=False, **kwargs):
+        """Creates a model from previously saved checkpoint.
+
+        Parameters
+        ----------
+        prefix : str
+            path prefix of saved model files. You should have
+            "prefix-symbol.json", "prefix-xxxx.params", and
+            optionally "prefix-xxxx.states", where xxxx is the
+            epoch number.
+        epoch : int
+            epoch to load.
+        load_optimizer_states : bool
+            whether to load optimizer states. Checkpoint needs
+            to have been made with save_optimizer_states=True.
+        data_names : list of str
+            Default is `('data')` for a typical model used in image classification.
+        label_names : list of str
+            Default is `('softmax_label')` for a typical model used in image
+            classification.
+        logger : Logger
+            Default is `logging`.
+        context : Context or list of Context
+            Default is ``cpu()``.
+        work_load_list : list of number
+            Default ``None``, indicating uniform workload.
+        fixed_param_names: list of str
+            Default ``None``, indicating no network parameters are fixed.
+        """
+        sym, args, auxs = load_checkpoint(prefix, epoch)
+        mod = SparseModule(symbol=sym, **kwargs)
+        mod._arg_params = args
+        mod._aux_params = auxs
+        mod.params_initialized = True
+        if load_optimizer_states:
+            mod._preload_opt_states = '%s-%04d.states'%(prefix, epoch)
+        return mod
+
+    def save_params(self, fname):
+        """Saves model parameters to file.
+        Parameters
+        ----------
+        fname : str
+            Path to output param file.
+        Examples
+        --------
+        >>> # An example of saving module parameters.
+        >>> mod.save_params('myfile')
+        """
+        arg_params, aux_params = self.get_params_from_kv(self._arg_params, self._aux_params)
+        save_dict = {('arg:%s' % k) : v.as_in_context(mx.cpu()) for k, v in arg_params.items()}
+        save_dict.update({('aux:%s' % k) : v.as_in_context(mx.cpu()) for k, v in aux_params.items()})
+        mx.nd.save(fname, save_dict)
+
+    def get_params_from_kv(self, arg_params, aux_params):
+        """ Copy data from each executor to `arg_params` and `aux_params`.
+        Parameters
+        ----------
+        arg_params : list of NDArray
+            Target parameter arrays.
+        aux_params : list of NDArray
+            Target aux arrays.
+        Notes
+        -----
+        - This function will inplace update the NDArrays in arg_params and aux_params.
+        """
+        assert(self._kvstore is not None)
+        for name, block in zip(self._exec_group.param_names, self._exec_group.param_arrays):
+            assert(isinstance(block, list))
+            #if block[0].stype == 'row_sparse':
+            #    print(name)
+            #    row_ids = mx.nd.arange(start=0, stop=block[0].shape[0], dtype='int64')
+            #    import numpy as np
+            #    print(np.unique(row_ids.asnumpy()).shape)
+            #    self._kvstore.row_sparse_pull(name, arg_params[name], row_ids=row_ids)
+            #    print(arg_params[name].indices.shape, block[0].shape, arg_params[name].stype)
+            #elif block[0].stype == 'default':
+            self._kvstore.pull(name, out=arg_params[name], ignore_sparse=False)
+            #else:
+            #    raise NotImplementedError()
+        # TODO handle aux names
+        print(self._exec_group.aux_names)
+        #assert(self._exec_group.aux_names is None or self._exec_group.aux_arrays is None)
+        #for name, block in zip(self._exec_group.aux_names, self._exec_group.aux_arrays):
+        #    if block[0].stype == 'row_sparse':
+        #        row_ids = mx.nd.arange(start=0, stop=block[0].shape[0])
+        #        self._kvstore.row_sparse_pull(name, aux_params[name], row_ids=row_ids)
+        #    elif block[0].stype == 'default':
+        #        self._kvstore.pull(name, out=aux_params[name])
+        #    else:
+        #        raise NotImplementedError()
+        return arg_params, aux_params
diff --git a/example/sparse/nce_language_model/train.py b/example/sparse/nce_language_model/train.py
new file mode 100644
index 00000000000..c0afed5caf2
--- /dev/null
+++ b/example/sparse/nce_language_model/train.py
@@ -0,0 +1,226 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx
+import run_utils
+import evaluate
+from data import Corpus, CorpusIter, DummyIter, MultiSentenceIter, SampleIter
+from model import *
+from sampler import *
+from sparse_module import SparseModule
+import os, math, logging, time, pickle
+import data_utils
+
+DEBUG_FLG = False
+
+def DEBUG(s):
+    if DEBUG_FLG:
+        print(s)
+
+if __name__ == '__main__':
+    import sys
+    print(sys.argv)
+    parser = run_utils.get_parser(is_train=True)
+    args = parser.parse_args()
+    mx.random.seed(args.seed)
+    np.random.seed(args.seed)
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    logging.info(args)
+    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] if args.gpus else [mx.cpu()]
+    ngpus = len(ctx)
+
+    # data
+    vocab = data_utils.Vocabulary.from_file(args.vocab)
+    unigram = vocab.unigram()
+    ntokens = unigram.size
+    os.environ["MXNET_MAGIC_DIM"] = str(ntokens * 513) if not args.dense else "-2"
+
+    train_data = mx.io.PrefetchingIter(MultiSentenceIter(args.data, vocab,
+                                       args.batch_size * ngpus, args.bptt))
+    # model
+    rnn_module = RNNModel(args.bptt, ntokens, args.emsize, args.nhid, args.nlayers,
+                          args.dropout, args.num_proj)
+    nce_module = SampledModule(ntokens, args.nhid, args.k, args.bptt, args.num_proj)
+
+    rnn_out, last_states = rnn_module.forward(args.batch_size)
+    logits, new_targets = nce_module.forward(rnn_out, args.batch_size)
+    loss_scale = args.bptt
+    model = CrossEntropyLoss().forward(logits, new_targets, loss_scale)
+    
+    state_names = rnn_module.state_names
+
+    sparse_params=['encoder_weight', 'decoder_weight']
+    data_names = ['data', 'mask']
+    label_names = ['label']
+
+    # module
+    extra_states = ['sample', 'p_noise_sample', 'p_noise_target']
+
+    import numpy as np
+    # TODO load optimizer state
+    if args.load_epoch < 0:
+        module = SparseModule(symbol=mx.sym.Group(last_states + [model]), context=ctx,
+                              state_names=(state_names + extra_states),
+                              data_names=data_names, label_names=label_names, sparse_params=sparse_params)
+        module.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+        # currently params are initialized explicitly, choice of init has no impact
+        arg_params = {}
+        module.init_params(initializer=mx.init.Xavier(factor_type='out'))
+
+    else:
+        module = SparseModule.load(args.checkpoint_dir, 0, context=ctx, state_names=(state_names + extra_states),
+                                   data_names=data_names, label_names=label_names, sparse_params=sparse_params)
+        module.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+
+    # parameters
+    all_args = model.list_arguments()
+    trainable_args = set(all_args) - set(state_names) - set(extra_states) - set(data_names) - set(label_names)
+    lstm_args = []
+    for arg in trainable_args:
+        if 'lstm' in arg:
+            lstm_args.append(arg)
+    print(lstm_args)
+
+    kvstore = None if args.kvstore is None else mx.kv.create(args.kvstore)
+    require_rsp_pull = kvstore and not args.dense
+    # TODO support custom eps
+    optimizer = mx.optimizer.create('adagrad', learning_rate=args.lr, rescale_grad=1.0/ngpus, eps=args.eps, wd=args.wd)
+
+    module.init_optimizer(optimizer=optimizer, kvstore=kvstore)
+    speedometer = mx.callback.Speedometer(args.batch_size * ngpus * args.bptt, args.log_interval)
+    ############### eval module ####################
+
+    if args.profile:
+        config = ['nhid', args.nhid, 'k', args.k, 'nlayers', args.nlayers,
+                  'dense', args.dense, 'ngpus', ngpus]
+        config_str = map(lambda x: str(x), config)
+        filename = '-'.join(config_str) + '.json'
+        mx.profiler.profiler_set_config(mode='all', filename=filename)
+        mx.profiler.profiler_set_state('run')
+
+    # train
+    def listify(x):
+        return x if isinstance(x, list) else [x]
+
+    def prep_samples(label):
+        label_list = listify(label.split(ngpus, axis=0))
+        p_noise_sample_list = []
+        p_noise_target_list = []
+        sample_list = []
+        for label in label_list:
+            sampled_classes, expected_count_true, expected_count_sampled = mx.nd.contrib.rand_zipfian(label.reshape((-1,1)), args.k, ntokens)
+            sample_list.append(sampled_classes.astype(np.float32))
+            p_noise_target_list.append(expected_count_true.astype(np.float32))
+            p_noise_sample_list.append(expected_count_sampled.astype(np.float32))
+        sample = mx.nd.concat(*sample_list, dim=0)
+        return (sample_list, p_noise_sample_list, p_noise_target_list), sample
+
+    logging.info("Training started ... ")
+    for epoch in range(args.epochs):
+        total_L = mx.nd.array([0.0])
+        nbatch = 0
+        module.set_states(value=0)
+        state_cache = module.get_states(merge_multi_context=False)[:-len(extra_states)]
+        next_batch = train_data.next()
+        next_lists, next_sample = prep_samples(next_batch.label[0])
+        stop_iter = False
+        while not stop_iter:
+            batch = next_batch
+            label = batch.label[0]
+            lists, sample = next_lists, next_sample
+            state_cache += lists
+            module.set_states(states=state_cache)
+            if require_rsp_pull:
+                data_1d = batch.data[0].reshape((-1,)).astype(np.float32)
+                label_1d = label.reshape((-1,))
+                sample_1d = sample.reshape((-1,)).astype(np.float32)
+                row_ids = mx.nd.concat(label_1d, sample_1d, dim=0)
+                param_rowids = {'encoder_weight': data_1d.astype(np.int64), 'decoder_weight': row_ids.astype(np.int64)}
+                # sync_sparse_params should be part of forward API
+                module.sync_sparse_params(param_rowids)
+
+            module.forward(batch)
+            try:
+                next_batch = train_data.next()
+                next_lists, next_sample = prep_samples(next_batch.label[0])
+            except StopIteration:
+                stop_iter = True
+            outputs = module.get_outputs(merge_multi_context=False)
+            state_cache = outputs[:-1]
+            module.backward()
+            # TODO haibin add_n
+            for g in range(ngpus):
+                total_L += outputs[-1][g].copyto(mx.cpu()) / ngpus
+
+            # update all parameters (including the weight parameter)
+            if args.rescale_embed:
+                param_idx = module._exec_group.param_names.index('encoder_weight')
+                grad_val = module._exec_group.grad_arrays[param_idx]
+                for g in grad_val:
+                    g[:] *= 128
+            if args.per_ctx_clip:
+                norm = module.clip_by_global_norm_per_ctx(max_norm=args.clip, param_names=lstm_args)
+            else:
+                norm = module.clip_by_global_norm(max_norm=args.clip, param_names=lstm_args)
+            #if nbatch % (args.log_interval / 10) == 0:
+                #print(norm)
+            module.update()
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=None, locals=locals())
+
+            speedometer(speedometer_param)
+            # update training metric
+            # TODO (revert >=)
+            x = -1 if DEBUG_FLG else 0
+            if nbatch % args.log_interval == 0 and nbatch > x:
+                cur_L = total_L.asscalar() / args.log_interval / loss_scale
+                try:
+                    ppl = math.exp(cur_L) if cur_L < 100 else -1.0
+                except OverflowError:
+                    ppl = -1.0
+                logging.info('Iter[%d] Batch [%d] \tloss %.7f, ppl %.7f'%(
+                    epoch, nbatch, cur_L, ppl))
+                #print('Batch [%d] \tloss %.7f, ppl %.7f \n'%(nbatch, cur_L, ppl))
+                total_L[:] = 0.0
+            nbatch += 1
+            if nbatch == args.checkpoint_interval:
+                #exit()
+                pass
+        if (epoch + 1) % args.checkpoint_interval == 0:
+            module.save_checkpoint(args.checkpoint_dir, epoch % 1, save_optimizer_states=False)
+            nce_mod = SparseModule.load(args.checkpoint_dir, 0, context=mx.cpu(), state_names=(state_names + extra_states),
+                                        data_names=data_names, label_names=label_names, sparse_params=sparse_params)
+            checkpoint_iter = MultiSentenceIter(args.data, vocab,
+                                                args.batch_size, args.bptt)
+            nce_mod.bind(data_shapes=checkpoint_iter.provide_data, label_shapes=checkpoint_iter.provide_label)
+
+            ############### eval model ####################
+            eval_model = ce_loss(rnn_out, ntokens, args.dense)
+            ############### eval module ####################
+            eval_module = SparseModule(symbol=mx.sym.Group(last_states + [eval_model]), context=mx.cpu(), data_names=data_names,
+                                       label_names=label_names, state_names=state_names, sparse_params=sparse_params)
+            test_data_path = "/home/ubuntu/gbw-validation/heldout-monolingual.tokenized.shuffled/*"
+            eval_data = mx.io.PrefetchingIter(MultiSentenceIter(test_data_path, vocab,
+                                              args.batch_size, args.bptt))
+            eval_module.bind(data_shapes=eval_data.provide_data, label_shapes=eval_data.provide_label, shared_module=nce_mod, for_training=False)
+            val_L = evaluate.evaluate(eval_module, eval_data, epoch, 20, early_stop=None)
+        train_data.reset()
+    logging.info("Training completed. ")
+    if args.profile:
+        mx.profiler.profiler_set_state('stop')
diff --git a/example/sparse/nce_language_model/tune.py b/example/sparse/nce_language_model/tune.py
new file mode 100644
index 00000000000..df1365f4084
--- /dev/null
+++ b/example/sparse/nce_language_model/tune.py
@@ -0,0 +1,55 @@
+import subprocess, os
+import argparse
+import logging
+
+parser = argparse.ArgumentParser(description="Run sparse linear classification " \
+                                             "with distributed kvstore",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--gpu', type=int, default=0,
+                    help='which gpu')
+
+
+num_gpus = 1
+
+args = parser.parse_args()
+gpu = args.gpu
+print("total num gpus = %d, current gpu = %d" % (num_gpus, gpu))
+
+LR = [0.05, 0.1, 0.2]
+BPTT = [20, 35]
+K = [8192, 10240]
+CLIP = [1, 5, 10, 15]
+DROPOUT = [0.01, 0.05, 0.1, 0.2]
+EPS = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
+PER_CTX = [True, False]
+RESCALE = [True, False]
+
+#LR = [0.2]
+#BPTT = [20]
+#K = [8192]
+#CLIP = [10]
+#DROPOUT = [0.1]
+total_iter = len(LR) * len(BPTT) * len(K) * len(CLIP) * len(DROPOUT)
+
+for lr in LR:
+    for bptt in BPTT:
+        for k in K:
+            for clip in CLIP:
+                for dropout in DROPOUT:
+                    for eps in EPS:
+                        for ctx in PER_CTX:
+                            for resc in RESCALE:
+                                my_env = os.environ.copy()
+                                my_env["PYTHONPATH"] = "/home/ubuntu/tf/python:" + my_env["PYTHONPATH"]
+
+                                config = ["--bptt", str(bptt), "--k", str(k), '--dropout', str(dropout), \
+                                          '--clip', str(clip), '--lr', str(lr), '--eps', str(eps)]
+                                if ctx:
+                                    config += ["--per-ctx-clip"]
+                                if resc:
+                                    config += ["--rescale-embed"]
+                                cmd = ["python", "train.py", "--gpus=0,1,2,3", "--data=/home/ubuntu/gbw-5/training-monolingual.tokenized.shuffled/*",
+                                       "--epoch=1", "--checkpoint-interval=100"] + config
+                                filename = 'logs-5/tune' + '-'.join(config) + ".tunelog"
+                                with open(filename, "w") as outfile:
+                                    subprocess.check_call(cmd, stderr=outfile, env=my_env)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index e85afe522f0..f93a81c892b 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1797,7 +1797,8 @@ MXNET_DLL int MXKVStorePull(KVStoreHandle handle,
                             mx_uint num,
                             const int* keys,
                             NDArrayHandle* vals,
-                            int priority);
+                            int priority,
+                            const bool ignore_sparse DEFAULT(true));
 /*!
  * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string
  * \param handle handle to the kvstore
@@ -1811,7 +1812,8 @@ MXNET_DLL int MXKVStorePullEx(KVStoreHandle handle,
                               mx_uint num,
                               const char** keys,
                               NDArrayHandle* vals,
-                              int priority);
+                              int priority,
+                              const bool ignore_sparse DEFAULT(true));
 
 /*!
  * \brief pull a list of (key, value) pairs from the kvstore, where each key is an integer.
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index 4e99a9c861f..efcf630b781 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -170,19 +170,21 @@ class KVStore {
    * \param keys the list of keys
    * \param values the list of buffers for the pulled data, they should be preallocated
    * \param priority Priority of the action.
+   * \param ignore_sparse Whether to ignore sparse ndarrays in the list.
    */
   virtual void Pull(const std::vector<int>& keys,
                     const std::vector<NDArray*>& values,
-                    int priority = 0) = 0;
+                    int priority = 0, bool ignore_sparse = true) = 0;
   /*!
    * \brief pull a list of key-value pairs from the store
    * \param keys the list of keys in string format
    * \param values the list of buffers for the pulled data, they should be preallocated
    * \param priority Priority of the action.
+   * \param ignore_sparse Whether to ignore sparse ndarrays in the list.
    */
   virtual void Pull(const std::vector<std::string>& str_keys,
                     const std::vector<NDArray*>& values,
-                    int priority = 0) = 0;
+                    int priority = 0, bool ignore_sparse = true) = 0;
 
   /*!
    * \brief pull a list of key-value pairs from the store.
diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
index d74c107df56..bb11cbb783b 100644
--- a/python/mxnet/gluon/contrib/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -17,9 +17,9 @@
 
 # coding: utf-8
 """Definition of various recurrent neural network cells."""
-__all__ = ['VariationalDropoutCell']
+__all__ = ['VariationalDropoutCell', 'LSTMPCell']
 
-from ...rnn import BidirectionalCell, SequentialRNNCell, ModifierCell
+from ...rnn import BidirectionalCell, SequentialRNNCell, ModifierCell, HybridRecurrentCell
 from ...rnn.rnn_cell import _format_sequence, _get_begin_state
 
 
@@ -181,3 +181,123 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
         outputs, _, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
 
         return outputs, states
+
+
+class LSTMPCell(HybridRecurrentCell):
+    r"""Long-Short Term Memory Projected (LSTMP) network cell.
+    (https://arxiv.org/abs/1402.1128)
+    Each call computes the following function:
+    .. math::
+        \begin{array}{ll}
+        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
+        f_t = sigmoid(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
+        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}}) \\
+        o_t = sigmoid(W_{io} x_t + b_{io} + W_{ro} r_{(t-1)} + b_{ro}) \\
+        c_t = f_t * c_{(t-1)} + i_t * g_t \\
+        h_t = o_t * \tanh(c_t) \\
+        r_t = W_{hr} h_t
+        \end{array}
+    where :math:`r_t` is the projected recurrent activation at time `t`,
+    math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
+    cell state at time `t`, :math:`x_t` is the input at time `t`, and :math:`i_t`,
+    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
+    out gates, respectively.
+    Parameters
+    ----------
+    hidden_size : int
+        Number of units in cell state symbol.
+    projection_size : int
+        Number of units in output symbol.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the hidden state.
+    proj_weight_initializer : str or Initializer
+        Initializer for the projection weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default 'lstmbias'
+        Initializer for the bias vector. By default, bias for the forget
+        gate is initialized to 1 while all other biases are initialized
+        to zero.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    prefix : str, default 'lstmp_'
+        Prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, input_size)`.
+        - **states**: a list of two initial recurrent state tensors, with shape
+          `(batch_size, projection_size)` and `(batch_size, hidden_size)` respectively.
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, num_hidden)`.
+        - **next_states**: a list of two output recurrent state tensors. Each has
+          the same shape as `states`.
+    """
+    def __init__(self, hidden_size, projection_size,
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 proj_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 input_size=0, prefix=None, params=None):
+        super(LSTMPCell, self).__init__(prefix=prefix, params=params)
+
+        self._hidden_size = hidden_size
+        self._input_size = input_size
+        self._projection_size = projection_size
+        self.i2h_weight = self.params.get('i2h_weight', shape=(4*hidden_size, input_size),
+                                          init=i2h_weight_initializer,
+                                          allow_deferred_init=True)
+        self.h2h_weight = self.params.get('h2h_weight', shape=(4*hidden_size, projection_size),
+                                          init=h2h_weight_initializer,
+                                          allow_deferred_init=True)
+        self.proj_weight = self.params.get('proj_weight', shape=(projection_size, hidden_size),
+                                           init=proj_weight_initializer,
+                                           allow_deferred_init=True)
+        self.i2h_bias = self.params.get('i2h_bias', shape=(4*hidden_size,),
+                                        init=i2h_bias_initializer,
+                                        allow_deferred_init=True)
+        self.h2h_bias = self.params.get('h2h_bias', shape=(4*hidden_size,),
+                                        init=h2h_bias_initializer,
+                                        allow_deferred_init=True)
+
+    def state_info(self, batch_size=0):
+        return [{'shape': (batch_size, self._projection_size), '__layout__': 'NC'},
+                {'shape': (batch_size, self._hidden_size), '__layout__': 'NC'}]
+
+    def _alias(self):
+        return 'lstmp'
+
+    def __repr__(self):
+        s = '{name}({mapping})'
+        shape = self.i2h_weight.shape
+        proj_shape = self.proj_weight.shape
+        mapping = '{0} -> {1} -> {2}'.format(shape[1] if shape[1] else None, shape[0], proj_shape[0])
+        return s.format(name=self.__class__.__name__,
+                        mapping=mapping,
+                        **self.__dict__)
+
+    def hybrid_forward(self, F, inputs, states, i2h_weight,
+                       h2h_weight, proj_weight, i2h_bias, h2h_bias):
+        prefix = 't%d_'%self._counter
+        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
+                               num_hidden=self._hidden_size*4, name=prefix+'i2h')
+        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
+                               num_hidden=self._hidden_size*4, name=prefix+'h2h')
+        gates = i2h + h2h
+        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
+        in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
+        forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
+        in_transform = F.Activation(slice_gates[2], act_type="tanh", name=prefix+'c')
+        out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
+        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
+                                   name=prefix+'state')
+        hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type="tanh"),
+                                  name=prefix+'hidden')
+        next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
+                                       weight=proj_weight, no_bias=True, name=prefix+'out')
+
+        return next_r, [next_r, next_c]
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 890c9024d87..9614313058d 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -141,6 +141,7 @@ def init(self, key, value):
         >>> print b
         <RowSparseNDArray 2x3 @cpu(0)>
         """
+        #print('init sum(%s) = %.7f' % (key, value.sum().asnumpy()[0]))
         ckeys, cvals, use_str_keys = _ctype_key_value(key, value)
         if use_str_keys:
             check_call(_LIB.MXKVStoreInitEx(self.handle, mx_uint(len(ckeys)), ckeys, cvals))
@@ -227,7 +228,7 @@ def push(self, key, value, priority=0):
                 self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
 
 
-    def pull(self, key, out=None, priority=0):
+    def pull(self, key, out=None, priority=0, ignore_sparse=True):
         """ Pulls a single value or a sequence of values from the store.
 
         This function returns immediately after adding an operator to the engine.
@@ -255,6 +256,9 @@ def pull(self, key, out=None, priority=0):
             Higher priority pull operations are likely to be executed before
             other pull actions.
 
+        ignore_sparse : bool, optional
+            Whether to ignore sparse ndarrays during the operation. Defaults to True.
+
         Examples
         --------
         >>> # pull a single key-value pair
@@ -290,11 +294,11 @@ def pull(self, key, out=None, priority=0):
         assert(out is not None)
         ckeys, cvals, use_str_keys = _ctype_key_value(key, out)
         if use_str_keys:
-            check_call(_LIB.MXKVStorePullEx(
-                self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
+            check_call(_LIB.MXKVStorePullEx(self.handle, mx_uint(len(ckeys)), ckeys, cvals,
+                                            ctypes.c_int(priority), ctypes.c_bool(ignore_sparse)))
         else:
-            check_call(_LIB.MXKVStorePull(
-                self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
+            check_call(_LIB.MXKVStorePull(self.handle, mx_uint(len(ckeys)), ckeys, cvals,
+                                          ctypes.c_int(priority), ctypes.c_bool(ignore_sparse)))
 
     def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
         """ Pulls a single RowSparseNDArray value or a sequence of RowSparseNDArray values \
@@ -321,7 +325,7 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
             other pull actions.
 
         row_ids : NDArray or list of NDArray
-            The row_ids for which to pull for each value. Each row_id is an 1D NDArray \
+            The row_ids for which to pull for each value. Each row_id is an 1-D NDArray \
             whose values don't have to be unique nor sorted.
 
         Examples
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 33dae173259..ff656aed890 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -94,13 +94,15 @@ def _create_kvstore(kvstore, num_device, arg_params):
 
     return (kv, update_on_kvstore)
 
-def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore):
+def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore, skip_pull=None):
     """Initialize kvstore"""
     for idx, param_on_devs in enumerate(param_arrays):
         name = param_names[idx]
         kvstore.init(name, arg_params[name])
 
         if update_on_kvstore:
+            if skip_pull and name in skip_pull:
+                continue
             kvstore.pull(name, param_on_devs, priority=-idx)
 
 def _update_params_on_kvstore_nccl(param_arrays, grad_arrays, kvstore, param_names):
@@ -123,7 +125,8 @@ def _update_params_on_kvstore_nccl(param_arrays, grad_arrays, kvstore, param_nam
         kvstore.pull(valid_param_names[start:end], valid_param_arrays[start:end], priority=-start)
         start = end
 
-def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
+
+def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names, skip_pull=None):
     """Perform update of param_arrays from grad_arrays on kvstore."""
     for index, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
@@ -132,6 +135,8 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
         name = param_names[index]
         # push gradient, priority is negative index
         kvstore.push(name, grad_list, priority=-index)
+        if skip_pull and name in skip_pull:
+            continue
         # pull back the weights
         kvstore.pull(name, arg_list, priority=-index)
 
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 32400c11dbc..50d2e1a4e82 100755
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -413,7 +413,9 @@ def get_params(self, arg_params, aux_params):
         - This function will inplace update the NDArrays in arg_params and aux_params.
         """
         for name, block in zip(self.param_names, self.param_arrays):
-            weight = sum(w.copyto(ctx.cpu()) for w in block) / len(block)
+            # XXX temp hacks for avoid fallback
+            weight_cpu = [w.copyto(ctx.cpu()) for w in block]
+            weight = nd.add_n(*weight_cpu) / len(block)
             weight.astype(arg_params[name].dtype).copyto(arg_params[name])
         for name, block in zip(self.aux_names, self.aux_arrays):
             weight = sum(w.copyto(ctx.cpu()) for w in block) / len(block)
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 1cf70402ce3..c8a56b70766 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -26,6 +26,7 @@
 
 from .. import context as ctx
 from .. import optimizer as opt
+from .. import gluon
 
 from .executor_group import DataParallelExecutorGroup
 from ..model import _create_kvstore, _initialize_kvstore, _update_params, _update_params_on_kvstore
@@ -36,6 +37,48 @@
 
 from .base_module import BaseModule, _check_input_names, _parse_data_desc
 
+import mxnet.ndarray as nd
+
+def nd_global_norm(t_list):
+    """Computes the global norm of multiple tensors.
+    Given a tuple or list of tensors t_list, this operation returns the global norm of the elements
+     in all tensors in t_list. The global norm is computed as:
+    ``global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))``
+    Any entries in t_list that are of type None are ignored.
+    Parameters
+    ----------
+    t_list: list or tuple
+        The NDArray list
+    Returns
+    -------
+    ret: NDArray
+        The global norm. The shape of the NDArray will be (1,)
+    Examples
+    --------
+    >>> x = mx.nd.ones((2, 3))
+    >>> y = mx.nd.ones((5, 6))
+    >>> z = mx.nd.ones((4, 2, 3))
+    >>> print(nd_global_norm([x, y, z]).asscalar())
+    7.74597
+    >>> xnone = None
+    >>> ret = nd_global_norm([x, y, z, xnone])
+    >>> print(ret.asscalar())
+    7.74597
+    """
+    ret = None
+    for arr in t_list:
+        if arr is not None:
+            if arr.stype == 'row_sparse':
+                norm = nd._internal._square_sum(arr, axis=0).sum().as_in_context(ctx.cpu())
+            else:
+                norm = nd.square(nd.norm(arr)).copyto(ctx.cpu())
+            if ret is None:
+                ret = norm
+            else:
+                ret += norm
+    ret = nd.sqrt(ret)
+    return ret
+
 class Module(BaseModule):
     """Module is a basic module that wrap a `Symbol`. It is functionally the same
     as the `FeedForward` model, except under the module API.
@@ -791,3 +834,86 @@ def install_monitor(self, mon):
         """Installs monitor on all executors. """
         assert self.binded
         self._exec_group.install_monitor(mon)
+
+    def clip_by_global_norm(self, max_norm=1.0, param_names=None):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        grad_array = []
+        if param_names is None:
+            for grads in self._exec_group.grad_arrays:
+                grad_array += grads
+        else:
+            for param_name in param_names:
+                param_idx = self._exec_group.param_names.index(param_name)
+                grad_val = self._exec_group.grad_arrays[param_idx]
+                grad_array += grad_val
+
+        norm_val = self.global_grad_norm(grad_array)
+        if norm_val > max_norm:
+            ratio = max_norm / float(norm_val)
+            for grad in grad_array:
+                grad *= ratio
+        return norm_val
+
+    def clip_by_global_norm_per_ctx(self, max_norm=1.0, param_names=None):
+        """Clips gradient norm.
+        The norm is computed over all gradients together, as if they were
+         concatenated into a single vector. Gradients are modified in-place.
+        The method is first used in
+         `[ICML2013] On the difficulty of training recurrent neural networks`
+        Parameters
+        ----------
+        max_norm : float or int
+            The maximum clipping threshold of the gradient norm.
+        Returns
+        -------
+        norm_val : float
+            The computed norm of the gradients.
+        Examples
+        --------
+        An example of using clip_grad_norm to clip the gradient before updating the parameters::
+            >>> #Get the gradient via back-propagation
+            >>> net.forward_backward(data_batch=data_batch)
+            >>> norm_val = net.clip_by_global_norm(max_norm=1.0)
+            >>> net.update()
+        """
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        num_ctx = len(self._exec_group.grad_arrays[0])
+        grad_array_per_ctx = [[] for i in range(num_ctx)]
+        assert(param_names is not None)
+        for param_name in param_names:
+            param_idx = self._exec_group.param_names.index(param_name)
+            grad_val = self._exec_group.grad_arrays[param_idx]
+            assert(len(grad_val) == num_ctx)
+            for i in range(num_ctx):
+                grad_array_per_ctx[i].append(grad_val[i])
+        norm_vals = []
+        for i in range(num_ctx):
+            norm_val = self.global_grad_norm(grad_array_per_ctx[i])
+            norm_vals.append(norm_val)
+            if norm_val > max_norm:
+                ratio = max_norm / float(norm_val)
+                for grad in grad_array_per_ctx[i]:
+                    grad *= ratio
+        return norm_vals
+
+    def global_grad_norm(self,arr):
+        """Calculate global gradient norm.
+        The L2 norm is computed over all gradients together, as if they were
+         concatenated into a single vector.
+        Could be used to debug the optimization process.
+         See http://videolectures.net/deeplearning2015_goodfellow_network_optimization/
+        Returns
+        -------
+        norm_val : float
+            The computed norm of the gradients.
+        Examples
+        --------
+        An example of using global_norm to calculate the gradient norm after back-propgation::
+            >>> #Get the gradient via back-propagation
+            >>> net.forward_backward(data_batch=data_batch)
+            >>> norm_val = net.global_grad_norm()
+            >>> print(norm_val)
+        """
+        norm_val = 0.0
+        norm_val += nd_global_norm(arr).asscalar()
+        return norm_val
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index c65d7ce408e..7799afa4fa3 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -397,7 +397,10 @@ def __setitem__(self, key, value):
                 if value.handle is not self.handle:
                     value.copyto(self)
             elif isinstance(value, numeric_types):
-                raise ValueError("Assigning numeric types to CSRNDArray is " \
+                if value == 0:
+                    zeros('csr', shape=self.shape, ctx=self.context, dtype=self.dtype).copyto(self)
+                else:
+                    raise ValueError("Assigning numeric types to CSRNDArray is " \
                                  "not implemented yet.")
             elif isinstance(value, (np.ndarray, np.generic)):
                 # TODO(haibin/anisub) check scipy.sparse and use _sync_copy_from to
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 065c08cee4e..57a2816d8d5 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -23,10 +23,10 @@
 import warnings
 import numpy
 from .base import py_str
-from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs)
+from .ndarray import (NDArray, zeros, ones, clip, sqrt, cast, maximum, abs as NDabs)
 from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                      signsgd_update, signum_update)
+                      signsgd_update, signum_update, adagrad_update)
 from .ndarray import _internal
 from .ndarray import op
 from .ndarray import sparse
@@ -1084,7 +1084,8 @@ def __init__(self, eps=1e-7, **kwargs):
         self.float_stable_eps = eps
 
     def create_state(self, index, weight):
-        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
+        s = zeros(weight.shape, weight.context, stype=weight.stype)
+        return s
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
@@ -1093,17 +1094,18 @@ def update(self, index, weight, grad, state):
         lr = self._get_lr(index)
         wd = self._get_wd(index)
 
-        is_sparse = True if weight.stype == 'row_sparse' and grad.stype == 'row_sparse' else False
-
-        if is_sparse is True:
-            grad_indices_count = len(grad.indices)
+        is_sparse = weight.stype == 'row_sparse' and grad.stype == 'row_sparse'
+        if is_sparse:
+            # TODO check wd
+            kwargs = {'eps': self.float_stable_eps, 'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            adagrad_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
+            return
 
         grad = grad * self.rescale_grad
-
-        if is_sparse is True:
-            grad_indices = grad.indices
-            # Make sure that the scalar multiply still has a sparse result
-            assert grad_indices_count == len(grad_indices)
+        if wd > 0:
+            grad += weight * wd
 
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
@@ -1111,24 +1113,11 @@ def update(self, index, weight, grad, state):
         save_history_stype = history.stype
 
         if is_sparse:
-            history[:] = sparse.elemwise_add(sparse.square(grad),
-                                             sparse.retain(history, grad_indices))
-            history_indices = history.indices
-            assert len(history_indices) == grad_indices_count
-            adjusted_add = _internal._scatter_plus_scalar(history, self.float_stable_eps)
-            srt = op.sqrt(adjusted_add)
-            div = _internal._scatter_elemwise_div(grad, srt)
-            retained_weight = sparse.retain(weight, grad.indices)
-            to_add = sparse.elemwise_add(div, _internal._mul_scalar(retained_weight, float(wd)))
-            assert len(to_add.indices) == grad_indices_count
-            weight[:] = sparse.elemwise_add(weight, _internal._mul_scalar(to_add, float(-lr)))
-            state[:] = history
-            assert state.stype == save_history_stype
-            assert len(history_indices) == grad_indices_count
+            raise NotImplementedError()
         else:
             history[:] += square(grad)
-            div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += (div + weight * wd) * -lr
+            div = grad / (sqrt(history + self.float_stable_eps))
+            weight[:] += div * -lr
 
 @register
 class RMSProp(Optimizer):
diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py
index 3301102ba90..8d0e127029e 100644
--- a/python/mxnet/rnn/rnn_cell.py
+++ b/python/mxnet/rnn/rnn_cell.py
@@ -419,19 +419,24 @@ class LSTMCell(BaseRNNCell):
     forget_bias : bias added to forget gate, default 1.0.
         Jozefowicz et al. 2015 recommends setting this to 1.0
     """
-    def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0):
+    def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0, num_proj=0):
         super(LSTMCell, self).__init__(prefix=prefix, params=params)
 
         self._num_hidden = num_hidden
-        self._iW = self.params.get('i2h_weight')
-        self._hW = self.params.get('h2h_weight')
+        self._num_proj = num_proj
+        import math
+        scale = 1.0 / math.sqrt(num_hidden)
+        self._iW = self.params.get('i2h_weight')#, init=init.Uniform(scale))
+        self._hW = self.params.get('h2h_weight')#, init=init.Uniform(scale))
         # we add the forget_bias to i2h_bias, this adds the bias to the forget gate activation
-        self._iB = self.params.get('i2h_bias', init=init.LSTMBias(forget_bias=forget_bias))
-        self._hB = self.params.get('h2h_bias')
+        self._iB = self.params.get('i2h_bias')#, init=init.Uniform(scale))
+        self._hB = self.params.get('h2h_bias')#, init=init.Uniform(scale))
+        if self._num_proj > 0:
+            self._pW = self.params.get('pj_weight')
 
     @property
     def state_info(self):
-        return [{'shape': (0, self._num_hidden), '__layout__': 'NC'},
+        return [{'shape': (0, self._num_hidden if self._num_proj == 0 else self._num_proj), '__layout__': 'NC'},
                 {'shape': (0, self._num_hidden), '__layout__': 'NC'}]
 
     @property
@@ -462,6 +467,8 @@ def __call__(self, inputs, states):
                                         name='%sstate'%name)
         next_h = symbol._internal._mul(out_gate, symbol.Activation(next_c, act_type="tanh"),
                                        name='%sout'%name)
+        if self._num_proj:
+            next_h = symbol.FullyConnected(next_h, num_hidden=self._num_proj, weight=self._pW, no_bias=True)
 
         return next_h, [next_h, next_c]
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b41a142ab64..7d4db437fe7 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -832,7 +832,8 @@ int MXKVStorePull(KVStoreHandle handle,
                   mx_uint num,
                   const int* keys,
                   NDArrayHandle* vals,
-                  int priority) {
+                  int priority,
+                  const bool ignore_sparse) {
   API_BEGIN();
   std::vector<int> v_keys(num);
   std::vector<NDArray*> v_vals(num);
@@ -840,7 +841,7 @@ int MXKVStorePull(KVStoreHandle handle,
     v_keys[i] = keys[i];
     v_vals[i] = static_cast<NDArray*>(vals[i]);
   }
-  static_cast<KVStore*>(handle)->Pull(v_keys, v_vals, priority);
+  static_cast<KVStore*>(handle)->Pull(v_keys, v_vals, priority, ignore_sparse);
   API_END();
 }
 
@@ -848,7 +849,8 @@ int MXKVStorePullEx(KVStoreHandle handle,
                   mx_uint num,
                   const char** keys,
                   NDArrayHandle* vals,
-                  int priority) {
+                  int priority,
+                  const bool ignore_sparse) {
   API_BEGIN();
   std::vector<std::string> v_keys(num);
   std::vector<NDArray*> v_vals(num);
@@ -856,7 +858,7 @@ int MXKVStorePullEx(KVStoreHandle handle,
     v_keys[i] = keys[i];
     v_vals[i] = static_cast<NDArray*>(vals[i]);
   }
-  static_cast<KVStore*>(handle)->Pull(v_keys, v_vals, priority);
+  static_cast<KVStore*>(handle)->Pull(v_keys, v_vals, priority, ignore_sparse);
   API_END();
 }
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 7d31a31b839..ee97649768b 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -802,17 +802,17 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
           CHECK_EQ(inferred_stype, arg_nd_stype)
             << "Inferred stype does not match shared_exec.arg_array's stype"
                " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument"
+               " be resued for creating NDArray of the argument "
             << arg_name << " for the current executor";
           CHECK_EQ(inferred_shape, in_arg_nd.shape())
             << "Inferred shape does not match shared_exec.arg_array's shape"
                " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument"
+               " be resued for creating NDArray of the argument "
             << arg_name << " for the current executor";
           CHECK_EQ(inferred_dtype, in_arg_nd.dtype())
             << "Inferred dtype does not match shared_exec.arg_array's dtype"
                " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument"
+               " be resued for creating NDArray of the argument "
             << arg_name << " for the current executor";
           in_arg_vec->emplace_back(in_arg_nd);
         } else {
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index da2d03d519f..3085966e99b 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -65,14 +65,14 @@ class Comm {
 
   /**
    * \brief broadcast src to dst[i] with target row_ids for every i
+   * \param key the identifier key for the stored ndarray
+   * \param src the source row_sparse ndarray to broadcast
    * \param dst a list of destination row_sparse NDArray and its target row_ids to broadcast,
-            where the row_ids are expected to be unique and sorted
-   * \param use_copy if set to true, directly copy src to dst[i] without looking up the
-            provided row_ids
+            where the row_ids are expected to be unique and sorted in row_id.data()
+   * \param priority the priority of the operation
    */
   virtual void BroadcastRowSparse(int key, const NDArray& src,
                                   const std::vector<std::pair<NDArray*, NDArray>>& dst,
-                                  const bool use_copy,
                                   const int priority) = 0;
 
   /**
@@ -209,7 +209,6 @@ class CommCPU : public Comm {
 
   void BroadcastRowSparse(int key, const NDArray& src,
                           const std::vector<std::pair<NDArray*, NDArray>>& dst,
-                          const bool use_copy,
                           const int priority) override {
     using namespace mshadow;
     CHECK_EQ(src.storage_type(), kRowSparseStorage)
@@ -219,107 +218,30 @@ class CommCPU : public Comm {
     for (size_t i = 0; i < dst.size(); ++i) {
       NDArray* out = dst[i].first;
       NDArray row_id = dst[i].second;
-      if (use_copy) {
-        CopyFromTo(src, out, priority);
-      } else {
-        CHECK_EQ(out->storage_type(), kRowSparseStorage)
-                 << "BroadcastRowSparse expects row_sparse dst NDArray";
-        CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU)
-                 << "BroadcastRowSparse with row_indices on gpu context not supported";
-        // retain according to unique indices
-        const bool use_sparse_retain = (src.shape()[0] != src.storage_shape()[0])
-          || (row_id.dtype() != out->aux_type(rowsparse::kIdx))
-          || (out->ctx().dev_mask() != Context::kGPU);
-        if (use_sparse_retain) {  // use sparse_retain op
-          const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU;
-          NDArray out_cpu = is_to_gpu? NDArray(kRowSparseStorage, src.shape(),
-              src.ctx(), true, src.dtype(), src.aux_types()) : *out;
-          Engine::Get()->PushAsync(
-            [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-              const TBlob& indices = row_id.data();
-              NDArray temp = out_cpu;  // get rid of const qualifier
-              op::SparseRetainOpForwardRspImpl<cpu>(rctx.get_stream<cpu>(),
-                                                    src, indices, kWriteTo,
-                                                    &temp);
-              on_complete();
-            }, Context::CPU(), {src.var(), row_id.var()}, {out_cpu.var()},
-            FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
-          if (is_to_gpu) {
-            CopyFromTo(out_cpu, out, priority);
-          }
-        } else {  // direct copy rows
-          Engine::Get()->PushAsync(
-            [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-              CopyRetainedRowsToGPU(rctx.get_stream<cpu>(), rctx.get_stream<gpu>(),
-                                    src, row_id, out);
-              // wait for GPU operations to complete
-              rctx.get_stream<gpu>()->Wait();
-              on_complete();
-            }, out->ctx(), {src.var(), row_id.var()}, {out->var()},
-            FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("KVStoreCopyRetainedRowsToGPU"));
-        }
-      }
+      CHECK_EQ(out->storage_type(), kRowSparseStorage)
+               << "BroadcastRowSparse expects row_sparse dst NDArray";
+      CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU)
+               << "BroadcastRowSparse with row_indices on gpu context not supported";
+      // retain according to unique indices
+      const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU;
+      NDArray retained_cpu = is_to_gpu ? NDArray(kRowSparseStorage, src.shape(),
+          src.ctx(), true, src.dtype(), src.aux_types()) : *out;
+      Engine::Get()->PushAsync(
+        [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          const TBlob& indices = row_id.data();
+          NDArray temp = retained_cpu;  // get rid the of const qualifier
+          op::SparseRetainOpForwardRspImpl<cpu>(rctx.get_stream<cpu>(),
+                                                src, indices, kWriteTo,
+                                                &temp);
+          on_complete();
+        }, Context::CPU(), {src.var(), row_id.var()}, {retained_cpu.var()},
+        FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
+      // if retained_cpu == out, CopyFromTo will ignore the copy operation
+      CopyFromTo(retained_cpu, out, priority);
     }
   }
 
  private:
-  /*!
-   * \brief When src is a rsp with full rows,
-   * simply copy retained rows directly from cpu to gpu
-   * without invoking sparse_retain op.
-   */
-  void CopyRetainedRowsToGPU(mshadow::Stream<cpu>* cpu_stream,
-                             mshadow::Stream<gpu>* gpu_stream,
-                             const NDArray& src,
-                             const NDArray& indices,
-                             NDArray* dst) {
-#if MXNET_USE_CUDA == 1
-    CHECK_EQ(src.storage_type(), kRowSparseStorage)
-      << "CopyRetainedRowsToGPU expects row-sparse src NDArray";
-    CHECK_EQ(src.ctx().dev_mask(), Context::kCPU)
-      << "CopyRetainedRowsToGPU with src on gpu context not supported";
-    CHECK_EQ(src.storage_shape()[0], src.shape()[0])
-      << "CopyRetainedRowsToGPU only supports src rsp with full rows";
-    CHECK_EQ(indices.storage_type(), kDefaultStorage);
-    CHECK_EQ(indices.ctx().dev_mask(), Context::kCPU);
-    CHECK_EQ(dst->storage_type(), kRowSparseStorage);
-    CHECK_EQ(dst->ctx().dev_mask(), Context::kGPU);
-    CHECK_EQ(indices.dtype(), dst->aux_type(rowsparse::kIdx))
-      << "CopyRetainedRowsToGPU only supports same data type for idx array and dst aux_data(0)";
-    if (!src.storage_initialized() || indices.data().Size() == 0U) {
-      op::FillZerosRspImpl(gpu_stream, *dst);
-      return;
-    }
-    using namespace mshadow;
-
-    const TBlob& src_data = src.data();
-    const TBlob& idx_data = indices.data();
-    const size_t row_length = src.shape().ProdShape(1, src.shape().ndim());
-    const size_t num_rows_retained = idx_data.Size();
-    dst->CheckAndAlloc({Shape1(num_rows_retained)});
-    TBlob dst_data = dst->data();
-    TBlob dst_idx_data = dst->aux_data(rowsparse::kIdx);
-    MSHADOW_TYPE_SWITCH(src.dtype(), DType, {
-      MSHADOW_IDX_TYPE_SWITCH(indices.dtype(), IType, {
-        // copy idx array
-        Tensor<gpu, 1, IType> dst_idx_tensor = dst_idx_data.FlatTo1D<gpu, IType>(gpu_stream);
-        const Tensor<cpu, 1, IType> idx_tensor = idx_data.FlatTo1D<cpu, IType>(cpu_stream);
-        Copy(dst_idx_tensor, idx_tensor, gpu_stream);
-        // copy src data
-        const Tensor<cpu, 2, DType> src_data_tensor = src_data.get_with_shape<cpu, 2, DType>(
-            Shape2(src_data.shape_[0], row_length), cpu_stream);
-        Tensor<gpu, 2, DType> dst_data_tensor = dst_data.get_with_shape<gpu, 2, DType>(
-            Shape2(dst_data.shape_[0], row_length), gpu_stream);
-        for (size_t i = 0; i < num_rows_retained; ++i) {
-          Copy(dst_data_tensor[i], src_data_tensor[idx_tensor[i]], gpu_stream);
-        }
-      })
-    })
-#else
-    LOG(FATAL) << "GPU not enabled";
-#endif
-  }
-
   // reduce sum into val[0]
   inline void ReduceSumCPU(const std::vector<NDArray> &in_data) {
     MSHADOW_TYPE_SWITCH(in_data[0].dtype(), DType, {
@@ -632,7 +554,6 @@ class CommDevice : public Comm {
 
   void BroadcastRowSparse(int key, const NDArray& src,
                           const std::vector<std::pair<NDArray*, NDArray>>& dst,
-                          const bool use_copy,
                           const int priority) override {
     CHECK_EQ(src.storage_type(), kRowSparseStorage)
       << "BroadcastRowSparse expects row-sparse src NDArray";
@@ -640,46 +561,39 @@ class CommDevice : public Comm {
     for (size_t i = 0; i < dst.size(); ++i) {
       NDArray* out = dst[i].first;
       NDArray row_id = dst[i].second;
-      if (use_copy) {
-        CopyFromTo(src, out, priority);
-      } else {
-        CHECK_EQ(out->storage_type(), kRowSparseStorage)
-                 << "BroadcastRowSparse expects row_sparse dst NDArray";
-
-        const bool is_diff_ctx = out->ctx() != src.ctx();
-        NDArray out_gpu = is_diff_ctx? NDArray(kRowSparseStorage, out->shape(),
-            src.ctx(), true, out->dtype(), out->aux_types()) : *out;
-
-        CHECK_EQ(row_id.ctx(), src.ctx())
-                << "row_id and src are expected to be on the same context";
-
-        Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-            NDArray temp = out_gpu;
-            const TBlob& indices = row_id.data();
-            switch (temp.ctx().dev_mask()) {
-              case cpu::kDevMask: {
-                mxnet::common::SparseRetainOpForwardRspWrapper<cpu>(rctx.get_stream<cpu>(),
-                    src, indices, kWriteTo, &temp);
-                break;
-              }
+      CHECK_EQ(out->storage_type(), kRowSparseStorage)
+               << "BroadcastRowSparse expects row_sparse dst NDArray";
+      CHECK_EQ(row_id.ctx(), src.ctx())
+              << "row_id and src are expected to be on the same context";
+      // retain according to indices
+      const bool is_diff_ctx = out->ctx() != src.ctx();
+      NDArray out_gpu = is_diff_ctx? NDArray(kRowSparseStorage, out->shape(),
+          src.ctx(), true, out->dtype(), out->aux_types()) : *out;
+      Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          const TBlob& indices = row_id.data();
+          using namespace mxnet::common;
+          NDArray temp = out_gpu;
+          switch (temp.ctx().dev_mask()) {
+            case cpu::kDevMask: {
+              SparseRetainOpForwardRspWrapper<cpu>(rctx.get_stream<cpu>(),
+                  src, indices, kWriteTo, &temp);
+              break;
+            }
 #if MXNET_USE_CUDA
-              case gpu::kDevMask: {
-                mxnet::common::SparseRetainOpForwardRspWrapper<gpu>(rctx.get_stream<gpu>(),
-                    src, indices, kWriteTo, &temp);
-                // wait for GPU operations to complete
-                rctx.get_stream<gpu>()->Wait();
-                break;
-              }
-#endif
-              default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+            case gpu::kDevMask: {
+              SparseRetainOpForwardRspWrapper<gpu>(rctx.get_stream<gpu>(),
+                  src, indices, kWriteTo, &temp);
+              // wait for GPU operations to complete
+              rctx.get_stream<gpu>()->Wait();
+              break;
             }
-            on_complete();
-          }, out_gpu.ctx(), {src.var(), row_id.var()}, {out_gpu.var()},
-        FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
-        if (is_diff_ctx) {
-          CopyFromTo(out_gpu, out, priority);
-        }
-      }
+#endif
+            default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+          }
+          on_complete();
+        }, out_gpu.ctx(), {src.var(), row_id.var()}, {out_gpu.var()},
+      FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
+      CopyFromTo(out_gpu, out, priority);
     }
   }
 
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index e01cc4206b3..7562d29740a 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -206,10 +206,11 @@ class KVStoreDist : public KVStoreLocal {
 
   void PullImpl(const std::vector<int>& keys,
                 const std::vector<NDArray*>& values,
-                int priority) override {
+                int priority, bool ignore_sparse) override {
+    CHECK_EQ(ignore_sparse, true) << "Distributed KVStore doesn't support Pull(RowSparseNDArray)";
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray*> > grouped_vals;
-    GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals);
+    GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals, ignore_sparse);
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
@@ -279,24 +280,20 @@ class KVStoreDist : public KVStoreLocal {
       }
       auto &target_val_rowids = grouped_val_rowids[i];
       const size_t num_vals = target_val_rowids.size();
-      size_t num_rows = 0;
-      // TODO(haibin) refactor this for loop
       for (size_t i = 0; i < num_vals; i++) {
         auto &row_id = target_val_rowids[i].second;
-        NDArray indices(row_id.shape(), pinned_ctx_, false, mshadow::kInt64);
-        CopyFromTo(row_id, &indices, 0);
-        Unique(&indices, priority);
-        target_val_rowids[i].second = indices;
-        num_rows += indices.shape().Size();
-      }
-      if (num_vals > 1) {
-        // TODO(haibin) aggregate over all unique indices
-        LOG(FATAL) << "RowSparsePull with multiple values is not implemented yet";
-      } else {
-        auto& indices = target_val_rowids[0].second;
-        PullRowSparse_(key, recv_buf, indices, priority);
-        comm_->BroadcastRowSparse(key, recv_buf, grouped_val_rowid, num_vals == 1, priority);
+        target_val_rowids[i].second = Unique(row_id, pinned_ctx_, 0);
       }
+      CHECK_EQ(num_vals, 1) << "RowSparsePull with multiple values is not supported yet";
+      NDArray& indices = target_val_rowids[0].second;
+      PullRowSparse_(key, recv_buf, indices, priority);
+      // The recv_buf contains values pulled from remote server with unique indices.
+      // Directly broadcast w/o rowids if num_vals == 1
+      auto get_val = [](const std::pair<NDArray*, NDArray>& p) { return p.first; };
+      std::vector<NDArray*> grouped_val(grouped_val_rowid.size());
+      std::transform(grouped_val_rowid.begin(), grouped_val_rowid.end(),
+                     grouped_val.begin(), get_val);
+      comm_->Broadcast(key, recv_buf, grouped_val, priority);
     }
   }
 
@@ -457,15 +454,17 @@ class KVStoreDist : public KVStoreLocal {
 
   // pull row sparse weight into `recv_buf` based on indices given by `indices`
   void PullRowSparse_(const int key, const NDArray& recv_buf,
-                      const NDArray& indices, int priority) {
+                      const NDArray& sized_indices, int priority) {
     using namespace rowsparse;
-    auto pull_from_servers = [this, key, recv_buf, indices]
+    auto pull_from_servers = [this, key, recv_buf, sized_indices]
       (RunContext rctx, Engine::CallbackOnComplete cb) {
       // allocate memory for the buffer
-      size_t num_rows = indices.shape().Size();
+      CHECK_EQ(indices.dtype(), mshadow::kInt64);
+      const TBlob idx_data = indices.data();
+      size_t num_rows = idx_data.shape_.Size();
       recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)});
       real_t* data = recv_buf.data().dptr<real_t>();
-      const auto offsets = indices.data().dptr<int64_t>();
+      const auto offsets = idx_data.dptr<int64_t>();
       const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim());
       const int64_t size = num_rows * unit_len;
       // convert to ps keys in row sparse format
@@ -480,7 +479,7 @@ class KVStoreDist : public KVStoreLocal {
       // because after pull is done, the callback function returns and locks are released.
       // at this point, later functions may access the indices variable while copy happens
       mshadow::Copy(recv_buf.aux_data(kIdx).FlatTo1D<cpu, int64_t>(),
-                    indices.data().FlatTo1D<cpu, int64_t>());
+                    idx_data.FlatTo1D<cpu, int64_t>());
       CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens,
                                        static_cast<int>(DataHandleType::kRowSparsePushPull),
                                        [vals, cb]() { delete vals; cb(); });
@@ -488,7 +487,7 @@ class KVStoreDist : public KVStoreLocal {
     CHECK_NOTNULL(Engine::Get())->PushAsync(
       pull_from_servers,
       pinned_ctx_,
-      {indices.var()},
+      {sized_indices.var()},
       {recv_buf.var()},
       FnProperty::kNormal,
       priority,
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 7b3d6fa4cd7..1ade92d7ffa 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -35,6 +35,7 @@
 #include <algorithm>
 #include "./comm.h"
 #include "./kvstore_utils.h"
+#include "../ndarray/ndarray_function.h"
 
 namespace mxnet {
 namespace kvstore {
@@ -59,6 +60,7 @@ class KVStoreLocal : public KVStore {
     } else {
       comm_ = new CommCPU();
     }
+    comm_cpu_ = new CommCPU();
     pinned_ctx_ = comm_->pinned_ctx();
     gradient_compression_ = std::make_shared<GradientCompression>();
   }
@@ -100,9 +102,9 @@ class KVStoreLocal : public KVStore {
 
   void Pull(const std::vector<int>& keys,
             const std::vector<NDArray*>& values,
-            int priority) override {
+            int priority, bool ignore_sparse) override {
     SetKeyType(kIntKey);
-    PullImpl(keys, values, priority);
+    PullImpl(keys, values, priority, ignore_sparse);
   }
 
   void PullRowSparse(const std::vector<int>& keys,
@@ -123,11 +125,11 @@ class KVStoreLocal : public KVStore {
 
   void Pull(const std::vector<std::string>& str_keys,
             const std::vector<NDArray*>& values,
-            int priority) override {
+            int priority, bool ignore_sparse) override {
     SetKeyType(kStringKey);
     std::vector<int> keys(str_keys.size());
     LookupKeys(str_keys, &keys);
-    PullImpl(keys, values, priority);
+    PullImpl(keys, values, priority, ignore_sparse);
   }
 
   void PullRowSparse(const std::vector<std::string>& str_keys,
@@ -164,15 +166,25 @@ class KVStoreLocal : public KVStore {
     GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals);
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
-      const NDArray& merged = comm_->Reduce(key, grouped_vals[i], priority);
+      NDArray merged = comm_->Reduce(key, grouped_vals[i], priority);
       NDArray& local = local_[key];
+      int64_t magic_dim = dmlc::GetEnv("MXNET_MAGIC_DIM", -1);
+      bool magic_weight = local.shape().Size() > magic_dim;
       if (updater_ != nullptr) {
         CHECK(!local.is_none()) << "key " << key << " has not been inited";
         // if merged is on gpu, we may need copy weight from cpu to gpu
-        if (merged.ctx().dev_mask() != cpu::kDevMask &&
-            local.ctx().dev_mask() == cpu::kDevMask) {
-          local = local.Copy(merged.ctx());
+        if (!magic_weight) {
+          if (merged.ctx().dev_mask() != cpu::kDevMask &&
+              local.ctx().dev_mask() == cpu::kDevMask) {
+            local = local.Copy(merged.ctx());
+          }
+        } else {
+          if (merged.ctx().dev_mask() != cpu::kDevMask &&
+              local.ctx().dev_mask() == cpu::kDevMask) {
+            merged = merged.Copy(local.ctx());
+          }
         }
+
         // call the updater with string keys
         // if string keys are used and str_updater_ is available
         // otherwise fallback to updater_ which uses int key interface
@@ -189,7 +201,11 @@ class KVStoreLocal : public KVStore {
         if (merged.storage_type() != local.storage_type()) {
           local = merged.Copy(local.ctx());
         } else {
-          local = merged;
+          if (!magic_weight) {
+            local = merged;
+          } else {
+            local = merged.Copy(local.ctx());
+          }
         }
       }
     }
@@ -197,10 +213,10 @@ class KVStoreLocal : public KVStore {
 
   virtual void PullImpl(const std::vector<int>& keys,
                         const std::vector<NDArray*>& values,
-                        int priority) {
+                        int priority, bool ignore_sparse) {
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray*> > grouped_vals;
-    GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals);
+    GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals, ignore_sparse);
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
@@ -226,12 +242,9 @@ class KVStoreLocal : public KVStore {
       const size_t num_vals = target_val_rowids.size();
       for (size_t j = 0; j < num_vals; j++) {
         auto &row_id = target_val_rowids[j].second;
-        NDArray indices(row_id.shape(), local.ctx(), false, mshadow::kInt64);
-        CopyFromTo(row_id, &indices, 0);
-        Unique(&indices, priority);
-        target_val_rowids[j].second = indices;
+        target_val_rowids[j].second = Unique(row_id, local.ctx(), 0);
       }
-      comm_->BroadcastRowSparse(key, local, grouped_val_rowids[i], false, priority);
+      comm_->BroadcastRowSparse(key, local, grouped_val_rowids[i], priority);
     }
   }
 
@@ -270,11 +283,11 @@ class KVStoreLocal : public KVStore {
   virtual void GroupKVPairsPull(const std::vector<int>& keys,
                                 const std::vector<NDArray*>& values,
                                 std::vector<int> *uniq_keys,
-                                std::vector<std::vector<NDArray*>> *grouped_vals) {
+                                std::vector<std::vector<NDArray*>> *grouped_vals,
+                                bool ignore_sparse) {
     // check if the storage type of a value is valid
-    auto validator = [this](const int key, const NDArray* nd) -> bool {
-      // valid
-      if (nd->storage_type() == kDefaultStorage) return true;
+    auto validator = [this, ignore_sparse](const int key, const NDArray* nd) -> bool {
+      if (nd->storage_type() == kDefaultStorage || !ignore_sparse) return true;
       // invalid, print warning messages once
       if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) {
         LOG(INFO) << "Warning: non-default weights detected during kvstore pull. "
@@ -354,44 +367,65 @@ class KVStoreLocal : public KVStore {
     }
   }
 
-  /**
-   * \brief sort and get unique values.
+  /*
+   * \brief Compute the unique values in data and store them in ascending order
+   * in an int64_t row_sparse ndarray on ctx. The opeartion is async. The result
+   * row_sparse ndarray stores the unique values in out.data(). The aux_data()
+   * contains values that are not necessarily meaningful and should be ignored.
+   * \param data the input data
+   * \param ctx the target context
+   * \param priority the priority of the operation
    */
-  void Unique(NDArray *out, int priority) {
-    Resource rsc = ResourceManager::Get()->Request(out->ctx(),
+  NDArray Unique(const NDArray &data, Context ctx, int priority) {
+    // create kRowSparseStorage output ndarray
+    const size_t num_elements = data.shape().Size();
+    NDArray out(kRowSparseStorage, mshadow::Shape2(num_elements, 1),
+                ctx, true, mshadow::kInt64);
+    bool diff_ctx = data.ctx() != ctx;
+    NDArray data_in_ctx = diff_ctx ? NDArray(data.shape(), ctx, true, data.dtype()) : data;
+    // if data == data_in_ctx, CopyFromTo is smart enough to skip the copy
+    CopyFromTo(data, &data_in_ctx, priority);
+    Resource rsc = ResourceManager::Get()->Request(out.ctx(),
       ResourceRequest(ResourceRequest::kTempSpace));
+    // GPU requires temp resources
+    std::vector<Engine::VarHandle> mutate_vars{out.var()};
+    if (out.ctx().dev_mask() == gpu::kDevMask) mutate_vars.emplace_back(rsc.var);
     Engine::Get()->PushAsync(
-      [rsc, out](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-        NDArray *output = out;
-        CHECK_EQ(out->shape().ndim(), 1) << "Unique expects 1D inputs";
-        nnvm::dim_t size = out->shape()[0];
-        switch (out->ctx().dev_mask()) {
+      [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+        // copy data.data() to out.data()
+        out.CheckAndAlloc({mshadow::Shape1(num_elements)});
+        TBlob out_data = out.data();
+        switch (out.ctx().dev_mask()) {
           case cpu::kDevMask: {
             mshadow::Stream<cpu> *s = rctx.get_stream<cpu>();
-            UniqueImpl(rsc, s, output, size);
+            ndarray::Copy<cpu, cpu>(data_in_ctx.data(), &out_data,
+                                    ctx, ctx, rctx);
+            UniqueImpl(rsc, s, out);
             break;
           }
   #if MXNET_USE_CUDA
           case gpu::kDevMask: {
             mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
-            UniqueImpl(rsc, s, output, size);
+            ndarray::Copy<gpu, gpu>(data_in_ctx.data(), &out_data,
+                                    ctx, ctx, rctx);
+            UniqueImpl(rsc, s, out);
             // wait for GPU operations to complete
             s->Wait();
             break;
           }
   #endif
           default:
-            LOG(FATAL) << "GPU not enabled.";
+            LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
         }
         on_complete();
-      }, out->ctx(), {}, {out->var(), rsc.var},
+      }, out.ctx(), {data_in_ctx.var()}, mutate_vars,
       FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreUnique"));
-    out->WaitToRead();
+    return out;
   }
 
-
   /// reducer and broadcaster
   Comm* comm_;
+  Comm* comm_cpu_;
   /// pinned context
   Context pinned_ctx_;
   /// \brief buffer for storing local values
diff --git a/src/kvstore/kvstore_utils.cc b/src/kvstore/kvstore_utils.cc
index 9e14d8ba75f..e187b0ce489 100644
--- a/src/kvstore/kvstore_utils.cc
+++ b/src/kvstore/kvstore_utils.cc
@@ -28,15 +28,18 @@
 namespace mxnet {
 namespace kvstore {
 
-
 template<>
 void UniqueImpl<cpu>(const Resource& rsc, mshadow::Stream<cpu> *s,
-                     NDArray *out, nnvm::dim_t size) {
-  MSHADOW_IDX_TYPE_SWITCH(out->data().type_flag_, IType, {
-    IType *dptr = out->data().dptr<IType>();
-    common::ParallelSort(dptr, dptr + size, omp_get_max_threads());
-    size_t num_unique_idx = std::unique(dptr, dptr + size) - dptr;
-    *out = out->Reshape(mshadow::Shape1(num_unique_idx));
+                      const NDArray& out) {
+  const size_t num_elements = out.shape().Size();
+  CHECK_EQ(out.storage_type(), kRowSparseStorage) << "row_sparse NDArray is expected";
+  MSHADOW_IDX_TYPE_SWITCH(out.dtype(), IType, {
+    IType *dptr = out.data().dptr<IType>();
+    common::ParallelSort(dptr, dptr + num_elements,
+                         engine::OpenMP::Get()->GetRecommendedOMPThreadCount());
+    const size_t num_selected_out = std::unique(dptr, dptr + num_elements) - dptr;
+    // set the shape of data/aux_data according to the number of unique values
+    out.set_aux_shape(rowsparse::kIdx, mshadow::Shape1(num_selected_out));
   });
 }
 
diff --git a/src/kvstore/kvstore_utils.cu b/src/kvstore/kvstore_utils.cu
index 00f316fe6d1..79f2ae6ad7f 100644
--- a/src/kvstore/kvstore_utils.cu
+++ b/src/kvstore/kvstore_utils.cu
@@ -40,10 +40,9 @@
 namespace mxnet {
 namespace kvstore {
 
-
 template<typename IType>
 size_t UniqueImplGPU(const Resource& rsc, mshadow::Stream<gpu> *s,
-                     IType *dptr, nnvm::dim_t size) {
+                   IType *dptr, const size_t size) {
 #ifndef SORT_WITH_THRUST
   size_t sort_temp_bytes = 0;
   cub::DeviceRadixSort::SortKeys(NULL, sort_temp_bytes,
@@ -58,45 +57,41 @@ size_t UniqueImplGPU(const Resource& rsc, mshadow::Stream<gpu> *s,
   thrust::sort(thrust::cuda::par.on(mshadow::Stream<gpu>::GetStream(s)),
     dptr, dptr + size, thrust::greater<IType>());
 #endif
+  // estimate unique temp space. The first byte is reserved to store the number of
+  // unique values selected
   size_t unique_temp_bytes = 0;
-  mshadow::Tensor<gpu, 1, char> dummy_space = rsc
-    .get_space_typed<gpu, 1, char>(
-      mshadow::Shape1(sizeof(size_t)), s);
-  size_t *dummy_ptr = reinterpret_cast<size_t*>(dummy_space.dptr_);
+  size_t num_selected_bytes = sizeof(size_t);
+  size_t *null_ptr = nullptr;
   cub::DeviceSelect::Unique(NULL, unique_temp_bytes, dptr, dptr,
-    dummy_ptr, size, mshadow::Stream<gpu>::GetStream(s));
-
-  mshadow::Tensor<gpu, 1, char> unique_space = rsc
-    .get_space_typed<gpu, 1, char>(
-      mshadow::Shape1((unique_temp_bytes + sizeof(size_t) + 7) / 8 * 8), s);
-
-  void *unique_temp_storage = static_cast<void*>(
-    unique_space.dptr_);
-  size_t *d_num_selected_out = reinterpret_cast<size_t*>(
-    unique_space.dptr_ + (unique_temp_bytes + 7) / 8 * 8);
-
+    null_ptr, size, mshadow::Stream<gpu>::GetStream(s));
+  size_t total_temp_bytes = unique_temp_bytes + num_selected_bytes;
+  // request temp storage
+  mshadow::Tensor<gpu, 1, char> workspace = rsc
+    .get_space_typed<gpu, 1, char>(mshadow::Shape1(total_temp_bytes), s);
+  void *unique_temp_storage = static_cast<void*>(workspace.dptr_ + num_selected_bytes);
+  size_t* num_selected_ptr = reinterpret_cast<size_t*>(workspace.dptr_);
+  // execute unique kernel
   cub::DeviceSelect::Unique(unique_temp_storage, unique_temp_bytes, dptr, dptr,
-    d_num_selected_out, size, mshadow::Stream<gpu>::GetStream(s));
-
+    num_selected_ptr, size, mshadow::Stream<gpu>::GetStream(s));
+  // retrieve num selected unique values
   size_t num_selected_out = 0;
-  CUDA_CALL(cudaMemcpy(&num_selected_out, d_num_selected_out, sizeof(size_t),
+  CUDA_CALL(cudaMemcpy(&num_selected_out, num_selected_ptr, num_selected_bytes,
      cudaMemcpyDeviceToHost));
   return num_selected_out;
 }
 
-/*!
- * \brief sort and get unique values.
- */
 template<>
 void UniqueImpl<gpu>(const Resource& rsc, mshadow::Stream<gpu> *s,
-                     NDArray *out, nnvm::dim_t size) {
-  MSHADOW_IDX_TYPE_SWITCH(out->data().type_flag_, IType, {
-    IType *dptr = out->data().dptr<IType>();
-    size_t num_selected_out = UniqueImplGPU(rsc, s, dptr, size);
-    *out = out->Reshape(mshadow::Shape1(num_selected_out));
+                     const NDArray &out) {
+  const size_t num_elements = out.shape().Size();
+  CHECK_EQ(out.storage_type(), kRowSparseStorage) << "row_sparse NDArray is expected";
+  MSHADOW_IDX_TYPE_SWITCH(out.dtype(), IType, {
+    IType *dptr = out.data().dptr<IType>();
+    size_t num_selected_out = UniqueImplGPU(rsc, s, dptr, num_elements);
+    // set the shape of data/aux_data according to the number of unique values
+    out.set_aux_shape(rowsparse::kIdx, mshadow::Shape1(num_selected_out));
   });
 }
 
-
 }  // namespace kvstore
 }  // namespace mxnet
diff --git a/src/kvstore/kvstore_utils.h b/src/kvstore/kvstore_utils.h
index 8255619cdb2..ee173b4559f 100644
--- a/src/kvstore/kvstore_utils.h
+++ b/src/kvstore/kvstore_utils.h
@@ -35,12 +35,15 @@ namespace kvstore {
 
 
 /*!
- * \brief sort and get unique values.
+ * \brief compute unique and sorted values in a row_sparse ndarray.
+ * \param rsc Temp resource for computation
+ * \param s   Stream
+ * \param out Input and output ndarray. The ndarray stores the
+ *            unique elements in out.data().
  */
 template<typename xpu>
 void UniqueImpl(const Resource& rsc, mshadow::Stream<xpu> *s,
-                NDArray *out, nnvm::dim_t size);
-
+                 const NDArray& out);
 }  // namespace kvstore
 }  // namespace mxnet
 
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index e8e95643e64..e988c803c18 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -99,7 +99,8 @@ void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
   //   out = dot(data, wmat.T());
   linalg_gemm(data, wmat, out, false, true, s);
   if (!param.no_bias) {
-    Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].reshape(Shape1(in_data[fullc::kBias].shape_[0])).get<xpu, 1, DType>(s);
+    //Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
     out += repmat(bias, data.size(0));
   }
 }
@@ -147,7 +148,8 @@ void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
   linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
   // gradient of bias
   if (!param.no_bias) {
-    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].reshape(Shape1(in_grad[fullc::kBias].shape_[0])).get<xpu, 1, DType>(s);
+    //Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
     Assign(gbias, req[fullc::kBias], sum_rows(grad));
   }
   // gradient of data
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 4362408a23a..01cf5a74fe7 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -56,7 +56,11 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
   }
   SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));
   if (!param.no_bias) {
-    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden));
+    if (!shape_assign(&(*in_shape)[fullc::kBias], TShape(Shape1(param.num_hidden))) &&
+        !shape_assign(&(*in_shape)[fullc::kBias], TShape(Shape2(param.num_hidden, 1)))) {
+      LOG(FATAL) << "SHAPE_ASSIGN_CHECK failed";
+    }
+    //SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden));
   }
 
   if (!param.flatten) {
@@ -73,12 +77,12 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-#if MXNET_USE_MKLDNN == 1
 void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext &ctx,
                                 const std::vector<NDArray> &inputs,
                                 const std::vector<OpReqType> &req,
                                 const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
   if (SupportMKLDNN(inputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNFCForward(attrs, ctx, inputs, req, outputs);
@@ -86,9 +90,18 @@ void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
                        outputs);
     return;
   }
-  FallBackCompute(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
+#endif
+  //FallBackCompute(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
+  std::vector<TBlob> in_blobs(inputs.size());
+ for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  FullyConnectedCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
 }
 
+#if MXNET_USE_MKLDNN == 1
 void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const OpContext &ctx,
                                     const std::vector<NDArray> &inputs,
@@ -139,7 +152,7 @@ inline static bool FCStorageType(const nnvm::NodeAttrs& attrs,
     wanted_mode = DispatchMode::kFComputeEx;
   else
 #endif
-    wanted_mode = DispatchMode::kFCompute;
+    wanted_mode = DispatchMode::kFComputeEx;
   return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
                              dispatch_mode, wanted_mode);
 }
@@ -214,9 +227,7 @@ If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
 .set_attr<nnvm::FInferShape>("FInferShape", FullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", FullyConnectedType)
 .set_attr<FCompute>("FCompute<cpu>", FullyConnectedCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
 .set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedComputeExCPU)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", FullyConnectedGrad{"_backward_FullyConnected"})
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 89d27e17ec6..225b22d43d8 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -1345,9 +1345,7 @@ inline void FtrlUpdateEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
-
 // Implementation for signSGD and Signum
-
 struct SignSGDParam : public dmlc::Parameter<SignSGDParam> {
   float lr;
   float wd;
@@ -1372,7 +1370,6 @@ struct SignSGDParam : public dmlc::Parameter<SignSGDParam> {
   }
 };
 
-
 struct SignSGDKernel {
   template<typename DType>
   MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* weight_data,
@@ -1483,7 +1480,162 @@ inline void SignumUpdate(const nnvm::NodeAttrs& attrs,
     });
 }
 
+struct AdagradParam : public dmlc::Parameter<AdagradParam> {
+  float lr;
+  float eps;
+  float rescale_grad;
+  float clip_gradient;
+  float wd;
+  DMLC_DECLARE_PARAMETER(AdagradParam) {
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(eps)
+    .set_default(1.0e-7)
+    .describe("eps");
+    DMLC_DECLARE_FIELD(wd)
+    .set_default(0.0f)
+    .describe("wd");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+  }
+};
+
+inline bool AdagradStorageType(const nnvm::NodeAttrs& attrs,
+                               const int dev_mask,
+                               DispatchMode* dispatch_mode,
+                               std::vector<int>* in_attrs,
+                               std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const AdagradParam& param = nnvm::get<AdagradParam>(attrs.parsed);
+  bool dispatched = false;
+  if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+    // dns, dns, dns -> dns
+    // dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+    //                                 dispatch_mode, DispatchMode::kFCompute);
+    LOG(FATAL) << "NOT IMPLEMENTED YET";
+  }
+  if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kRowSparseStorage) &&
+      common::ContainsOnlyStorage(*in_attrs, kRowSparseStorage) &&
+      param.wd == 0.0f) {
+    // rsp, rsp, rsp -> rsp with wd = 0.0
+    dispatched = storage_type_assign(out_attrs, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  // TODO no need to log storage fallback
+  if (!dispatched) {
+    dispatched = dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  return dispatched;
+}
+
+struct AdagradDnsRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
+    DType* state_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType epsilon,
+    const DType lr, const DType rescale_grad) {
+    using nnvm::dim_t;
+    dim_t data_i = grad_idx[i] * row_length;
+    dim_t grad_i = i * row_length;
+    for (dim_t j = 0; j < row_length; j++) {
+      const dim_t data_j = data_i + j;
+      const dim_t grad_j = grad_i + j;
+      DType grad_rescaled = grad_data[grad_j] * rescale_grad;
+      if (clip_gradient >= 0.0f) {
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
+      }
+      const DType grad_squared = grad_rescaled * grad_rescaled;
+      state_data[data_j] += grad_squared;
+      // TODO  replace math::sqrt?
+      const DType div = grad_rescaled / math::sqrt(state_data[data_j] + epsilon);
+      // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
+      out_data[data_j] = weight_data[data_j] + div * -lr;
+    }
+  }
+};
+
+template<typename xpu>
+void AdagradUpdateDnsRspDnsImpl(const AdagradParam& param,
+                                const OpContext& ctx,
+                                const TBlob& weight,
+                                const NDArray& grad,
+                                const TBlob& state,
+                                const OpReqType& req,
+                                TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  using namespace mshadow;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(param.wd, 0.0f)
+    << "sparse adagrad_update does not support wd.";
+  if (req == kNullOp || !grad.storage_initialized()) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adagrad_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(state.shape_.Size(), 0);
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      const DType* weight_data = weight.dptr<DType>();
+      const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+      const DType* grad_val = grad.data().dptr<DType>();
+      DType* state_data = state.dptr<DType>();
+      DType* out_data = out->dptr<DType>();
+      const nnvm::dim_t nnr = grad.storage_shape()[0];
+      const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+      Kernel<AdagradDnsRspDnsKernel, xpu>::Launch(s, nnr, row_length,
+        out_data, state_data, weight_data, grad_idx, grad_val,
+        static_cast<DType>(param.clip_gradient), static_cast<DType>(param.eps),
+        static_cast<DType>(param.lr), static_cast<DType>(param.rescale_grad));
+    });
+  });
+}
+
+template<typename xpu>
+inline void AdagradUpdateRspRspRspImpl(const AdagradParam& param,
+                                       const OpContext& ctx,
+                                       const NDArray& weight,
+                                       const NDArray& grad,
+                                       const NDArray& hst,
+                                       const OpReqType& req,
+                                       NDArray *out) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "AdagradUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  // fill history with zero values
+  if (!hst.storage_initialized()) {
+    NDArray hst_zeros = hst;
+    FillDnsZerosRspImpl(s, &hst_zeros);
+  }
+  TBlob out_blob = out->data();
+  // reuse dns rsp implementation when storage_shape == shape
+  AdagradUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad,
+                                 hst.data(), req, &out_blob);
+}
 
+template<typename xpu>
+inline void AdagradUpdateEx(const nnvm::NodeAttrs& attrs,
+                            const OpContext &ctx,
+                            const std::vector<NDArray> &inputs,
+                            const std::vector<OpReqType> &req,
+                            const std::vector<NDArray> &outputs) {
+  using namespace mxnet_op;
+  const AdagradParam& param = nnvm::get<AdagradParam>(attrs.parsed);
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) &&
+      common::ContainsOnlyStorage(outputs, kRowSparseStorage)) {
+    NDArray out = outputs[0];
+    AdagradUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], inputs[2], req[0], &out);
+  } else {
+    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
+  }
+}
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 136769a1bf0..e98c6cd3a74 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -38,6 +38,7 @@ DMLC_REGISTER_PARAMETER(RMSPropAlexParam);
 DMLC_REGISTER_PARAMETER(FtrlParam);
 DMLC_REGISTER_PARAMETER(SignSGDParam);
 DMLC_REGISTER_PARAMETER(SignumParam);
+DMLC_REGISTER_PARAMETER(AdagradParam);
 
 NNVM_REGISTER_OP(signsgd_update)
 .describe(R"code(Update function for SignSGD optimizer.
@@ -536,5 +537,30 @@ only the row slices whose indices appear in grad.indices are updated (for w, z a
 .add_argument("n", "NDArray-or-Symbol", "Square of grad")
 .add_arguments(FtrlParam::__FIELDS__());
 
+NNVM_REGISTER_OP(adagrad_update)
+MXNET_ADD_SPARSE_OP_ALIAS(adagrad_update)
+.describe(R"code(Adagrad update function.
+)code" ADD_FILELINE)
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<AdagradParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", AdagradStorageType)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FComputeEx>("FComputeEx<cpu>", AdagradUpdateEx<cpu>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("history", "NDArray-or-Symbol", "History")
+.add_arguments(AdagradParam::__FIELDS__());
+
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 1bd6117432b..20fc4239519 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -173,6 +173,9 @@ NNVM_REGISTER_OP(sgd_update)
 .set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", SGDUpdateEx<gpu>);
 
+NNVM_REGISTER_OP(adagrad_update)
+.set_attr<FComputeEx>("FComputeEx<gpu>", AdagradUpdateEx<gpu>);
+
 NNVM_REGISTER_OP(sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", SGDMomUpdateEx<gpu>);
diff --git a/src/operator/tensor/control_flow_op.h b/src/operator/tensor/control_flow_op.h
index 503bc7c4abb..92227f801f3 100644
--- a/src/operator/tensor/control_flow_op.h
+++ b/src/operator/tensor/control_flow_op.h
@@ -80,7 +80,6 @@ struct where_csr {
   }
 };
 
-
 /*! \brief Choose elements from x or y depending on condition
  * The condition is a vector whose size is the same as the
  * x's first dim size.
@@ -269,7 +268,6 @@ inline bool WhereOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
 }
 
 
-
 template<typename xpu>
 void WhereOpForward(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 762d8fd64c2..5c27bceb692 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -43,20 +43,34 @@ struct is_valid_check {
 
 
 struct AddTakeGradRspGPUKernel {
-  template<typename DType, typename IType>
-  __device__ __forceinline__ static void Map(int tid,
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int thread_id,
                                              DType* out,
                                              const nnvm::dim_t* prefix_sum,
-                                             const IType* data,
+                                             const nnvm::dim_t* sorted_data,
+                                             const nnvm::dim_t data_size,
+                                             const nnvm::dim_t* original_idx,
                                              const DType* ograd,
-                                             const nnvm::dim_t row_length) {
+                                             const nnvm::dim_t row_length,
+                                             const nnvm::dim_t num_threads_per_row) {
     using nnvm::dim_t;
-    const dim_t data_i = tid / row_length;
-    const dim_t grad_i = tid % row_length;
-    const dim_t irow = static_cast<dim_t>(data[data_i]);
-    const dim_t rsp_row = prefix_sum[irow] - 1;
-    const DType val = ograd[data_i * row_length + grad_i];
-    atomicAdd(static_cast<DType *>(&(out[rsp_row*row_length+grad_i])), val);
+    auto tid = thread_id / num_threads_per_row;
+    auto feature_start = thread_id % num_threads_per_row * 4;
+    auto feature_end = feature_start + 4;
+    if (feature_end > row_length) feature_end = row_length;
+    if (tid == 0 || sorted_data[tid - 1] != sorted_data[tid]) {
+      do {
+        dim_t data = sorted_data[tid];
+        dim_t idx = original_idx[tid];
+        dim_t row_id = prefix_sum[data] - 1;
+        dim_t ograd_offset = idx * row_length;
+        dim_t out_offset = row_id * row_length;
+        for (int i = feature_start; i < feature_end; i++) {
+          out[out_offset + i] += ograd[ograd_offset + i];
+        }
+        tid++;
+      } while (tid < data_size && sorted_data[tid - 1] == sorted_data[tid]);
+    }
   }
 };
 
@@ -125,55 +139,89 @@ inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
   dim_t row_length = output.shape()[1];
   dim_t data_size = static_cast<dim_t>(data.shape_.Size());
   dim_t num_threads;
+  if (data_size == 0) {
+    FillZerosRspImpl(s, output);
+    return;
+  }
 
   MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
-    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
       MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
         dim_t* prefix_sum = NULL;
-        void* d_temp_storage = NULL;
-        size_t temp_storage_bytes = 0;
-        cub::DeviceScan::InclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
+        void* temp_storage = NULL;
+        dim_t* sorted_data = NULL;
+        dim_t* original_idx = NULL;
+        // calculate resource bytes
+        size_t row_flg_storage_bytes = num_rows * sizeof(dim_t);
+        size_t sorted_data_storage_bytes = data_size * sizeof(dim_t);
+        size_t original_idx_storage_bytes = data_size * sizeof(dim_t);
+        size_t sum_workspace_bytes = 0;
+        size_t sort_workspace_size = SortByKeyWorkspaceSize<dim_t, dim_t, gpu>(data_size);
+        cub::DeviceScan::InclusiveSum(temp_storage,
+                                      sum_workspace_bytes,
                                       prefix_sum,
                                       prefix_sum,
                                       num_rows,
                                       Stream<gpu>::GetStream(s));
+        // temp_workspace is shared by inclusive sum and sort
+        size_t temp_workspace_bytes = std::max(sum_workspace_bytes, sort_workspace_size);
+        size_t total_storage_bytes = row_flg_storage_bytes + sorted_data_storage_bytes +
+                                     original_idx_storage_bytes + temp_workspace_bytes;
+
+        // request resource and split it. layout =
+        // row_flg/prefixsum, sorted_data, original_idx, temp_storage
         Tensor<gpu, 1, char> workspace = ctx.requested[0]
-            .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(dim_t) +
-                                           temp_storage_bytes), s);
+            .get_space_typed<gpu, 1, char>(Shape1(total_storage_bytes), s);
         prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
-        d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
+        sorted_data = reinterpret_cast<dim_t*>(workspace.dptr_ + row_flg_storage_bytes);
+        original_idx = reinterpret_cast<dim_t*>(workspace.dptr_ + row_flg_storage_bytes +
+                                                sorted_data_storage_bytes);
+        temp_storage = workspace.dptr_ + total_storage_bytes - temp_workspace_bytes;
+        // compute row flags and prefix sum
         num_threads = num_rows;
         Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
         Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
-
-        cub::DeviceScan::InclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
+        cub::DeviceScan::InclusiveSum(temp_storage,
+                                      temp_workspace_bytes,
                                       prefix_sum,
                                       prefix_sum,
                                       num_rows,
                                       mshadow::Stream<gpu>::GetStream(s));
+        // retrieve nnr and allocate output
         dim_t nnr = 0;
         CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
             cudaMemcpyDeviceToHost));
-
-        if (nnr == 0) {
-          FillZerosRspImpl(s, output);
-          return;
-        }
         output.CheckAndAlloc({Shape1(nnr)});
-        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
         // fill row_idx array of output matrix, using the row_flg values
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
         Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
             grad_row_idx, prefix_sum, num_rows);
-        // prefill with zeros
+
+        // make a copy of the data, to be sorted
+        TBlob sorted_data_blob(sorted_data, Shape1(data_size), gpu::kDevMask);
+        auto sorted_data_tensor = sorted_data_blob.FlatTo1D<gpu, dim_t>(s);
+        mxnet_op::copy(s, sorted_data_blob, data);
+
+        // generate original idx
+        Tensor<gpu, 1, dim_t> original_idx_tensor(original_idx, Shape1(data_size), s);
+        Kernel<range_fwd, gpu>::Launch(s, data_size, 1, static_cast<dim_t>(0), static_cast<dim_t>(1),
+                                       kWriteTo, original_idx);
+        // sort data with its original idx
+        int num_bits = ilog2(num_rows - 1);
+        char* temp_storage_ptr = reinterpret_cast<char*>(temp_storage);
+        Tensor<gpu, 1, char> temp_storage_tensor(temp_storage_ptr,
+                                                 Shape1(sort_workspace_size), s);
+        SortByKey(sorted_data_tensor, original_idx_tensor, true,
+                  &temp_storage_tensor, 0, num_bits);
+
+        // accumulate gradients
         DType* grad_data = output.data().dptr<DType>();
         Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
             kWriteTo, 0);
-        // add the final gradients
-        num_threads = row_length * data_size;
-        Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s, num_threads, grad_data, prefix_sum,
-            data.dptr<IType>(), ograd.dptr<DType>(), row_length);
+        //num_threads = data_size;
+        const nnvm::dim_t num_threads_per_row = (row_length + 3) / 4;
+        Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s, data_size * num_threads_per_row, grad_data, prefix_sum,
+               sorted_data, data_size, original_idx, ograd.dptr<DType>(), row_length, num_threads_per_row);
       });
     });
   });
diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index df85fe58605..3a3c916d782 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -99,7 +99,7 @@ def check_row_sparse_keys(kv, my_rank, nworker):
             # select a random subset of rows this worker is interested in
             num_rows = shape[0]
             row_ids_np = np.random.randint(num_rows, size=num_rows)
-            row_ids = mx.nd.array(row_ids_np, dtype='int64')
+            row_ids = mx.nd.array(row_ids_np).reshape((num_rows/2, 2))
             # perform pull
             val = mx.nd.zeros(shape, stype='row_sparse')
             kv.row_sparse_pull('9', out=val, row_ids=row_ids)
@@ -170,7 +170,7 @@ def check_big_row_sparse_keys(kv, my_rank, nworker):
             rnd.seed(my_rank)
             num_rows = big_shape[0]
             row_ids_np = np.random.randint(num_rows, size=num_rows)
-            row_ids = mx.nd.array(row_ids_np)
+            row_ids = mx.nd.array(row_ids_np).reshape((num_rows/2, 2))
             # perform pull
             val = mx.nd.zeros(big_shape, stype='row_sparse')
             kv.row_sparse_pull('100', out=val, row_ids=row_ids)
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index 5fd3097f29e..48105a7d283 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -57,14 +57,14 @@ def check_rsp_pull(kv, count, ctxs, is_same_rowid=False, use_slice=False):
             vals = [mx.nd.sparse.zeros(shape=shape, ctx=ctxs[i], stype='row_sparse') for i in range(count)]
             if is_same_rowid:
                 row_id = np.random.randint(num_rows, size=num_rows)
-                row_ids = [mx.nd.array(row_id, dtype='int64')] * count
+                row_ids = [mx.nd.array(row_id)] * count
             elif use_slice:
-                total_row_ids = mx.nd.array(np.random.randint(num_rows, size=count*num_rows), dtype='int64')
+                total_row_ids = mx.nd.array(np.random.randint(num_rows, size=count*num_rows))
                 row_ids = [total_row_ids[i*num_rows : (i+1)*num_rows] for i in range(count)]
             else:
                 for i in range(count):
                     row_id = np.random.randint(num_rows, size=num_rows)
-                    row_ids.append(mx.nd.array(row_id, dtype='int64'))
+                    row_ids.append(mx.nd.array(row_id))
             row_ids_to_pull = row_ids[0] if (len(row_ids) == 1 or is_same_rowid) else row_ids
             vals_to_pull = vals[0] if len(vals) == 1 else vals
 
diff --git a/tests/python/unittest/test_gluon_contrib.py b/tests/python/unittest/test_gluon_contrib.py
index 03e4261ad16..5c8e3b5b183 100644
--- a/tests/python/unittest/test_gluon_contrib.py
+++ b/tests/python/unittest/test_gluon_contrib.py
@@ -108,6 +108,22 @@ def test_conv_fill_shape():
     check_rnn_forward(cell, mx.nd.ones((8, 3, 5, 7)))
     assert cell.i2h_weight.shape[1] == 5, cell.i2h_weight.shape[1]
 
+@with_seed()
+def test_lstmp():
+    nhid = 100
+    nproj = 64
+    cell = contrib.rnn.LSTMPCell(nhid, nproj, prefix='rnn_')
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
+    outputs = mx.sym.Group(outputs)
+    expected_params = ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight', 'rnn_proj_weight']
+    expected_outputs = ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']
+    assert sorted(cell.collect_params().keys()) == expected_params
+    assert outputs.list_outputs() == expected_outputs, outputs.list_outputs()
+
+    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+    assert outs == [(10, nproj), (10, nproj), (10, nproj)]
+
 
 @with_seed()
 def test_vardrop():
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index 6bab06c5b70..c6ebd9df389 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -57,11 +57,13 @@ def test_single_kv_pair():
     def check_single_kv_pair(kv, key):
         kv.push(key, mx.nd.ones(shape))
         val = mx.nd.empty(shape)
-        kv.pull(key, out=val)
+        kv.pull(key, out=val, ignore_sparse=False)
         check_diff_to_scalar(val, 1)
 
     check_single_kv_pair(init_kv(), 3)
     check_single_kv_pair(init_kv_with_str(), 'a')
+    check_single_kv_pair(init_kv('row_sparse'), 3)
+    check_single_kv_pair(init_kv_with_str('row_sparse'), 'a')
 
 @with_seed()
 def test_row_sparse_pull():
@@ -76,7 +78,7 @@ def check_row_sparse_pull(kv, count):
         for i in range(count):
             vals.append(mx.nd.zeros(shape).tostype('row_sparse'))
             row_id = np.random.randint(num_rows, size=num_rows)
-            row_ids.append(mx.nd.array(row_id))
+            row_ids.append(mx.nd.array(row_id).reshape((2,2)))
         row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids
         vals_to_pull = vals[0] if len(vals) == 1 else vals
 
@@ -110,12 +112,14 @@ def test_list_kv_pair():
     def check_list_kv_pair(kv, key):
         kv.push(key, [mx.nd.ones(shape)*4] * len(key))
         val = [mx.nd.empty(shape)] * len(key)
-        kv.pull(key, out=val)
+        kv.pull(key, out=val, ignore_sparse=False)
         for v in val:
             check_diff_to_scalar(v, 4)
 
     check_list_kv_pair(init_kv(), keys)
     check_list_kv_pair(init_kv_with_str(), str_keys)
+    check_list_kv_pair(init_kv('row_sparse'), keys)
+    check_list_kv_pair(init_kv_with_str('row_sparse'), str_keys)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 159c9bac89d..367c4b46a2d 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -963,6 +963,27 @@ def get_net(num_hidden, flatten=True):
             optimizer='nadam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
 
+def test_adagrad():
+    mx.random.seed(0)
+    opt1 = mx.optimizer.AdaGrad
+    opt2 = mx.optimizer.AdaGrad
+    shape = (3, 4, 5)
+    eps_options = [{}, {'eps': 1e-7}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    for dtype in [np.float32]:
+        for eps_option in eps_options:
+            for cg_option in cg_options:
+                for rg_option in rg_options:
+                    for wd_option in wd_options:
+                        kwarg = {}
+                        kwarg.update(eps_option)
+                        kwarg.update(cg_option)
+                        kwarg.update(rg_option)
+                        kwarg.update(wd_option)
+                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
+                                          w_stype='row_sparse', g_stype='row_sparse')
 
 if __name__ == '__main__':
     import nose


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services