You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/09/24 00:58:14 UTC

svn commit: r1762100 - in /ctakes/trunk/ctakes-temporal/scripts/nn: classify.sh dataset_hybrid.py dataset_hybrid.pyc hybrid_classify.py hybrid_train.py train.sh

Author: dligach
Date: Sat Sep 24 00:58:14 2016
New Revision: 1762100

URL: http://svn.apache.org/viewvc?rev=1762100&view=rev
Log:
code to get pos/token hybrid model working

Added:
    ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc   (with props)
    ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py
    ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py
Modified:
    ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
    ctakes/trunk/ctakes-temporal/scripts/nn/train.sh

Modified: ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh?rev=1762100&r1=1762099&r2=1762100&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh Sat Sep 24 00:58:14 2016
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 source $(dirname $0)/env/bin/activate
-python $(dirname $0)/cnn_classify.py $*
+python $(dirname $0)/hybrid_classify.py $*
 ret=$?
 deactivate
 exit $ret

Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py?rev=1762100&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py Sat Sep 24 00:58:14 2016
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import numpy as np
+import sys, ConfigParser, collections
+sys.dont_write_bytecode = True
+
+class DatasetProvider:
+  """THYME relation data"""
+  
+  def __init__(self, path):
+    """Index words by frequency in a file"""
+
+    self.word2int = {}  # words indexed by frequency
+    self.tag2int = {}   # pos tags indexed by frequency
+    self.label2int = {} # class to int mapping
+    
+    unigrams = [] # corpus as list
+    tags = []     # pos tags as list
+    labels = []   # classes as list
+    for line in open(path):
+      label, text, pos = line.strip().split('|')
+      unigrams.extend(text.split())
+      tags.extend(pos.split())
+      labels.append(label)
+        
+    index = 1 # zero used to encode unknown words
+    self.word2int['oov_word'] = 0
+    unigram_counts = collections.Counter(unigrams)
+    for unigram, count in unigram_counts.most_common():
+      self.word2int[unigram] = index
+      index = index + 1
+
+    index = 1 # zero used to encode unknown pos tags
+    self.tag2int['oov_tag'] = 0
+    tag_counts = collections.Counter(tags)
+    for tag, count in tag_counts.most_common():
+      self.tag2int[tag] = index
+      index = index + 1
+
+    index = 0 # index classes
+    for label in set(labels):
+      self.label2int[label] = index
+      index = index + 1
+
+  def load(self, path, maxlen=float('inf')):
+    """Convert sentences (examples) into lists of indices"""
+
+    examples = [] # sequences of words as ints
+    tagseqs = []  # sequences of pos tags as ints
+    labels = []   # labels
+
+    for line in open(path):
+      label, text, pos = line.strip().split('|')
+
+      example = []
+      for unigram in text.split():
+        if unigram in self.word2int:
+          example.append(self.word2int[unigram])
+        else:
+          example.append(self.word2int['oov_word'])
+
+      tagseq = []
+      for tag in pos.split():
+        if tag in self.tag2int:
+          tagseq.append(self.tag2int[tag])
+        else:
+          tagseq.append(self.tag2int['oov_tag'])
+
+      # truncate example if it's too long
+      if len(example) > maxlen:
+        example = example[0:maxlen]
+      if len(tagseq) > maxlen:
+        tagseq = tagseq[0:maxlen]
+
+      examples.append(example)
+      tagseqs.append(tagseq)
+      labels.append(self.label2int[label])
+
+    return examples, tagseqs, labels
+
+if __name__ == "__main__":
+
+  cfg = ConfigParser.ConfigParser()
+  cfg.read(sys.argv[1])
+
+  dataset = DatasetProvider(cfg.get('data', 'train'))
+  print 'alphabet size:', len(dataset.tag2int)
+  x1, x2, y = dataset.load(cfg.get('data', 'train'))
+  print 'train max seq len:', max([len(s) for s in x1])
+  
+  x1, x2, y = dataset.load(cfg.get('data', 'test'), maxlen=10)
+  print 'test max seq len:', max([len(s) for s in x2])
+  print 'labels:', dataset.label2int
+  print 'label counts:', collections.Counter(y)
+  print 'first 10 examples:', x2[:10]

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py
------------------------------------------------------------------------------
    svn:executable = *

Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc?rev=1762100&view=auto
==============================================================================
Binary file - no diff available.

Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py?rev=1762100&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py Sat Sep 24 00:58:14 2016
@@ -0,0 +1,87 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+
+def main(args):
+    if len(args) < 1:
+        sys.stderr.write("Error - one required argument: <model directory>\n")
+        sys.exit(-1)
+    working_dir = args[0]
+
+    target_dir = 'ctakes-temporal/target/eval/thyme/train_and_test/event-time/'
+    model_dir = os.path.join(os.environ['CTAKES_ROOT'], target_dir)
+    maxlen   = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+    word2int = pickle.load(open(os.path.join(model_dir, "word2int.p"), "rb"))
+    tag2int = pickle.load(open(os.path.join(model_dir, "tag2int.p"), "rb"))
+    label2int = pickle.load(open(os.path.join(model_dir, "label2int.p"), "rb"))
+    model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+    model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+    int2label = {}
+    for label, integer in label2int.items():
+      int2label[integer] = label
+
+    while True:
+        try:
+            line = sys.stdin.readline().rstrip()
+            if not line:
+                break
+
+            text, pos = line.strip().split('|')
+
+            tokens = []
+            for token in text.rstrip().split():
+                if token in word2int:
+                    tokens.append(word2int[token])
+                else:
+                    tokens.append(word2int['none'])
+
+            tags = []
+            for tag in pos.rstrip().split():
+                if tag in tag2int:
+                    tags.append(tag2int[token])
+                else:
+                    tags.append(tag2int['oov_word'])
+                    
+            if len(tokens) > maxlen:
+                tokens = tokens[0:maxlen]
+            if len(tags) > maxlen:
+                tags = tags[0:maxlen]
+                
+            test_x1 = pad_sequences([tokens], maxlen=maxlen)
+            test_x2 = pad_sequences([tags], maxlen=maxlen)
+
+            test_xs = []
+            test_xs.append(test_x)
+            test_xs.append(test_x)
+            test_xs.append(test_x)
+            test_xs.append(test_x)
+            test_xs.append(test_x2)
+            test_xs.append(test_x2)
+            test_xs.append(test_x2)
+            test_xs.append(test_x2)
+
+            out = model.predict(test_xs, batch_size=50)[0]
+
+        except KeyboardInterrupt:
+            sys.stderr.write("Caught keyboard interrupt\n")
+            break
+
+        if line == '':
+            sys.stderr.write("Encountered empty string so exiting\n")
+            break
+
+        out_str = int2label[out.argmax()]
+        print out_str
+        sys.stdout.flush()
+
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

Added: ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py?rev=1762100&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py Sat Sep 24 00:58:14 2016
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+import numpy as np
+np.random.seed(1337)
+import et_cleartk_io as ctk_io
+import nn_models
+import sys
+import os.path
+import dataset_hybrid
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+import pickle
+
+def main(args):
+    if len(args) < 1:
+        sys.stderr.write("Error - one required argument: <data directory>\n")
+        sys.exit(-1)
+    working_dir = args[0]
+    data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+    # learn alphabet from training data
+    provider = dataset_hybrid.DatasetProvider(data_file)
+    # now load training examples and labels
+    train_x1, train_x2, train_y = provider.load(data_file)
+    # turn x and y into numpy array among other things
+    maxlen = max([len(seq) for seq in train_x1])
+    classes = len(set(train_y))
+
+    train_x1 = pad_sequences(train_x1, maxlen=maxlen)
+    train_x2 = pad_sequences(train_x2, maxlen=maxlen)
+    train_y = to_categorical(np.array(train_y), classes)
+
+    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+    pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb"))
+    pickle.dump(provider.tag2int, open(os.path.join(working_dir, 'tag2int.p'),"wb"))
+    pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb"))
+
+    print 'train_x1 shape:', train_x1.shape
+    print 'train_x2 shape:', train_x2.shape
+    print 'train_y shape:', train_y.shape
+
+    branches = [] # models to be merged
+    train_xs = [] # train x for each branch
+
+    for filter_len in '2,3,4,5'.split(','):
+        branch = Sequential()
+        branch.add(Embedding(len(provider.word2int),
+                             300,
+                             input_length=maxlen,
+                             weights=None))
+        branch.add(Convolution1D(nb_filter=200,
+                                 filter_length=int(filter_len),
+                                 border_mode='valid',
+                                 activation='relu',
+                                 subsample_length=1))
+        branch.add(MaxPooling1D(pool_length=2))
+        branch.add(Flatten())
+
+        branches.append(branch)
+        train_xs.append(train_x1)
+
+    for filter_len in '2,3,4,5'.split(','):
+        branch = Sequential()
+        branch.add(Embedding(len(provider.word2int),
+                             300,
+                             input_length=maxlen,
+                             weights=None))
+        branch.add(Convolution1D(nb_filter=200,
+                                 filter_length=int(filter_len),
+                                 border_mode='valid',
+                                 activation='relu',
+                                 subsample_length=1))
+        branch.add(MaxPooling1D(pool_length=2))
+        branch.add(Flatten())
+
+        branches.append(branch)
+        train_xs.append(train_x2)
+
+    model = Sequential()
+    model.add(Merge(branches, mode='concat'))
+
+    model.add(Dense(300))
+    model.add(Dropout(0.25))
+    model.add(Activation('relu'))
+
+    model.add(Dropout(0.25))
+    model.add(Dense(classes))
+    model.add(Activation('softmax'))
+
+    optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=optimizer,
+                  metrics=['accuracy'])
+    model.fit(train_xs,
+              train_y,
+              nb_epoch=3,
+              batch_size=50,
+              verbose=1,
+              validation_split=0.1)
+
+    json_string = model.to_json()
+    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

Modified: ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train.sh?rev=1762100&r1=1762099&r2=1762100&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train.sh Sat Sep 24 00:58:14 2016
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 source $(dirname $0)/env/bin/activate
-python $(dirname $0)/cnn_train.py $*
+python $(dirname $0)/hybrid_train.py $*
 ret=$?
 deactivate
 exit $ret