You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2016/07/25 21:10:59 UTC

svn commit: r1754049 - in /ctakes/trunk/ctakes-temporal/scripts/keras: classify.sh et_RCNN_train-and-package.py rCNN-predict.py train.sh

Author: clin
Date: Mon Jul 25 21:10:58 2016
New Revision: 1754049

URL: http://svn.apache.org/viewvc?rev=1754049&view=rev
Log:
Check in the Region-based CNN implementation (python code) for event-time.

Added:
    ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py
    ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py
Modified:
    ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh
    ctakes/trunk/ctakes-temporal/scripts/keras/train.sh

Modified: ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh?rev=1754049&r1=1754048&r2=1754049&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh Mon Jul 25 21:10:58 2016
@@ -2,7 +2,7 @@
 
 source $(dirname $0)/env/bin/activate
 
-python $(dirname $0)/dima-predict.py $*
+python $(dirname $0)/rCNN-predict.py $*
 
 ret=$?
 

Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py?rev=1754049&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py Mon Jul 25 21:10:58 2016
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+
+import numpy as np
+np.random.seed(1337)
+
+import et_cleartk_io as ctk_io
+import nn_models
+
+import sys
+import os.path
+
+import dataset
+
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+
+import pickle
+
+def main(args):
+    if len(args) < 1:
+        sys.stderr.write("Error - one required argument: <data directory>\n")
+        sys.exit(-1)
+
+    working_dir = args[0]
+
+    #read in data file
+#    print("Reading data...")
+    #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+    data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+    # learn alphabet from training and test data
+    dataset1 = dataset.DatasetProvider([data_file])
+    # now load training examples and labels
+    train_x, train_y = dataset1.load(data_file)
+    pres, arg1s, conts, arg2s, posts, train_y = dataset1.load_by_region(data_file)
+
+    init_vectors = None #used for pre-trained embeddings
+    
+    # turn x and y into numpy array among other things
+    maxlen = max([len(seq) for seq in train_x])
+    outcomes = set(train_y)
+    classes = len(outcomes)
+
+    train_x = pad_sequences(train_x, maxlen = maxlen, truncating='pre')
+    pres_x = pad_sequences(pres, maxlen=5, truncating='pre')
+    arg1s_x = pad_sequences(arg1s, maxlen = 5, truncating='pre')
+    conts_x  = pad_sequences(conts, maxlen = 120, truncating='pre')
+    arg2s_x = pad_sequences(arg2s, maxlen = 5, truncating='pre')
+    posts_x = pad_sequences(posts, maxlen=5, truncating='post')
+    train_y = to_categorical(np.array(train_y), classes)
+
+    pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+    pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+    #test_x = pad_sequences(test_x, maxlen=maxlen)
+    #test_y = to_categorical(np.array(test_y), classes)
+
+    print 'pres_x shape:', pres_x.shape
+    print 'arg1s_x shape:', arg1s_x.shape
+    print 'conts_x shape:', conts_x.shape
+    print 'arg2s_x shape:', arg2s_x.shape
+    print 'posts_x shape:', posts_x.shape
+    print 'train_y shape:', train_y.shape
+
+    branches = [] # models to be merged
+    train_xs = [] # train x for each branch
+    length =[]
+    #test_xs = []  # test x for each branch
+    train_xs.append(train_x)
+    length.append(train_x.shape[1])
+    train_xs.append(train_x)
+    length.append(train_x.shape[1])
+    train_xs.append(train_x)
+    length.append(train_x.shape[1])
+    train_xs.append(pres_x) #for filter 2
+    length.append(pres_x.shape[1])
+    train_xs.append(pres_x) #for filter 3
+    length.append(pres_x.shape[1])
+    train_xs.append(arg1s_x) # filer 2
+    length.append(arg1s_x.shape[1])
+    train_xs.append(conts_x) # filter 3
+    length.append(conts_x.shape[1])
+    train_xs.append(conts_x) # filter 4
+    length.append(conts_x.shape[1])
+    train_xs.append(conts_x) # filter 5
+    length.append(conts_x.shape[1])
+    train_xs.append(arg2s_x) # filer 2
+    length.append(arg2s_x.shape[1])
+    train_xs.append(posts_x) #for filter 2
+    length.append(posts_x.shape[1])
+    train_xs.append(posts_x) #for filter 3
+    length.append(posts_x.shape[1])
+
+    filtlens = "3,4,5,2,3,2,30,40,50,2,2,3"
+    filters = filtlens.split(',')
+    for i in range(len(filters)):
+        branch = Sequential()
+        branch.add(Embedding(len(dataset1.alphabet),
+                         200,
+                         input_length=length[i],
+                         weights=init_vectors))
+        branch.add(Convolution1D(nb_filter=200,
+                             filter_length=int(filters[i]),
+                             border_mode='valid',
+                             activation='relu',
+                             subsample_length=1))
+        branch.add(MaxPooling1D(pool_length=2))
+        branch.add(Flatten())
+        branches.append(branch)
+
+        #test_xs.append(test_x)
+    model = Sequential()
+    model.add(Merge(branches, mode='concat'))
+
+    model.add(Dense(350))#cfg.getint('cnn', 'hidden')))
+    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+    model.add(Activation('relu'))
+
+    model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+    model.add(Dense(classes))
+    model.add(Activation('softmax'))
+
+    optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
+                      rho=0.9, epsilon=1e-08)
+    model.compile(loss='categorical_crossentropy',
+                optimizer=optimizer,
+                metrics=['accuracy'])
+    model.fit(train_xs,
+            train_y,
+            nb_epoch=3,#cfg.getint('cnn', 'epochs'),
+            batch_size=50,#cfg.getint('cnn', 'batches'),
+            verbose=1,
+            validation_split=0.1,
+            class_weight=None)
+
+    model.summary()
+
+    json_string = model.to_json()
+    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
\ No newline at end of file

Added: ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py?rev=1754049&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py Mon Jul 25 21:10:58 2016
@@ -0,0 +1,112 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+from fnmatch import fnmatch
+
+def main(args):
+    if len(args) < 1:
+        sys.stderr.write("Error - one required argument: <model directory>\n")
+        sys.exit(-1)
+
+    working_dir = args[0]
+
+    int2label = {
+        0:'none',
+        1:'CONTAINS',
+        2:'CONTAINS-1'
+    }
+
+    ## Load models and weights:
+    #outcomes = ctk_io.get_outcome_array(working_dir)
+    model_dir = "/Users/chenlin/Programming/ctakesWorkspace/ctakes/ctakes-temporal/target/eval/thyme/train_and_test/event-time"
+    maxlen   = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+    alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+    #print("Outcomes array is %s" % (outcomes) )
+    model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+    model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+    while True:
+        try:
+            line = sys.stdin.readline().rstrip()
+            if not line:
+                break
+
+            ## Convert the line of Strings to lists of indices
+            pre=[]
+            arg1=[]
+            cont=[]
+            arg2=[]
+            post=[]
+            train_x = []
+            tag = 0
+            for unigram in line.rstrip().split():
+                if(alphabet.has_key(unigram)):
+                    idx = alphabet[unigram]
+                else:
+                    idx = alphabet["none"]
+
+                train_x.append(idx)
+                if( fnmatch(unigram, '<*>')):
+                    tag = tag + 1
+                    continue
+                if(tag ==0 ):
+                    pre.append(idx)
+                elif(tag == 1):
+                    arg1.append(idx)
+                elif(tag == 2):
+                    cont.append(idx)
+                elif(tag == 3):
+                    arg2.append(idx)
+                elif(tag == 4):
+                    post.append(idx)
+
+            train_x = pad_sequences([train_x], maxlen=maxlen, truncating='pre')
+            pres_x = pad_sequences([pre], maxlen=5, truncating='pre')
+            arg1s_x = pad_sequences([arg1], maxlen = 5, truncating='pre')
+            conts_x  = pad_sequences([cont], maxlen = 120, truncating='pre')
+            arg2s_x = pad_sequences([arg2], maxlen = 5, truncating='pre')
+            posts_x = pad_sequences([post], maxlen=5, truncating='post')
+            #test_x = pad_sequences([feats], maxlen=maxlen)
+            #feats = np.reshape(feats, (1, 6, input_dims / 6))
+            #feats = np.reshape(feats, (1, input_dims))
+
+            X_dup = []
+            X_dup.append(train_x)
+            X_dup.append(train_x)
+            X_dup.append(train_x)
+            X_dup.append(pres_x)
+            X_dup.append(pres_x)
+            X_dup.append(arg1s_x)
+            X_dup.append(conts_x)
+            X_dup.append(conts_x)
+            X_dup.append(conts_x)
+            X_dup.append(arg2s_x)
+            X_dup.append(posts_x)
+            X_dup.append(posts_x)
+
+            out = model.predict(X_dup)[0]
+            # print("Out is %s and decision is %d" % (out, out.argmax()))
+        except KeyboardInterrupt:
+            sys.stderr.write("Caught keyboard interrupt\n")
+            break
+
+        if line == '':
+            sys.stderr.write("Encountered empty string so exiting\n")
+            break
+
+        out_str = int2label[out.argmax()]
+
+        print(out_str)
+        sys.stdout.flush()
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])

Modified: ctakes/trunk/ctakes-temporal/scripts/keras/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/train.sh?rev=1754049&r1=1754048&r2=1754049&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/train.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/train.sh Mon Jul 25 21:10:58 2016
@@ -2,7 +2,7 @@
 
 source $(dirname $0)/env/bin/activate
 
-python $(dirname $0)/et_dimaCNN_train-and-package.py $*
+python $(dirname $0)/et_RCNN_train-and-package.py $*
 
 ret=$?