You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2016/07/25 21:10:59 UTC
svn commit: r1754049 - in /ctakes/trunk/ctakes-temporal/scripts/keras:
classify.sh et_RCNN_train-and-package.py rCNN-predict.py train.sh
Author: clin
Date: Mon Jul 25 21:10:58 2016
New Revision: 1754049
URL: http://svn.apache.org/viewvc?rev=1754049&view=rev
Log:
Check in the Region-based CNN implementation (python code) for event-time.
Added:
ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py
ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py
Modified:
ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh
ctakes/trunk/ctakes-temporal/scripts/keras/train.sh
Modified: ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh?rev=1754049&r1=1754048&r2=1754049&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh Mon Jul 25 21:10:58 2016
@@ -2,7 +2,7 @@
source $(dirname $0)/env/bin/activate
-python $(dirname $0)/dima-predict.py $*
+python $(dirname $0)/rCNN-predict.py $*
ret=$?
Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py?rev=1754049&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/et_RCNN_train-and-package.py Mon Jul 25 21:10:58 2016
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+
+import numpy as np
+np.random.seed(1337)
+
+import et_cleartk_io as ctk_io
+import nn_models
+
+import sys
+import os.path
+
+import dataset
+
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+
+import pickle
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <data directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ #read in data file
+# print("Reading data...")
+ #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+ data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+ # learn alphabet from training and test data
+ dataset1 = dataset.DatasetProvider([data_file])
+ # now load training examples and labels
+ train_x, train_y = dataset1.load(data_file)
+ pres, arg1s, conts, arg2s, posts, train_y = dataset1.load_by_region(data_file)
+
+ init_vectors = None #used for pre-trained embeddings
+
+ # turn x and y into numpy array among other things
+ maxlen = max([len(seq) for seq in train_x])
+ outcomes = set(train_y)
+ classes = len(outcomes)
+
+ train_x = pad_sequences(train_x, maxlen = maxlen, truncating='pre')
+ pres_x = pad_sequences(pres, maxlen=5, truncating='pre')
+ arg1s_x = pad_sequences(arg1s, maxlen = 5, truncating='pre')
+ conts_x = pad_sequences(conts, maxlen = 120, truncating='pre')
+ arg2s_x = pad_sequences(arg2s, maxlen = 5, truncating='pre')
+ posts_x = pad_sequences(posts, maxlen=5, truncating='post')
+ train_y = to_categorical(np.array(train_y), classes)
+
+ pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+ pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+ #test_x = pad_sequences(test_x, maxlen=maxlen)
+ #test_y = to_categorical(np.array(test_y), classes)
+
+ print 'pres_x shape:', pres_x.shape
+ print 'arg1s_x shape:', arg1s_x.shape
+ print 'conts_x shape:', conts_x.shape
+ print 'arg2s_x shape:', arg2s_x.shape
+ print 'posts_x shape:', posts_x.shape
+ print 'train_y shape:', train_y.shape
+
+ branches = [] # models to be merged
+ train_xs = [] # train x for each branch
+ length =[]
+ #test_xs = [] # test x for each branch
+ train_xs.append(train_x)
+ length.append(train_x.shape[1])
+ train_xs.append(train_x)
+ length.append(train_x.shape[1])
+ train_xs.append(train_x)
+ length.append(train_x.shape[1])
+ train_xs.append(pres_x) #for filter 2
+ length.append(pres_x.shape[1])
+ train_xs.append(pres_x) #for filter 3
+ length.append(pres_x.shape[1])
+ train_xs.append(arg1s_x) # filer 2
+ length.append(arg1s_x.shape[1])
+ train_xs.append(conts_x) # filter 3
+ length.append(conts_x.shape[1])
+ train_xs.append(conts_x) # filter 4
+ length.append(conts_x.shape[1])
+ train_xs.append(conts_x) # filter 5
+ length.append(conts_x.shape[1])
+ train_xs.append(arg2s_x) # filer 2
+ length.append(arg2s_x.shape[1])
+ train_xs.append(posts_x) #for filter 2
+ length.append(posts_x.shape[1])
+ train_xs.append(posts_x) #for filter 3
+ length.append(posts_x.shape[1])
+
+ filtlens = "3,4,5,2,3,2,30,40,50,2,2,3"
+ filters = filtlens.split(',')
+ for i in range(len(filters)):
+ branch = Sequential()
+ branch.add(Embedding(len(dataset1.alphabet),
+ 200,
+ input_length=length[i],
+ weights=init_vectors))
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filters[i]),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+ branches.append(branch)
+
+ #test_xs.append(test_x)
+ model = Sequential()
+ model.add(Merge(branches, mode='concat'))
+
+ model.add(Dense(350))#cfg.getint('cnn', 'hidden')))
+ model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+ model.add(Activation('relu'))
+
+ model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+ model.add(Dense(classes))
+ model.add(Activation('softmax'))
+
+ optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
+ rho=0.9, epsilon=1e-08)
+ model.compile(loss='categorical_crossentropy',
+ optimizer=optimizer,
+ metrics=['accuracy'])
+ model.fit(train_xs,
+ train_y,
+ nb_epoch=3,#cfg.getint('cnn', 'epochs'),
+ batch_size=50,#cfg.getint('cnn', 'batches'),
+ verbose=1,
+ validation_split=0.1,
+ class_weight=None)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
\ No newline at end of file
Added: ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py?rev=1754049&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/rCNN-predict.py Mon Jul 25 21:10:58 2016
@@ -0,0 +1,112 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+from fnmatch import fnmatch
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <model directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ int2label = {
+ 0:'none',
+ 1:'CONTAINS',
+ 2:'CONTAINS-1'
+ }
+
+ ## Load models and weights:
+ #outcomes = ctk_io.get_outcome_array(working_dir)
+ model_dir = "/Users/chenlin/Programming/ctakesWorkspace/ctakes/ctakes-temporal/target/eval/thyme/train_and_test/event-time"
+ maxlen = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+ alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+ #print("Outcomes array is %s" % (outcomes) )
+ model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+ model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+ while True:
+ try:
+ line = sys.stdin.readline().rstrip()
+ if not line:
+ break
+
+ ## Convert the line of Strings to lists of indices
+ pre=[]
+ arg1=[]
+ cont=[]
+ arg2=[]
+ post=[]
+ train_x = []
+ tag = 0
+ for unigram in line.rstrip().split():
+ if(alphabet.has_key(unigram)):
+ idx = alphabet[unigram]
+ else:
+ idx = alphabet["none"]
+
+ train_x.append(idx)
+ if( fnmatch(unigram, '<*>')):
+ tag = tag + 1
+ continue
+ if(tag ==0 ):
+ pre.append(idx)
+ elif(tag == 1):
+ arg1.append(idx)
+ elif(tag == 2):
+ cont.append(idx)
+ elif(tag == 3):
+ arg2.append(idx)
+ elif(tag == 4):
+ post.append(idx)
+
+ train_x = pad_sequences([train_x], maxlen=maxlen, truncating='pre')
+ pres_x = pad_sequences([pre], maxlen=5, truncating='pre')
+ arg1s_x = pad_sequences([arg1], maxlen = 5, truncating='pre')
+ conts_x = pad_sequences([cont], maxlen = 120, truncating='pre')
+ arg2s_x = pad_sequences([arg2], maxlen = 5, truncating='pre')
+ posts_x = pad_sequences([post], maxlen=5, truncating='post')
+ #test_x = pad_sequences([feats], maxlen=maxlen)
+ #feats = np.reshape(feats, (1, 6, input_dims / 6))
+ #feats = np.reshape(feats, (1, input_dims))
+
+ X_dup = []
+ X_dup.append(train_x)
+ X_dup.append(train_x)
+ X_dup.append(train_x)
+ X_dup.append(pres_x)
+ X_dup.append(pres_x)
+ X_dup.append(arg1s_x)
+ X_dup.append(conts_x)
+ X_dup.append(conts_x)
+ X_dup.append(conts_x)
+ X_dup.append(arg2s_x)
+ X_dup.append(posts_x)
+ X_dup.append(posts_x)
+
+ out = model.predict(X_dup)[0]
+ # print("Out is %s and decision is %d" % (out, out.argmax()))
+ except KeyboardInterrupt:
+ sys.stderr.write("Caught keyboard interrupt\n")
+ break
+
+ if line == '':
+ sys.stderr.write("Encountered empty string so exiting\n")
+ break
+
+ out_str = int2label[out.argmax()]
+
+ print(out_str)
+ sys.stdout.flush()
+
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
Modified: ctakes/trunk/ctakes-temporal/scripts/keras/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/train.sh?rev=1754049&r1=1754048&r2=1754049&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/train.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/train.sh Mon Jul 25 21:10:58 2016
@@ -2,7 +2,7 @@
source $(dirname $0)/env/bin/activate
-python $(dirname $0)/et_dimaCNN_train-and-package.py $*
+python $(dirname $0)/et_RCNN_train-and-package.py $*
ret=$?