You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2016/07/19 17:19:56 UTC
svn commit: r1753408 - in /ctakes/trunk/ctakes-temporal/scripts: ./ keras/
Author: clin
Date: Tue Jul 19 17:19:55 2016
New Revision: 1753408
URL: http://svn.apache.org/viewvc?rev=1753408&view=rev
Log:
python scripts that are needed for running keras-based CNN
Added:
ctakes/trunk/ctakes-temporal/scripts/
ctakes/trunk/ctakes-temporal/scripts/keras/
ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh (with props)
ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.py
ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/keras/dima-predict.py
ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.py
ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/keras/et_dimaCNN_train-and-package.py
ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_testCNN.py
ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_train-and-package.py
ctakes/trunk/ctakes-temporal/scripts/keras/keras-predict.py
ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.py
ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/keras/train.sh (with props)
Added: ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh Tue Jul 19 17:19:55 2016
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+source $(dirname $0)/env/bin/activate
+
+python $(dirname $0)/dima-predict.py $*
+
+ret=$?
+
+deactivate
+
+exit $ret
Propchange: ctakes/trunk/ctakes-temporal/scripts/keras/classify.sh
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import os, os.path
+import subprocess
+
+
+def string_label_to_label_vector(label_string, outcome_maps):
+ label_vec = []
+
+ for label_val in label_string.split('#'):
+ (label, val) = label_val.split('=')
+ cur_map = outcome_maps[label]
+ label_ind = cur_map[val]
+ label_vec.append(label_ind)
+
+ return label_vec
+
+def get_data_dimensions(data_file):
+ wc_out = subprocess.check_output(['wc', data_file])
+ wc_fields = wc_out.decode().strip().split(' ')
+ file_len = int(wc_fields[0])
+
+ num_feats = 0
+ for line in open(data_file):
+ max_dim = int( line.rstrip().split(' ')[-1].split(':')[0] )
+ if max_dim > num_feats:
+ num_feats = max_dim
+
+ return (file_len, num_feats)
+
+def flatten_outputs(Y):
+ maxes = Y.max(0)
+ #print("Maxes = %s" % (maxes) )
+ reqd_dims = 0
+ indices = [0]
+
+ ## Create an indices array that maps from "true" label indices to neural network
+ ## output layer indices -- binary labels map to single output nodes (2->1) while n-ary
+ ## labels map to n nodes.
+ for val in maxes:
+ if val == 1:
+ reqd_dims += 1
+ elif val > 1:
+ reqd_dims += (int(val) + 1)
+ else:
+ raise Exception("There is a column with all zeros!")
+
+ indices.append(reqd_dims)
+
+ Y_adj = np.zeros( (Y.shape[0], reqd_dims) )
+ for row_ind in range(0, Y.shape[0]):
+ for col_ind in range(0, Y.shape[1]):
+ if maxes[col_ind] == 1:
+ ## For binary variables just need the offset and copy the value
+ Y_adj[row_ind][ int(indices[col_ind]) ] = Y[row_ind][col_ind]
+ else:
+ ## for n-ary variables we use the value to find the offset that will
+ ## be set to 1.
+ Y_adj[row_ind][ int(indices[col_ind]) + int(Y[row_ind][col_ind]) ] = 1
+
+ return Y_adj, indices
+
+def read_outcome_maps(dirname):
+ raw_outcomes = []
+ raw_outcomes.append(None)
+
+ derived_maps = {}
+ lookup_map = {}
+ ## First read outcome file
+ for line in open(os.path.join(dirname, 'outcome-lookup.txt') ):
+ (index, label) = line.rstrip().split(' ')
+ raw_outcomes.append(label)
+
+ for task_label in label.split('#'):
+ #print(task_label)
+ (task, val) = task_label.rstrip().split("=")
+ if not task in derived_maps:
+ derived_maps[task] = {}
+ lookup_map[task] = []
+
+ cur_map = derived_maps[task]
+ lookup = lookup_map[task]
+ if not val in cur_map:
+ cur_map[val] = len(cur_map)
+ lookup.append(val)
+
+ return raw_outcomes, derived_maps, lookup_map
+
+def outcome_list(raw_outcomes):
+ outcomes = []
+ for outcome_val in raw_outcomes[1].split("#"):
+ outcomes.append(outcome_val.split("=")[0])
+
+ return outcomes
+
+def read_multitask_liblinear(dirname):
+
+ raw_outcomes, derived_maps, outcome_lookups = read_outcome_maps(dirname)
+
+ data_file = os.path.join(dirname, 'training-data.liblinear')
+
+ (data_points, feat_dims) = get_data_dimensions(data_file)
+
+ ## Remove bias feature -- will be part of any neural network
+ label_dims = len(derived_maps)
+
+ label_matrix = np.zeros( (data_points, label_dims) )
+ feat_matrix = np.zeros( (data_points, feat_dims) )
+
+ line_ind = 0
+ for line in open( data_file ):
+ label_and_feats = line.rstrip().split(' ')
+ label = label_and_feats[0]
+ string_label = raw_outcomes[int(label)]
+ label_vec = string_label_to_label_vector(string_label, derived_maps)
+
+ for ind, val in enumerate(label_vec):
+ label_matrix[line_ind, ind] = val
+
+ ## Go from 2 on -- skip both the label and the first feature since it will be
+ ## the bias term from the liblinear data writer.
+# feat_list = feature_array_to_list( label_and_feats[1:], feat_dims )
+# feat_matrix[line_ind,:] = feat_list[1:]
+ feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims )
+# for feat in label_and_feats[1:]:
+# (ind, val) = feat.split(':')
+# feat_ind = int(ind) - 1 ## since feats are indexed at 1
+# feat_matrix[line_ind, feat_ind] = float(val)
+
+
+ line_ind += 1
+
+ return label_matrix, feat_matrix
+
+def convert_multi_output_to_string(outcomes, outcome_list, lookup_map, raw_outcomes):
+ """Return the int value corresponding to the class implied by the
+ set of outputs in the outcomes array."""
+ str = ''
+ for ind, label in enumerate(outcome_list):
+ str += label
+ str += "="
+ str += lookup_map[label][outcomes[ind]]
+ str += "#"
+
+ str = str[:-1]
+ return str
+
+def feature_string_to_list( feat_string, length=-1 ):
+ return feature_array_to_list( feat_string.split(' '), length )
+
+def feature_array_to_list( feats, length=-1 ):
+ if length == -1:
+ length = len(feats)
+
+ #f = np.zeros(length)
+ f = [0] * length
+
+ for feat in feats:
+ (ind, val) = feat.split(':')
+ ind = int(ind) - 1
+ if int(ind) >= len(f):
+ raise Exception("Feature index %d is larger than feature vector length %d -- you may need to specify the expected length of the vector." % (int(ind), len(f) ) )
+ f[int(ind)] = val
+
+ return f
+
+if __name__ == "__main__":
+ (labels, feats) = read_multitask_liblinear('data_testing/multitask_assertion/train_and_test/')
+ print("train[0][100] = %f" % feats[0][100])
Added: ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.pyc?rev=1753408&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/keras/cleartk_io.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/keras/dima-predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/dima-predict.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/dima-predict.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/dima-predict.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,76 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <model directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ int2label = {
+ 0:'-NONE-',
+ 1:'CONTAINS',
+ 2:'CONTAINS-1'
+ }
+
+ ## Load models and weights:
+ #outcomes = ctk_io.get_outcome_array(working_dir)
+ model_dir = "/Users/chenlin/Programming/ctakesWorkspace/ctakes/ctakes-temporal/target/eval/thyme/train_and_test/event-time"
+ maxlen = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+ alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+ #print("Outcomes array is %s" % (outcomes) )
+ model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+ model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+ while True:
+ try:
+ line = sys.stdin.readline().rstrip()
+ if not line:
+ break
+
+ ## Convert the line of Strings to lists of indices
+ feats=[]
+ for unigram in line.rstrip().split():
+ if(alphabet.has_key(unigram)):
+ feats.append(alphabet[unigram])
+ else:
+ feats.append(alphabet["none"])
+ if(len(feats)> maxlen):
+ feats=feats[0:maxlen]
+ test_x = pad_sequences([feats], maxlen=maxlen)
+ #feats = np.reshape(feats, (1, 6, input_dims / 6))
+ #feats = np.reshape(feats, (1, input_dims))
+
+ X_dup = []
+ X_dup.append(test_x)
+ X_dup.append(test_x)
+ X_dup.append(test_x)
+
+ out = model.predict(X_dup, batch_size=50)[0]
+ # print("Out is %s and decision is %d" % (out, out.argmax()))
+ except KeyboardInterrupt:
+ sys.stderr.write("Caught keyboard interrupt\n")
+ break
+
+ if line == '':
+ sys.stderr.write("Encountered empty string so exiting\n")
+ break
+
+ out_str = int2label[out.argmax()]
+
+ print(out_str)
+ sys.stdout.flush()
+
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import numpy as np
+import os, os.path
+import subprocess
+
+
+def string_label_to_label_vector(label_string, outcome_maps):
+ label_vec = []
+
+ for label_val in label_string.split('#'):
+ (label, val) = label_val.split('=')
+ cur_map = outcome_maps[label]
+ label_ind = cur_map[val]
+ label_vec.append(label_ind)
+
+ return label_vec
+
+def get_data_dimensions(data_file):
+ wc_out = subprocess.check_output(['wc', data_file])
+ wc_fields = wc_out.decode().strip().split(' ')
+ file_len = int(wc_fields[0])
+
+ num_feats = 0
+ for line in open(data_file):
+ max_dim = int( line.rstrip().split(' ')[-1].split(':')[0] )
+ if max_dim > num_feats:
+ num_feats = max_dim
+
+ return (file_len, num_feats)
+
+def flatten_outputs(Y):
+ maxes = Y.max(0)
+ #print("Maxes = %s" % (maxes) )
+ reqd_dims = 0
+ indices = [0]
+
+ ## Create an indices array that maps from "true" label indices to neural network
+ ## output layer indices -- binary labels map to single output nodes (2->1) while n-ary
+ ## labels map to n nodes.
+ for val in maxes:
+ if val == 1:
+ reqd_dims += 1
+ elif val > 1:
+ reqd_dims += (int(val) + 1)
+ else:
+ raise Exception("There is a column with all zeros!")
+
+ indices.append(reqd_dims)
+
+ Y_adj = np.zeros( (Y.shape[0], reqd_dims) )
+ for row_ind in range(0, Y.shape[0]):
+ for col_ind in range(0, Y.shape[1]):
+ if maxes[col_ind] == 1:
+ ## For binary variables just need the offset and copy the value
+ Y_adj[row_ind][ int(indices[col_ind]) ] = Y[row_ind][col_ind]
+ else:
+ ## for n-ary variables we use the value to find the offset that will
+ ## be set to 1.
+ Y_adj[row_ind][ int(indices[col_ind]) + int(Y[row_ind][col_ind]) ] = 1
+
+ return Y_adj, indices
+
+def read_liblinear(dirname):
+ data_file = os.path.join(dirname, 'training-data.liblinear')
+
+ (data_points, feat_dims) = get_data_dimensions(data_file)
+
+ label_array = np.zeros( (data_points, 1), dtype=np.int )
+ feat_matrix = np.zeros( (data_points, feat_dims) )
+
+ line_ind = 0
+ for line in open( data_file ):
+ label_and_feats = line.rstrip().split(' ')
+ label = label_and_feats[0]
+
+ label_array[line_ind] = float(label) - 1
+
+ ## Go from 1 on -- skip the label
+ ## the bias term from the liblinear data writer.
+ feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims )
+
+ line_ind += 1
+
+ label_matrix = np.zeros( (data_points, label_array.max()+1) )
+
+ for ind,val in np.ndenumerate(label_array):
+ label_matrix[ind,val] = 1
+
+ return label_matrix, feat_matrix
+
+def read_outcome_maps(dirname):
+ raw_outcomes = []
+ raw_outcomes.append(None)
+
+ derived_maps = {}
+ lookup_map = {}
+ ## First read outcome file
+ for line in open(os.path.join(dirname, 'outcome-lookup.txt') ):
+ (index, label) = line.rstrip().split(' ')
+ raw_outcomes.append(label)
+
+ #for task_label in label.split('#'):
+ #print(task_label)
+ # (task, val) = task_label.rstrip().split("=")
+ # if not task in derived_maps:
+ # derived_maps[task] = {}
+ # lookup_map[task] = []
+
+ # cur_map = derived_maps[task]
+ # lookup = lookup_map[task]
+ # if not val in cur_map:
+ # cur_map[val] = len(cur_map)
+ # lookup.append(val)
+
+ return raw_outcomes#, derived_maps, lookup_map
+
+def outcome_list(raw_outcomes):
+ outcomes = []
+ for outcome_val in raw_outcomes[1].split("#"):
+ outcomes.append(outcome_val.split("=")[0])
+
+ return outcomes
+
+def read_multitask_liblinear(dirname):
+
+ #raw_outcomes, derived_maps, outcome_lookups = read_outcome_maps(dirname)
+
+ data_file = os.path.join(dirname, 'training-data.liblinear')
+
+ (data_points, feat_dims) = get_data_dimensions(data_file)
+
+ ## Remove bias feature -- will be part of any neural network
+ label_dims = 1 #len(derived_maps)
+
+ label_matrix = np.zeros( (data_points, label_dims) )
+ feat_matrix = np.zeros( (data_points, feat_dims) )
+
+ line_ind = 0
+ for line in open( data_file ):
+ label_and_feats = line.rstrip().split(' ')
+ label = label_and_feats[0]
+ #string_label = raw_outcomes[int(label)]
+ #label_vec = string_label_to_label_vector(string_label, derived_maps)
+
+ #for ind, val in enumerate(label_vec):
+ label_matrix[line_ind, 0] = label
+
+ ## Go from 2 on -- skip both the label and the first feature since it will be
+ ## the bias term from the liblinear data writer.
+# feat_list = feature_array_to_list( label_and_feats[1:], feat_dims )
+# feat_matrix[line_ind,:] = feat_list[1:]
+ feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims )
+# for feat in label_and_feats[1:]:
+# (ind, val) = feat.split(':')
+# feat_ind = int(ind) - 1 ## since feats are indexed at 1
+# feat_matrix[line_ind, feat_ind] = float(val)
+
+
+ line_ind += 1
+
+ return label_matrix, feat_matrix
+
+def convert_multi_output_to_string(outcomes, outcome_list, lookup_map, raw_outcomes):
+ """Return the int value corresponding to the class implied by the
+ set of outputs in the outcomes array."""
+ str = ''
+ for ind, label in enumerate(outcome_list):
+ str += label
+ str += "="
+ str += lookup_map[label][outcomes[ind]]
+ str += "#"
+
+ str = str[:-1]
+ return str
+
+def get_outcome_array(working_dir):
+ labels = []
+
+ for line in open(os.path.join(working_dir, "outcome-lookup.txt")):
+ (ind, val) = line.rstrip().split(" ")
+ labels.append(val)
+
+ return labels
+
+def feature_string_to_list( feat_string, length=-1 ):
+ return feature_array_to_list( feat_string.split(' '), length )
+
+def feature_array_to_list( feats, length=-1 ):
+ if length == -1:
+ length = len(feats)
+
+ #f = np.zeros(length)
+ f = [0] * length
+
+ for feat in feats:
+ (ind, val) = feat.split(':')
+ ind = int(ind) - 1
+ if int(ind) >= len(f):
+ raise Exception("Feature index %d is larger than feature vector length %d -- you may need to specify the expected length of the vector." % (int(ind), len(f) ) )
+ f[int(ind)] = val
+
+ return f
+
+if __name__ == "__main__":
+ (labels, feats) = read_multitask_liblinear('target/eval/thyme/train_and_test/event-time/')
+ print("train[0][100] = %f" % feats[0][100])
Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.pyc?rev=1753408&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/keras/et_cleartk_io.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_dimaCNN_train-and-package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_dimaCNN_train-and-package.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/et_dimaCNN_train-and-package.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/et_dimaCNN_train-and-package.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+
+import numpy as np
+np.random.seed(1337)
+
+import et_cleartk_io as ctk_io
+import nn_models
+
+import sys
+import os.path
+
+import dataset
+
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+
+import pickle
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <data directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ #read in data file
+# print("Reading data...")
+ #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+ data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+ # learn alphabet from training and test data
+ dataset1 = dataset.DatasetProvider([data_file])
+ # now load training examples and labels
+ train_x, train_y = dataset1.load(data_file)
+
+ init_vectors = None #used for pre-trained embeddings
+
+ # turn x and y into numpy array among other things
+ maxlen = max([len(seq) for seq in train_x])
+ outcomes = set(train_y)
+ classes = len(outcomes)
+
+ train_x = pad_sequences(train_x, maxlen=maxlen)
+ train_y = to_categorical(np.array(train_y), classes)
+
+ pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+ pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+ #test_x = pad_sequences(test_x, maxlen=maxlen)
+ #test_y = to_categorical(np.array(test_y), classes)
+
+ print 'train_x shape:', train_x.shape
+ print 'train_y shape:', train_y.shape
+
+ branches = [] # models to be merged
+ train_xs = [] # train x for each branch
+ #test_xs = [] # test x for each branch
+
+ filtlens = "3,4,5"
+ for filter_len in filtlens.split(','):
+ branch = Sequential()
+ branch.add(Embedding(len(dataset1.alphabet),
+ 300,
+ input_length=maxlen,
+ weights=init_vectors))
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filter_len),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+
+ branches.append(branch)
+ train_xs.append(train_x)
+ #test_xs.append(test_x)
+ model = Sequential()
+ model.add(Merge(branches, mode='concat'))
+
+ model.add(Dense(250))#cfg.getint('cnn', 'hidden')))
+ model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+ model.add(Activation('relu'))
+
+ model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+ model.add(Dense(classes))
+ model.add(Activation('softmax'))
+
+ optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
+ rho=0.9, epsilon=1e-08)
+ model.compile(loss='categorical_crossentropy',
+ optimizer=optimizer,
+ metrics=['accuracy'])
+ model.fit(train_xs,
+ train_y,
+ nb_epoch=3,#cfg.getint('cnn', 'epochs'),
+ batch_size=50,#cfg.getint('cnn', 'batches'),
+ verbose=1,
+ validation_split=0.1,
+ class_weight=None)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
\ No newline at end of file
Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_testCNN.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_testCNN.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_testCNN.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_testCNN.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation
+from keras.optimizers import SGD
+from keras.utils import np_utils
+#from sklearn.datasets import load_svmlight_file
+import sklearn as sk
+import sklearn.cross_validation
+import numpy as np
+import et_cleartk_io as ctk_io
+import nn_models
+import sys
+import os.path
+
+batch_size = 64
+nb_epoch = 10
+layers = (256, 256, 256)
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <data directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+# print("Reading data...")
+ Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+
+ num_outputs = Y.shape[-1]
+ num_examples, dimension = X.shape
+ num_y_examples, num_labels = Y.shape
+ assert num_examples == num_y_examples
+
+ #print("Data has %d examples and dimension %d" % (num_examples, dimension) )
+ #print("Output has %d dimensions" % (num_labels) )
+
+ X = np.reshape(X, (num_examples, 6, dimension / 6))
+
+ #Y_adj, indices = ctk_io.flatten_outputs(Y)
+
+ #print("After reshaping the data has shape %s" % (str(X.shape)))
+
+ '''for label_ind in range(0, Y.shape[1]):
+
+ num_outputs = indices[label_ind+1] - indices[label_ind]
+ model = nn_models.get_cnn_model(X.shape, num_outputs)
+
+ #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1])))
+
+ train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind+1])]
+
+ #if(train_y.shape[-1] == 1):
+ # print("Number of values=1 is %d" % (train_y.sum()))
+
+ #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) )
+
+ model.fit(X, train_y,
+ nb_epoch=nb_epoch,
+ batch_size=batch_size,
+ verbose=1)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True)
+
+ #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) )
+ #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50])
+ '''
+
+ X_dup = []
+ X_dup.append(X)
+ X_dup.append(X)
+ X_dup.append(X)
+ model = nn_models.get_dima_cnn_model(dimension, num_outputs)
+ model.fit(X_dup, Y,
+ nb_epoch=nb_epoch,
+ batch_size=batch_size,
+ verbose=1)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
\ No newline at end of file
Added: ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_train-and-package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_train-and-package.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_train-and-package.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/et_neural_train-and-package.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation
+from keras.optimizers import SGD
+from keras.utils import np_utils
+#from sklearn.datasets import load_svmlight_file
+import sklearn as sk
+import sklearn.cross_validation
+import numpy as np
+import et_cleartk_io as ctk_io
+import nn_models
+import sys
+import os.path
+
+batch_size = 64
+nb_epoch = 5
+layers = (256, 256, 256)
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <data directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+# print("Reading data...")
+ Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+
+ num_outputs = Y.shape[-1]
+ num_examples, dimension = X.shape
+ num_y_examples, num_labels = Y.shape
+ assert num_examples == num_y_examples
+
+ #print("Data has %d examples and dimension %d" % (num_examples, dimension) )
+ #print("Output has %d dimensions" % (num_labels) )
+
+ X = np.reshape(X, (num_examples, 6, dimension / 6))
+
+ #Y_adj, indices = ctk_io.flatten_outputs(Y)
+
+ #print("After reshaping the data has shape %s" % (str(X.shape)))
+
+ '''for label_ind in range(0, Y.shape[1]):
+
+ num_outputs = indices[label_ind+1] - indices[label_ind]
+ model = nn_models.get_cnn_model(X.shape, num_outputs)
+
+ #print("For label ind %d, grabbing indices from %d to %d" % (label_ind, int(indices[label_ind]), int(indices[label_ind+1])))
+
+ train_y = Y_adj[:, int(indices[label_ind]):int(indices[label_ind+1])]
+
+ #if(train_y.shape[-1] == 1):
+ # print("Number of values=1 is %d" % (train_y.sum()))
+
+ #print("Shape of y is %s, shape of X is %s, max value in y is %f and min is %f" % (str(train_y.shape), str(X.shape), train_y.max(), train_y.min()) )
+
+ model.fit(X, train_y,
+ nb_epoch=nb_epoch,
+ batch_size=batch_size,
+ verbose=1)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_%d.json' % label_ind), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_%d.h5' % label_ind), overwrite=True)
+
+ #print("This model has %d layers and layer 3 has %d weights" % (len(model.layers), len(model.layers[3].get_weights()) ) )
+ #print("The weight of the first layer at index 50 is %f" % model.layers[3].get_weights()[50])
+ '''
+
+ X_dup = []
+ X_dup.append(X)
+ X_dup.append(X)
+ X_dup.append(X)
+ model, branches = nn_models.get_dima_cnn_model(dimension, num_outputs)
+ model.fit(X_dup, Y,
+ nb_epoch=nb_epoch,
+ batch_size=batch_size,
+ verbose=1)
+ for b in branches:
+ b.trainable = False
+
+ model.fit(X_dup, Y,
+ nb_epoch=nb_epoch,
+ batch_size=batch_size,
+ verbose=1)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
\ No newline at end of file
Added: ctakes/trunk/ctakes-temporal/scripts/keras/keras-predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/keras-predict.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/keras-predict.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/keras-predict.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,64 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <model directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ ## Load models and weights:
+ model_list = []
+ model_ind = 0
+ input_dims = 0
+ outcomes = ctk_io.get_outcome_array(working_dir)
+ #print("Outcomes array is %s" % (outcomes) )
+ model = model_from_json(open(os.path.join(working_dir, "model_0.json")).read())
+ model.load_weights(os.path.join(working_dir, "model_0.h5"))
+
+ input_dims = 1200 #model.layers[0].input_shape[1]
+
+ while True:
+ try:
+ line = sys.stdin.readline().rstrip()
+ if not line:
+ break
+
+ ## Convert the line into a feature vector and pass to model.
+ feat_list = ctk_io.feature_string_to_list(line.rstrip(), input_dims)
+ feats = np.array(feat_list)
+ feats = np.reshape(feats, (1, 6, input_dims / 6))
+ #feats = np.reshape(feats, (1, input_dims))
+
+ X_dup = []
+ X_dup.append(feats)
+ X_dup.append(feats)
+ X_dup.append(feats)
+
+ out = model.predict(X_dup, batch_size=1, verbose=0)[0]
+ # print("Out is %s and decision is %d" % (out, out.argmax()))
+ except KeyboardInterrupt:
+ sys.stderr.write("Caught keyboard interrupt\n")
+ break
+
+ if line == '':
+ sys.stderr.write("Encountered empty string so exiting\n")
+ break
+
+ out_str = outcomes[out.argmax()]
+
+ print(out_str + "\n")
+ sys.stdout.flush()
+
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
Added: ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.py?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.py Tue Jul 19 17:19:55 2016
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Convolution1D, MaxPooling1D, Lambda, Flatten, Merge
+from keras.optimizers import SGD
+from keras import backend as K
+from keras.optimizers import RMSprop
+
+def get_mlp_model(dimension, num_outputs, layers=(64, 256, 64) ):
+ model = Sequential()
+ sgd = get_mlp_optimizer()
+
+ drop = 0.5
+
+ # Dense(64) is a fully-connected layer with 64 hidden units.
+ # in the first layer, you must specify the expected input data shape:
+ # here, 20-dimensional vectors.
+ model.add(Dense(layers[0], input_dim=dimension, init='uniform'))
+ model.add(Activation('relu'))
+ model.add(Dropout(drop))
+ model.add(Dense(layers[1], init='uniform'))
+ model.add(Activation('relu'))
+ model.add(Dropout(drop))
+ #model.add(Dense(layers[2], init='uniform'))
+ #model.add(Activation('relu'))
+ #model.add(Dropout(drop))
+
+# model.add(Dense(layers[2], init='uniform'))
+# model.add(Activation('relu'))
+# model.add(Dropout(0.5))
+
+ if num_outputs == 1:
+ model.add(Dense(1, init='uniform'))
+ model.add(Activation('sigmoid'))
+ model.compile(loss='binary_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+ else:
+ model.add(Dense(num_outputs, init='uniform'))
+ model.add(Activation('softmax'))
+ model.compile(loss='categorical_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+
+ return model
+
+def get_mlp_optimizer():
+ return SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
+
+def get_cnn_model(dimension, num_outputs, nb_filter = 200, layers=(64, 64, 256) ):
+ model = Sequential()
+ sgd = get_mlp_optimizer()
+
+ ## Convolutional layers:
+ model.add(Convolution1D(nb_filter, 3, input_shape=(6,200)))
+ def max_1d(X):
+ return K.max(X, axis=1)
+
+ model.add(Lambda(max_1d, output_shape=(nb_filter,)))
+
+
+ #model.add(MaxPooling1D())
+
+ model.add(Dense(layers[1], init='uniform'))
+ model.add(Activation('relu'))
+ model.add(Dropout(0.5))
+
+# model.add(Dense(layers[2], init='uniform'))
+# model.add(Activation('relu'))
+# model.add(Dropout(0.5))
+
+ if num_outputs == 1:
+ model.add(Dense(1, init='uniform'))
+ model.add(Activation('sigmoid'))
+ model.compile(loss='binary_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+ else:
+ model.add(Dense(num_outputs, init='uniform'))
+ model.add(Activation('softmax'))
+ model.compile(loss='categorical_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+
+ return model
+
+def get_dima_cnn_model(dimension, num_outputs):
+ filtlens = "3,4,5"
+ branches = [] # models to be merged
+ train_xs = []
+ for filterLen in filtlens.split(','):
+ branch = Sequential()
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filterLen),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1,
+ input_shape=(6,200)))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+
+ branches.append(branch)
+ model = Sequential()
+ model.add(Merge(branches, mode='concat'))
+
+ dropout = 0.25
+ model.add(Dense(250))
+ model.add(Dropout(dropout))
+ model.add(Activation('relu'))
+
+ model.add(Dropout(dropout))
+ model.add(Dense(num_outputs))
+ model.add(Activation('softmax'))
+
+ optimizer = RMSprop(lr=0.001,
+ rho=0.9, epsilon=1e-08)
+ model.compile(loss='categorical_crossentropy',
+ optimizer=optimizer,
+ metrics=['accuracy'])
+
+ return model, branches
\ No newline at end of file
Added: ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.pyc?rev=1753408&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/keras/nn_models.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/keras/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/keras/train.sh?rev=1753408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/keras/train.sh (added)
+++ ctakes/trunk/ctakes-temporal/scripts/keras/train.sh Tue Jul 19 17:19:55 2016
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+source $(dirname $0)/env/bin/activate
+
+python $(dirname $0)/et_dimaCNN_train-and-package.py $*
+
+ret=$?
+
+deactivate
+
+exit $ret
Propchange: ctakes/trunk/ctakes-temporal/scripts/keras/train.sh
------------------------------------------------------------------------------
svn:executable = *