You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/09/02 15:21:50 UTC
svn commit: r1758973 - in /ctakes/trunk/ctakes-temporal/scripts/nn: ./
classify.sh cleartk_io.py dataset.py dataset.pyc et_cleartk_io.py
et_cleartk_io.pyc nn_models.py nn_models.pyc predict.py reqs.txt train.sh
train_and_package.py
Author: dligach
Date: Fri Sep 2 15:21:49 2016
New Revision: 1758973
URL: http://svn.apache.org/viewvc?rev=1758973&view=rev
Log:
scripts to train dima's keras models; initial checkin
Added:
ctakes/trunk/ctakes-temporal/scripts/nn/
ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py
ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py
ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py
ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt
ctakes/trunk/ctakes-temporal/scripts/nn/train.sh (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py
Added: ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh Fri Sep 2 15:21:49 2016
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source $(dirname $0)/env/bin/activate
+python $(dirname $0)/predict.py $*
+ret=$?
+deactivate
+exit $ret
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/cleartk_io.py Fri Sep 2 15:21:49 2016
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import os, os.path
+import subprocess
+
+
+def string_label_to_label_vector(label_string, outcome_maps):
+ label_vec = []
+
+ for label_val in label_string.split('#'):
+ (label, val) = label_val.split('=')
+ cur_map = outcome_maps[label]
+ label_ind = cur_map[val]
+ label_vec.append(label_ind)
+
+ return label_vec
+
+def get_data_dimensions(data_file):
+ wc_out = subprocess.check_output(['wc', data_file])
+ wc_fields = wc_out.decode().strip().split(' ')
+ file_len = int(wc_fields[0])
+
+ num_feats = 0
+ for line in open(data_file):
+ max_dim = int( line.rstrip().split(' ')[-1].split(':')[0] )
+ if max_dim > num_feats:
+ num_feats = max_dim
+
+ return (file_len, num_feats)
+
+def flatten_outputs(Y):
+ maxes = Y.max(0)
+ #print("Maxes = %s" % (maxes) )
+ reqd_dims = 0
+ indices = [0]
+
+ ## Create an indices array that maps from "true" label indices to neural network
+ ## output layer indices -- binary labels map to single output nodes (2->1) while n-ary
+ ## labels map to n nodes.
+ for val in maxes:
+ if val == 1:
+ reqd_dims += 1
+ elif val > 1:
+ reqd_dims += (int(val) + 1)
+ else:
+ raise Exception("There is a column with all zeros!")
+
+ indices.append(reqd_dims)
+
+ Y_adj = np.zeros( (Y.shape[0], reqd_dims) )
+ for row_ind in range(0, Y.shape[0]):
+ for col_ind in range(0, Y.shape[1]):
+ if maxes[col_ind] == 1:
+ ## For binary variables just need the offset and copy the value
+ Y_adj[row_ind][ int(indices[col_ind]) ] = Y[row_ind][col_ind]
+ else:
+ ## for n-ary variables we use the value to find the offset that will
+ ## be set to 1.
+ Y_adj[row_ind][ int(indices[col_ind]) + int(Y[row_ind][col_ind]) ] = 1
+
+ return Y_adj, indices
+
+def read_outcome_maps(dirname):
+ raw_outcomes = []
+ raw_outcomes.append(None)
+
+ derived_maps = {}
+ lookup_map = {}
+ ## First read outcome file
+ for line in open(os.path.join(dirname, 'outcome-lookup.txt') ):
+ (index, label) = line.rstrip().split(' ')
+ raw_outcomes.append(label)
+
+ for task_label in label.split('#'):
+ #print(task_label)
+ (task, val) = task_label.rstrip().split("=")
+ if not task in derived_maps:
+ derived_maps[task] = {}
+ lookup_map[task] = []
+
+ cur_map = derived_maps[task]
+ lookup = lookup_map[task]
+ if not val in cur_map:
+ cur_map[val] = len(cur_map)
+ lookup.append(val)
+
+ return raw_outcomes, derived_maps, lookup_map
+
+def outcome_list(raw_outcomes):
+ outcomes = []
+ for outcome_val in raw_outcomes[1].split("#"):
+ outcomes.append(outcome_val.split("=")[0])
+
+ return outcomes
+
+def read_multitask_liblinear(dirname):
+
+ raw_outcomes, derived_maps, outcome_lookups = read_outcome_maps(dirname)
+
+ data_file = os.path.join(dirname, 'training-data.liblinear')
+
+ (data_points, feat_dims) = get_data_dimensions(data_file)
+
+ ## Remove bias feature -- will be part of any neural network
+ label_dims = len(derived_maps)
+
+ label_matrix = np.zeros( (data_points, label_dims) )
+ feat_matrix = np.zeros( (data_points, feat_dims) )
+
+ line_ind = 0
+ for line in open( data_file ):
+ label_and_feats = line.rstrip().split(' ')
+ label = label_and_feats[0]
+ string_label = raw_outcomes[int(label)]
+ label_vec = string_label_to_label_vector(string_label, derived_maps)
+
+ for ind, val in enumerate(label_vec):
+ label_matrix[line_ind, ind] = val
+
+ ## Go from 2 on -- skip both the label and the first feature since it will be
+ ## the bias term from the liblinear data writer.
+# feat_list = feature_array_to_list( label_and_feats[1:], feat_dims )
+# feat_matrix[line_ind,:] = feat_list[1:]
+ feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims )
+# for feat in label_and_feats[1:]:
+# (ind, val) = feat.split(':')
+# feat_ind = int(ind) - 1 ## since feats are indexed at 1
+# feat_matrix[line_ind, feat_ind] = float(val)
+
+
+ line_ind += 1
+
+ return label_matrix, feat_matrix
+
+def convert_multi_output_to_string(outcomes, outcome_list, lookup_map, raw_outcomes):
+ """Return the int value corresponding to the class implied by the
+ set of outputs in the outcomes array."""
+ str = ''
+ for ind, label in enumerate(outcome_list):
+ str += label
+ str += "="
+ str += lookup_map[label][outcomes[ind]]
+ str += "#"
+
+ str = str[:-1]
+ return str
+
+def feature_string_to_list( feat_string, length=-1 ):
+ return feature_array_to_list( feat_string.split(' '), length )
+
+def feature_array_to_list( feats, length=-1 ):
+ if length == -1:
+ length = len(feats)
+
+ #f = np.zeros(length)
+ f = [0] * length
+
+ for feat in feats:
+ (ind, val) = feat.split(':')
+ ind = int(ind) - 1
+ if int(ind) >= len(f):
+ raise Exception("Feature index %d is larger than feature vector length %d -- you may need to specify the expected length of the vector." % (int(ind), len(f) ) )
+ f[int(ind)] = val
+
+ return f
+
+if __name__ == "__main__":
+ (labels, feats) = read_multitask_liblinear('data_testing/multitask_assertion/train_and_test/')
+ print("train[0][100] = %f" % feats[0][100])
Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py Fri Sep 2 15:21:49 2016
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+import numpy as np
+
+import sys
+sys.dont_write_bytecode = True
+
+import ConfigParser
+
+import glob, string, collections, operator
+
+from fnmatch import fnmatch
+
+label2int = {
+ 'none':0,
+ 'contains':1,
+ 'contains-1':2
+ }
+
+# will have to do this eventually
+# label2int = {
+# 'none': 0,
+# 'contains': 1,
+# 'contains-1': 2,
+# 'before': 3,
+# 'before-1': 4,
+# 'begins-on': 5,
+# 'begins-on-1': 6,
+# 'ends-on': 7,
+# 'ends-on-1': 8,
+# 'overlap': 9,
+# 'overlap-1': 10,
+# }
+
+class DatasetProvider:
+ """THYME relation data"""
+
+ def __init__(self, file_names):
+ """Index words by frequency in a list of files"""
+
+ self.alphabet = {} # words indexed by frequency
+
+ unigrams = [] # read entire corpus into a list
+ for file_name in file_names:
+ for line in open(file_name):
+ label, text = line.strip().split('|')
+ unigrams.extend(text.split())
+
+ index = 1 # zero used to encode unknown words
+ unigram_counts = collections.Counter(unigrams)
+ self.alphabet['oov_word'] = 0
+ for unigram, count in unigram_counts.most_common():
+ self.alphabet[unigram] = index
+ index = index + 1
+
+ def load(self, path):
+ """Convert sentences (examples) into lists of indices"""
+
+ examples = []
+ labels = []
+ for line in open(path):
+ label, text = line.strip().split('|')
+ example = []
+ for unigram in text.split():
+ example.append(self.alphabet[unigram])
+ examples.append(example)
+ labels.append(label2int[label])
+
+ return examples, labels
+
+ def load_if_oov(self, path):
+
+ examples = []
+ labels = []
+ for line in open(path):
+ label,text = line.strip().split('|')
+ example = []
+ for unigram in text.split():
+ if(self.alphabet.has_key(unigram)):
+ example.append(self.alphabet[unigram])
+ else:
+ example.append(self.alphabet["none"])
+ examples.append(example)
+ labels.append(label2int[label])
+
+ return examples, labels
+
+ def load_by_region(self, path):
+ pres = []
+ arg1s = []
+ conts = []
+ arg2s = []
+ posts = []
+ labels = []
+ for line in open(path):
+ label,text = line.strip().split('|')
+ pre,arg1,cont,arg2,post = self.processText(text)
+ pres.append(pre)
+ arg1s.append(arg1)
+ conts.append(cont)
+ arg2s.append(arg2)
+ posts.append(post)
+ labels.append(label2int[label])
+
+ return pres, arg1s, conts, arg2s, posts, labels
+
+ def processText(self, text):
+ pre= []
+ arg1= []
+ cont= []
+ arg2= []
+ post= []
+
+ tag = 0
+ for unigram in text.split():
+ idx = self.alphabet[unigram]
+ if( fnmatch(unigram, '<*>')):
+ tag = tag + 1
+ continue
+ if(tag ==0 ):
+ pre.append(idx)
+ elif(tag == 1):
+ arg1.append(idx)
+ elif(tag == 2):
+ cont.append(idx)
+ elif(tag == 3):
+ arg2.append(idx)
+ elif(tag == 4):
+ post.append(idx)
+
+ return pre, arg1, cont, arg2, post
+
+
+
+if __name__ == "__main__":
+
+ cfg = ConfigParser.ConfigParser()
+ cfg.read('settings.ini')
+
+ dataset = DatasetProvider([cfg.get('data', 'train'),
+ cfg.get('data', 'test')])
+ print 'alphabet size:', len(dataset.alphabet)
+
+ x,y = dataset.load(cfg.get('data', 'test'))
+
+ print 'max seq len:', max([len(s) for s in x])
+ print 'number of examples:', len(x)
+ print 'number of labels:', len(set(y))
+ print 'label counts:', collections.Counter(y)
+ print 'first 10 examples:', x[:10]
+ print 'class proportions:'
+ counter = collections.Counter(y)
+ for label in counter:
+ print label, counter[label] / float(len(y)), float(len(y)) / counter[label]
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc?rev=1758973&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.py Fri Sep 2 15:21:49 2016
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import numpy as np
+import os, os.path
+import subprocess
+
+
+def string_label_to_label_vector(label_string, outcome_maps):
+ label_vec = []
+
+ for label_val in label_string.split('#'):
+ (label, val) = label_val.split('=')
+ cur_map = outcome_maps[label]
+ label_ind = cur_map[val]
+ label_vec.append(label_ind)
+
+ return label_vec
+
+def get_data_dimensions(data_file):
+ wc_out = subprocess.check_output(['wc', data_file])
+ wc_fields = wc_out.decode().strip().split(' ')
+ file_len = int(wc_fields[0])
+
+ num_feats = 0
+ for line in open(data_file):
+ max_dim = int( line.rstrip().split(' ')[-1].split(':')[0] )
+ if max_dim > num_feats:
+ num_feats = max_dim
+
+ return (file_len, num_feats)
+
+def flatten_outputs(Y):
+ maxes = Y.max(0)
+ #print("Maxes = %s" % (maxes) )
+ reqd_dims = 0
+ indices = [0]
+
+ ## Create an indices array that maps from "true" label indices to neural network
+ ## output layer indices -- binary labels map to single output nodes (2->1) while n-ary
+ ## labels map to n nodes.
+ for val in maxes:
+ if val == 1:
+ reqd_dims += 1
+ elif val > 1:
+ reqd_dims += (int(val) + 1)
+ else:
+ raise Exception("There is a column with all zeros!")
+
+ indices.append(reqd_dims)
+
+ Y_adj = np.zeros( (Y.shape[0], reqd_dims) )
+ for row_ind in range(0, Y.shape[0]):
+ for col_ind in range(0, Y.shape[1]):
+ if maxes[col_ind] == 1:
+ ## For binary variables just need the offset and copy the value
+ Y_adj[row_ind][ int(indices[col_ind]) ] = Y[row_ind][col_ind]
+ else:
+ ## for n-ary variables we use the value to find the offset that will
+ ## be set to 1.
+ Y_adj[row_ind][ int(indices[col_ind]) + int(Y[row_ind][col_ind]) ] = 1
+
+ return Y_adj, indices
+
+def read_liblinear(dirname):
+ data_file = os.path.join(dirname, 'training-data.liblinear')
+
+ (data_points, feat_dims) = get_data_dimensions(data_file)
+
+ label_array = np.zeros( (data_points, 1), dtype=np.int )
+ feat_matrix = np.zeros( (data_points, feat_dims) )
+
+ line_ind = 0
+ for line in open( data_file ):
+ label_and_feats = line.rstrip().split(' ')
+ label = label_and_feats[0]
+
+ label_array[line_ind] = float(label) - 1
+
+ ## Go from 1 on -- skip the label
+ ## the bias term from the liblinear data writer.
+ feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims )
+
+ line_ind += 1
+
+ label_matrix = np.zeros( (data_points, label_array.max()+1) )
+
+ for ind,val in np.ndenumerate(label_array):
+ label_matrix[ind,val] = 1
+
+ return label_matrix, feat_matrix
+
+def read_outcome_maps(dirname):
+ raw_outcomes = []
+ raw_outcomes.append(None)
+
+ derived_maps = {}
+ lookup_map = {}
+ ## First read outcome file
+ for line in open(os.path.join(dirname, 'outcome-lookup.txt') ):
+ (index, label) = line.rstrip().split(' ')
+ raw_outcomes.append(label)
+
+ #for task_label in label.split('#'):
+ #print(task_label)
+ # (task, val) = task_label.rstrip().split("=")
+ # if not task in derived_maps:
+ # derived_maps[task] = {}
+ # lookup_map[task] = []
+
+ # cur_map = derived_maps[task]
+ # lookup = lookup_map[task]
+ # if not val in cur_map:
+ # cur_map[val] = len(cur_map)
+ # lookup.append(val)
+
+ return raw_outcomes#, derived_maps, lookup_map
+
+def outcome_list(raw_outcomes):
+ outcomes = []
+ for outcome_val in raw_outcomes[1].split("#"):
+ outcomes.append(outcome_val.split("=")[0])
+
+ return outcomes
+
+def read_multitask_liblinear(dirname):
+
+ #raw_outcomes, derived_maps, outcome_lookups = read_outcome_maps(dirname)
+
+ data_file = os.path.join(dirname, 'training-data.liblinear')
+
+ (data_points, feat_dims) = get_data_dimensions(data_file)
+
+ ## Remove bias feature -- will be part of any neural network
+ label_dims = 1 #len(derived_maps)
+
+ label_matrix = np.zeros( (data_points, label_dims) )
+ feat_matrix = np.zeros( (data_points, feat_dims) )
+
+ line_ind = 0
+ for line in open( data_file ):
+ label_and_feats = line.rstrip().split(' ')
+ label = label_and_feats[0]
+ #string_label = raw_outcomes[int(label)]
+ #label_vec = string_label_to_label_vector(string_label, derived_maps)
+
+ #for ind, val in enumerate(label_vec):
+ label_matrix[line_ind, 0] = label
+
+ ## Go from 2 on -- skip both the label and the first feature since it will be
+ ## the bias term from the liblinear data writer.
+# feat_list = feature_array_to_list( label_and_feats[1:], feat_dims )
+# feat_matrix[line_ind,:] = feat_list[1:]
+ feat_matrix[line_ind, :] = feature_array_to_list( label_and_feats[1:], feat_dims )
+# for feat in label_and_feats[1:]:
+# (ind, val) = feat.split(':')
+# feat_ind = int(ind) - 1 ## since feats are indexed at 1
+# feat_matrix[line_ind, feat_ind] = float(val)
+
+
+ line_ind += 1
+
+ return label_matrix, feat_matrix
+
+def convert_multi_output_to_string(outcomes, outcome_list, lookup_map, raw_outcomes):
+ """Return the int value corresponding to the class implied by the
+ set of outputs in the outcomes array."""
+ str = ''
+ for ind, label in enumerate(outcome_list):
+ str += label
+ str += "="
+ str += lookup_map[label][outcomes[ind]]
+ str += "#"
+
+ str = str[:-1]
+ return str
+
+def get_outcome_array(working_dir):
+ labels = []
+
+ for line in open(os.path.join(working_dir, "outcome-lookup.txt")):
+ (ind, val) = line.rstrip().split(" ")
+ labels.append(val)
+
+ return labels
+
+def feature_string_to_list( feat_string, length=-1 ):
+ return feature_array_to_list( feat_string.split(' '), length )
+
+def feature_array_to_list( feats, length=-1 ):
+ if length == -1:
+ length = len(feats)
+
+ #f = np.zeros(length)
+ f = [0] * length
+
+ for feat in feats:
+ (ind, val) = feat.split(':')
+ ind = int(ind) - 1
+ if int(ind) >= len(f):
+ raise Exception("Feature index %d is larger than feature vector length %d -- you may need to specify the expected length of the vector." % (int(ind), len(f) ) )
+ f[int(ind)] = val
+
+ return f
+
+if __name__ == "__main__":
+ (labels, feats) = read_multitask_liblinear('target/eval/thyme/train_and_test/event-time/')
+ print("train[0][100] = %f" % feats[0][100])
Added: ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc?rev=1758973&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/et_cleartk_io.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.py Fri Sep 2 15:21:49 2016
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Convolution1D, MaxPooling1D, Lambda, Flatten, Merge
+from keras.optimizers import SGD
+from keras import backend as K
+from keras.optimizers import RMSprop
+
+def get_mlp_model(dimension, num_outputs, layers=(64, 256, 64) ):
+ model = Sequential()
+ sgd = get_mlp_optimizer()
+
+ drop = 0.5
+
+ # Dense(64) is a fully-connected layer with 64 hidden units.
+ # in the first layer, you must specify the expected input data shape:
+ # here, 20-dimensional vectors.
+ model.add(Dense(layers[0], input_dim=dimension, init='uniform'))
+ model.add(Activation('relu'))
+ model.add(Dropout(drop))
+ model.add(Dense(layers[1], init='uniform'))
+ model.add(Activation('relu'))
+ model.add(Dropout(drop))
+ #model.add(Dense(layers[2], init='uniform'))
+ #model.add(Activation('relu'))
+ #model.add(Dropout(drop))
+
+# model.add(Dense(layers[2], init='uniform'))
+# model.add(Activation('relu'))
+# model.add(Dropout(0.5))
+
+ if num_outputs == 1:
+ model.add(Dense(1, init='uniform'))
+ model.add(Activation('sigmoid'))
+ model.compile(loss='binary_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+ else:
+ model.add(Dense(num_outputs, init='uniform'))
+ model.add(Activation('softmax'))
+ model.compile(loss='categorical_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+
+ return model
+
+def get_mlp_optimizer():
+ return SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
+
+def get_cnn_model(dimension, num_outputs, nb_filter = 200, layers=(64, 64, 256) ):
+ model = Sequential()
+ sgd = get_mlp_optimizer()
+
+ ## Convolutional layers:
+ model.add(Convolution1D(nb_filter, 3, input_shape=(6,200)))
+ def max_1d(X):
+ return K.max(X, axis=1)
+
+ model.add(Lambda(max_1d, output_shape=(nb_filter,)))
+
+
+ #model.add(MaxPooling1D())
+
+ model.add(Dense(layers[1], init='uniform'))
+ model.add(Activation('relu'))
+ model.add(Dropout(0.5))
+
+# model.add(Dense(layers[2], init='uniform'))
+# model.add(Activation('relu'))
+# model.add(Dropout(0.5))
+
+ if num_outputs == 1:
+ model.add(Dense(1, init='uniform'))
+ model.add(Activation('sigmoid'))
+ model.compile(loss='binary_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+ else:
+ model.add(Dense(num_outputs, init='uniform'))
+ model.add(Activation('softmax'))
+ model.compile(loss='categorical_crossentropy',
+ optimizer=sgd,
+ metrics=['accuracy'])
+
+ return model
+
+def get_dima_cnn_model(dimension, num_outputs):
+ filtlens = "3,4,5"
+ branches = [] # models to be merged
+ train_xs = []
+ for filterLen in filtlens.split(','):
+ branch = Sequential()
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filterLen),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1,
+ input_shape=(6,200)))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+
+ branches.append(branch)
+ model = Sequential()
+ model.add(Merge(branches, mode='concat'))
+
+ dropout = 0.25
+ model.add(Dense(250))
+ model.add(Dropout(dropout))
+ model.add(Activation('relu'))
+
+ model.add(Dropout(dropout))
+ model.add(Dense(num_outputs))
+ model.add(Activation('softmax'))
+
+ optimizer = RMSprop(lr=0.001,
+ rho=0.9, epsilon=1e-08)
+ model.compile(loss='categorical_crossentropy',
+ optimizer=optimizer,
+ metrics=['accuracy'])
+
+ return model, branches
\ No newline at end of file
Added: ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc?rev=1758973&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/nn_models.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/predict.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/predict.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/predict.py Fri Sep 2 15:21:49 2016
@@ -0,0 +1,76 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <model directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ int2label = {
+ 0:'none',
+ 1:'CONTAINS',
+ 2:'CONTAINS-1'
+ }
+
+ ## Load models and weights:
+ #outcomes = ctk_io.get_outcome_array(working_dir)
+ model_dir = "/Users/Dima/Loyola/Workspaces/cTakes/ctakes/ctakes-temporal/target/eval/thyme/train_and_test/event-time"
+ maxlen = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+ alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+ #print("Outcomes array is %s" % (outcomes) )
+ model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+ model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+ while True:
+ try:
+ line = sys.stdin.readline().rstrip()
+ if not line:
+ break
+
+ ## Convert the line of Strings to lists of indices
+ feats=[]
+ for unigram in line.rstrip().split():
+ if(alphabet.has_key(unigram)):
+ feats.append(alphabet[unigram])
+ else:
+ feats.append(alphabet["none"])
+ if(len(feats)> maxlen):
+ feats=feats[0:maxlen]
+ test_x = pad_sequences([feats], maxlen=maxlen)
+ #feats = np.reshape(feats, (1, 6, input_dims / 6))
+ #feats = np.reshape(feats, (1, input_dims))
+
+ X_dup = []
+ X_dup.append(test_x)
+ X_dup.append(test_x)
+ X_dup.append(test_x)
+
+ out = model.predict(X_dup, batch_size=50)[0]
+ # print("Out is %s and decision is %d" % (out, out.argmax()))
+ except KeyboardInterrupt:
+ sys.stderr.write("Caught keyboard interrupt\n")
+ break
+
+ if line == '':
+ sys.stderr.write("Encountered empty string so exiting\n")
+ break
+
+ out_str = int2label[out.argmax()]
+
+ print(out_str)
+ sys.stdout.flush()
+
+ sys.exit(0)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
Added: ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/reqs.txt Fri Sep 2 15:21:49 2016
@@ -0,0 +1,10 @@
+h5py==2.6.0
+Keras==1.0.4
+numpy==1.11.0
+PyYAML==3.11
+scipy==0.17.1
+scikit-learn==0.17.1
+six==1.10.0
+sklearn==0.0
+Theano==0.8.2
+wheel==0.29.0
Added: ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train.sh?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train.sh (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train.sh Fri Sep 2 15:21:49 2016
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source $(dirname $0)/env/bin/activate
+python $(dirname $0)/train_and_package.py $*
+ret=$?
+deactivate
+exit $ret
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py?rev=1758973&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py Fri Sep 2 15:21:49 2016
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+
+import numpy as np
+np.random.seed(1337)
+
+import et_cleartk_io as ctk_io
+import nn_models
+
+import sys
+import os.path
+
+import dataset
+
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+
+import pickle
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <data directory>\n")
+ sys.exit(-1)
+
+ working_dir = args[0]
+
+ #read in data file
+# print("Reading data...")
+ #Y, X = ctk_io.read_liblinear(working_dir) # ('data_testing/multitask_assertion/train_and_test')
+ data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+ # learn alphabet from training and test data
+ dataset1 = dataset.DatasetProvider([data_file])
+ # now load training examples and labels
+ train_x, train_y = dataset1.load(data_file)
+
+ init_vectors = None #used for pre-trained embeddings
+
+ # turn x and y into numpy array among other things
+ maxlen = max([len(seq) for seq in train_x])
+ outcomes = set(train_y)
+ classes = len(outcomes)
+
+ train_x = pad_sequences(train_x, maxlen=maxlen)
+ train_y = to_categorical(np.array(train_y), classes)
+
+ pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+ pickle.dump(dataset1.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+ #test_x = pad_sequences(test_x, maxlen=maxlen)
+ #test_y = to_categorical(np.array(test_y), classes)
+
+ print 'train_x shape:', train_x.shape
+ print 'train_y shape:', train_y.shape
+
+ branches = [] # models to be merged
+ train_xs = [] # train x for each branch
+ #test_xs = [] # test x for each branch
+
+ filtlens = "3,4,5"
+ for filter_len in filtlens.split(','):
+ branch = Sequential()
+ branch.add(Embedding(len(dataset1.alphabet),
+ 300,
+ input_length=maxlen,
+ weights=init_vectors))
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filter_len),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+
+ branches.append(branch)
+ train_xs.append(train_x)
+ #test_xs.append(test_x)
+ model = Sequential()
+ model.add(Merge(branches, mode='concat'))
+
+ model.add(Dense(250))#cfg.getint('cnn', 'hidden')))
+ model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+ model.add(Activation('relu'))
+
+ model.add(Dropout(0.25))#cfg.getfloat('cnn', 'dropout')))
+ model.add(Dense(classes))
+ model.add(Activation('softmax'))
+
+ optimizer = RMSprop(lr=0.0001,#cfg.getfloat('cnn', 'learnrt'),
+ rho=0.9, epsilon=1e-08)
+ model.compile(loss='categorical_crossentropy',
+ optimizer=optimizer,
+ metrics=['accuracy'])
+ model.fit(train_xs,
+ train_y,
+ nb_epoch=3,#cfg.getint('cnn', 'epochs'),
+ batch_size=50,#cfg.getint('cnn', 'batches'),
+ verbose=1,
+ validation_split=0.1,
+ class_weight=None)
+
+ model.summary()
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
\ No newline at end of file