You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/09/24 00:58:14 UTC
svn commit: r1762100 - in /ctakes/trunk/ctakes-temporal/scripts/nn:
classify.sh dataset_hybrid.py dataset_hybrid.pyc hybrid_classify.py
hybrid_train.py train.sh
Author: dligach
Date: Sat Sep 24 00:58:14 2016
New Revision: 1762100
URL: http://svn.apache.org/viewvc?rev=1762100&view=rev
Log:
code to get pos/token hybrid model working
Added:
ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc (with props)
ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py
ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py
Modified:
ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
Modified: ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh?rev=1762100&r1=1762099&r2=1762100&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/classify.sh Sat Sep 24 00:58:14 2016
@@ -1,7 +1,7 @@
#!/bin/bash
source $(dirname $0)/env/bin/activate
-python $(dirname $0)/cnn_classify.py $*
+python $(dirname $0)/hybrid_classify.py $*
ret=$?
deactivate
exit $ret
Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py?rev=1762100&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py Sat Sep 24 00:58:14 2016
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import numpy as np
+import sys, ConfigParser, collections
+sys.dont_write_bytecode = True
+
+class DatasetProvider:
+ """THYME relation data"""
+
+ def __init__(self, path):
+ """Index words by frequency in a file"""
+
+ self.word2int = {} # words indexed by frequency
+ self.tag2int = {} # pos tags indexed by frequency
+ self.label2int = {} # class to int mapping
+
+ unigrams = [] # corpus as list
+ tags = [] # pos tags as list
+ labels = [] # classes as list
+ for line in open(path):
+ label, text, pos = line.strip().split('|')
+ unigrams.extend(text.split())
+ tags.extend(pos.split())
+ labels.append(label)
+
+ index = 1 # zero used to encode unknown words
+ self.word2int['oov_word'] = 0
+ unigram_counts = collections.Counter(unigrams)
+ for unigram, count in unigram_counts.most_common():
+ self.word2int[unigram] = index
+ index = index + 1
+
+ index = 1 # zero used to encode unknown pos tags
+ self.tag2int['oov_tag'] = 0
+ tag_counts = collections.Counter(tags)
+ for tag, count in tag_counts.most_common():
+ self.tag2int[tag] = index
+ index = index + 1
+
+ index = 0 # index classes
+ for label in set(labels):
+ self.label2int[label] = index
+ index = index + 1
+
+ def load(self, path, maxlen=float('inf')):
+ """Convert sentences (examples) into lists of indices"""
+
+ examples = [] # sequences of words as ints
+ tagseqs = [] # sequences of pos tags as ints
+ labels = [] # labels
+
+ for line in open(path):
+ label, text, pos = line.strip().split('|')
+
+ example = []
+ for unigram in text.split():
+ if unigram in self.word2int:
+ example.append(self.word2int[unigram])
+ else:
+ example.append(self.word2int['oov_word'])
+
+ tagseq = []
+ for tag in pos.split():
+ if tag in self.tag2int:
+ tagseq.append(self.tag2int[tag])
+ else:
+ tagseq.append(self.tag2int['oov_tag'])
+
+ # truncate example if it's too long
+ if len(example) > maxlen:
+ example = example[0:maxlen]
+ if len(tagseq) > maxlen:
+ tagseq = tagseq[0:maxlen]
+
+ examples.append(example)
+ tagseqs.append(tagseq)
+ labels.append(self.label2int[label])
+
+ return examples, tagseqs, labels
+
+if __name__ == "__main__":
+
+ cfg = ConfigParser.ConfigParser()
+ cfg.read(sys.argv[1])
+
+ dataset = DatasetProvider(cfg.get('data', 'train'))
+ print 'alphabet size:', len(dataset.tag2int)
+ x1, x2, y = dataset.load(cfg.get('data', 'train'))
+ print 'train max seq len:', max([len(s) for s in x1])
+
+ x1, x2, y = dataset.load(cfg.get('data', 'test'), maxlen=10)
+ print 'test max seq len:', max([len(s) for s in x2])
+ print 'labels:', dataset.label2int
+ print 'label counts:', collections.Counter(y)
+ print 'first 10 examples:', x2[:10]
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.py
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc?rev=1762100&view=auto
==============================================================================
Binary file - no diff available.
Propchange: ctakes/trunk/ctakes-temporal/scripts/nn/dataset_hybrid.pyc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py?rev=1762100&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_classify.py Sat Sep 24 00:58:14 2016
@@ -0,0 +1,87 @@
+#!python
+
+from keras.models import Sequential, model_from_json
+import numpy as np
+import et_cleartk_io as ctk_io
+import sys
+import os.path
+import pickle
+from keras.preprocessing.sequence import pad_sequences
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <model directory>\n")
+ sys.exit(-1)
+ working_dir = args[0]
+
+ target_dir = 'ctakes-temporal/target/eval/thyme/train_and_test/event-time/'
+ model_dir = os.path.join(os.environ['CTAKES_ROOT'], target_dir)
+ maxlen = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
+ word2int = pickle.load(open(os.path.join(model_dir, "word2int.p"), "rb"))
+ tag2int = pickle.load(open(os.path.join(model_dir, "tag2int.p"), "rb"))
+ label2int = pickle.load(open(os.path.join(model_dir, "label2int.p"), "rb"))
+ model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
+ model.load_weights(os.path.join(model_dir, "model_0.h5"))
+
+ int2label = {}
+ for label, integer in label2int.items():
+ int2label[integer] = label
+
+ while True:
+ try:
+ line = sys.stdin.readline().rstrip()
+ if not line:
+ break
+
+ text, pos = line.strip().split('|')
+
+ tokens = []
+ for token in text.rstrip().split():
+ if token in word2int:
+ tokens.append(word2int[token])
+ else:
+ tokens.append(word2int['none'])
+
+ tags = []
+ for tag in pos.rstrip().split():
+ if tag in tag2int:
+ tags.append(tag2int[token])
+ else:
+ tags.append(tag2int['oov_word'])
+
+ if len(tokens) > maxlen:
+ tokens = tokens[0:maxlen]
+ if len(tags) > maxlen:
+ tags = tags[0:maxlen]
+
+ test_x1 = pad_sequences([tokens], maxlen=maxlen)
+ test_x2 = pad_sequences([tags], maxlen=maxlen)
+
+ test_xs = []
+ test_xs.append(test_x)
+ test_xs.append(test_x)
+ test_xs.append(test_x)
+ test_xs.append(test_x)
+ test_xs.append(test_x2)
+ test_xs.append(test_x2)
+ test_xs.append(test_x2)
+ test_xs.append(test_x2)
+
+ out = model.predict(test_xs, batch_size=50)[0]
+
+ except KeyboardInterrupt:
+ sys.stderr.write("Caught keyboard interrupt\n")
+ break
+
+ if line == '':
+ sys.stderr.write("Encountered empty string so exiting\n")
+ break
+
+ out_str = int2label[out.argmax()]
+ print out_str
+ sys.stdout.flush()
+
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
Added: ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py?rev=1762100&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py (added)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/hybrid_train.py Sat Sep 24 00:58:14 2016
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import sklearn as sk
+import numpy as np
+np.random.seed(1337)
+import et_cleartk_io as ctk_io
+import nn_models
+import sys
+import os.path
+import dataset_hybrid
+import keras as k
+from keras.utils.np_utils import to_categorical
+from keras.optimizers import RMSprop
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Merge
+from keras.layers.core import Dense, Dropout, Activation, Flatten
+from keras.layers.convolutional import Convolution1D, MaxPooling1D
+from keras.layers.embeddings import Embedding
+import pickle
+
+def main(args):
+ if len(args) < 1:
+ sys.stderr.write("Error - one required argument: <data directory>\n")
+ sys.exit(-1)
+ working_dir = args[0]
+ data_file = os.path.join(working_dir, 'training-data.liblinear')
+
+ # learn alphabet from training data
+ provider = dataset_hybrid.DatasetProvider(data_file)
+ # now load training examples and labels
+ train_x1, train_x2, train_y = provider.load(data_file)
+ # turn x and y into numpy array among other things
+ maxlen = max([len(seq) for seq in train_x1])
+ classes = len(set(train_y))
+
+ train_x1 = pad_sequences(train_x1, maxlen=maxlen)
+ train_x2 = pad_sequences(train_x2, maxlen=maxlen)
+ train_y = to_categorical(np.array(train_y), classes)
+
+ pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
+ pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb"))
+ pickle.dump(provider.tag2int, open(os.path.join(working_dir, 'tag2int.p'),"wb"))
+ pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb"))
+
+ print 'train_x1 shape:', train_x1.shape
+ print 'train_x2 shape:', train_x2.shape
+ print 'train_y shape:', train_y.shape
+
+ branches = [] # models to be merged
+ train_xs = [] # train x for each branch
+
+ for filter_len in '2,3,4,5'.split(','):
+ branch = Sequential()
+ branch.add(Embedding(len(provider.word2int),
+ 300,
+ input_length=maxlen,
+ weights=None))
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filter_len),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+
+ branches.append(branch)
+ train_xs.append(train_x1)
+
+ for filter_len in '2,3,4,5'.split(','):
+ branch = Sequential()
+ branch.add(Embedding(len(provider.word2int),
+ 300,
+ input_length=maxlen,
+ weights=None))
+ branch.add(Convolution1D(nb_filter=200,
+ filter_length=int(filter_len),
+ border_mode='valid',
+ activation='relu',
+ subsample_length=1))
+ branch.add(MaxPooling1D(pool_length=2))
+ branch.add(Flatten())
+
+ branches.append(branch)
+ train_xs.append(train_x2)
+
+ model = Sequential()
+ model.add(Merge(branches, mode='concat'))
+
+ model.add(Dense(300))
+ model.add(Dropout(0.25))
+ model.add(Activation('relu'))
+
+ model.add(Dropout(0.25))
+ model.add(Dense(classes))
+ model.add(Activation('softmax'))
+
+ optimizer = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08)
+ model.compile(loss='categorical_crossentropy',
+ optimizer=optimizer,
+ metrics=['accuracy'])
+ model.fit(train_xs,
+ train_y,
+ nb_epoch=3,
+ batch_size=50,
+ verbose=1,
+ validation_split=0.1)
+
+ json_string = model.to_json()
+ open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
+ model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
Modified: ctakes/trunk/ctakes-temporal/scripts/nn/train.sh
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train.sh?rev=1762100&r1=1762099&r2=1762100&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train.sh (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train.sh Sat Sep 24 00:58:14 2016
@@ -1,7 +1,7 @@
#!/bin/bash
source $(dirname $0)/env/bin/activate
-python $(dirname $0)/cnn_train.py $*
+python $(dirname $0)/hybrid_train.py $*
ret=$?
deactivate
exit $ret