You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/09/21 17:41:44 UTC
svn commit: r1761788 - in /ctakes/trunk/ctakes-temporal/scripts/nn:
dataset.py dataset.pyc predict.py train_and_package.py
Author: dligach
Date: Wed Sep 21 17:41:44 2016
New Revision: 1761788
URL: http://svn.apache.org/viewvc?rev=1761788&view=rev
Log:
updated to work with my version of dataset.py; also using env variable to locate target dir
Modified:
ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py
Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py Wed Sep 21 17:41:44 2016
@@ -1,154 +1,77 @@
#!/usr/bin/env python
import numpy as np
-
import sys
sys.dont_write_bytecode = True
-
import ConfigParser
-
import glob, string, collections, operator
-from fnmatch import fnmatch
-
-label2int = {
- 'none':0,
- 'contains':1,
- 'contains-1':2
- }
-
-# will have to do this eventually
-# label2int = {
-# 'none': 0,
-# 'contains': 1,
-# 'contains-1': 2,
-# 'before': 3,
-# 'before-1': 4,
-# 'begins-on': 5,
-# 'begins-on-1': 6,
-# 'ends-on': 7,
-# 'ends-on-1': 8,
-# 'overlap': 9,
-# 'overlap-1': 10,
-# }
-
class DatasetProvider:
"""THYME relation data"""
- def __init__(self, file_names):
- """Index words by frequency in a list of files"""
-
- self.alphabet = {} # words indexed by frequency
-
- unigrams = [] # read entire corpus into a list
- for file_name in file_names:
- for line in open(file_name):
- label, text = line.strip().split('|')
- unigrams.extend(text.split())
+ def __init__(self, path):
+ """Index words by frequency in a file"""
+ self.word2int = {} # words indexed by frequency
+ self.label2int = {} # class to int mapping
+
+ unigrams = [] # corpus as list
+ labels = [] # classes as list
+ for line in open(path):
+ label, text = line.strip().split('|')
+ unigrams.extend(text.split())
+ labels.append(label)
+
index = 1 # zero used to encode unknown words
+ self.word2int['oov_word'] = 0
unigram_counts = collections.Counter(unigrams)
- self.alphabet['oov_word'] = 0
for unigram, count in unigram_counts.most_common():
- self.alphabet[unigram] = index
+ self.word2int[unigram] = index
+ index = index + 1
+
+ index = 0 # index classes
+ for label in set(labels):
+ self.label2int[label] = index
index = index + 1
- def load(self, path):
+ def load(self, path, maxlen=float('inf')):
"""Convert sentences (examples) into lists of indices"""
examples = []
labels = []
+
for line in open(path):
label, text = line.strip().split('|')
example = []
for unigram in text.split():
- example.append(self.alphabet[unigram])
- examples.append(example)
- labels.append(label2int[label])
-
- return examples, labels
+ if unigram in self.word2int:
+ example.append(self.word2int[unigram])
+ else:
+ example.append(self.word2int['oov_word'])
- def load_if_oov(self, path):
+ # truncate example if it's too long
+ if len(example) > maxlen:
+ example = example[0:maxlen]
- examples = []
- labels = []
- for line in open(path):
- label,text = line.strip().split('|')
- example = []
- for unigram in text.split():
- if(self.alphabet.has_key(unigram)):
- example.append(self.alphabet[unigram])
- else:
- example.append(self.alphabet["none"])
examples.append(example)
- labels.append(label2int[label])
+ labels.append(self.label2int[label])
return examples, labels
- def load_by_region(self, path):
- pres = []
- arg1s = []
- conts = []
- arg2s = []
- posts = []
- labels = []
- for line in open(path):
- label,text = line.strip().split('|')
- pre,arg1,cont,arg2,post = self.processText(text)
- pres.append(pre)
- arg1s.append(arg1)
- conts.append(cont)
- arg2s.append(arg2)
- posts.append(post)
- labels.append(label2int[label])
-
- return pres, arg1s, conts, arg2s, posts, labels
-
- def processText(self, text):
- pre= []
- arg1= []
- cont= []
- arg2= []
- post= []
-
- tag = 0
- for unigram in text.split():
- idx = self.alphabet[unigram]
- if( fnmatch(unigram, '<*>')):
- tag = tag + 1
- continue
- if(tag ==0 ):
- pre.append(idx)
- elif(tag == 1):
- arg1.append(idx)
- elif(tag == 2):
- cont.append(idx)
- elif(tag == 3):
- arg2.append(idx)
- elif(tag == 4):
- post.append(idx)
-
- return pre, arg1, cont, arg2, post
-
-
-
if __name__ == "__main__":
cfg = ConfigParser.ConfigParser()
- cfg.read('settings.ini')
-
- dataset = DatasetProvider([cfg.get('data', 'train'),
- cfg.get('data', 'test')])
- print 'alphabet size:', len(dataset.alphabet)
+ cfg.read(sys.argv[1])
- x,y = dataset.load(cfg.get('data', 'test'))
+ dataset = DatasetProvider(cfg.get('data', 'train'))
+ print 'alphabet size:', len(dataset.word2int)
- print 'max seq len:', max([len(s) for s in x])
+ x,y = dataset.load(cfg.get('data', 'train'))
+ print 'train max seq len:', max([len(s) for s in x])
+
+ x,y = dataset.load(cfg.get('data', 'test'), maxlen=10)
print 'number of examples:', len(x)
- print 'number of labels:', len(set(y))
+ print 'test max seq len:', max([len(s) for s in x])
+ print 'labels:', dataset.label2int
print 'label counts:', collections.Counter(y)
print 'first 10 examples:', x[:10]
- print 'class proportions:'
- counter = collections.Counter(y)
- for label in counter:
- print label, counter[label] / float(len(y)), float(len(y)) / counter[label]
Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
Binary files - no diff available.
Modified: ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/predict.py?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/predict.py (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/predict.py Wed Sep 21 17:41:44 2016
@@ -12,23 +12,20 @@ def main(args):
if len(args) < 1:
sys.stderr.write("Error - one required argument: <model directory>\n")
sys.exit(-1)
-
working_dir = args[0]
- int2label = {
- 0:'none',
- 1:'CONTAINS',
- 2:'CONTAINS-1'
- }
-
- ctakes_root = '/Users/Dima/Loyola/Workspaces/cTakes/ctakes/'
target_dir = 'ctakes-temporal/target/eval/thyme/train_and_test/event-time/'
- model_dir = ctakes_root + target_dir
+ model_dir = os.path.join(os.environ['CTAKES_ROOT'], target_dir)
maxlen = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
- alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+ word2int = pickle.load(open(os.path.join(model_dir, "word2int.p"), "rb"))
+ label2int = pickle.load(open(os.path.join(model_dir, "label2int.p"), "rb"))
model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
model.load_weights(os.path.join(model_dir, "model_0.h5"))
+ int2label = {}
+ for label, integer in label2int.items():
+ int2label[integer] = label
+
while True:
try:
line = sys.stdin.readline().rstrip()
@@ -37,10 +34,10 @@ def main(args):
feats=[]
for unigram in line.rstrip().split():
- if(alphabet.has_key(unigram)):
- feats.append(alphabet[unigram])
+ if(word2int.has_key(unigram)):
+ feats.append(word2int[unigram])
else:
- feats.append(alphabet["none"])
+ feats.append(word2int["none"])
if(len(feats) > maxlen):
feats=feats[0:maxlen]
Modified: ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py Wed Sep 21 17:41:44 2016
@@ -23,14 +23,13 @@ def main(args):
if len(args) < 1:
sys.stderr.write("Error - one required argument: <data directory>\n")
sys.exit(-1)
-
working_dir = args[0]
data_file = os.path.join(working_dir, 'training-data.liblinear')
# learn alphabet from training data
- data_set = dataset.DatasetProvider([data_file])
+ provider = dataset.DatasetProvider(data_file)
# now load training examples and labels
- train_x, train_y = data_set.load(data_file)
+ train_x, train_y = provider.load(data_file)
# turn x and y into numpy array among other things
maxlen = max([len(seq) for seq in train_x])
outcomes = set(train_y)
@@ -40,7 +39,8 @@ def main(args):
train_y = to_categorical(np.array(train_y), classes)
pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
- pickle.dump(data_set.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+ pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb"))
+ pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb"))
print 'train_x shape:', train_x.shape
print 'train_y shape:', train_y.shape
@@ -51,7 +51,7 @@ def main(args):
for filter_len in '2,3,4,5'.split(','):
branch = Sequential()
- branch.add(Embedding(len(data_set.alphabet),
+ branch.add(Embedding(len(provider.word2int),
300,
input_length=maxlen,
weights=None))