You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/09/21 17:41:44 UTC

svn commit: r1761788 - in /ctakes/trunk/ctakes-temporal/scripts/nn: dataset.py dataset.pyc predict.py train_and_package.py

Author: dligach
Date: Wed Sep 21 17:41:44 2016
New Revision: 1761788

URL: http://svn.apache.org/viewvc?rev=1761788&view=rev
Log:
updated to work with my version of dataset.py; also using env variable to locate target dir

Modified:
    ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
    ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
    ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
    ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py

Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py Wed Sep 21 17:41:44 2016
@@ -1,154 +1,77 @@
 #!/usr/bin/env python
 
 import numpy as np
-
 import sys
 sys.dont_write_bytecode = True
-
 import ConfigParser
-
 import glob, string, collections, operator
 
-from fnmatch import fnmatch
-
-label2int = {
-  'none':0,
-  'contains':1,
-  'contains-1':2
-  }
-
-# will have to do this eventually
-# label2int = {
-#   'none': 0,
-#   'contains': 1,
-#   'contains-1': 2,
-#   'before': 3,
-#   'before-1': 4,
-#   'begins-on': 5,
-#   'begins-on-1': 6,
-#   'ends-on': 7,
-#   'ends-on-1': 8,
-#   'overlap': 9,
-#   'overlap-1': 10,
-# }
-
 class DatasetProvider:
   """THYME relation data"""
   
-  def __init__(self, file_names):
-    """Index words by frequency in a list of files"""
-
-    self.alphabet = {} # words indexed by frequency
-
-    unigrams = [] # read entire corpus into a list
-    for file_name in file_names:
-      for line in open(file_name):
-        label, text = line.strip().split('|')
-        unigrams.extend(text.split())
+  def __init__(self, path):
+    """Index words by frequency in a file"""
 
+    self.word2int = {} # words indexed by frequency
+    self.label2int = {}   # class to int mapping
+    
+    unigrams = [] # corpus as list
+    labels = []   # classes as list
+    for line in open(path):
+      label, text = line.strip().split('|')
+      unigrams.extend(text.split())
+      labels.append(label)
+        
     index = 1 # zero used to encode unknown words
+    self.word2int['oov_word'] = 0
     unigram_counts = collections.Counter(unigrams)
-    self.alphabet['oov_word'] = 0
     for unigram, count in unigram_counts.most_common():
-      self.alphabet[unigram] = index
+      self.word2int[unigram] = index
+      index = index + 1
+
+    index = 0 # index classes
+    for label in set(labels):
+      self.label2int[label] = index
       index = index + 1
 
-  def load(self, path):
+  def load(self, path, maxlen=float('inf')):
     """Convert sentences (examples) into lists of indices"""
 
     examples = []
     labels = []
+
     for line in open(path):
       label, text = line.strip().split('|')
       example = []
       for unigram in text.split():
-        example.append(self.alphabet[unigram])
-      examples.append(example)
-      labels.append(label2int[label])
-
-    return examples, labels
+        if unigram in self.word2int:
+          example.append(self.word2int[unigram])
+        else:
+          example.append(self.word2int['oov_word'])
 
-  def load_if_oov(self, path):
+      # truncate example if it's too long
+      if len(example) > maxlen:
+        example = example[0:maxlen]
 
-    examples = []
-    labels = []
-    for line in open(path):
-      label,text = line.strip().split('|')
-      example = []
-      for unigram in text.split():
-        if(self.alphabet.has_key(unigram)):
-            example.append(self.alphabet[unigram])
-        else:
-            example.append(self.alphabet["none"])
       examples.append(example)
-      labels.append(label2int[label])
+      labels.append(self.label2int[label])
 
     return examples, labels
 
-  def load_by_region(self, path):
-    pres = []
-    arg1s = []
-    conts = []
-    arg2s = []
-    posts = []
-    labels = []
-    for line in open(path):
-      label,text = line.strip().split('|')
-      pre,arg1,cont,arg2,post = self.processText(text)
-      pres.append(pre)
-      arg1s.append(arg1)
-      conts.append(cont)
-      arg2s.append(arg2)
-      posts.append(post)
-      labels.append(label2int[label])
-
-    return pres, arg1s, conts, arg2s, posts, labels
-
-  def processText(self, text):
-    pre= []
-    arg1= []
-    cont= []
-    arg2= []
-    post= []
-
-    tag = 0
-    for unigram in text.split():
-      idx = self.alphabet[unigram]
-      if( fnmatch(unigram, '<*>')):
-        tag = tag + 1
-        continue
-      if(tag ==0 ):
-        pre.append(idx)
-      elif(tag == 1):
-        arg1.append(idx)
-      elif(tag == 2):
-        cont.append(idx)
-      elif(tag == 3):
-        arg2.append(idx)
-      elif(tag == 4):
-        post.append(idx)
-
-    return pre, arg1, cont, arg2, post
-
-
-
 if __name__ == "__main__":
 
   cfg = ConfigParser.ConfigParser()
-  cfg.read('settings.ini')
-
-  dataset = DatasetProvider([cfg.get('data', 'train'),
-                             cfg.get('data', 'test')])
-  print 'alphabet size:', len(dataset.alphabet)
+  cfg.read(sys.argv[1])
 
-  x,y = dataset.load(cfg.get('data', 'test'))
+  dataset = DatasetProvider(cfg.get('data', 'train'))
+  print 'alphabet size:', len(dataset.word2int)
 
-  print 'max seq len:', max([len(s) for s in x])
+  x,y = dataset.load(cfg.get('data', 'train'))
+  print 'train max seq len:', max([len(s) for s in x])
+  
+  x,y = dataset.load(cfg.get('data', 'test'), maxlen=10)
   print 'number of examples:', len(x)
-  print 'number of labels:', len(set(y))
+  print 'test max seq len:', max([len(s) for s in x])
+  print 'labels:', dataset.label2int
   print 'label counts:', collections.Counter(y)
   print 'first 10 examples:', x[:10]
-  print 'class proportions:'
-  counter = collections.Counter(y)
-  for label in counter:
-    print label, counter[label] / float(len(y)), float(len(y)) / counter[label]

Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
Binary files - no diff available.

Modified: ctakes/trunk/ctakes-temporal/scripts/nn/predict.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/predict.py?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/predict.py (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/predict.py Wed Sep 21 17:41:44 2016
@@ -12,23 +12,20 @@ def main(args):
     if len(args) < 1:
         sys.stderr.write("Error - one required argument: <model directory>\n")
         sys.exit(-1)
-
     working_dir = args[0]
 
-    int2label = {
-        0:'none',
-        1:'CONTAINS',
-        2:'CONTAINS-1'
-    }
-
-    ctakes_root = '/Users/Dima/Loyola/Workspaces/cTakes/ctakes/'
     target_dir = 'ctakes-temporal/target/eval/thyme/train_and_test/event-time/'
-    model_dir = ctakes_root + target_dir
+    model_dir = os.path.join(os.environ['CTAKES_ROOT'], target_dir)
     maxlen   = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb"))
-    alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb"))
+    word2int = pickle.load(open(os.path.join(model_dir, "word2int.p"), "rb"))
+    label2int = pickle.load(open(os.path.join(model_dir, "label2int.p"), "rb"))
     model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read())
     model.load_weights(os.path.join(model_dir, "model_0.h5"))
 
+    int2label = {}
+    for label, integer in label2int.items():
+      int2label[integer] = label
+
     while True:
         try:
             line = sys.stdin.readline().rstrip()
@@ -37,10 +34,10 @@ def main(args):
 
             feats=[]
             for unigram in line.rstrip().split():
-                if(alphabet.has_key(unigram)):
-                    feats.append(alphabet[unigram])
+                if(word2int.has_key(unigram)):
+                    feats.append(word2int[unigram])
                 else:
-                    feats.append(alphabet["none"])
+                    feats.append(word2int["none"])
                     
             if(len(feats) > maxlen):
                 feats=feats[0:maxlen]

Modified: ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py?rev=1761788&r1=1761787&r2=1761788&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py (original)
+++ ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py Wed Sep 21 17:41:44 2016
@@ -23,14 +23,13 @@ def main(args):
     if len(args) < 1:
         sys.stderr.write("Error - one required argument: <data directory>\n")
         sys.exit(-1)
-        
     working_dir = args[0]
     data_file = os.path.join(working_dir, 'training-data.liblinear')
 
     # learn alphabet from training data
-    data_set = dataset.DatasetProvider([data_file])
+    provider = dataset.DatasetProvider(data_file)
     # now load training examples and labels
-    train_x, train_y = data_set.load(data_file)
+    train_x, train_y = provider.load(data_file)
     # turn x and y into numpy array among other things
     maxlen = max([len(seq) for seq in train_x])
     outcomes = set(train_y)
@@ -40,7 +39,8 @@ def main(args):
     train_y = to_categorical(np.array(train_y), classes)
 
     pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb"))
-    pickle.dump(data_set.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb"))
+    pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb"))
+    pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb"))
 
     print 'train_x shape:', train_x.shape
     print 'train_y shape:', train_y.shape
@@ -51,7 +51,7 @@ def main(args):
     for filter_len in '2,3,4,5'.split(','):
       
         branch = Sequential()
-        branch.add(Embedding(len(data_set.alphabet),
+        branch.add(Embedding(len(provider.word2int),
                              300,
                              input_length=maxlen,
                              weights=None))