You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/05/03 17:54:18 UTC

svn commit: r1742133 - in /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers: DataForWord2Vec.java EventPrinter.java GoldEventPrinter.java SentencePrinter.java TokenPreprocForWord2Vec.java

Author: dligach
Date: Tue May  3 15:54:18 2016
New Revision: 1742133

URL: http://svn.apache.org/viewvc?rev=1742133&view=rev
Log:
made sure BaseToken->word mapping is consistent with word2vec preproc

Added:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java
      - copied, changed from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java
      - copied, changed from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java
Removed:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java
Modified:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java?rev=1742133&r1=1742132&r2=1742133&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java Tue May  3 15:54:18 2016
@@ -45,76 +45,41 @@ import com.lexicalscope.jewel.cli.Option
  */
 public class DataForWord2Vec {
 
-	static interface Options {
+  static interface Options {
 
-		@Option(
-				longName = "xmi-dir",
-				description = "path to xmi files")
-		public File getInputDirectory();
-	}
-
-	public static void main(String[] args) throws Exception {
-
-		Options options = CliFactory.parseArguments(Options.class, args);
-		CollectionReader collectionReader = Utils.getCollectionReader(options.getInputDirectory());
-		AnalysisEngine annotationConsumer = AnalysisEngineFactory.createEngine(AnnotationPrinter.class);
-		SimplePipeline.runPipeline(collectionReader, annotationConsumer);
-	}
-
-	public static class AnnotationPrinter extends JCasAnnotator_ImplBase {
-
-		@Override
-		public void process(JCas jCas) throws AnalysisEngineProcessException {
-
-			JCas systemView;
-			try {
-				systemView = jCas.getView("_InitialView");
-			} catch (CASException e) {
-				throw new AnalysisEngineProcessException(e);
-			}
-
-		  for(BaseToken token : JCasUtil.select(systemView, BaseToken.class)) { 
-		    String stringValue = tokenToString(token);
-		    if(stringValue != null) {
-		      System.out.print(stringValue + " ");
-		    }
-		  }
-		}
-		
-		/**
-		 * Determine what to print based on the token's type.
-		 */
-		private String tokenToString(BaseToken token) {
-		  
-		  String stringValue;
-		  String tokenType = token.getClass().getSimpleName();
-		  String tokenText = token.getCoveredText().toLowerCase();
-
-		  switch(tokenType) {
-		    case "ContractionToken":
-		      stringValue = tokenText;
-		      break;
-		    case "NewlineToken":
-		      stringValue = null;
-		      break;
-		    case "NumToken":
-		      stringValue = "number_token";
-		      break;
-		    case "PunctuationToken":
-		      stringValue = tokenText;
-		      break;
-		    case "SymbolToken":
-		      stringValue = tokenText;
-		      break;
-		    case "WordToken":
-		      stringValue = tokenText;
-		      break;
-		    default:
-		      throw new IllegalArgumentException("Invalid token type: " + tokenType);
-		  }
-		  
-		  return stringValue;
-		}
-	}
+    @Option(
+        longName = "xmi-dir",
+        description = "path to xmi files")
+    public File getInputDirectory();
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    Options options = CliFactory.parseArguments(Options.class, args);
+    CollectionReader collectionReader = Utils.getCollectionReader(options.getInputDirectory());
+    AnalysisEngine annotationConsumer = AnalysisEngineFactory.createEngine(AnnotationPrinter.class);
+    SimplePipeline.runPipeline(collectionReader, annotationConsumer);
+  }
+
+  public static class AnnotationPrinter extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+      JCas systemView;
+      try {
+        systemView = jCas.getView("_InitialView");
+      } catch (CASException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+
+      for(BaseToken token : JCasUtil.select(systemView, BaseToken.class)) { 
+        String stringValue = TokenPreprocForWord2Vec.tokenToString(token);
+        if(stringValue != null) {
+          System.out.print(stringValue + " ");
+        }
+      }
+    }
+  }
 }
 

Copied: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java (from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java?p2=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java&p1=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java&r1=1742132&r2=1742133&rev=1742133&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java Tue May  3 15:54:18 2016
@@ -43,7 +43,7 @@ import com.lexicalscope.jewel.cli.Option
  *  
  * @author dmitriy dligach
  */
-public class SentencePrinter {
+public class GoldEventPrinter {
 
   static interface Options {
 
@@ -90,10 +90,12 @@ public class SentencePrinter {
       for(Sentence sentence : JCasUtil.select(systemView, Sentence.class)) {
         for(BaseToken baseToken : JCasUtil.selectCovered(systemView, BaseToken.class, sentence)) {
           List<EventMention> events = JCasUtil.selectCovering(goldView, EventMention.class, baseToken.getBegin(), baseToken.getEnd());
+          String tokenText = TokenPreprocForWord2Vec.tokenToString(baseToken);
           if(events.size() > 0) {
-            System.out.println("[" + baseToken.getCoveredText() + "] ");
+            System.out.print("[" + tokenText + "] ");
+          } else {
+            System.out.print(tokenText + " ");
           }
-          System.out.print(baseToken.getCoveredText() + " ");
         }
         System.out.println();
       }

Copied: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java (from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java?p2=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java&p1=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java&r1=1742132&r2=1742133&rev=1742133&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java Tue May  3 15:54:18 2016
@@ -39,7 +39,7 @@ import com.lexicalscope.jewel.cli.Option
  *  
  * @author dmitriy dligach
  */
-public class EventPrinter {
+public class SentencePrinter {
   
   static interface Options {
 

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java?rev=1742133&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java Tue May  3 15:54:18 2016
@@ -0,0 +1,41 @@
+package org.apache.ctakes.consumers;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+
+public class TokenPreprocForWord2Vec {
+
+  /**
+   * Determine what to print based on the token's type.
+   */
+  public static String tokenToString(BaseToken token) {
+
+    String stringValue;
+    String tokenType = token.getClass().getSimpleName();
+    String tokenText = token.getCoveredText().toLowerCase();
+
+    switch(tokenType) {
+    case "ContractionToken":
+      stringValue = tokenText;
+      break;
+    case "NewlineToken":
+      stringValue = null;
+      break;
+    case "NumToken":
+      stringValue = "number_token";
+      break;
+    case "PunctuationToken":
+      stringValue = tokenText;
+      break;
+    case "SymbolToken":
+      stringValue = tokenText;
+      break;
+    case "WordToken":
+      stringValue = tokenText;
+      break;
+    default:
+      throw new IllegalArgumentException("Invalid token type: " + tokenType);
+    }
+
+    return stringValue;
+  }
+}