You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2016/05/03 17:54:18 UTC
svn commit: r1742133 - in
/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers:
DataForWord2Vec.java EventPrinter.java GoldEventPrinter.java
SentencePrinter.java TokenPreprocForWord2Vec.java
Author: dligach
Date: Tue May 3 15:54:18 2016
New Revision: 1742133
URL: http://svn.apache.org/viewvc?rev=1742133&view=rev
Log:
made sure BaseToken->word mapping is consistent with word2vec preproc
Added:
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java
- copied, changed from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java
- copied, changed from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java
Removed:
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java
Modified:
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java
Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java?rev=1742133&r1=1742132&r2=1742133&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/DataForWord2Vec.java Tue May 3 15:54:18 2016
@@ -45,76 +45,41 @@ import com.lexicalscope.jewel.cli.Option
*/
public class DataForWord2Vec {
- static interface Options {
+ static interface Options {
- @Option(
- longName = "xmi-dir",
- description = "path to xmi files")
- public File getInputDirectory();
- }
-
- public static void main(String[] args) throws Exception {
-
- Options options = CliFactory.parseArguments(Options.class, args);
- CollectionReader collectionReader = Utils.getCollectionReader(options.getInputDirectory());
- AnalysisEngine annotationConsumer = AnalysisEngineFactory.createEngine(AnnotationPrinter.class);
- SimplePipeline.runPipeline(collectionReader, annotationConsumer);
- }
-
- public static class AnnotationPrinter extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
-
- JCas systemView;
- try {
- systemView = jCas.getView("_InitialView");
- } catch (CASException e) {
- throw new AnalysisEngineProcessException(e);
- }
-
- for(BaseToken token : JCasUtil.select(systemView, BaseToken.class)) {
- String stringValue = tokenToString(token);
- if(stringValue != null) {
- System.out.print(stringValue + " ");
- }
- }
- }
-
- /**
- * Determine what to print based on the token's type.
- */
- private String tokenToString(BaseToken token) {
-
- String stringValue;
- String tokenType = token.getClass().getSimpleName();
- String tokenText = token.getCoveredText().toLowerCase();
-
- switch(tokenType) {
- case "ContractionToken":
- stringValue = tokenText;
- break;
- case "NewlineToken":
- stringValue = null;
- break;
- case "NumToken":
- stringValue = "number_token";
- break;
- case "PunctuationToken":
- stringValue = tokenText;
- break;
- case "SymbolToken":
- stringValue = tokenText;
- break;
- case "WordToken":
- stringValue = tokenText;
- break;
- default:
- throw new IllegalArgumentException("Invalid token type: " + tokenType);
- }
-
- return stringValue;
- }
- }
+ @Option(
+ longName = "xmi-dir",
+ description = "path to xmi files")
+ public File getInputDirectory();
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ Options options = CliFactory.parseArguments(Options.class, args);
+ CollectionReader collectionReader = Utils.getCollectionReader(options.getInputDirectory());
+ AnalysisEngine annotationConsumer = AnalysisEngineFactory.createEngine(AnnotationPrinter.class);
+ SimplePipeline.runPipeline(collectionReader, annotationConsumer);
+ }
+
+ public static class AnnotationPrinter extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+ JCas systemView;
+ try {
+ systemView = jCas.getView("_InitialView");
+ } catch (CASException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+
+ for(BaseToken token : JCasUtil.select(systemView, BaseToken.class)) {
+ String stringValue = TokenPreprocForWord2Vec.tokenToString(token);
+ if(stringValue != null) {
+ System.out.print(stringValue + " ");
+ }
+ }
+ }
+ }
}
Copied: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java (from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java?p2=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java&p1=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java&r1=1742132&r2=1742133&rev=1742133&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/GoldEventPrinter.java Tue May 3 15:54:18 2016
@@ -43,7 +43,7 @@ import com.lexicalscope.jewel.cli.Option
*
* @author dmitriy dligach
*/
-public class SentencePrinter {
+public class GoldEventPrinter {
static interface Options {
@@ -90,10 +90,12 @@ public class SentencePrinter {
for(Sentence sentence : JCasUtil.select(systemView, Sentence.class)) {
for(BaseToken baseToken : JCasUtil.selectCovered(systemView, BaseToken.class, sentence)) {
List<EventMention> events = JCasUtil.selectCovering(goldView, EventMention.class, baseToken.getBegin(), baseToken.getEnd());
+ String tokenText = TokenPreprocForWord2Vec.tokenToString(baseToken);
if(events.size() > 0) {
- System.out.println("[" + baseToken.getCoveredText() + "] ");
+ System.out.print("[" + tokenText + "] ");
+ } else {
+ System.out.print(tokenText + " ");
}
- System.out.print(baseToken.getCoveredText() + " ");
}
System.out.println();
}
Copied: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java (from r1742132, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java?p2=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java&p1=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java&r1=1742132&r2=1742133&rev=1742133&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/EventPrinter.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java Tue May 3 15:54:18 2016
@@ -39,7 +39,7 @@ import com.lexicalscope.jewel.cli.Option
*
* @author dmitriy dligach
*/
-public class EventPrinter {
+public class SentencePrinter {
static interface Options {
Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java?rev=1742133&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/TokenPreprocForWord2Vec.java Tue May 3 15:54:18 2016
@@ -0,0 +1,41 @@
+package org.apache.ctakes.consumers;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+
+public class TokenPreprocForWord2Vec {
+
+ /**
+ * Determine what to print based on the token's type.
+ */
+ public static String tokenToString(BaseToken token) {
+
+ String stringValue;
+ String tokenType = token.getClass().getSimpleName();
+ String tokenText = token.getCoveredText().toLowerCase();
+
+ switch(tokenType) {
+ case "ContractionToken":
+ stringValue = tokenText;
+ break;
+ case "NewlineToken":
+ stringValue = null;
+ break;
+ case "NumToken":
+ stringValue = "number_token";
+ break;
+ case "PunctuationToken":
+ stringValue = tokenText;
+ break;
+ case "SymbolToken":
+ stringValue = tokenText;
+ break;
+ case "WordToken":
+ stringValue = tokenText;
+ break;
+ default:
+ throw new IllegalArgumentException("Invalid token type: " + tokenType);
+ }
+
+ return stringValue;
+ }
+}