You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/01/19 16:27:57 UTC

svn commit: r1060835 - in /incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat: AbstractDocumentCategorizer.java DocumentCategorizer.java LanguageDetector.java

Author: joern
Date: Wed Jan 19 15:27:57 2011
New Revision: 1060835

URL: http://svn.apache.org/viewvc?rev=1060835&view=rev
Log:
OpenNLP-51 Extended the integration with an AE which can set the doccat category label as language.

Added:
    incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java   (with props)
    incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java

Added: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java?rev=1060835&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java (added)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java Wed Jan 19 15:27:57 2011
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.uima.doccat;
+
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.uima.util.AnnotatorUtil;
+import opennlp.uima.util.UimaUtil;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.resource.ResourceAccessException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+
+/**
+ * Abstract document categorizer which can be implemented to define how the
+ * output of the categorizer should be written into the CAS.
+ */
+abstract class AbstractDocumentCategorizer extends CasAnnotator_ImplBase {
+
+  private UimaContext context;
+
+  private Logger mLogger;
+
+  private opennlp.tools.doccat.DocumentCategorizer mCategorizer;
+
+  private Type mTokenType;
+
+  public void initialize(UimaContext context)
+      throws ResourceInitializationException {
+
+    super.initialize(context);
+
+    this.context = context;
+
+    mLogger = context.getLogger();
+
+    if (mLogger.isLoggable(Level.INFO)) {
+      mLogger.log(Level.INFO, "Initializing the OpenNLP Categorizer.");
+    }
+
+    DoccatModel model;
+
+    try {
+      DoccatModelResource modelResource = (DoccatModelResource) context
+          .getResourceObject(UimaUtil.MODEL_PARAMETER);
+
+      model = modelResource.getModel();
+    } catch (ResourceAccessException e) {
+      throw new ResourceInitializationException(e);
+    }
+
+    mCategorizer = new DocumentCategorizerME(model);
+  }
+  
+  public void typeSystemInit(TypeSystem typeSystem) 
+      throws AnalysisEngineProcessException {
+    mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
+        UimaUtil.SENTENCE_TYPE_PARAMETER);
+  }
+  
+  protected abstract void setBestCategory(CAS cas, String bestCategory);
+  
+  public void process(CAS cas) {
+    
+    double result[];
+    
+    if (mTokenType != null) {
+      // TODO:
+      // count tokens
+      // create token array
+      // pass array to doccat
+      // create result annotation
+      result = mCategorizer.categorize(cas.getDocumentText());
+    }
+    else {
+      result = mCategorizer.categorize(cas.getDocumentText());
+    }
+    
+    String bestCategory = mCategorizer.getBestCategory(result);
+    
+    setBestCategory(cas, bestCategory);
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java?rev=1060835&r1=1060834&r2=1060835&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java (original)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java Wed Jan 19 15:27:57 2011
@@ -17,13 +17,8 @@
 
 package opennlp.uima.doccat;
 
-import opennlp.tools.doccat.DoccatModel;
-import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.uima.util.AnnotatorUtil;
-import opennlp.uima.util.UimaUtil;
 
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIndex;
@@ -31,94 +26,34 @@ import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceAccessException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Level;
-import org.apache.uima.util.Logger;
 
 /**
- * OpenNLP NameFinder trainer.
+ * OpenNLP Document Categorizer.
  * 
  * Mandatory parameters:
  */
-public class DocumentCategorizer extends CasAnnotator_ImplBase {
+public class DocumentCategorizer extends AbstractDocumentCategorizer {
   
-  private UimaContext context;
-	
-  private Logger mLogger;
-  
-  private opennlp.tools.doccat.DocumentCategorizer mCategorizer;
-
-  private Type mTokenType;
-
   private Type mCategoryType;
 
   private Feature mCategoryFeature;
   
-  public void initialize(UimaContext context) 
-      throws ResourceInitializationException {
-    
-    super.initialize(context);
-    
-    this.context = context;
-	  
-	mLogger = context.getLogger();
-	  
-    if (mLogger.isLoggable(Level.INFO)) {
-      mLogger.log(Level.INFO, "Initializing the OpenNLP Categorizer.");
-    }  
-    
-    DoccatModel model;
-    
-    try {
-      DoccatModelResource modelResource = 
-            (DoccatModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);
-        
-        model = modelResource.getModel();
-    }
-    catch (ResourceAccessException e) {
-        throw new ResourceInitializationException(e);
-    }
-    
-    mCategorizer = new DocumentCategorizerME(model);
-  }
+
   
   public void typeSystemInit(TypeSystem typeSystem) 
       throws AnalysisEngineProcessException {
     
-    // yes it must, the user later would use a very simple tokenizer and pass it to the
-    // doccat for language detection
-	  mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
-        UimaUtil.SENTENCE_TYPE_PARAMETER);
-    
     // get category type and feature (it a document propery, one object with a feature)
-    mCategoryType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
+    mCategoryType = AnnotatorUtil.getRequiredTypeParameter(getContext(), typeSystem,
         "opennlp.uima.doccat.CategoryType");
     
     // get feature name
-    mCategoryFeature = AnnotatorUtil.getRequiredFeatureParameter(context, mCategoryType, 
+    mCategoryFeature = AnnotatorUtil.getRequiredFeatureParameter(getContext(), mCategoryType, 
     		"opennlp.uima.doccat.CategoryFeature", CAS.TYPE_NAME_STRING);
   }
   
-  public void process(CAS tcas) {
-    
-    double result[];
-    
-    if (mTokenType != null) {
-      // TODO:
-      // count tokens
-      // create token array
-      // pass array to doccat
-      // create result annotation
-      result = mCategorizer.categorize(tcas.getDocumentText());
-    }
-    else {
-      result = mCategorizer.categorize(tcas.getDocumentText());
-    }
-    
-    String bestCategroy = mCategorizer.getBestCategory(result);
-    
-    // get cat fs 
+  @Override
+  protected void setBestCategory(CAS tcas, String bestCategory) {
     FSIndex<AnnotationFS> categoryIndex = tcas.getAnnotationIndex(mCategoryType);
     
     AnnotationFS categoryAnnotation = (AnnotationFS) (categoryIndex.size() > 0 ? 
@@ -134,6 +69,6 @@ public class DocumentCategorizer extends
       tcas.getIndexRepository().addFS(categoryAnnotation);
     }    
     
-    categoryAnnotation.setStringValue(mCategoryFeature, bestCategroy);
+    categoryAnnotation.setStringValue(mCategoryFeature, bestCategory);
   }
 }
\ No newline at end of file

Added: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java?rev=1060835&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java (added)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java Wed Jan 19 15:27:57 2011
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.uima.doccat;
+
+import org.apache.uima.cas.CAS;
+
+/**
+ * Analysis Engine which can detected the language of a text. The AE uses the OpenNLP document
+ * categorizer and a special language detection model. The outcome of the document categorizer
+ * model is written into the language field of the CAS view.
+ */
+public class LanguageDetector extends AbstractDocumentCategorizer {
+
+  @Override
+  protected void setBestCategory(CAS cas, String bestCategory) {
+    cas.setDocumentLanguage(bestCategory);
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain