You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/01/19 16:27:57 UTC
svn commit: r1060835 - in
/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat:
AbstractDocumentCategorizer.java DocumentCategorizer.java
LanguageDetector.java
Author: joern
Date: Wed Jan 19 15:27:57 2011
New Revision: 1060835
URL: http://svn.apache.org/viewvc?rev=1060835&view=rev
Log:
OpenNLP-51 Extended the integration with an AE which can set the doccat category label as language.
Added:
incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java (with props)
incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java
Added: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java?rev=1060835&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java (added)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java Wed Jan 19 15:27:57 2011
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.uima.doccat;
+
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.uima.util.AnnotatorUtil;
+import opennlp.uima.util.UimaUtil;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.resource.ResourceAccessException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+
+/**
+ * Abstract document categorizer which can be implemented to define how the
+ * output of the categorizer should be written into the CAS.
+ */
+abstract class AbstractDocumentCategorizer extends CasAnnotator_ImplBase {
+
+ private UimaContext context;
+
+ private Logger mLogger;
+
+ private opennlp.tools.doccat.DocumentCategorizer mCategorizer;
+
+ private Type mTokenType;
+
+ public void initialize(UimaContext context)
+ throws ResourceInitializationException {
+
+ super.initialize(context);
+
+ this.context = context;
+
+ mLogger = context.getLogger();
+
+ if (mLogger.isLoggable(Level.INFO)) {
+ mLogger.log(Level.INFO, "Initializing the OpenNLP Categorizer.");
+ }
+
+ DoccatModel model;
+
+ try {
+ DoccatModelResource modelResource = (DoccatModelResource) context
+ .getResourceObject(UimaUtil.MODEL_PARAMETER);
+
+ model = modelResource.getModel();
+ } catch (ResourceAccessException e) {
+ throw new ResourceInitializationException(e);
+ }
+
+ mCategorizer = new DocumentCategorizerME(model);
+ }
+
+ public void typeSystemInit(TypeSystem typeSystem)
+ throws AnalysisEngineProcessException {
+ mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
+ UimaUtil.SENTENCE_TYPE_PARAMETER);
+ }
+
+ protected abstract void setBestCategory(CAS cas, String bestCategory);
+
+ public void process(CAS cas) {
+
+ double result[];
+
+ if (mTokenType != null) {
+ // TODO:
+ // count tokens
+ // create token array
+ // pass array to doccat
+ // create result annotation
+ result = mCategorizer.categorize(cas.getDocumentText());
+ }
+ else {
+ result = mCategorizer.categorize(cas.getDocumentText());
+ }
+
+ String bestCategory = mCategorizer.getBestCategory(result);
+
+ setBestCategory(cas, bestCategory);
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java?rev=1060835&r1=1060834&r2=1060835&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java (original)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/DocumentCategorizer.java Wed Jan 19 15:27:57 2011
@@ -17,13 +17,8 @@
package opennlp.uima.doccat;
-import opennlp.tools.doccat.DoccatModel;
-import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.uima.util.AnnotatorUtil;
-import opennlp.uima.util.UimaUtil;
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
@@ -31,94 +26,34 @@ import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceAccessException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Level;
-import org.apache.uima.util.Logger;
/**
- * OpenNLP NameFinder trainer.
+ * OpenNLP Document Categorizer.
*
* Mandatory parameters:
*/
-public class DocumentCategorizer extends CasAnnotator_ImplBase {
+public class DocumentCategorizer extends AbstractDocumentCategorizer {
- private UimaContext context;
-
- private Logger mLogger;
-
- private opennlp.tools.doccat.DocumentCategorizer mCategorizer;
-
- private Type mTokenType;
-
private Type mCategoryType;
private Feature mCategoryFeature;
- public void initialize(UimaContext context)
- throws ResourceInitializationException {
-
- super.initialize(context);
-
- this.context = context;
-
- mLogger = context.getLogger();
-
- if (mLogger.isLoggable(Level.INFO)) {
- mLogger.log(Level.INFO, "Initializing the OpenNLP Categorizer.");
- }
-
- DoccatModel model;
-
- try {
- DoccatModelResource modelResource =
- (DoccatModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);
-
- model = modelResource.getModel();
- }
- catch (ResourceAccessException e) {
- throw new ResourceInitializationException(e);
- }
-
- mCategorizer = new DocumentCategorizerME(model);
- }
+
public void typeSystemInit(TypeSystem typeSystem)
throws AnalysisEngineProcessException {
- // yes it must, the user later would use a very simple tokenizer and pass it to the
- // doccat for language detection
- mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
- UimaUtil.SENTENCE_TYPE_PARAMETER);
-
// get category type and feature (it a document propery, one object with a feature)
- mCategoryType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
+ mCategoryType = AnnotatorUtil.getRequiredTypeParameter(getContext(), typeSystem,
"opennlp.uima.doccat.CategoryType");
// get feature name
- mCategoryFeature = AnnotatorUtil.getRequiredFeatureParameter(context, mCategoryType,
+ mCategoryFeature = AnnotatorUtil.getRequiredFeatureParameter(getContext(), mCategoryType,
"opennlp.uima.doccat.CategoryFeature", CAS.TYPE_NAME_STRING);
}
- public void process(CAS tcas) {
-
- double result[];
-
- if (mTokenType != null) {
- // TODO:
- // count tokens
- // create token array
- // pass array to doccat
- // create result annotation
- result = mCategorizer.categorize(tcas.getDocumentText());
- }
- else {
- result = mCategorizer.categorize(tcas.getDocumentText());
- }
-
- String bestCategroy = mCategorizer.getBestCategory(result);
-
- // get cat fs
+ @Override
+ protected void setBestCategory(CAS tcas, String bestCategory) {
FSIndex<AnnotationFS> categoryIndex = tcas.getAnnotationIndex(mCategoryType);
AnnotationFS categoryAnnotation = (AnnotationFS) (categoryIndex.size() > 0 ?
@@ -134,6 +69,6 @@ public class DocumentCategorizer extends
tcas.getIndexRepository().addFS(categoryAnnotation);
}
- categoryAnnotation.setStringValue(mCategoryFeature, bestCategroy);
+ categoryAnnotation.setStringValue(mCategoryFeature, bestCategory);
}
}
\ No newline at end of file
Added: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java?rev=1060835&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java (added)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java Wed Jan 19 15:27:57 2011
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.uima.doccat;
+
+import org.apache.uima.cas.CAS;
+
+/**
+ * Analysis Engine which can detected the language of a text. The AE uses the OpenNLP document
+ * categorizer and a special language detection model. The outcome of the document categorizer
+ * model is written into the language field of the CAS view.
+ */
+public class LanguageDetector extends AbstractDocumentCategorizer {
+
+ @Override
+ protected void setBestCategory(CAS cas, String bestCategory) {
+ cas.setDocumentLanguage(bestCategory);
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/doccat/LanguageDetector.java
------------------------------------------------------------------------------
svn:mime-type = text/plain