You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2010/09/14 17:34:14 UTC
svn commit: r996953 - in /uima/sandbox/trunk/Tagger: desc/HmmModelTrainer.xml src/main/java/org/apache/uima/examples/tagger/HMMModelTrainer.java src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java

Author: schor
Date: Tue Sep 14 15:34:14 2010
New Revision: 996953

URL: http://svn.apache.org/viewvc?rev=996953&view=rev
Log:
[UIMA-1833] committed the update, which adds an xml descriptor and an annotator that extracts tokens from a CAS to use as input to the existing HMM training code.

Added:
    uima/sandbox/trunk/Tagger/desc/HmmModelTrainer.xml
    uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMModelTrainer.java
Modified:
    uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java

Added: uima/sandbox/trunk/Tagger/desc/HmmModelTrainer.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/Tagger/desc/HmmModelTrainer.xml?rev=996953&view=auto
==============================================================================
--- uima/sandbox/trunk/Tagger/desc/HmmModelTrainer.xml (added)
+++ uima/sandbox/trunk/Tagger/desc/HmmModelTrainer.xml Tue Sep 14 15:34:14 2010
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+  <primitive>true</primitive>
+  <annotatorImplementationName>org.apache.uima.examples.tagger.HMMModelTrainer</annotatorImplementationName>
+  <analysisEngineMetaData>
+    <name>HMMModelTrainer</name>
+    <description>This analysis engine trains an N-gram model for the HMM tagger. It uses a training corpus as reference. This corpus must contain annotations on words with an attribute corresponding of the POS value to be learned.
+
+The configuration of this analysis engine is done through several parameters:
+&lt;ul&gt;
+&lt;li&gt;View: - the view from which the tokens will be extracted&lt;/li&gt;
+&lt;li&gt;ModelExportFile: - the path where the model will be written&lt;/li&gt;
+&lt;li&gt;FeaturePathPOS: - feature path to the value of the POS to be learned. The annotation should exactly cover a "word".&lt;/li&gt;
+&lt;/ul&gt;
+
+&lt;b&gt;BEWARE: this analysis engine does not allow multiple deployment !&lt;/b&gt;
+
+&lt;i&gt;NB. At the moment: both bi and trigram statistics are saved in one model file.&lt;/i&gt;</description>
+    <version>1.0</version>
+    <vendor/>
+    <configurationParameters>
+      <configurationParameter>
+        <name>View</name>
+        <description>The view from which the tokens will be extracted.</description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>true</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>ModelExportFile</name>
+        <description>The path where the model will be written.</description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>true</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>FeaturePathPOS</name>
+        <description>Feature path to the value of the POS to be learnt. The annotation should exactly cover a "word".</description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>true</mandatory>
+      </configurationParameter>
+    </configurationParameters>
+    <configurationParameterSettings>
+      <nameValuePair>
+        <name>View</name>
+        <value>
+          <string>_InitialView</string>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>ModelExportFile</name>
+        <value>
+          <string>hmmtagger_model.dat</string>
+        </value>
+      </nameValuePair>
+      <nameValuePair>
+        <name>FeaturePathPOS</name>
+        <value>
+          <string>org.apache.uima.TokenAnnotation:posTag</string>
+        </value>
+      </nameValuePair>
+    </configurationParameterSettings>
+    <typeSystemDescription/>
+    <typePriorities/>
+    <fsIndexCollection/>
+    <capabilities>
+      <capability>
+        <inputs/>
+        <outputs/>
+        <languagesSupported/>
+      </capability>
+    </capabilities>
+    <operationalProperties>
+      <modifiesCas>false</modifiesCas>
+      <multipleDeploymentAllowed>false</multipleDeploymentAllowed>
+      <outputsNewCASes>false</outputsNewCASes>
+    </operationalProperties>
+  </analysisEngineMetaData>
+  <resourceManagerConfiguration/>
+</analysisEngineDescription>

Added: uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMModelTrainer.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMModelTrainer.java?rev=996953&view=auto
==============================================================================
--- uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMModelTrainer.java (added)
+++ uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/HMMModelTrainer.java Tue Sep 14 15:34:14 2010
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.examples.tagger;
+
+
+// Java dependencies
+import java.util.ArrayList;
+// UIMA dependencies
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.examples.tagger.trainAndTest.ModelGeneration;
+import org.apache.uima.examples.tagger.trainAndTest.Token;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+
+/**
+ * This analysis engine trains an N-gram model for the HMM tagger. It uses
+ * a training corpus as reference. This corpus must contain annotations on 
+ * words with an attribute corresponding to the POS value to be learned.
+ * 
+ * The configuration of this analysis engine is done through several parameters:
+ * <ul>
+ * <li>View: - the view from which the tokens will be extracted</li>
+ * <li>ModelExportFile: - the path where the model will be written</li>
+ * <li>FeaturePathPOS: - feature path to the value of the POS to be learned.
+ * The annotation should exactly cover a "word".</li>
+ * </ul>
+ * 
+ * <b>BEWARE: this analysis engine does not allow multiple deployment !</b>
+ * 
+ * NB. At the moment: both bi and trigram statistics are saved in one model file. 
+ */
+public class HMMModelTrainer extends JCasAnnotator_ImplBase {
+	
+	/** Name of the parameter for the view */
+	public static String PARAM_VIEW  = "View";
+	/** Name of the parameter for the model export path */
+	public static String PARAM_FILE  = "ModelExportFile";
+	/** Name of the parameter for the feature path to the POS */
+	public static String PARAM_POSFP = "FeaturePathPOS";
+	
+	/** The view from which the tokens will be extracted */
+	private String theView;
+	/** The path to the file where the model will be written */
+	private String fileOutput;
+	/** The type from which we will extract the tags to learn */
+	private String theTokenTypeName;
+	/** The name of the attribute where the POS to learn is stored */
+	private String thePOSAttribute;
+	
+	/** The list of collected tokens */
+	private ArrayList<Token> theLearnedTokens;
+
+	/**
+	 * Initialization of the component
+	 */
+	public void initialize(UimaContext aContext)
+			throws ResourceInitializationException {
+		super.initialize(aContext);
+		// Configure the component
+		theView = 
+			(String) aContext.getConfigParameterValue(PARAM_VIEW);
+		fileOutput = 
+				(String) aContext.getConfigParameterValue(PARAM_FILE);
+		// Compute the type and the attribute name
+		String fpPOS = 
+			(String) aContext.getConfigParameterValue(PARAM_POSFP);
+		Integer idx = fpPOS.lastIndexOf(":");
+		if (idx >= 0) {
+			theTokenTypeName = fpPOS.substring(0, idx);
+			thePOSAttribute  = fpPOS.substring(idx+1);
+		} else {
+			throw new ResourceInitializationException("The feature path passed " +
+					"in parameter ('"+fpPOS+"') is not valid. " +
+					"It should be like : 'type.name:attribute'", null);
+		}	
+		// Prepare the list of tokens
+		theLearnedTokens = new ArrayList<Token>();
+	}
+	
+	/**
+	 * Processing.
+	 * Browse the annotations of the type theTokenTypeName that must inherit 
+	 * from the type tcas.Annotation and build the list of tokens that will be 
+	 * learned by the HMMTagger. 
+	 */
+	@Override
+	public void process(JCas cas) throws AnalysisEngineProcessException {
+		try {
+			// Select the view we will work on
+			JCas workingView = cas.getView(theView);
+			// Iterate over the type we will learn from
+			Type tokenType = 
+				workingView.getTypeSystem().getType(theTokenTypeName);
+			if (tokenType != null) {
+				// Compute the feature for the POS value
+				Feature featPOS = 
+					tokenType.getFeatureByBaseName(thePOSAttribute);
+				// Browse these annotation and create the tokens to be learned
+				FSIterator<Annotation> itPOS = 
+					workingView.getAnnotationIndex(tokenType).iterator();
+				Integer c = 0;
+				Integer i = 0;
+				while ( itPOS.hasNext() ) {
+					Annotation token = itPOS.next();
+					// Create a new token to be learned and add it to the list,
+					// if the POS value is relevant (not null)
+					Token tokenTmp = new Token();
+					tokenTmp.word  = token.getCoveredText();
+					tokenTmp.pos   = token.getStringValue(featPOS);
+					if (tokenTmp.pos != null) {
+						c++;
+						theLearnedTokens.add(tokenTmp);
+					} else {
+						UIMAFramework.getLogger().log(Level.WARNING,
+								"Ignoring token "+tokenTmp.word+" because its " +
+								"POS value is null");
+						i++;
+					}
+				}
+				// Log the number of tokens collected
+				UIMAFramework.getLogger().log(Level.INFO,
+						c + " tokens to be learned added, " + i +
+						" tokens ignored.");
+			} else {
+				throw new AnalysisEngineProcessException(
+					"The type '"+theTokenTypeName+"', passed as token type " +
+					"for the training is not in the type system.", null);
+			}
+		} catch (CASException e) {
+			throw new AnalysisEngineProcessException(e);
+		}
+	}
+
+	/**
+	 * Called at the end of the processing.
+	 * When the whole collection has been processed, we create the model
+	 * from the elements we collected.
+	 */
+	@Override
+	public void collectionProcessComplete() throws AnalysisEngineProcessException {
+		try {
+			UIMAFramework.getLogger().log(Level.INFO, 
+					"Generation of model '"+fileOutput+"' with " + 
+					theLearnedTokens.size()+" tokens to be learned.");
+			ModelGeneration md = 
+				new ModelGeneration(theLearnedTokens,fileOutput);
+			md.init();
+			UIMAFramework.getLogger().log(Level.INFO, "Model generated: " +
+			md.suffix_tree.size() + " leaves suffix tree, " +
+			md.transition_probs.size() + " transitions probabilities, " +
+			md.word_probs.size() + " tag probabilities.");
+		} catch (Exception e) {
+			// Because UIMA filter all exceptions... we try to catch those
+			UIMAFramework.getLogger().log(Level.SEVERE,
+					"Something happened : " + e.getMessage());
+			e.printStackTrace();
+			throw new AnalysisEngineProcessException(e);
+		}
+	}
+}
+

Modified: uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java?rev=996953&r1=996952&r2=996953&view=diff
==============================================================================
--- uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java (original)
+++ uima/sandbox/trunk/Tagger/src/main/java/org/apache/uima/examples/tagger/trainAndTest/ModelGeneration.java Tue Sep 14 15:34:14 2010
@@ -90,7 +90,7 @@ public class ModelGeneration implements 
   }
   
   
-  private void init(){
+  public void init(){
     
       List<Map<String, Map<String,Double>>> l = get_word_probs(get_lexicon(corpus));