You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/15 21:29:20 UTC

svn commit: r1244688 - in /lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima: ./ ae/

Author: rmuir
Date: Wed Feb 15 20:29:20 2012
New Revision: 1244688

URL: http://svn.apache.org/viewvc?rev=1244688&view=rev
Log:
LUCENE-3731: performance improvements and thread safety fixes to UIMA tokenizers

Modified:
    lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
    lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
    lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
    lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java
    lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java

Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java Wed Feb 15 20:29:20 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uima;
  */
 
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
@@ -35,28 +36,31 @@ import java.io.Reader;
 public abstract class BaseUIMATokenizer extends Tokenizer {
 
   protected FSIterator<AnnotationFS> iterator;
+  protected final AnalysisEngine ae;
+  protected final CAS cas;
 
-  protected BaseUIMATokenizer(Reader reader) {
+  protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
     super(reader);
+    try {
+      ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
+      cas = ae.newCAS();
+    } catch (ResourceInitializationException e) {
+      throw new RuntimeException(e);
+    }
   }
 
   /**
    * analyzes the tokenizer input using the given analysis engine
-   *
-   * @param analysisEngine the AE to use for analyzing the tokenizer input
-   * @return CAS with extracted metadata (UIMA annotations, feature structures)
-   * @throws ResourceInitializationException
+   * 
+   * {@link #cas} will be filled with  extracted metadata (UIMA annotations, feature structures)
    *
    * @throws AnalysisEngineProcessException
    * @throws IOException
    */
-  protected CAS analyzeInput(AnalysisEngine analysisEngine) throws ResourceInitializationException,
-      AnalysisEngineProcessException, IOException {
-    CAS cas = analysisEngine.newCAS();
+  protected void analyzeInput() throws AnalysisEngineProcessException,IOException {
+    cas.reset();
     cas.setDocumentText(toString(input));
-    analysisEngine.process(cas);
-    analysisEngine.destroy();
-    return cas;
+    ae.process(cas);
   }
 
   private String toString(Reader reader) throws IOException {
@@ -78,4 +82,6 @@ public abstract class BaseUIMATokenizer 
   public void end() throws IOException {
     iterator = null;
   }
+  
+  
 }

Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java Wed Feb 15 20:29:20 2012
@@ -20,14 +20,9 @@ package org.apache.lucene.analysis.uima;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
-import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.InvalidXMLException;
 
 import java.io.IOException;
 import java.io.Reader;
@@ -42,23 +37,18 @@ public final class UIMAAnnotationsTokeni
   private final OffsetAttribute offsetAttr;
 
   private final String tokenTypeString;
-
-  private final String descriptorPath;
-
+  
   private int finalOffset = 0;
 
   public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
-    super(input);
+    super(input, descriptorPath);
     this.tokenTypeString = tokenType;
     this.termAttr = addAttribute(CharTermAttribute.class);
     this.offsetAttr = addAttribute(OffsetAttribute.class);
-    this.descriptorPath = descriptorPath;
   }
 
-  private void analyzeText(String descriptorPath) throws IOException, ResourceInitializationException,
-      AnalysisEngineProcessException {
-    AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
-    CAS cas = analyzeInput(ae);
+  private void analyzeText() throws IOException, AnalysisEngineProcessException {
+    analyzeInput();
     finalOffset = correctOffset(cas.getDocumentText().length());
     Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
     iterator = cas.getAnnotationIndex(tokenType).iterator();
@@ -68,7 +58,7 @@ public final class UIMAAnnotationsTokeni
   public boolean incrementToken() throws IOException {
     if (iterator == null) {
       try {
-        analyzeText(descriptorPath);
+        analyzeText();
       } catch (Exception e) {
         throw new IOException(e);
       }

Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java Wed Feb 15 20:29:20 2012
@@ -21,16 +21,11 @@ import org.apache.lucene.analysis.Tokeni
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
-import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
 import org.apache.uima.cas.FeaturePath;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.InvalidXMLException;
 
 import java.io.IOException;
 import java.io.Reader;
@@ -49,28 +44,23 @@ public final class UIMATypeAwareAnnotati
 
   private final String tokenTypeString;
 
-  private final String descriptorPath;
-
   private final String typeAttributeFeaturePath;
 
   private FeaturePath featurePath;
-
+  
   private int finalOffset = 0;
 
   public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
-    super(input);
+    super(input, descriptorPath);
     this.tokenTypeString = tokenType;
     this.termAttr = addAttribute(CharTermAttribute.class);
     this.typeAttr = addAttribute(TypeAttribute.class);
     this.offsetAttr = addAttribute(OffsetAttribute.class);
     this.typeAttributeFeaturePath = typeAttributeFeaturePath;
-    this.descriptorPath = descriptorPath;
   }
 
-  private void analyzeText() throws IOException, ResourceInitializationException, AnalysisEngineProcessException,
-      CASException {
-    AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
-    CAS cas = analyzeInput(ae);
+  private void analyzeText() throws IOException, AnalysisEngineProcessException, CASException {
+    analyzeInput();
     finalOffset = correctOffset(cas.getDocumentText().length());
     Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
     iterator = cas.getAnnotationIndex(tokenType).iterator();

Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java Wed Feb 15 20:29:20 2012
@@ -17,6 +17,9 @@ package org.apache.lucene.analysis.uima.
  * limitations under the License.
  */
 
+import java.io.IOException;
+
+import org.apache.lucene.util.IOUtils;
 import org.apache.uima.UIMAFramework;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@@ -30,38 +33,55 @@ import org.apache.uima.util.XMLInputSour
 public class BasicAEProvider implements AEProvider {
 
   private final String aePath;
-  private AnalysisEngine cachedAE;
+  private AnalysisEngineDescription cachedDescription;
 
   public BasicAEProvider(String aePath) {
     this.aePath = aePath;
   }
 
   @Override
-  public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
-    try {
-      if (cachedAE == null) {
-        // get Resource Specifier from XML file
-
-        XMLInputSource in;
+  public AnalysisEngine getAE() throws ResourceInitializationException {
+    synchronized(this) {
+      if (cachedDescription == null) {
+        XMLInputSource in = null;
+        boolean success = false;
         try {
-          in = new XMLInputSource(aePath);
+          // get Resource Specifier from XML file
+          in = getInputSource();
+
+          // get AE description
+          cachedDescription = UIMAFramework.getXMLParser()
+              .parseAnalysisEngineDescription(in);
+          configureDescription(cachedDescription);
+          success = true;
         } catch (Exception e) {
-          in = new XMLInputSource(getClass().getResource(aePath));
+            throw new ResourceInitializationException(e);
+        } finally {
+          if (success) {
+            try {
+              IOUtils.close(in.getInputStream());
+            } catch (IOException e) {
+              throw new ResourceInitializationException(e);
+            }
+          } else if (in != null) {
+            IOUtils.closeWhileHandlingException(in.getInputStream());
+          }
         }
+      } 
+    }
 
-        // get AE description
-        AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
-            .parseAnalysisEngineDescription(in);
-
-        // create AE here
-        cachedAE = UIMAFramework.produceAnalysisEngine(desc);
-      } else {
-        cachedAE.reconfigure();
-      }
-    } catch (Exception e) {
-      cachedAE = null;
-      throw new ResourceInitializationException(e);
+    return UIMAFramework.produceAnalysisEngine(cachedDescription);
+  }
+  
+  protected void configureDescription(AnalysisEngineDescription description) {
+    // no configuration
+  }
+  
+  private XMLInputSource getInputSource() throws IOException {
+    try {
+      return new XMLInputSource(aePath);
+    } catch (IOException e) {
+      return new XMLInputSource(getClass().getResource(aePath));
     }
-    return cachedAE;
   }
 }

Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java Wed Feb 15 20:29:20 2012
@@ -17,11 +17,7 @@ package org.apache.lucene.analysis.uima.
  * limitations under the License.
  */
 
-import org.apache.uima.UIMAFramework;
-import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.XMLInputSource;
 
 import java.util.Map;
 
@@ -30,51 +26,22 @@ import java.util.Map;
  * injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
  * them as overriding parameters in the aggregate AE
  */
-public class OverridingParamsAEProvider implements AEProvider {
-
-  private final String aePath;
-
-  private AnalysisEngine cachedAE;
+public class OverridingParamsAEProvider extends BasicAEProvider {
 
   private final Map<String, Object> runtimeParameters;
 
   public OverridingParamsAEProvider(String aePath, Map<String, Object> runtimeParameters) {
-    this.aePath = aePath;
+    super(aePath);
     this.runtimeParameters = runtimeParameters;
   }
-
+  
   @Override
-  public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
-    try {
-      if (cachedAE == null) {
-        // get Resource Specifier from XML file
-        XMLInputSource in;
-        try {
-          in = new XMLInputSource(aePath);
-        } catch (Exception e) {
-          in = new XMLInputSource(getClass().getResource(aePath));
-        }
-
-        // get AE description
-        AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
-            .parseAnalysisEngineDescription(in);
-
-        /* iterate over each AE (to set runtime parameters) */
-        for (String attributeName : runtimeParameters.keySet()) {
-          Object val = getRuntimeValue(desc, attributeName);
-          desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
-              attributeName, val);
-        }
-        // create AE here
-        cachedAE = UIMAFramework.produceAnalysisEngine(desc);
-      } else {
-        cachedAE.reconfigure();
-      }
-    } catch (Exception e) {
-      cachedAE = null;
-      throw new ResourceInitializationException(e);
+  protected void configureDescription(AnalysisEngineDescription description) {
+    for (String attributeName : runtimeParameters.keySet()) {
+      Object val = getRuntimeValue(description, attributeName);
+      description.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
+          attributeName, val);
     }
-    return cachedAE;
   }
 
   /* create the value to inject in the runtime parameter depending on its declared type */