You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/15 21:29:20 UTC
svn commit: r1244688 - in
/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima:
./ ae/
Author: rmuir
Date: Wed Feb 15 20:29:20 2012
New Revision: 1244688
URL: http://svn.apache.org/viewvc?rev=1244688&view=rev
Log:
LUCENE-3731: performance improvements and thread safety fixes to UIMA tokenizers
Modified:
lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java
lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java
Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java Wed Feb 15 20:29:20 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uima;
*/
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
@@ -35,28 +36,31 @@ import java.io.Reader;
public abstract class BaseUIMATokenizer extends Tokenizer {
protected FSIterator<AnnotationFS> iterator;
+ protected final AnalysisEngine ae;
+ protected final CAS cas;
- protected BaseUIMATokenizer(Reader reader) {
+ protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
super(reader);
+ try {
+ ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
+ cas = ae.newCAS();
+ } catch (ResourceInitializationException e) {
+ throw new RuntimeException(e);
+ }
}
/**
* analyzes the tokenizer input using the given analysis engine
- *
- * @param analysisEngine the AE to use for analyzing the tokenizer input
- * @return CAS with extracted metadata (UIMA annotations, feature structures)
- * @throws ResourceInitializationException
+ *
+ * {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures)
*
* @throws AnalysisEngineProcessException
* @throws IOException
*/
- protected CAS analyzeInput(AnalysisEngine analysisEngine) throws ResourceInitializationException,
- AnalysisEngineProcessException, IOException {
- CAS cas = analysisEngine.newCAS();
+ protected void analyzeInput() throws AnalysisEngineProcessException,IOException {
+ cas.reset();
cas.setDocumentText(toString(input));
- analysisEngine.process(cas);
- analysisEngine.destroy();
- return cas;
+ ae.process(cas);
}
private String toString(Reader reader) throws IOException {
@@ -78,4 +82,6 @@ public abstract class BaseUIMATokenizer
public void end() throws IOException {
iterator = null;
}
+
+
}
Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java Wed Feb 15 20:29:20 2012
@@ -20,14 +20,9 @@ package org.apache.lucene.analysis.uima;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
-import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.InvalidXMLException;
import java.io.IOException;
import java.io.Reader;
@@ -42,23 +37,18 @@ public final class UIMAAnnotationsTokeni
private final OffsetAttribute offsetAttr;
private final String tokenTypeString;
-
- private final String descriptorPath;
-
+
private int finalOffset = 0;
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
- super(input);
+ super(input, descriptorPath);
this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class);
this.offsetAttr = addAttribute(OffsetAttribute.class);
- this.descriptorPath = descriptorPath;
}
- private void analyzeText(String descriptorPath) throws IOException, ResourceInitializationException,
- AnalysisEngineProcessException {
- AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
- CAS cas = analyzeInput(ae);
+ private void analyzeText() throws IOException, AnalysisEngineProcessException {
+ analyzeInput();
finalOffset = correctOffset(cas.getDocumentText().length());
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
iterator = cas.getAnnotationIndex(tokenType).iterator();
@@ -68,7 +58,7 @@ public final class UIMAAnnotationsTokeni
public boolean incrementToken() throws IOException {
if (iterator == null) {
try {
- analyzeText(descriptorPath);
+ analyzeText();
} catch (Exception e) {
throw new IOException(e);
}
Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java Wed Feb 15 20:29:20 2012
@@ -21,16 +21,11 @@ import org.apache.lucene.analysis.Tokeni
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
-import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FeaturePath;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.InvalidXMLException;
import java.io.IOException;
import java.io.Reader;
@@ -49,28 +44,23 @@ public final class UIMATypeAwareAnnotati
private final String tokenTypeString;
- private final String descriptorPath;
-
private final String typeAttributeFeaturePath;
private FeaturePath featurePath;
-
+
private int finalOffset = 0;
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
- super(input);
+ super(input, descriptorPath);
this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class);
this.typeAttr = addAttribute(TypeAttribute.class);
this.offsetAttr = addAttribute(OffsetAttribute.class);
this.typeAttributeFeaturePath = typeAttributeFeaturePath;
- this.descriptorPath = descriptorPath;
}
- private void analyzeText() throws IOException, ResourceInitializationException, AnalysisEngineProcessException,
- CASException {
- AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
- CAS cas = analyzeInput(ae);
+ private void analyzeText() throws IOException, AnalysisEngineProcessException, CASException {
+ analyzeInput();
finalOffset = correctOffset(cas.getDocumentText().length());
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
iterator = cas.getAnnotationIndex(tokenType).iterator();
Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java Wed Feb 15 20:29:20 2012
@@ -17,6 +17,9 @@ package org.apache.lucene.analysis.uima.
* limitations under the License.
*/
+import java.io.IOException;
+
+import org.apache.lucene.util.IOUtils;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@@ -30,38 +33,55 @@ import org.apache.uima.util.XMLInputSour
public class BasicAEProvider implements AEProvider {
private final String aePath;
- private AnalysisEngine cachedAE;
+ private AnalysisEngineDescription cachedDescription;
public BasicAEProvider(String aePath) {
this.aePath = aePath;
}
@Override
- public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
- try {
- if (cachedAE == null) {
- // get Resource Specifier from XML file
-
- XMLInputSource in;
+ public AnalysisEngine getAE() throws ResourceInitializationException {
+ synchronized(this) {
+ if (cachedDescription == null) {
+ XMLInputSource in = null;
+ boolean success = false;
try {
- in = new XMLInputSource(aePath);
+ // get Resource Specifier from XML file
+ in = getInputSource();
+
+ // get AE description
+ cachedDescription = UIMAFramework.getXMLParser()
+ .parseAnalysisEngineDescription(in);
+ configureDescription(cachedDescription);
+ success = true;
} catch (Exception e) {
- in = new XMLInputSource(getClass().getResource(aePath));
+ throw new ResourceInitializationException(e);
+ } finally {
+ if (success) {
+ try {
+ IOUtils.close(in.getInputStream());
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
+ } else if (in != null) {
+ IOUtils.closeWhileHandlingException(in.getInputStream());
+ }
}
+ }
+ }
- // get AE description
- AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
- .parseAnalysisEngineDescription(in);
-
- // create AE here
- cachedAE = UIMAFramework.produceAnalysisEngine(desc);
- } else {
- cachedAE.reconfigure();
- }
- } catch (Exception e) {
- cachedAE = null;
- throw new ResourceInitializationException(e);
+ return UIMAFramework.produceAnalysisEngine(cachedDescription);
+ }
+
+ protected void configureDescription(AnalysisEngineDescription description) {
+ // no configuration
+ }
+
+ private XMLInputSource getInputSource() throws IOException {
+ try {
+ return new XMLInputSource(aePath);
+ } catch (IOException e) {
+ return new XMLInputSource(getClass().getResource(aePath));
}
- return cachedAE;
}
}
Modified: lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java?rev=1244688&r1=1244687&r2=1244688&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java (original)
+++ lucene/dev/trunk/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java Wed Feb 15 20:29:20 2012
@@ -17,11 +17,7 @@ package org.apache.lucene.analysis.uima.
* limitations under the License.
*/
-import org.apache.uima.UIMAFramework;
-import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.XMLInputSource;
import java.util.Map;
@@ -30,51 +26,22 @@ import java.util.Map;
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
* them as overriding parameters in the aggregate AE
*/
-public class OverridingParamsAEProvider implements AEProvider {
-
- private final String aePath;
-
- private AnalysisEngine cachedAE;
+public class OverridingParamsAEProvider extends BasicAEProvider {
private final Map<String, Object> runtimeParameters;
public OverridingParamsAEProvider(String aePath, Map<String, Object> runtimeParameters) {
- this.aePath = aePath;
+ super(aePath);
this.runtimeParameters = runtimeParameters;
}
-
+
@Override
- public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
- try {
- if (cachedAE == null) {
- // get Resource Specifier from XML file
- XMLInputSource in;
- try {
- in = new XMLInputSource(aePath);
- } catch (Exception e) {
- in = new XMLInputSource(getClass().getResource(aePath));
- }
-
- // get AE description
- AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
- .parseAnalysisEngineDescription(in);
-
- /* iterate over each AE (to set runtime parameters) */
- for (String attributeName : runtimeParameters.keySet()) {
- Object val = getRuntimeValue(desc, attributeName);
- desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
- attributeName, val);
- }
- // create AE here
- cachedAE = UIMAFramework.produceAnalysisEngine(desc);
- } else {
- cachedAE.reconfigure();
- }
- } catch (Exception e) {
- cachedAE = null;
- throw new ResourceInitializationException(e);
+ protected void configureDescription(AnalysisEngineDescription description) {
+ for (String attributeName : runtimeParameters.keySet()) {
+ Object val = getRuntimeValue(description, attributeName);
+ description.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
+ attributeName, val);
}
- return cachedAE;
}
/* create the value to inject in the runtime parameter depending on its declared type */