You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2021/05/06 17:54:42 UTC
svn commit: r1889571 - in /ctakes/trunk:
ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/
ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ae/
ctakes-dependency-parser/src/main/java/org/apach...
Author: tmill
Date: Thu May 6 17:54:42 2021
New Revision: 1889571
URL: http://svn.apache.org/viewvc?rev=1889571&view=rev
Log:
Added max tokens parameter to both parsers for faster processing of long (usually not meaningful) sentences.
Modified:
ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ae/ConstituencyParser.java
ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java
Modified: ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java?rev=1889571&r1=1889570&r2=1889571&view=diff
==============================================================================
--- ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java (original)
+++ ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java Thu May 6 17:54:42 2021
@@ -45,9 +45,13 @@ public class MaxentParserWrapper impleme
Parser parser = null;
private String parseStr = "";
Logger logger = Logger.getLogger(this.getClass().getName());
+ private int maxTokens;
+ public MaxentParserWrapper(InputStream in){
+ this(in, -1);
+ }
- public MaxentParserWrapper(InputStream is){
+ public MaxentParserWrapper(InputStream is, int maxTokens){
try {
if (is!=null) {
ParserModel model = new ParserModel(is);
@@ -56,6 +60,7 @@ public class MaxentParserWrapper impleme
} catch (IOException e) {
e.printStackTrace();
}
+ this.maxTokens = maxTokens;
}
@Override
@@ -87,7 +92,7 @@ public class MaxentParserWrapper impleme
}
// final FSArray terminalArray = TreeUtils.getTerminals( jcas, sentence );
final FSArray terminalArray = TreeUtils.getTerminals( jcas, new ArrayList<>( sentenceTokens.getValue() ) );
- if(terminalArray.size() > 40) continue;
+ if(this.maxTokens > 0 && terminalArray.size() > this.maxTokens) continue;
final String tokenString = TreeUtils.getSplitSentence( terminalArray );
if ( tokenString.isEmpty() ) {
parse = null;
Modified: ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ae/ConstituencyParser.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ae/ConstituencyParser.java?rev=1889571&r1=1889570&r2=1889571&view=diff
==============================================================================
--- ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ae/ConstituencyParser.java (original)
+++ ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/ae/ConstituencyParser.java Thu May 6 17:54:42 2021
@@ -52,6 +52,12 @@ public class ConstituencyParser extends
defaultValue = "org/apache/ctakes/constituency/parser/models/sharpacq-3.1.bin"
)
private String modelFilename;
+
+ public static final String PARAM_MAX_TOKENS = "MaxTokens";
+ @ConfigurationParameter(name = PARAM_MAX_TOKENS,
+ description = "The token limit for sentences we actually parse. Longer sentences will be ignored.",
+ mandatory = false)
+ private int maxTokens = -1;
private ParserWrapper parser = null;
@@ -62,7 +68,7 @@ public class ConstituencyParser extends
super.initialize( aContext );
logger.info( "Initializing ..." );
try ( DotLogger dotter = new DotLogger() ) {
- parser = new MaxentParserWrapper( FileLocator.getAsStream( modelFilename ) );
+ parser = new MaxentParserWrapper( FileLocator.getAsStream( modelFilename ), this.maxTokens );
} catch ( IOException ioE ) {
logger.error( "Error reading parser model file/directory: " + ioE.getMessage() );
throw new ResourceInitializationException( ioE );
@@ -83,6 +89,14 @@ public class ConstituencyParser extends
ConstituencyParser.PARAM_MODEL_FILENAME,
modelPath);
}
+
+ public static AnalysisEngineDescription createAnnotatorDescription(int maxTokens) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ ConstituencyParser.class,
+ ConstituencyParser.PARAM_MAX_TOKENS,
+ maxTokens);
+ }
+
public static AnalysisEngineDescription createAnnotatorDescription()
throws ResourceInitializationException {
return AnalysisEngineFactory.createEngineDescription(
Modified: ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java?rev=1889571&r1=1889570&r2=1889571&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java (original)
+++ ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java Thu May 6 17:54:42 2021
@@ -127,6 +127,12 @@ public class ClearNLPDependencyParserAE
description = "If true, use the default ClearNLP lemmatizer, otherwise use lemmas from the BaseToken normalizedToken field" )
private boolean useLemmatizer;
+ public static final String PARAM_MAX_TOKENS = "MaxTokens";
+ @ConfigurationParameter(name = PARAM_MAX_TOKENS,
+ mandatory=false,
+ description="The maximum length sentence to parse. Longer sentences will have a basic dependency structure created where every node's head is the sentence node.")
+ private int maxTokens=-1;
+
public static final String DEP_MODEL_KEY = "DepModel";
@ExternalResource( key = DEP_MODEL_KEY, mandatory = false )
private DependencySharedModel parserModel = null;
@@ -189,12 +195,13 @@ public class ClearNLPDependencyParserAE
BaseToken token = printableTokens.get( i );
String lemma = useLemmatizer ? lemmatizer.getLemma( token.getCoveredText(), token.getPartOfSpeech() ) : token.getNormalizedForm();
DEPNode node = new DEPNode( i + 1, token.getCoveredText(), lemma, token.getPartOfSpeech(), new DEPFeat() );
+ node.setHead(tree.get(0)); // in case we don't end up actually processing, point everyone at the root.
tree.add( node );
}
// Run parser and convert output back to CAS friendly data types
synchronized(LOCK){
- parser.process( tree );
+ if(this.maxTokens <=0 || printableTokens.size() <= this.maxTokens) parser.process( tree );
ArrayList<ConllDependencyNode> nodes = ClearDependencyUtility.convert( jCas, tree, sentence, printableTokens );
DependencyUtility.addToIndexes( jCas, nodes );
}
@@ -212,6 +219,17 @@ public class ClearNLPDependencyParserAE
return createAnnotatorDescription( defaultParserResource, defaultLemmatizerResource );
}
+ public static synchronized AnalysisEngineDescription createAnnotatorDescription(int maxTokens) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ ClearNLPDependencyParserAE.class,
+ DEP_MODEL_KEY,
+ defaultParserResource,
+ LEM_MODEL_KEY,
+ defaultLemmatizerResource,
+ PARAM_MAX_TOKENS,
+ maxTokens);
+ }
+
public static AnalysisEngineDescription createAnnotatorDescription( ExternalResourceDescription parserDesc, ExternalResourceDescription lemmaDesc ) throws ResourceInitializationException {
return AnalysisEngineFactory.createEngineDescription(
ClearNLPDependencyParserAE.class,