You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2016/12/19 17:32:03 UTC
svn commit: r1775141 -
/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
Author: seanfinan
Date: Mon Dec 19 17:32:03 2016
New Revision: 1775141
URL: http://svn.apache.org/viewvc?rev=1775141&view=rev
Log:
Move divider line detection to ctakes
Modified:
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java?rev=1775141&r1=1775140&r2=1775141&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java Mon Dec 19 17:32:03 2016
@@ -2,6 +2,7 @@ package org.apache.ctakes.core.ae;
import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.regex.RegexSpanFinder;
import org.apache.ctakes.core.util.regex.TimeoutMatcher;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.log4j.Logger;
@@ -30,6 +31,8 @@ abstract public class RegexSectionizer e
*/
static private final String DEFAULT_SEGMENT_ID = "SIMPLE_SEGMENT";
static private final String SECTION_NAME_EX = "SECTION_NAME";
+ static public final String DIVIDER_LINE_NAME = "DIVIDER_LINE";
+ static private final Pattern DIVIDER_LINE_PATTERN = Pattern.compile( "^[_\\-=]{4,}\\r?\\n" );
private enum TagType {
HEADER, FOOTER
@@ -110,6 +113,7 @@ abstract public class RegexSectionizer e
@Override
public void process( final JCas jcas ) throws AnalysisEngineProcessException {
LOGGER.info( "Starting processing" );
+ createDividerLines( jcas );
if ( _sectionTypes.isEmpty() ) {
LOGGER.info( "Finished processing, no section types defined" );
return;
@@ -311,5 +315,24 @@ abstract public class RegexSectionizer e
return text2.equalsIgnoreCase( "true" ) || text2.equalsIgnoreCase( "false" );
}
+ /**
+ * Find line dividers
+ *
+ * @param jcas ye olde ...
+ */
+ static private void createDividerLines( final JCas jcas ) {
+ final String docText = jcas.getDocumentText();
+ final List<Pair<Integer>> spans = new ArrayList<>();
+ try ( RegexSpanFinder finder = new RegexSpanFinder( DIVIDER_LINE_PATTERN ) ) {
+ spans.addAll( finder.findSpans( docText ) );
+ }
+ for ( Pair<Integer> span : spans ) {
+ final Segment lineSegment = new Segment( jcas, span.getValue1(), span.getValue2() );
+ lineSegment.setId( DIVIDER_LINE_NAME );
+ lineSegment.setPreferredText( DIVIDER_LINE_NAME );
+ lineSegment.addToIndexes();
+ }
+ }
+
}