You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2016/12/19 17:32:03 UTC

svn commit: r1775141 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java

Author: seanfinan
Date: Mon Dec 19 17:32:03 2016
New Revision: 1775141

URL: http://svn.apache.org/viewvc?rev=1775141&view=rev
Log:
Move divider line detection to ctakes

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java?rev=1775141&r1=1775140&r2=1775141&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java Mon Dec 19 17:32:03 2016
@@ -2,6 +2,7 @@ package org.apache.ctakes.core.ae;
 
 
 import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.regex.RegexSpanFinder;
 import org.apache.ctakes.core.util.regex.TimeoutMatcher;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
 import org.apache.log4j.Logger;
@@ -30,6 +31,8 @@ abstract public class RegexSectionizer e
     */
    static private final String DEFAULT_SEGMENT_ID = "SIMPLE_SEGMENT";
    static private final String SECTION_NAME_EX = "SECTION_NAME";
+   static public final String DIVIDER_LINE_NAME = "DIVIDER_LINE";
+   static private final Pattern DIVIDER_LINE_PATTERN = Pattern.compile( "^[_\\-=]{4,}\\r?\\n" );
 
    private enum TagType {
       HEADER, FOOTER
@@ -110,6 +113,7 @@ abstract public class RegexSectionizer e
    @Override
    public void process( final JCas jcas ) throws AnalysisEngineProcessException {
       LOGGER.info( "Starting processing" );
+      createDividerLines( jcas );
       if ( _sectionTypes.isEmpty() ) {
          LOGGER.info( "Finished processing, no section types defined" );
          return;
@@ -311,5 +315,24 @@ abstract public class RegexSectionizer e
       return text2.equalsIgnoreCase( "true" ) || text2.equalsIgnoreCase( "false" );
    }
 
+   /**
+    * Find line dividers
+    *
+    * @param jcas ye olde ...
+    */
+   static private void createDividerLines( final JCas jcas ) {
+      final String docText = jcas.getDocumentText();
+      final List<Pair<Integer>> spans = new ArrayList<>();
+      try ( RegexSpanFinder finder = new RegexSpanFinder( DIVIDER_LINE_PATTERN ) ) {
+         spans.addAll( finder.findSpans( docText ) );
+      }
+      for ( Pair<Integer> span : spans ) {
+         final Segment lineSegment = new Segment( jcas, span.getValue1(), span.getValue2() );
+         lineSegment.setId( DIVIDER_LINE_NAME );
+         lineSegment.setPreferredText( DIVIDER_LINE_NAME );
+         lineSegment.addToIndexes();
+      }
+   }
+
 
 }