You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by re...@apache.org on 2020/01/06 22:19:15 UTC

[uima-uimaj] 01/01: [UIMA-6152] "trim" method for AnnotationFS

This is an automated email from the ASF dual-hosted git repository.

rec pushed a commit to branch feature/UIMA-6152-trim-method-for-AnnotationFS
in repository https://gitbox.apache.org/repos/asf/uima-uimaj.git

commit 827f4f695e64b04ffb9621e2047190977f869739
Author: Richard Eckart de Castilho <re...@apache.org>
AuthorDate: Mon Jan 6 23:19:05 2020 +0100

    [UIMA-6152] "trim" method for AnnotationFS
    
    - Introduce a Unicode-aware trim() method on AnnotationFS
    - Added unit tests
    - Added CasCreationUtils.createCas() no-args convenience method
---
 uimaj-core/pom.xml                                 |   9 +-
 .../org/apache/uima/cas/text/AnnotationFS.java     |  22 ++-
 .../java/org/apache/uima/jcas/tcas/Annotation.java |  36 ++++
 .../org/apache/uima/util/CasCreationUtils.java     |  12 ++
 .../org/apache/uima/jcas/tcas/AnnotationTest.java  | 205 +++++++++++++++++++++
 uimaj-parent/pom.xml                               |   6 +
 6 files changed, 285 insertions(+), 5 deletions(-)

diff --git a/uimaj-core/pom.xml b/uimaj-core/pom.xml
index 958c911..eb12807 100644
--- a/uimaj-core/pom.xml
+++ b/uimaj-core/pom.xml
@@ -204,8 +204,13 @@
       <artifactId>asm-tree</artifactId>
       <version>5.0.4</version>
     </dependency>
- -->		
-
+ -->
+ 
+    <dependency>
+      <groupId>org.assertj</groupId>
+      <artifactId>assertj-core</artifactId>
+      <scope>test</scope>
+    </dependency>
 	</dependencies>
 	
 	<build>
diff --git a/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java b/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
index d281c34..2f48c01 100644
--- a/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
+++ b/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
@@ -19,9 +19,9 @@
 
 package org.apache.uima.cas.text;
 
+import java.util.function.IntPredicate;
+
 import org.apache.uima.cas.AnnotationBaseFS;
-import org.apache.uima.cas.CASRuntimeException;
-import org.apache.uima.cas.impl.FeatureStructureImplC;
 
 /**
  * Interface for Annotation Feature Structures.
@@ -74,4 +74,20 @@ public interface AnnotationFS extends AnnotationBaseFS {
    */
   String getCoveredText();
 
- }
+  /**
+   * Strips leading and trailing whitespace by increasing/decreasing the begin/end offsets. This 
+   * method is aware of Unicode codepoints. It expects that the begin/end offsets point to valid
+   * codepoints.
+   */
+  default void trim() {
+      trim(Character::isWhitespace);
+  }
+  
+  /**
+   * Strips leading and trailing characters matching the given predicate by increasing/decreasing 
+   * the begin/end offsets.
+   * 
+   * @see #trim()
+   */
+  void trim(IntPredicate aPredicate);
+}
diff --git a/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java b/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
index d9cf124..b424321 100644
--- a/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
+++ b/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
@@ -21,6 +21,7 @@ package org.apache.uima.jcas.tcas;
 
 import java.lang.invoke.CallSite;
 import java.lang.invoke.MethodHandle;
+import java.util.function.IntPredicate;
 
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.admin.LinearTypeOrder;
@@ -244,4 +245,39 @@ public class Annotation extends AnnotationBase implements AnnotationImpl {
     return Integer.compare(_id,  other._id);
   }
 
+  @Override
+  public void trim(IntPredicate aIsTrimChar) {
+    int begin = getBegin();
+    int end = getEnd();
+    String text = _casView.getDocumentText();
+      
+    // If the span is empty, there is nothing to trim
+    if (begin == end) {
+      return;
+    }
+      
+    // First we trim at the end. If a trimmed span is empty, we want to return the original 
+    // begin as the begin/end of the trimmed span
+    int backwardsSeekingCodepoint;
+    while (
+              (end > 0)
+              && end > begin
+              && aIsTrimChar.test(backwardsSeekingCodepoint = text.codePointBefore(end))
+    ) {
+      end -= Character.charCount(backwardsSeekingCodepoint);
+    }
+    
+    // Then, trim at the start
+    int forwardSeekingCodepoint;
+    while (
+              (begin < (text.length() - 1))
+              && begin < end
+              && aIsTrimChar.test(forwardSeekingCodepoint = text.codePointAt(begin))
+    ) {
+      begin += Character.charCount(forwardSeekingCodepoint);
+    }
+      
+    setBegin(begin);
+    setEnd(end);
+  }
 }
diff --git a/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java b/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
index 30dda98..e4246a7 100644
--- a/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
+++ b/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
@@ -92,6 +92,18 @@ public class CasCreationUtils {
   private final static FeatureDescription[] EMPTY_FEAT_DESC_ARRAY = new FeatureDescription[0];
   
   /**
+   * Creates a new CAS instance.
+   * 
+   * @return a new CAS instance
+   * 
+   * @throws ResourceInitializationException
+   *                 if CAS creation fails
+   */
+  public static CAS createCas() throws ResourceInitializationException {
+    return createCas((TypeSystemDescription) null, null, null);
+  }
+  
+  /**
    * Creates a new CAS instance. Note this method does not work for Aggregate Analysis Engine
    * descriptors -- use {@link #createCas(AnalysisEngineDescription)} instead.
    * 
diff --git a/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java b/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java
new file mode 100644
index 0000000..b4336d2
--- /dev/null
+++ b/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.jcas.tcas;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.util.CasCreationUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+public class AnnotationTest
+{
+  private CAS cas;
+    
+  @Before
+  public void setup() throws Exception {
+      cas = CasCreationUtils.createCas();
+  }
+    
+  @Test
+  public void thatEmptySpanIsTrimmedToEmptySpan() throws Exception {
+    cas.setDocumentText("    ");
+    
+    AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 2);
+    ann.trim();
+    
+    assertThat(ann)
+        .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+        .containsExactly(2, 2, "");
+  }
+
+  @Test
+  public void thatSpanIsTrimmedToEmptySpanStartingAtOriginalStart() {
+    cas.setDocumentText("    ");
+      
+    AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 3);
+    ann.trim();
+      
+    assertThat(ann)
+        .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+        .containsExactly(2, 2, "");
+  }
+
+  @Test
+  public void thatLeadingAndTrailingWhitespaceIsRemoved() {
+    cas.setDocumentText(" ab ");
+
+    AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 4);
+    ann.trim();
+
+    assertThat(ann)
+        .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+        .containsExactly(1, 3, "ab");
+  }
+
+    @Test
+    public void thatInnerWhitespaceIsRemoved1()
+    {
+      cas.setDocumentText(" a b ");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 2);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(1, 2, "a");
+    }
+
+    @Test
+    public void thatInnerWhitespaceIsRemoved2()
+    {
+      cas.setDocumentText(" a b ");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 5);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(3, 4, "b");
+    }
+
+    @Test
+    public void testSingleCharacter()
+    {
+      cas.setDocumentText(".");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 1);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(0, 1, ".");
+    }
+
+    @Test
+    public void testLeadingWhitespace()
+    {
+      cas.setDocumentText(" \t\n\r.");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 5);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(4, 5, ".");
+    }
+
+    @Test
+    public void testLeadingWhitespaceWithSurrogates()
+    {
+      cas.setDocumentText(" \t\n\ršŸ˜€");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 6);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(4, 6, "šŸ˜€");
+    }
+
+    @Test
+    public void testTrailingWhitespace()
+    {
+      cas.setDocumentText(". \n\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 5);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(0, 1, ".");
+    }
+
+    @Test
+    public void testTrailingWhitespaceWithSurrogates()
+    {
+      cas.setDocumentText("šŸ˜€ \n\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 6);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(0, 2, "šŸ˜€");
+    }
+
+    @Test
+    public void testLeadingTrailingWhitespace()
+    {
+      cas.setDocumentText(" \t\n\r. \n\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 9);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd)
+          .containsExactly(4, 5);
+    }
+
+    @Test
+    public void testLeadingTrailingWhitespaceWithSurrogatesAndCustomPredicate()
+    {
+      // šŖ€ (U+1DA80) is the SIGNWRITING LOCATION-FLOORPLANE SPACE. It is not recognized by
+      // Character.isWhitespace(...), so we use a custom predicate to filter it out
+      cas.setDocumentText(" \tšŖ€\n\r. \nšŖ€\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 9);
+      ann.trim(codepoint -> Character.isWhitespace(codepoint) || 0x1DA80 == codepoint);
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(6, 7, ".");
+    }
+
+    @Test
+    public void testBlankString()
+    {
+      cas.setDocumentText("   ");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 1, 2);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(1, 1, "");
+    }
+}
diff --git a/uimaj-parent/pom.xml b/uimaj-parent/pom.xml
index e585666..8d6d89b 100644
--- a/uimaj-parent/pom.xml
+++ b/uimaj-parent/pom.xml
@@ -168,6 +168,12 @@
         <version>4.12</version>
         <scope>test</scope>
       </dependency>
+      <dependency>
+        <groupId>org.assertj</groupId>
+        <artifactId>assertj-core</artifactId>
+        <version>3.14.0</version>
+        <scope>test</scope>
+      </dependency>
       <!-- set dependency versions for logger parts -->
       <dependency>
         <groupId>org.slf4j</groupId>