You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by re...@apache.org on 2020/01/06 22:19:14 UTC

[uima-uimaj] branch feature/UIMA-6152-trim-method-for-AnnotationFS created (now 827f4f6)

This is an automated email from the ASF dual-hosted git repository.

rec pushed a change to branch feature/UIMA-6152-trim-method-for-AnnotationFS
in repository https://gitbox.apache.org/repos/asf/uima-uimaj.git.


      at 827f4f6  [UIMA-6152] "trim" method for AnnotationFS

This branch includes the following new commits:

     new 827f4f6  [UIMA-6152] "trim" method for AnnotationFS

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[uima-uimaj] 01/01: [UIMA-6152] "trim" method for AnnotationFS

Posted by re...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

rec pushed a commit to branch feature/UIMA-6152-trim-method-for-AnnotationFS
in repository https://gitbox.apache.org/repos/asf/uima-uimaj.git

commit 827f4f695e64b04ffb9621e2047190977f869739
Author: Richard Eckart de Castilho <re...@apache.org>
AuthorDate: Mon Jan 6 23:19:05 2020 +0100

    [UIMA-6152] "trim" method for AnnotationFS
    
    - Introduce a Unicode-aware trim() method on AnnotationFS
    - Added unit tests
    - Added CasCreationUtils.createCas() no-args convenience method
---
 uimaj-core/pom.xml                                 |   9 +-
 .../org/apache/uima/cas/text/AnnotationFS.java     |  22 ++-
 .../java/org/apache/uima/jcas/tcas/Annotation.java |  36 ++++
 .../org/apache/uima/util/CasCreationUtils.java     |  12 ++
 .../org/apache/uima/jcas/tcas/AnnotationTest.java  | 205 +++++++++++++++++++++
 uimaj-parent/pom.xml                               |   6 +
 6 files changed, 285 insertions(+), 5 deletions(-)

diff --git a/uimaj-core/pom.xml b/uimaj-core/pom.xml
index 958c911..eb12807 100644
--- a/uimaj-core/pom.xml
+++ b/uimaj-core/pom.xml
@@ -204,8 +204,13 @@
       <artifactId>asm-tree</artifactId>
       <version>5.0.4</version>
     </dependency>
- -->		
-
+ -->
+ 
+    <dependency>
+      <groupId>org.assertj</groupId>
+      <artifactId>assertj-core</artifactId>
+      <scope>test</scope>
+    </dependency>
 	</dependencies>
 	
 	<build>
diff --git a/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java b/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
index d281c34..2f48c01 100644
--- a/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
+++ b/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
@@ -19,9 +19,9 @@
 
 package org.apache.uima.cas.text;
 
+import java.util.function.IntPredicate;
+
 import org.apache.uima.cas.AnnotationBaseFS;
-import org.apache.uima.cas.CASRuntimeException;
-import org.apache.uima.cas.impl.FeatureStructureImplC;
 
 /**
  * Interface for Annotation Feature Structures.
@@ -74,4 +74,20 @@ public interface AnnotationFS extends AnnotationBaseFS {
    */
   String getCoveredText();
 
- }
+  /**
+   * Strips leading and trailing whitespace by increasing/decreasing the begin/end offsets. This 
+   * method is aware of Unicode codepoints. It expects that the begin/end offsets point to valid
+   * codepoints.
+   */
+  default void trim() {
+      trim(Character::isWhitespace);
+  }
+  
+  /**
+   * Strips leading and trailing characters matching the given predicate by increasing/decreasing 
+   * the begin/end offsets.
+   * 
+   * @see #trim()
+   */
+  void trim(IntPredicate aPredicate);
+}
diff --git a/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java b/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
index d9cf124..b424321 100644
--- a/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
+++ b/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
@@ -21,6 +21,7 @@ package org.apache.uima.jcas.tcas;
 
 import java.lang.invoke.CallSite;
 import java.lang.invoke.MethodHandle;
+import java.util.function.IntPredicate;
 
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.admin.LinearTypeOrder;
@@ -244,4 +245,39 @@ public class Annotation extends AnnotationBase implements AnnotationImpl {
     return Integer.compare(_id,  other._id);
   }
 
+  @Override
+  public void trim(IntPredicate aIsTrimChar) {
+    int begin = getBegin();
+    int end = getEnd();
+    String text = _casView.getDocumentText();
+      
+    // If the span is empty, there is nothing to trim
+    if (begin == end) {
+      return;
+    }
+      
+    // First we trim at the end. If a trimmed span is empty, we want to return the original 
+    // begin as the begin/end of the trimmed span
+    int backwardsSeekingCodepoint;
+    while (
+              (end > 0)
+              && end > begin
+              && aIsTrimChar.test(backwardsSeekingCodepoint = text.codePointBefore(end))
+    ) {
+      end -= Character.charCount(backwardsSeekingCodepoint);
+    }
+    
+    // Then, trim at the start
+    int forwardSeekingCodepoint;
+    while (
+              (begin < (text.length() - 1))
+              && begin < end
+              && aIsTrimChar.test(forwardSeekingCodepoint = text.codePointAt(begin))
+    ) {
+      begin += Character.charCount(forwardSeekingCodepoint);
+    }
+      
+    setBegin(begin);
+    setEnd(end);
+  }
 }
diff --git a/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java b/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
index 30dda98..e4246a7 100644
--- a/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
+++ b/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
@@ -92,6 +92,18 @@ public class CasCreationUtils {
   private final static FeatureDescription[] EMPTY_FEAT_DESC_ARRAY = new FeatureDescription[0];
   
   /**
+   * Creates a new CAS instance.
+   * 
+   * @return a new CAS instance
+   * 
+   * @throws ResourceInitializationException
+   *                 if CAS creation fails
+   */
+  public static CAS createCas() throws ResourceInitializationException {
+    return createCas((TypeSystemDescription) null, null, null);
+  }
+  
+  /**
    * Creates a new CAS instance. Note this method does not work for Aggregate Analysis Engine
    * descriptors -- use {@link #createCas(AnalysisEngineDescription)} instead.
    * 
diff --git a/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java b/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java
new file mode 100644
index 0000000..b4336d2
--- /dev/null
+++ b/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.jcas.tcas;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.util.CasCreationUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+public class AnnotationTest
+{
+  private CAS cas;
+    
+  @Before
+  public void setup() throws Exception {
+      cas = CasCreationUtils.createCas();
+  }
+    
+  @Test
+  public void thatEmptySpanIsTrimmedToEmptySpan() throws Exception {
+    cas.setDocumentText("    ");
+    
+    AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 2);
+    ann.trim();
+    
+    assertThat(ann)
+        .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+        .containsExactly(2, 2, "");
+  }
+
+  @Test
+  public void thatSpanIsTrimmedToEmptySpanStartingAtOriginalStart() {
+    cas.setDocumentText("    ");
+      
+    AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 3);
+    ann.trim();
+      
+    assertThat(ann)
+        .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+        .containsExactly(2, 2, "");
+  }
+
+  @Test
+  public void thatLeadingAndTrailingWhitespaceIsRemoved() {
+    cas.setDocumentText(" ab ");
+
+    AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 4);
+    ann.trim();
+
+    assertThat(ann)
+        .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+        .containsExactly(1, 3, "ab");
+  }
+
+    @Test
+    public void thatInnerWhitespaceIsRemoved1()
+    {
+      cas.setDocumentText(" a b ");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 2);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(1, 2, "a");
+    }
+
+    @Test
+    public void thatInnerWhitespaceIsRemoved2()
+    {
+      cas.setDocumentText(" a b ");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 5);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(3, 4, "b");
+    }
+
+    @Test
+    public void testSingleCharacter()
+    {
+      cas.setDocumentText(".");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 1);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(0, 1, ".");
+    }
+
+    @Test
+    public void testLeadingWhitespace()
+    {
+      cas.setDocumentText(" \t\n\r.");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 5);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(4, 5, ".");
+    }
+
+    @Test
+    public void testLeadingWhitespaceWithSurrogates()
+    {
+      cas.setDocumentText(" \t\n\ršŸ˜€");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 6);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(4, 6, "šŸ˜€");
+    }
+
+    @Test
+    public void testTrailingWhitespace()
+    {
+      cas.setDocumentText(". \n\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 5);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(0, 1, ".");
+    }
+
+    @Test
+    public void testTrailingWhitespaceWithSurrogates()
+    {
+      cas.setDocumentText("šŸ˜€ \n\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 6);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(0, 2, "šŸ˜€");
+    }
+
+    @Test
+    public void testLeadingTrailingWhitespace()
+    {
+      cas.setDocumentText(" \t\n\r. \n\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 9);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd)
+          .containsExactly(4, 5);
+    }
+
+    @Test
+    public void testLeadingTrailingWhitespaceWithSurrogatesAndCustomPredicate()
+    {
+      // šŖ€ (U+1DA80) is the SIGNWRITING LOCATION-FLOORPLANE SPACE. It is not recognized by
+      // Character.isWhitespace(...), so we use a custom predicate to filter it out
+      cas.setDocumentText(" \tšŖ€\n\r. \nšŖ€\r\t");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 9);
+      ann.trim(codepoint -> Character.isWhitespace(codepoint) || 0x1DA80 == codepoint);
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(6, 7, ".");
+    }
+
+    @Test
+    public void testBlankString()
+    {
+      cas.setDocumentText("   ");
+
+      AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 1, 2);
+      ann.trim();
+
+      assertThat(ann)
+          .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+          .containsExactly(1, 1, "");
+    }
+}
diff --git a/uimaj-parent/pom.xml b/uimaj-parent/pom.xml
index e585666..8d6d89b 100644
--- a/uimaj-parent/pom.xml
+++ b/uimaj-parent/pom.xml
@@ -168,6 +168,12 @@
         <version>4.12</version>
         <scope>test</scope>
       </dependency>
+      <dependency>
+        <groupId>org.assertj</groupId>
+        <artifactId>assertj-core</artifactId>
+        <version>3.14.0</version>
+        <scope>test</scope>
+      </dependency>
       <!-- set dependency versions for logger parts -->
       <dependency>
         <groupId>org.slf4j</groupId>