You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by re...@apache.org on 2020/01/06 22:19:15 UTC
[uima-uimaj] 01/01: [UIMA-6152] "trim" method for AnnotationFS
This is an automated email from the ASF dual-hosted git repository.
rec pushed a commit to branch feature/UIMA-6152-trim-method-for-AnnotationFS
in repository https://gitbox.apache.org/repos/asf/uima-uimaj.git
commit 827f4f695e64b04ffb9621e2047190977f869739
Author: Richard Eckart de Castilho <re...@apache.org>
AuthorDate: Mon Jan 6 23:19:05 2020 +0100
[UIMA-6152] "trim" method for AnnotationFS
- Introduce a Unicode-aware trim() method on AnnotationFS
- Added unit tests
- Added CasCreationUtils.createCas() no-args convenience method
---
uimaj-core/pom.xml | 9 +-
.../org/apache/uima/cas/text/AnnotationFS.java | 22 ++-
.../java/org/apache/uima/jcas/tcas/Annotation.java | 36 ++++
.../org/apache/uima/util/CasCreationUtils.java | 12 ++
.../org/apache/uima/jcas/tcas/AnnotationTest.java | 205 +++++++++++++++++++++
uimaj-parent/pom.xml | 6 +
6 files changed, 285 insertions(+), 5 deletions(-)
diff --git a/uimaj-core/pom.xml b/uimaj-core/pom.xml
index 958c911..eb12807 100644
--- a/uimaj-core/pom.xml
+++ b/uimaj-core/pom.xml
@@ -204,8 +204,13 @@
<artifactId>asm-tree</artifactId>
<version>5.0.4</version>
</dependency>
- -->
-
+ -->
+
+ <dependency>
+ <groupId>org.assertj</groupId>
+ <artifactId>assertj-core</artifactId>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java b/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
index d281c34..2f48c01 100644
--- a/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
+++ b/uimaj-core/src/main/java/org/apache/uima/cas/text/AnnotationFS.java
@@ -19,9 +19,9 @@
package org.apache.uima.cas.text;
+import java.util.function.IntPredicate;
+
import org.apache.uima.cas.AnnotationBaseFS;
-import org.apache.uima.cas.CASRuntimeException;
-import org.apache.uima.cas.impl.FeatureStructureImplC;
/**
* Interface for Annotation Feature Structures.
@@ -74,4 +74,20 @@ public interface AnnotationFS extends AnnotationBaseFS {
*/
String getCoveredText();
- }
+ /**
+ * Strips leading and trailing whitespace by increasing/decreasing the begin/end offsets. This
+ * method is aware of Unicode codepoints. It expects that the begin/end offsets point to valid
+ * codepoints.
+ */
+ default void trim() {
+ trim(Character::isWhitespace);
+ }
+
+ /**
+ * Strips leading and trailing characters matching the given predicate by increasing/decreasing
+ * the begin/end offsets.
+ *
+ * @see #trim()
+ */
+ void trim(IntPredicate aPredicate);
+}
diff --git a/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java b/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
index d9cf124..b424321 100644
--- a/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
+++ b/uimaj-core/src/main/java/org/apache/uima/jcas/tcas/Annotation.java
@@ -21,6 +21,7 @@ package org.apache.uima.jcas.tcas;
import java.lang.invoke.CallSite;
import java.lang.invoke.MethodHandle;
+import java.util.function.IntPredicate;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.admin.LinearTypeOrder;
@@ -244,4 +245,39 @@ public class Annotation extends AnnotationBase implements AnnotationImpl {
return Integer.compare(_id, other._id);
}
+ @Override
+ public void trim(IntPredicate aIsTrimChar) {
+ int begin = getBegin();
+ int end = getEnd();
+ String text = _casView.getDocumentText();
+
+ // If the span is empty, there is nothing to trim
+ if (begin == end) {
+ return;
+ }
+
+ // First we trim at the end. If a trimmed span is empty, we want to return the original
+ // begin as the begin/end of the trimmed span
+ int backwardsSeekingCodepoint;
+ while (
+ (end > 0)
+ && end > begin
+ && aIsTrimChar.test(backwardsSeekingCodepoint = text.codePointBefore(end))
+ ) {
+ end -= Character.charCount(backwardsSeekingCodepoint);
+ }
+
+ // Then, trim at the start
+ int forwardSeekingCodepoint;
+ while (
+ (begin < (text.length() - 1))
+ && begin < end
+ && aIsTrimChar.test(forwardSeekingCodepoint = text.codePointAt(begin))
+ ) {
+ begin += Character.charCount(forwardSeekingCodepoint);
+ }
+
+ setBegin(begin);
+ setEnd(end);
+ }
}
diff --git a/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java b/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
index 30dda98..e4246a7 100644
--- a/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
+++ b/uimaj-core/src/main/java/org/apache/uima/util/CasCreationUtils.java
@@ -92,6 +92,18 @@ public class CasCreationUtils {
private final static FeatureDescription[] EMPTY_FEAT_DESC_ARRAY = new FeatureDescription[0];
/**
+ * Creates a new CAS instance.
+ *
+ * @return a new CAS instance
+ *
+ * @throws ResourceInitializationException
+ * if CAS creation fails
+ */
+ public static CAS createCas() throws ResourceInitializationException {
+ return createCas((TypeSystemDescription) null, null, null);
+ }
+
+ /**
* Creates a new CAS instance. Note this method does not work for Aggregate Analysis Engine
* descriptors -- use {@link #createCas(AnalysisEngineDescription)} instead.
*
diff --git a/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java b/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java
new file mode 100644
index 0000000..b4336d2
--- /dev/null
+++ b/uimaj-core/src/test/java/org/apache/uima/jcas/tcas/AnnotationTest.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.jcas.tcas;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.util.CasCreationUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+public class AnnotationTest
+{
+ private CAS cas;
+
+ @Before
+ public void setup() throws Exception {
+ cas = CasCreationUtils.createCas();
+ }
+
+ @Test
+ public void thatEmptySpanIsTrimmedToEmptySpan() throws Exception {
+ cas.setDocumentText(" ");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 2);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(2, 2, "");
+ }
+
+ @Test
+ public void thatSpanIsTrimmedToEmptySpanStartingAtOriginalStart() {
+ cas.setDocumentText(" ");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 3);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(2, 2, "");
+ }
+
+ @Test
+ public void thatLeadingAndTrailingWhitespaceIsRemoved() {
+ cas.setDocumentText(" ab ");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 4);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(1, 3, "ab");
+ }
+
+ @Test
+ public void thatInnerWhitespaceIsRemoved1()
+ {
+ cas.setDocumentText(" a b ");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 2);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(1, 2, "a");
+ }
+
+ @Test
+ public void thatInnerWhitespaceIsRemoved2()
+ {
+ cas.setDocumentText(" a b ");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 2, 5);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(3, 4, "b");
+ }
+
+ @Test
+ public void testSingleCharacter()
+ {
+ cas.setDocumentText(".");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 1);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(0, 1, ".");
+ }
+
+ @Test
+ public void testLeadingWhitespace()
+ {
+ cas.setDocumentText(" \t\n\r.");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 5);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(4, 5, ".");
+ }
+
+ @Test
+ public void testLeadingWhitespaceWithSurrogates()
+ {
+ cas.setDocumentText(" \t\n\rš");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 6);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(4, 6, "š");
+ }
+
+ @Test
+ public void testTrailingWhitespace()
+ {
+ cas.setDocumentText(". \n\r\t");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 5);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(0, 1, ".");
+ }
+
+ @Test
+ public void testTrailingWhitespaceWithSurrogates()
+ {
+ cas.setDocumentText("š \n\r\t");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 6);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(0, 2, "š");
+ }
+
+ @Test
+ public void testLeadingTrailingWhitespace()
+ {
+ cas.setDocumentText(" \t\n\r. \n\r\t");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 9);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd)
+ .containsExactly(4, 5);
+ }
+
+ @Test
+ public void testLeadingTrailingWhitespaceWithSurrogatesAndCustomPredicate()
+ {
+ // šŖ (U+1DA80) is the SIGNWRITING LOCATION-FLOORPLANE SPACE. It is not recognized by
+ // Character.isWhitespace(...), so we use a custom predicate to filter it out
+ cas.setDocumentText(" \tšŖ\n\r. \nšŖ\r\t");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 0, 9);
+ ann.trim(codepoint -> Character.isWhitespace(codepoint) || 0x1DA80 == codepoint);
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(6, 7, ".");
+ }
+
+ @Test
+ public void testBlankString()
+ {
+ cas.setDocumentText(" ");
+
+ AnnotationFS ann = cas.createAnnotation(cas.getAnnotationType(), 1, 2);
+ ann.trim();
+
+ assertThat(ann)
+ .extracting(AnnotationFS::getBegin, AnnotationFS::getEnd, AnnotationFS::getCoveredText)
+ .containsExactly(1, 1, "");
+ }
+}
diff --git a/uimaj-parent/pom.xml b/uimaj-parent/pom.xml
index e585666..8d6d89b 100644
--- a/uimaj-parent/pom.xml
+++ b/uimaj-parent/pom.xml
@@ -168,6 +168,12 @@
<version>4.12</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.assertj</groupId>
+ <artifactId>assertj-core</artifactId>
+ <version>3.14.0</version>
+ <scope>test</scope>
+ </dependency>
<!-- set dependency versions for logger parts -->
<dependency>
<groupId>org.slf4j</groupId>