You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/05/27 22:09:37 UTC

svn commit: r1682114 - in /lucene/dev/trunk/lucene: ./ highlighter/src/java/org/apache/lucene/search/postingshighlight/ highlighter/src/test/org/apache/lucene/search/postingshighlight/

Author: rmuir
Date: Wed May 27 20:09:37 2015
New Revision: 1682114

URL: http://svn.apache.org/r1682114
Log:
LUCENE-6485: Add CustomSeparatorBreakIterator

Added:
    lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java   (with props)
    lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1682114&r1=1682113&r2=1682114&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed May 27 20:09:37 2015
@@ -34,6 +34,13 @@ API Changes
 
 ======================= Lucene 5.3.0 =======================
 
+New Features
+
+* LUCENE-6485: Add CustomSeparatorBreakIterator to postings
+  highlighter which splits on any character. For example, it 
+  can be used with getMultiValueSeparator render whole field
+  values.  (Luca Cavanna via Robert Muir)
+
 Bug fixes
 
 * LUCENE-6500: ParallelCompositeReader did not always call

Added: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java?rev=1682114&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java Wed May 27 20:09:37 2015
@@ -0,0 +1,151 @@
+package org.apache.lucene.search.postingshighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link BreakIterator} that breaks the text whenever a certain separator, provided as a constructor argument, is found.
+ */
+public final class CustomSeparatorBreakIterator extends BreakIterator {
+
+  private final char separator;
+  private CharacterIterator text;
+  private int current;
+
+  public CustomSeparatorBreakIterator(char separator) {
+    this.separator = separator;
+  }
+
+  @Override
+  public int current() {
+    return current;
+  }
+
+  @Override
+  public int first() {
+    text.setIndex(text.getBeginIndex());
+    return current = text.getIndex();
+  }
+
+  @Override
+  public int last() {
+    text.setIndex(text.getEndIndex());
+    return current = text.getIndex();
+  }
+
+  @Override
+  public int next() {
+    if (text.getIndex() == text.getEndIndex()) {
+      return DONE;
+    } else {
+      return advanceForward();
+    }
+  }
+
+  private int advanceForward() {
+    char c;
+    while ((c = text.next()) != CharacterIterator.DONE) {
+      if (c == separator) {
+        return current = text.getIndex() + 1;
+      }
+    }
+    assert text.getIndex() == text.getEndIndex();
+    return current = text.getIndex();
+  }
+
+  @Override
+  public int following(int pos) {
+    if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+      throw new IllegalArgumentException("offset out of bounds");
+    } else if (pos == text.getEndIndex()) {
+      // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+      // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=9000909
+      text.setIndex(text.getEndIndex());
+      current = text.getIndex();
+      return DONE;
+    } else {
+      text.setIndex(pos);
+      current = text.getIndex();
+      return advanceForward();
+    }
+  }
+
+  @Override
+  public int previous() {
+    if (text.getIndex() == text.getBeginIndex()) {
+      return DONE;
+    } else {
+      return advanceBackward();
+    }
+  }
+
+  private int advanceBackward() {
+    char c;
+    while ((c = text.previous()) != CharacterIterator.DONE) {
+      if (c == separator) {
+        return current = text.getIndex() + 1;
+      }
+    }
+    assert text.getIndex() == text.getBeginIndex();
+    return current = text.getIndex();
+  }
+
+  @Override
+  public int preceding(int pos) {
+    if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+      throw new IllegalArgumentException("offset out of bounds");
+    } else if (pos == text.getBeginIndex()) {
+      // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+      // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=9000909
+      text.setIndex(text.getBeginIndex());
+      current = text.getIndex();
+      return DONE;
+    } else {
+      text.setIndex(pos);
+      current = text.getIndex();
+      return advanceBackward();
+    }
+  }
+
+  @Override
+  public int next(int n) {
+    if (n < 0) {
+      for (int i = 0; i < -n; i++) {
+        previous();
+      }
+    } else {
+      for (int i = 0; i < n; i++) {
+        next();
+      }
+    }
+    return current();
+  }
+
+  @Override
+  public CharacterIterator getText() {
+    return text;
+  }
+
+  @Override
+  public void setText(CharacterIterator newText) {
+    text = newText;
+    current = text.getBeginIndex();
+  }
+}

Added: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java?rev=1682114&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java Wed May 27 20:09:37 2015
@@ -0,0 +1,115 @@
+package org.apache.lucene.search.postingshighlight;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import static org.apache.lucene.search.postingshighlight.TestWholeBreakIterator.assertSameBreaks;
+import static org.hamcrest.CoreMatchers.equalTo;
+
+public class TestCustomSeparatorBreakIterator extends LuceneTestCase {
+
+  private static final Character[] SEPARATORS = new Character[]{' ', '\u0000', 8233};
+
+  public void testBreakOnCustomSeparator() throws Exception {
+    Character separator = randomSeparator();
+    BreakIterator bi = new CustomSeparatorBreakIterator(separator);
+    String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
+    bi.setText(source);
+    assertThat(bi.current(), equalTo(0));
+    assertThat(bi.first(), equalTo(0));
+    assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
+    assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
+    assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
+    assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
+    assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
+    assertThat(bi.next(), equalTo(BreakIterator.DONE));
+
+    assertThat(bi.last(), equalTo(source.length()));
+    int current = bi.current();
+    assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
+    current = bi.current();
+    assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
+    current = bi.current();
+    assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
+    current = bi.current();
+    assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
+    current = bi.current();
+    assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
+    assertThat(bi.previous(), equalTo(BreakIterator.DONE));
+    assertThat(bi.current(), equalTo(0));
+
+    assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
+
+    assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
+
+    assertThat(bi.first(), equalTo(0));
+    assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
+  }
+
+  public void testSingleSentences() throws Exception {
+    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+    assertSameBreaks("a", expected, actual);
+    assertSameBreaks("ab", expected, actual);
+    assertSameBreaks("abc", expected, actual);
+    assertSameBreaks("", expected, actual);
+  }
+
+  public void testSliceEnd() throws Exception {
+    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+    assertSameBreaks("a000", 0, 1, expected, actual);
+    assertSameBreaks("ab000", 0, 1, expected, actual);
+    assertSameBreaks("abc000", 0, 1, expected, actual);
+    assertSameBreaks("000", 0, 0, expected, actual);
+  }
+
+  public void testSliceStart() throws Exception {
+    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+    assertSameBreaks("000a", 3, 1, expected, actual);
+    assertSameBreaks("000ab", 3, 2, expected, actual);
+    assertSameBreaks("000abc", 3, 3, expected, actual);
+    assertSameBreaks("000", 3, 0, expected, actual);
+  }
+
+  public void testSliceMiddle() throws Exception {
+    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+    assertSameBreaks("000a000", 3, 1, expected, actual);
+    assertSameBreaks("000ab000", 3, 2, expected, actual);
+    assertSameBreaks("000abc000", 3, 3, expected, actual);
+    assertSameBreaks("000000", 3, 0, expected, actual);
+  }
+
+  /** the current position must be ignored, initial position is always first() */
+  public void testFirstPosition() throws Exception {
+    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+    BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+    assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
+  }
+
+  private static char randomSeparator() {
+    return RandomPicks.randomFrom(random(), SEPARATORS);
+  }
+}

Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java?rev=1682114&r1=1682113&r2=1682114&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java Wed May 27 20:09:37 2015
@@ -17,13 +17,13 @@ package org.apache.lucene.search.posting
  * limitations under the License.
  */
 
+import org.apache.lucene.util.LuceneTestCase;
+
 import java.text.BreakIterator;
 import java.text.CharacterIterator;
 import java.text.StringCharacterIterator;
 import java.util.Locale;
 
-import org.apache.lucene.util.LuceneTestCase;
-
 public class TestWholeBreakIterator extends LuceneTestCase {
   
   /** For single sentences, we know WholeBreakIterator should break the same as a sentence iterator */
@@ -70,18 +70,18 @@ public class TestWholeBreakIterator exte
     assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
   }
 
-  public void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
+  public static void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
     assertSameBreaks(new StringCharacterIterator(text), 
                      new StringCharacterIterator(text), 
                      expected, 
                      actual);
   }
   
-  public void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
+  public static void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
     assertSameBreaks(text, offset, length, offset, expected, actual);
   }
   
-  public void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
+  public static void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
     assertSameBreaks(new StringCharacterIterator(text, offset, offset+length, current), 
                      new StringCharacterIterator(text, offset, offset+length, current), 
                      expected, 
@@ -89,7 +89,7 @@ public class TestWholeBreakIterator exte
   }
 
   /** Asserts that two breakiterators break the text the same way */
-  public void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
+  public static void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
     expected.setText(one);
     actual.setText(two);