You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2015/05/27 22:09:37 UTC
svn commit: r1682114 - in /lucene/dev/trunk/lucene: ./
highlighter/src/java/org/apache/lucene/search/postingshighlight/
highlighter/src/test/org/apache/lucene/search/postingshighlight/
Author: rmuir
Date: Wed May 27 20:09:37 2015
New Revision: 1682114
URL: http://svn.apache.org/r1682114
Log:
LUCENE-6485: Add CustomSeparatorBreakIterator
Added:
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java (with props)
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1682114&r1=1682113&r2=1682114&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed May 27 20:09:37 2015
@@ -34,6 +34,13 @@ API Changes
======================= Lucene 5.3.0 =======================
+New Features
+
+* LUCENE-6485: Add CustomSeparatorBreakIterator to postings
+ highlighter which splits on any character. For example, it
+ can be used with getMultiValueSeparator render whole field
+ values. (Luca Cavanna via Robert Muir)
+
Bug fixes
* LUCENE-6500: ParallelCompositeReader did not always call
Added: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java?rev=1682114&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/CustomSeparatorBreakIterator.java Wed May 27 20:09:37 2015
@@ -0,0 +1,151 @@
+package org.apache.lucene.search.postingshighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link BreakIterator} that breaks the text whenever a certain separator, provided as a constructor argument, is found.
+ */
+public final class CustomSeparatorBreakIterator extends BreakIterator {
+
+ private final char separator;
+ private CharacterIterator text;
+ private int current;
+
+ public CustomSeparatorBreakIterator(char separator) {
+ this.separator = separator;
+ }
+
+ @Override
+ public int current() {
+ return current;
+ }
+
+ @Override
+ public int first() {
+ text.setIndex(text.getBeginIndex());
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int last() {
+ text.setIndex(text.getEndIndex());
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int next() {
+ if (text.getIndex() == text.getEndIndex()) {
+ return DONE;
+ } else {
+ return advanceForward();
+ }
+ }
+
+ private int advanceForward() {
+ char c;
+ while ((c = text.next()) != CharacterIterator.DONE) {
+ if (c == separator) {
+ return current = text.getIndex() + 1;
+ }
+ }
+ assert text.getIndex() == text.getEndIndex();
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int following(int pos) {
+ if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (pos == text.getEndIndex()) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=9000909
+ text.setIndex(text.getEndIndex());
+ current = text.getIndex();
+ return DONE;
+ } else {
+ text.setIndex(pos);
+ current = text.getIndex();
+ return advanceForward();
+ }
+ }
+
+ @Override
+ public int previous() {
+ if (text.getIndex() == text.getBeginIndex()) {
+ return DONE;
+ } else {
+ return advanceBackward();
+ }
+ }
+
+ private int advanceBackward() {
+ char c;
+ while ((c = text.previous()) != CharacterIterator.DONE) {
+ if (c == separator) {
+ return current = text.getIndex() + 1;
+ }
+ }
+ assert text.getIndex() == text.getBeginIndex();
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int preceding(int pos) {
+ if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (pos == text.getBeginIndex()) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=9000909
+ text.setIndex(text.getBeginIndex());
+ current = text.getIndex();
+ return DONE;
+ } else {
+ text.setIndex(pos);
+ current = text.getIndex();
+ return advanceBackward();
+ }
+ }
+
+ @Override
+ public int next(int n) {
+ if (n < 0) {
+ for (int i = 0; i < -n; i++) {
+ previous();
+ }
+ } else {
+ for (int i = 0; i < n; i++) {
+ next();
+ }
+ }
+ return current();
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return text;
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ text = newText;
+ current = text.getBeginIndex();
+ }
+}
Added: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java?rev=1682114&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestCustomSeparatorBreakIterator.java Wed May 27 20:09:37 2015
@@ -0,0 +1,115 @@
+package org.apache.lucene.search.postingshighlight;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import org.apache.lucene.util.LuceneTestCase;
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import static org.apache.lucene.search.postingshighlight.TestWholeBreakIterator.assertSameBreaks;
+import static org.hamcrest.CoreMatchers.equalTo;
+
+public class TestCustomSeparatorBreakIterator extends LuceneTestCase {
+
+ private static final Character[] SEPARATORS = new Character[]{' ', '\u0000', 8233};
+
+ public void testBreakOnCustomSeparator() throws Exception {
+ Character separator = randomSeparator();
+ BreakIterator bi = new CustomSeparatorBreakIterator(separator);
+ String source = "this" + separator + "is" + separator + "the" + separator + "first" + separator + "sentence";
+ bi.setText(source);
+ assertThat(bi.current(), equalTo(0));
+ assertThat(bi.first(), equalTo(0));
+ assertThat(source.substring(bi.current(), bi.next()), equalTo("this" + separator));
+ assertThat(source.substring(bi.current(), bi.next()), equalTo("is" + separator));
+ assertThat(source.substring(bi.current(), bi.next()), equalTo("the" + separator));
+ assertThat(source.substring(bi.current(), bi.next()), equalTo("first" + separator));
+ assertThat(source.substring(bi.current(), bi.next()), equalTo("sentence"));
+ assertThat(bi.next(), equalTo(BreakIterator.DONE));
+
+ assertThat(bi.last(), equalTo(source.length()));
+ int current = bi.current();
+ assertThat(source.substring(bi.previous(), current), equalTo("sentence"));
+ current = bi.current();
+ assertThat(source.substring(bi.previous(), current), equalTo("first" + separator));
+ current = bi.current();
+ assertThat(source.substring(bi.previous(), current), equalTo("the" + separator));
+ current = bi.current();
+ assertThat(source.substring(bi.previous(), current), equalTo("is" + separator));
+ current = bi.current();
+ assertThat(source.substring(bi.previous(), current), equalTo("this" + separator));
+ assertThat(bi.previous(), equalTo(BreakIterator.DONE));
+ assertThat(bi.current(), equalTo(0));
+
+ assertThat(source.substring(0, bi.following(9)), equalTo("this" + separator + "is" + separator + "the" + separator));
+
+ assertThat(source.substring(0, bi.preceding(9)), equalTo("this" + separator + "is" + separator));
+
+ assertThat(bi.first(), equalTo(0));
+ assertThat(source.substring(0, bi.next(3)), equalTo("this" + separator + "is" + separator + "the" + separator));
+ }
+
+ public void testSingleSentences() throws Exception {
+ BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+ BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+ assertSameBreaks("a", expected, actual);
+ assertSameBreaks("ab", expected, actual);
+ assertSameBreaks("abc", expected, actual);
+ assertSameBreaks("", expected, actual);
+ }
+
+ public void testSliceEnd() throws Exception {
+ BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+ BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+ assertSameBreaks("a000", 0, 1, expected, actual);
+ assertSameBreaks("ab000", 0, 1, expected, actual);
+ assertSameBreaks("abc000", 0, 1, expected, actual);
+ assertSameBreaks("000", 0, 0, expected, actual);
+ }
+
+ public void testSliceStart() throws Exception {
+ BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+ BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+ assertSameBreaks("000a", 3, 1, expected, actual);
+ assertSameBreaks("000ab", 3, 2, expected, actual);
+ assertSameBreaks("000abc", 3, 3, expected, actual);
+ assertSameBreaks("000", 3, 0, expected, actual);
+ }
+
+ public void testSliceMiddle() throws Exception {
+ BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+ BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+ assertSameBreaks("000a000", 3, 1, expected, actual);
+ assertSameBreaks("000ab000", 3, 2, expected, actual);
+ assertSameBreaks("000abc000", 3, 3, expected, actual);
+ assertSameBreaks("000000", 3, 0, expected, actual);
+ }
+
+ /** the current position must be ignored, initial position is always first() */
+ public void testFirstPosition() throws Exception {
+ BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
+ BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
+ assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
+ }
+
+ private static char randomSeparator() {
+ return RandomPicks.randomFrom(random(), SEPARATORS);
+ }
+}
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java?rev=1682114&r1=1682113&r2=1682114&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestWholeBreakIterator.java Wed May 27 20:09:37 2015
@@ -17,13 +17,13 @@ package org.apache.lucene.search.posting
* limitations under the License.
*/
+import org.apache.lucene.util.LuceneTestCase;
+
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.Locale;
-import org.apache.lucene.util.LuceneTestCase;
-
public class TestWholeBreakIterator extends LuceneTestCase {
/** For single sentences, we know WholeBreakIterator should break the same as a sentence iterator */
@@ -70,18 +70,18 @@ public class TestWholeBreakIterator exte
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
}
- public void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
+ public static void assertSameBreaks(String text, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(new StringCharacterIterator(text),
new StringCharacterIterator(text),
expected,
actual);
}
- public void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
+ public static void assertSameBreaks(String text, int offset, int length, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(text, offset, length, offset, expected, actual);
}
- public void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
+ public static void assertSameBreaks(String text, int offset, int length, int current, BreakIterator expected, BreakIterator actual) {
assertSameBreaks(new StringCharacterIterator(text, offset, offset+length, current),
new StringCharacterIterator(text, offset, offset+length, current),
expected,
@@ -89,7 +89,7 @@ public class TestWholeBreakIterator exte
}
/** Asserts that two breakiterators break the text the same way */
- public void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
+ public static void assertSameBreaks(CharacterIterator one, CharacterIterator two, BreakIterator expected, BreakIterator actual) {
expected.setText(one);
actual.setText(two);