You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/10/23 16:55:26 UTC
svn commit: r1187900 - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/common/src/java/org/apache/lucene/analysis/th/
modules/analysis/common/src/java/org/apache/lucene/analysis/util/
modules/analysis/common/src/test/org/apache/lucene/analysis/u...
Author: rmuir
Date: Sun Oct 23 14:55:25 2011
New Revision: 1187900
URL: http://svn.apache.org/viewvc?rev=1187900&view=rev
Log:
LUCENE-3301: add workaround for jre breakiterator bugs
Added:
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java (with props)
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java (with props)
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1187900&r1=1187899&r2=1187900&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Sun Oct 23 14:55:25 2011
@@ -125,6 +125,10 @@ Bug Fixes
to retrieve top groups for any BlockJoinQuery after the first (Mark
Harwood, Mike McCandless)
+ * LUCENE-3301: Added a workaround for buggy BreakIterator implementations in
+ Java that crash on certain inputs containing supplementary characters.
+ (Robert Muir)
+
API Changes
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java?rev=1187900&r1=1187899&r2=1187900&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java Sun Oct 23 14:55:25 2011
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.lang.Character.UnicodeBlock;
import java.text.BreakIterator;
import java.util.Locale;
-import javax.swing.text.Segment;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.L
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArrayIterator;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
@@ -56,7 +56,7 @@ public final class ThaiWordFilter extend
DBBI_AVAILABLE = proto.isBoundary(4);
}
private final BreakIterator breaker = (BreakIterator) proto.clone();
- private final Segment charIterator = new Segment();
+ private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
private final boolean handlePosIncr;
@@ -113,9 +113,7 @@ public final class ThaiWordFilter extend
}
// reinit CharacterIterator
- charIterator.array = clonedTermAtt.buffer();
- charIterator.offset = 0;
- charIterator.count = clonedTermAtt.length();
+ charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
breaker.setText(charIterator);
int end = breaker.next();
if (end != BreakIterator.DONE) {
Added: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java?rev=1187900&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java Sun Oct 23 14:55:25 2011
@@ -0,0 +1,175 @@
+package org.apache.lucene.analysis.util;
+
+import java.text.BreakIterator; // javadoc
+import java.text.CharacterIterator;
+import java.util.Locale;
+
+/**
+ * A CharacterIterator used internally for use with {@link BreakIterator}
+ * @lucene.internal
+ */
+public abstract class CharArrayIterator implements CharacterIterator {
+ private char array[];
+ private int start;
+ private int index;
+ private int length;
+ private int limit;
+
+ public char [] getText() {
+ return array;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Set a new region of text to be examined by this iterator
+ *
+ * @param array text buffer to examine
+ * @param start offset into buffer
+ * @param length maximum length to examine
+ */
+ public void setText(final char array[], int start, int length) {
+ this.array = array;
+ this.start = start;
+ this.index = start;
+ this.length = length;
+ this.limit = start + length;
+ }
+
+ public char current() {
+ return (index == limit) ? DONE : jreBugWorkaround(array[index]);
+ }
+
+ protected abstract char jreBugWorkaround(char ch);
+
+ public char first() {
+ index = start;
+ return current();
+ }
+
+ public int getBeginIndex() {
+ return 0;
+ }
+
+ public int getEndIndex() {
+ return length;
+ }
+
+ public int getIndex() {
+ return index - start;
+ }
+
+ public char last() {
+ index = (limit == start) ? limit : limit - 1;
+ return current();
+ }
+
+ public char next() {
+ if (++index >= limit) {
+ index = limit;
+ return DONE;
+ } else {
+ return current();
+ }
+ }
+
+ public char previous() {
+ if (--index < start) {
+ index = start;
+ return DONE;
+ } else {
+ return current();
+ }
+ }
+
+ public char setIndex(int position) {
+ if (position < getBeginIndex() || position > getEndIndex())
+ throw new IllegalArgumentException("Illegal Position: " + position);
+ index = start + position;
+ return current();
+ }
+
+ @Override
+ public Object clone() {
+ try {
+ return super.clone();
+ } catch (CloneNotSupportedException e) {
+ // CharacterIterator does not allow you to throw CloneNotSupported
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Create a new CharArrayIterator that works around JRE bugs
+ * in a manner suitable for {@link BreakIterator#getSentenceInstance()}
+ */
+ public static CharArrayIterator newSentenceInstance() {
+ if (HAS_BUGGY_BREAKITERATORS) {
+ return new CharArrayIterator() {
+ // work around this for now by lying about all surrogates to
+ // the sentence tokenizer, instead we treat them all as
+ // SContinue so we won't break around them.
+ @Override
+ protected char jreBugWorkaround(char ch) {
+ return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
+ }
+ };
+ } else {
+ return new CharArrayIterator() {
+ // no bugs
+ @Override
+ protected char jreBugWorkaround(char ch) {
+ return ch;
+ }
+ };
+ }
+ }
+
+ /**
+ * Create a new CharArrayIterator that works around JRE bugs
+ * in a manner suitable for {@link BreakIterator#getWordInstance()}
+ */
+ public static CharArrayIterator newWordInstance() {
+ if (HAS_BUGGY_BREAKITERATORS) {
+ return new CharArrayIterator() {
+ // work around this for now by lying about all surrogates to the word,
+ // instead we treat them all as ALetter so we won't break around them.
+ @Override
+ protected char jreBugWorkaround(char ch) {
+ return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
+ }
+ };
+ } else {
+ return new CharArrayIterator() {
+ // no bugs
+ @Override
+ protected char jreBugWorkaround(char ch) {
+ return ch;
+ }
+ };
+ }
+ }
+
+ /**
+ * True if this JRE has a buggy BreakIterator implementation
+ */
+ public static final boolean HAS_BUGGY_BREAKITERATORS;
+ static {
+ boolean v;
+ try {
+ BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
+ bi.setText("\udb40\udc53");
+ bi.next();
+ v = false;
+ } catch (Exception e) {
+ v = true;
+ }
+ HAS_BUGGY_BREAKITERATORS = v;
+ }
+}
Added: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java?rev=1187900&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java (added)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java Sun Oct 23 14:55:25 2011
@@ -0,0 +1,162 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+import java.util.Locale;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
+public class TestCharArrayIterator extends LuceneTestCase {
+
+ public void testWordInstance() {
+ doTests(CharArrayIterator.newWordInstance());
+ }
+
+ public void testConsumeWordInstance() {
+ BreakIterator bi = BreakIterator.getWordInstance();
+ CharArrayIterator ci = CharArrayIterator.newWordInstance();
+ for (int i = 0; i < 10000; i++) {
+ char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+ ci.setText(text, 0, text.length);
+ consume(bi, ci);
+ }
+ }
+
+ /* run this to test if your JRE is buggy
+ public void testWordInstanceJREBUG() {
+ BreakIterator bi = BreakIterator.getWordInstance();
+ Segment ci = new Segment();
+ for (int i = 0; i < 10000; i++) {
+ char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+ ci.array = text;
+ ci.offset = 0;
+ ci.count = text.length;
+ consume(bi, ci);
+ }
+ }
+ */
+
+ public void testSentenceInstance() {
+ doTests(CharArrayIterator.newSentenceInstance());
+ }
+
+ public void testConsumeSentenceInstance() {
+ BreakIterator bi = BreakIterator.getSentenceInstance();
+ CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
+ for (int i = 0; i < 10000; i++) {
+ char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+ ci.setText(text, 0, text.length);
+ consume(bi, ci);
+ }
+ }
+
+ /* run this to test if your JRE is buggy
+ public void testSentenceInstanceJREBUG() {
+ BreakIterator bi = BreakIterator.getSentenceInstance();
+ Segment ci = new Segment();
+ for (int i = 0; i < 10000; i++) {
+ char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+ ci.array = text;
+ ci.offset = 0;
+ ci.count = text.length;
+ consume(bi, ci);
+ }
+ }
+ */
+
+ private void doTests(CharArrayIterator ci) {
+ // basics
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ assertEquals(0, ci.getBeginIndex());
+ assertEquals(7, ci.getEndIndex());
+ assertEquals(0, ci.getIndex());
+ assertEquals('t', ci.current());
+ assertEquals('e', ci.next());
+ assertEquals('g', ci.last());
+ assertEquals('n', ci.previous());
+ assertEquals('t', ci.first());
+ assertEquals(CharacterIterator.DONE, ci.previous());
+
+ // first()
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ ci.next();
+ // Sets the position to getBeginIndex() and returns the character at that position.
+ assertEquals('t', ci.first());
+ assertEquals(ci.getBeginIndex(), ci.getIndex());
+ // or DONE if the text is empty
+ ci.setText(new char[] {}, 0, 0);
+ assertEquals(CharacterIterator.DONE, ci.first());
+
+ // last()
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
+ // and returns the character at that position.
+ assertEquals('g', ci.last());
+ assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
+ // or DONE if the text is empty
+ ci.setText(new char[] {}, 0, 0);
+ assertEquals(CharacterIterator.DONE, ci.last());
+ assertEquals(ci.getEndIndex(), ci.getIndex());
+
+ // current()
+ // Gets the character at the current position (as returned by getIndex()).
+ ci.setText("testing".toCharArray(), 0, "testing".length());
+ assertEquals('t', ci.current());
+ ci.last();
+ ci.next();
+ // or DONE if the current position is off the end of the text.
+ assertEquals(CharacterIterator.DONE, ci.current());
+
+ // next()
+ ci.setText("te".toCharArray(), 0, 2);
+ // Increments the iterator's index by one and returns the character at the new index.
+ assertEquals('e', ci.next());
+ assertEquals(1, ci.getIndex());
+ // or DONE if the new position is off the end of the text range.
+ assertEquals(CharacterIterator.DONE, ci.next());
+ assertEquals(ci.getEndIndex(), ci.getIndex());
+
+ // setIndex()
+ ci.setText("test".toCharArray(), 0, "test".length());
+ try {
+ ci.setIndex(5);
+ fail();
+ } catch (Exception e) {
+ assertTrue(e instanceof IllegalArgumentException);
+ }
+
+ // clone()
+ char text[] = "testing".toCharArray();
+ ci.setText(text, 0, text.length);
+ ci.next();
+ CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
+ assertEquals(ci.getIndex(), ci2.getIndex());
+ assertEquals(ci.next(), ci2.next());
+ assertEquals(ci.last(), ci2.last());
+ }
+
+ private void consume(BreakIterator bi, CharacterIterator ci) {
+ bi.setText(ci);
+ while (bi.next() != BreakIterator.DONE)
+ ;
+ }
+}