You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/11/23 21:26:59 UTC
svn commit: r883485 - in /lucene/java/branches/flex_1458/src:
java/org/apache/lucene/index/SegmentReader.java
java/org/apache/lucene/index/TermRef.java
test/org/apache/lucene/index/TestTermEnumSurrogate.java
Author: rmuir
Date: Mon Nov 23 20:26:49 2009
New Revision: 883485
URL: http://svn.apache.org/viewvc?rev=883485&view=rev
Log:
LUCENE-1458: better back compat for unpaired surrogates in TermEnum
Added:
lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java (with props)
Modified:
lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java
lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java?rev=883485&r1=883484&r2=883485&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java Mon Nov 23 20:26:49 2009
@@ -1386,7 +1386,18 @@
if (currentField == t.field) {
// Field matches -- get terms
terms = fields.terms();
- TermRef tr = new TermRef(t.text());
+ String text = t.text();
+ TermRef tr;
+ // this is a hack only for backwards compatibility.
+ // previously you could supply a term ending with a lead surrogate,
+ // and it would return the next Term.
+ // if someone does this, tack on the lowest possible trail surrogate.
+ // this emulates the old behavior, and forms "valid UTF-8" unicode.
+ if (text.length() > 0
+ && Character.isHighSurrogate(text.charAt(text.length() - 1)))
+ tr = new TermRef(t.text() + "\uDC00");
+ else
+ tr = new TermRef(t.text());
TermsEnum.SeekStatus status = terms.seek(tr);
if (status == TermsEnum.SeekStatus.END) {
// leave currentTerm null
Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java?rev=883485&r1=883484&r2=883485&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java Mon Nov 23 20:26:49 2009
@@ -32,12 +32,18 @@
public TermRef() {
}
+ /**
+ * @param text Well-formed unicode text, with no unpaired surrogates or U+FFFF.
+ */
public TermRef(String text) {
copy(text);
}
// nocommit: we could do this w/ UnicodeUtil w/o requiring
// allocation of new bytes[]?
+ /**
+ * @param text Well-formed unicode text, with no unpaired surrogates or U+FFFF.
+ */
public void copy(String text) {
try {
bytes = text.getBytes("UTF-8");
@@ -97,7 +103,7 @@
}
public boolean startsWith(TermRef other) {
- // nocommit: is this correct?
+ // nocommit: is this correct? Yes this is correct.
if (length < other.length) {
return false;
}
Added: lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java?rev=883485&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java (added)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java Mon Nov 23 20:26:49 2009
@@ -0,0 +1,52 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Back-compat test that you can seek to a lead surrogate in the term
+ * dictionary. With the old lucene API, this worked, due to the fact that the
+ * Term itself did not need to be converted into proper UTF-8 bytes.
+ *
+ * With the new API the provided Term text must be encodeable into UTF-8.
+ *
+ * @deprecated Remove this when the old API is no longer supported.
+ */
+public class TestTermEnumSurrogate extends LuceneTestCase {
+ public void testSeekSurrogate() throws Exception {
+ RAMDirectory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
+ IndexWriter.MaxFieldLength.UNLIMITED);
+ Document d = new Document();
+ Field f = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+ d.add(f);
+ f.setValue("abacadaba");
+ writer.addDocument(d);
+ f.setValue("ð©¬
");
+ writer.addDocument(d);
+ writer.close();
+ IndexReader reader = IndexReader.open(dir, true);
+ TermEnum te = reader.terms(new Term("field", "ð©¬
".substring(0, 1)));
+ assertEquals(new Term("field", "ð©¬
"), te.term());
+ }
+}
Propchange: lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java
------------------------------------------------------------------------------
svn:eol-style = native