You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/11/23 21:26:59 UTC
svn commit: r883485 - in /lucene/java/branches/flex_1458/src: java/org/apache/lucene/index/SegmentReader.java java/org/apache/lucene/index/TermRef.java test/org/apache/lucene/index/TestTermEnumSurrogate.java

Author: rmuir
Date: Mon Nov 23 20:26:49 2009
New Revision: 883485

URL: http://svn.apache.org/viewvc?rev=883485&view=rev
Log:
LUCENE-1458: better back compat for unpaired surrogates in TermEnum

Added:
    lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java   (with props)
Modified:
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java?rev=883485&r1=883484&r2=883485&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/SegmentReader.java Mon Nov 23 20:26:49 2009
@@ -1386,7 +1386,18 @@
           if (currentField == t.field) {
             // Field matches -- get terms
             terms = fields.terms();
-            TermRef tr = new TermRef(t.text());
+            String text = t.text();
+            TermRef tr;
+            // this is a hack only for backwards compatibility.
+            // previously you could supply a term ending with a lead surrogate,
+            // and it would return the next Term.
+            // if someone does this, tack on the lowest possible trail surrogate.
+            // this emulates the old behavior, and forms "valid UTF-8" unicode.
+            if (text.length() > 0 
+                && Character.isHighSurrogate(text.charAt(text.length() - 1)))
+              tr = new TermRef(t.text() + "\uDC00");
+            else
+              tr = new TermRef(t.text());
             TermsEnum.SeekStatus status = terms.seek(tr);
             if (status == TermsEnum.SeekStatus.END) {
               // leave currentTerm null

Modified: lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java?rev=883485&r1=883484&r2=883485&view=diff
==============================================================================
--- lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java (original)
+++ lucene/java/branches/flex_1458/src/java/org/apache/lucene/index/TermRef.java Mon Nov 23 20:26:49 2009
@@ -32,12 +32,18 @@
   public TermRef() {
   }
 
+  /**
+   * @param text Well-formed unicode text, with no unpaired surrogates or U+FFFF.
+   */
   public TermRef(String text) {
     copy(text);
   }
 
   // nocommit: we could do this w/ UnicodeUtil w/o requiring
   // allocation of new bytes[]?
+  /**
+   * @param text Well-formed unicode text, with no unpaired surrogates or U+FFFF.
+   */
   public void copy(String text) {
     try {
       bytes = text.getBytes("UTF-8");
@@ -97,7 +103,7 @@
   }
 
   public boolean startsWith(TermRef other) {
-    // nocommit: is this correct?
+    // nocommit: is this correct? Yes this is correct.
     if (length < other.length) {
       return false;
     }

Added: lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java?rev=883485&view=auto
==============================================================================
--- lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java (added)
+++ lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java Mon Nov 23 20:26:49 2009
@@ -0,0 +1,52 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Back-compat test that you can seek to a lead surrogate in the term
+ * dictionary. With the old lucene API, this worked, due to the fact that the
+ * Term itself did not need to be converted into proper UTF-8 bytes.
+ * 
+ * With the new API the provided Term text must be encodeable into UTF-8.
+ * 
+ * @deprecated Remove this when the old API is no longer supported.
+ */
+public class TestTermEnumSurrogate extends LuceneTestCase {
+  public void testSeekSurrogate() throws Exception {
+    RAMDirectory dir = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
+        IndexWriter.MaxFieldLength.UNLIMITED);
+    Document d = new Document();
+    Field f = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
+    d.add(f);
+    f.setValue("abacadaba");
+    writer.addDocument(d);
+    f.setValue("ð©¬");
+    writer.addDocument(d);
+    writer.close();
+    IndexReader reader = IndexReader.open(dir, true);
+    TermEnum te = reader.terms(new Term("field", "ð©¬".substring(0, 1)));
+    assertEquals(new Term("field", "ð©¬"), te.term());
+  }
+}

Propchange: lucene/java/branches/flex_1458/src/test/org/apache/lucene/index/TestTermEnumSurrogate.java
------------------------------------------------------------------------------
    svn:eol-style = native