You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/21 15:30:35 UTC
svn commit: r1303407 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/facet/
lucene/core/src/java/org/apache/lucene/util/CharacterUtils.java solr/
solr/core/
Author: rmuir
Date: Wed Mar 21 14:30:34 2012
New Revision: 1303407
URL: http://svn.apache.org/viewvc?rev=1303407&view=rev
Log:
LUCENE-3896: CharacterUtils.fill must call Reader.read again if it only got a single high surrogate char on the first read
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/facet/ (props changed)
lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/CharacterUtils.java
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/core/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/CharacterUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/CharacterUtils.java?rev=1303407&r1=1303406&r2=1303407&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/CharacterUtils.java (original)
+++ lucene/dev/branches/branch_3x/lucene/core/src/java/org/apache/lucene/util/CharacterUtils.java Wed Mar 21 14:30:34 2012
@@ -1,8 +1,5 @@
package org.apache.lucene.util;
-import java.io.IOException;
-import java.io.Reader;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -20,6 +17,11 @@ import java.io.Reader;
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.util.Version;
+
/**
* {@link CharacterUtils} provides a unified interface to Character-related
* operations to implement backwards compatible character operations based on a
@@ -119,8 +121,9 @@ public abstract class CharacterUtils {
* @return a new {@link CharacterBuffer} instance.
*/
public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
- if(bufferSize < 2)
+ if (bufferSize < 2) {
throw new IllegalArgumentException("buffersize must be >= 2");
+ }
return new CharacterBuffer(new char[bufferSize], 0, 0);
}
@@ -157,7 +160,7 @@ public abstract class CharacterUtils {
}
@Override
- public final int codePointAt(final char[] chars, final int offset) {
+ public int codePointAt(final char[] chars, final int offset) {
return Character.codePointAt(chars, offset);
}
@@ -175,21 +178,51 @@ public abstract class CharacterUtils {
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
final char[] charBuffer = buffer.buffer;
buffer.offset = 0;
- charBuffer[0] = buffer.lastTrailingHighSurrogate;
- final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1;
- buffer.lastTrailingHighSurrogate = 0;
- final int read = reader.read(charBuffer, offset, charBuffer.length
- - offset);
+ final int offset;
+
+ // Install the previously saved ending high surrogate:
+ if (buffer.lastTrailingHighSurrogate != 0) {
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ offset = 1;
+ } else {
+ offset = 0;
+ }
+
+ final int read = reader.read(charBuffer,
+ offset,
+ charBuffer.length - offset);
if (read == -1) {
buffer.length = offset;
+ buffer.lastTrailingHighSurrogate = 0;
return offset != 0;
}
+ assert read > 0;
buffer.length = read + offset;
- // special case if the read returns 0 and the lastTrailingHighSurrogate was set
+
+ // If we read only a single char, and that char was a
+ // high surrogate, read again:
+ if (buffer.length == 1
+ && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ final int read2 = reader.read(charBuffer,
+ 1,
+ charBuffer.length - 1);
+ if (read2 == -1) {
+ // NOTE: mal-formed input (ended on a high
+ // surrogate)! Consumer must deal with it...
+ return true;
+ }
+ assert read2 > 0;
+
+ buffer.length += read2;
+ }
+
if (buffer.length > 1
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ } else {
+ buffer.lastTrailingHighSurrogate = 0;
}
+
return true;
}
}
@@ -199,7 +232,7 @@ public abstract class CharacterUtils {
}
@Override
- public final int codePointAt(final char[] chars, final int offset) {
+ public int codePointAt(final char[] chars, final int offset) {
return chars[offset];
}
@@ -236,7 +269,9 @@ public abstract class CharacterUtils {
private final char[] buffer;
private int offset;
private int length;
- private char lastTrailingHighSurrogate = 0;
+ // NOTE: not private so outer class can access without
+ // $access methods:
+ char lastTrailingHighSurrogate;
CharacterBuffer(char[] buffer, int offset, int length) {
this.buffer = buffer;