You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by yo...@apache.org on 2006/11/19 02:34:11 UTC

svn commit: r476679 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/queryParser/QueryParser.java src/java/org/apache/lucene/queryParser/QueryParser.jj src/test/org/apache/lucene/queryParser/TestQueryParser.java

Author: yonik
Date: Sat Nov 18 17:34:10 2006
New Revision: 476679

URL: http://svn.apache.org/viewvc?view=rev&rev=476679
Log:
unicode escapes for QueryParser: LUCENE-716

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java
    lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj
    lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=476679&r1=476678&r2=476679
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sat Nov 18 17:34:10 2006
@@ -58,6 +58,10 @@
  7. LUCENE-573: QueryParser now allows backslash escaping in
     quoted terms and phrases. (Michael Busch via Yonik Seeley)
 
+ 7. LUCENE-716: QueryParser now allows specification of unicode
+    characters in terms via a unicode escape of the form \uXXXX
+    (Michael Busch via Yonik Seeley)
+
 API Changes
 
  1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow

Modified: lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java?view=diff&rev=476679&r1=476678&r2=476679
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.java Sat Nov 18 17:34:10 2006
@@ -621,6 +621,9 @@
    * Returns a String where the escape char has been
    * removed, or kept only once if there was a double escape.
    * 
+   * Supports escaped unicode characters, e. g. translates
+   * <code>A</code> to <code>A</code>.
+   * 
    */
   private String discardEscapeChar(String input) throws ParseException {
     // Create char array to hold unescaped char sequence
@@ -635,12 +638,31 @@
     // an escape character
     boolean lastCharWasEscapeChar = false;
 
+    // The multiplier the current unicode digit must be multiplied with.
+    // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
+    int codePointMultiplier = 0;
+
+    // Used to calculate the codepoint of the escaped unicode character
+    int codePoint = 0;
+
     for (int i = 0; i < input.length(); i++) {
       char curChar = input.charAt(i);
-      if (lastCharWasEscapeChar) {
-        // this character was escaped
-        output[length] = curChar;
-        length++;
+      if (codePointMultiplier > 0) {
+        codePoint += hexToInt(curChar) * codePointMultiplier;
+        codePointMultiplier >>>= 4;
+        if (codePointMultiplier == 0) {
+          length += Character.toChars(codePoint, output, length);
+          codePoint = 0;
+        }
+      } else if (lastCharWasEscapeChar) {
+        if (curChar == 'u') {
+          // found an escaped unicode character
+          codePointMultiplier = 16 * 16 * 16;
+        } else {
+          // this character was escaped
+          output[length] = curChar;
+          length++;
+        }
         lastCharWasEscapeChar = false;
       } else {
         if (curChar == '\\') {
@@ -652,12 +674,30 @@
       }
     }
 
+    if (codePointMultiplier > 0) {
+      throw new ParseException("Truncated unicode escape sequence.");
+    }
+
     if (lastCharWasEscapeChar) {
       throw new ParseException("Term can not end with escape character.");
     }
 
     return new String(output, 0, length);
   }
+
+  /** Returns the numeric value of the hexadecimal character */
+  private static final int hexToInt(char c) throws ParseException {
+    if ('0' <= c && c <= '9') {
+      return c - '0';
+    } else if ('a' <= c && c <= 'f'){
+      return c - 'a' + 10;
+    } else if ('A' <= c && c <= 'F') {
+      return c - 'A' + 10;
+    } else {
+      throw new ParseException("None-hex character in unicode escape sequence: " + c);
+    }
+  }
+
   /**
    * Returns a String where those characters that QueryParser
    * expects to be escaped are escaped by a preceding <code>\</code>.

Modified: lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj?view=diff&rev=476679&r1=476678&r2=476679
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/queryParser/QueryParser.jj Sat Nov 18 17:34:10 2006
@@ -644,6 +644,10 @@
   /**
    * Returns a String where the escape char has been
    * removed, or kept only once if there was a double escape.
+   * 
+   * Supports escaped unicode characters, e. g. translates
+   * <code>\u0041</code> to <code>A</code>.
+   * 
    */
   private String discardEscapeChar(String input) throws ParseException {
     // Create char array to hold unescaped char sequence
@@ -658,12 +662,31 @@
     // an escape character
     boolean lastCharWasEscapeChar = false;
       
+    // The multiplier the current unicode digit must be multiplied with.
+    // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
+    int codePointMultiplier = 0;
+      
+    // Used to calculate the codepoint of the escaped unicode character
+    int codePoint = 0;
+      
     for (int i = 0; i < input.length(); i++) {
       char curChar = input.charAt(i);
-      if (lastCharWasEscapeChar) {
-        // this character was escaped
-        output[length] = curChar;    
-        length++;
+      if (codePointMultiplier > 0) {
+        codePoint += hexToInt(curChar) * codePointMultiplier;
+        codePointMultiplier >>>= 4;
+        if (codePointMultiplier == 0) {
+          length += Character.toChars(codePoint, output, length);
+          codePoint = 0;
+        }
+      } else if (lastCharWasEscapeChar) {
+        if (curChar == 'u') {
+          // found an escaped unicode character
+          codePointMultiplier = 16 * 16 * 16;
+        } else { 
+          // this character was escaped
+          output[length] = curChar;    
+          length++;
+        }
         lastCharWasEscapeChar = false;
       } else {
         if (curChar == '\\') {
@@ -675,12 +698,30 @@
       }
     }
       
+    if (codePointMultiplier > 0) {
+      throw new ParseException("Truncated unicode escape sequence.");
+    }
+    
     if (lastCharWasEscapeChar) {
       throw new ParseException("Term can not end with escape character.");
     }
       
     return new String(output, 0, length);
   }
+  
+  /** Returns the numeric value of the hexadecimal character */
+  private static final int hexToInt(char c) throws ParseException {
+    if ('0' <= c && c <= '9') {
+      return c - '0';
+    } else if ('a' <= c && c <= 'f'){
+      return c - 'a' + 10;
+    } else if ('A' <= c && c <= 'F') {
+      return c - 'A' + 10;
+    } else {
+      throw new ParseException("None-hex character in unicode escape sequence: " + c);
+    }
+  }
+  
   /**
    * Returns a String where those characters that QueryParser
    * expects to be escaped are escaped by a preceding <code>\</code>.

Modified: lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java?view=diff&rev=476679&r1=476678&r2=476679
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/queryParser/TestQueryParser.java Sat Nov 18 17:34:10 2006
@@ -486,6 +486,28 @@
     assertQueryEquals("\"a \\+b c d\"", a, "\"a +b c d\"");
     
     assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt");
+    
+
+    try {
+        assertQueryEquals("XY\\", a, "XYZ");
+        fail("ParseException expected, not thrown");
+    } catch (ParseException expected) {}
+    
+    // test unicode escaping
+    assertQueryEquals("a\\u0062c", a, "abc");
+    assertQueryEquals("XY\\u005a", a, "XYZ");
+    assertQueryEquals("XY\\u005A", a, "XYZ");
+    assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "\"a \\(b\" c\"");
+    
+    try {
+        assertQueryEquals("XY\\u005G", a, "XYZ");
+        fail("ParseException expected, not thrown");
+    } catch (ParseException expected) {}
+
+    try {
+        assertQueryEquals("XY\\u005", a, "XYZ");
+        fail("ParseException expected, not thrown");
+    } catch (ParseException expected) {}
   }
 
   public void testQueryStringEscaping() throws Exception {