You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@shindig.apache.org by jo...@apache.org on 2009/11/24 07:56:49 UTC

svn commit: r883607 - in /incubator/shindig/trunk/java/gadgets/src: main/java/org/apache/shindig/gadgets/encoding/ main/java/org/apache/shindig/gadgets/http/ test/java/org/apache/shindig/gadgets/encoding/

Author: johnh
Date: Tue Nov 24 06:56:47 2009
New Revision: 883607

URL: http://svn.apache.org/viewvc?rev=883607&view=rev
Log:
Refactoring to allow injection of a better (or simply profiled) encoding detector then the ICU
or simple UTF8 detection.

Thanks to Ziv Horesh for the patch!


Modified:
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
    incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
    incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java?rev=883607&r1=883606&r2=883607&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java Tue Nov 24 06:56:47 2009
@@ -20,6 +20,7 @@
 
 import java.nio.charset.Charset;
 
+import com.google.inject.Inject;
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;
 
@@ -32,6 +33,17 @@
   private static final Charset UTF_8 = Charset.forName("UTF-8");
   private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
 
+ 
+  public static class FallbackEncodingDetector {
+    public Charset detectEncoding(byte[] input) {
+      // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
+      CharsetDetector detector = new CharsetDetector();
+      detector.setText(input);
+      CharsetMatch match = detector.detect();
+      return Charset.forName(match.getName().toUpperCase());
+    }
+  }
+
   /**
    * Returns the detected encoding of the given byte array.
    *
@@ -40,9 +52,12 @@
    *     encoding for HTTP) if the bytes are not valid UTF-8. Only recommended if you can reasonably
    *     expect that other encodings are going to be specified. Full encoding detection is very
    *     expensive!
+   * @param alternateDecoder specify a fallback encoding detection. 
+   *     Only used if assume88591IfNotUtf8 is false.
    * @return The detected encoding.
    */
-  public static Charset detectEncoding(byte[] input, boolean assume88591IfNotUtf8) {
+  public static Charset detectEncoding(byte[] input, boolean assume88591IfNotUtf8,
+      FallbackEncodingDetector alternateDecoder) {
     if (looksLikeValidUtf8(input)) {
       return UTF_8;
     }
@@ -51,11 +66,8 @@
       return ISO_8859_1;
     }
 
-    // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
-    CharsetDetector detector = new CharsetDetector();
-    detector.setText(input);
-    CharsetMatch match = detector.detect();
-    return Charset.forName(match.getName().toUpperCase());
+    // Fall back encoding:
+    return alternateDecoder.detectEncoding(input);
   }
 
   /**

Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=883607&r1=883606&r2=883607&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java Tue Nov 24 06:56:47 2009
@@ -134,6 +134,11 @@
   @Inject(optional = true) @Named("shindig.http.fast-encoding-detection")
   private static boolean fastEncodingDetection = true;
 
+  // Support injection of smarter encoding detection
+  @Inject(optional = true)
+  private static EncodingDetector.FallbackEncodingDetector customEncodingDetector =
+      new EncodingDetector.FallbackEncodingDetector();
+  
   // Holds character sets for fast conversion
   private static final Map<String, Charset> encodingToCharset = new MapMaker().makeMap();
 
@@ -472,7 +477,8 @@
           }
         }
       }
-      Charset encoding = EncodingDetector.detectEncoding(body, fastEncodingDetection);
+      Charset encoding = EncodingDetector.detectEncoding(body, fastEncodingDetection,
+          customEncodingDetector);
       // Record the charset in the content-type header so that its value can be cached
       // and re-used. This is a BIG performance win.
       values.clear();

Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java?rev=883607&r1=883606&r2=883607&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java Tue Nov 24 06:56:47 2009
@@ -18,42 +18,76 @@
  */
 package org.apache.shindig.gadgets.encoding;
 
+import static org.easymock.classextension.EasyMock.expect;
+import static org.easymock.classextension.EasyMock.replay;
+import static org.easymock.classextension.EasyMock.verify;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
+import org.easymock.classextension.EasyMock;
 import org.junit.Test;
 
+import java.nio.charset.Charset;
+
 public class EncodingDetectorTest {
 
+  private EncodingDetector.FallbackEncodingDetector newMockFallbackEncoding(byte[] input,
+      String charset) {
+    EncodingDetector.FallbackEncodingDetector detector =
+      EasyMock.createNiceMock(EncodingDetector.FallbackEncodingDetector.class);
+    expect(detector.detectEncoding(input)).andReturn(Charset.forName(charset)).once();
+    replay(detector);
+    return detector;
+  }
+
   @Test
   public void asciiAssumesUtf8() throws Exception {
     byte[] data = "Hello, world".getBytes("US-ASCII");
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true, null).name());
   }
-
+ 
   @Test
   public void detectedUtf8WithByteOrderMark() {
     byte[] data = {
         (byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
     };
 
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true, null).name());
   }
 
   @Test
   public void assumeLatin1OnInvalidUtf8() throws Exception {
     byte[] data = "\u4F60\u597D".getBytes("BIG5");
-
-    assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true).name());
+    
+    assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true, null).name());
   }
 
   @Test
+  public void testFallbackDetectorIsUsed() throws Exception {
+    byte[] data = ("\u6211\u662F\u4E00\u4E2A\u4E0D\u5584\u4E8E\u8BB2\u8BDD\u7684\u4EBA\uFF0C" +
+                   "\u552F\u5176\u4E0D\u5584\u4E8E\u8BB2\u8BDD\uFF0C\u6709\u601D\u60F3\u8868" +
+                   "\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
+                   .getBytes("GB18030");
+
+    EncodingDetector.FallbackEncodingDetector detector = 
+      newMockFallbackEncoding(data, "GB18030");
+
+    assertEquals("GB18030", EncodingDetector.detectEncoding(data, false, detector).name());
+    verify(detector);
+  }
+  
+  // Test the fallback detector:
+  @Test
   public void doNotAssumeLatin1OnInvalidUtf8() throws Exception {
     byte[] data = ("\u6211\u662F\u4E00\u4E2A\u4E0D\u5584\u4E8E\u8BB2\u8BDD\u7684\u4EBA\uFF0C" +
                    "\u552F\u5176\u4E0D\u5584\u4E8E\u8BB2\u8BDD\uFF0C\u6709\u601D\u60F3\u8868" +
                    "\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
                    .getBytes("GB18030");
 
-    assertEquals("GB18030", EncodingDetector.detectEncoding(data, false).name());
+    EncodingDetector.FallbackEncodingDetector detector =
+        new EncodingDetector.FallbackEncodingDetector();
+
+    assertEquals("GB18030", EncodingDetector.detectEncoding(data, false, detector).name());
   }
 
   @Test
@@ -63,13 +97,32 @@
                    "\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
                    .getBytes("UTF-8");
 
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+    EncodingDetector.FallbackEncodingDetector detector =
+        new EncodingDetector.FallbackEncodingDetector();
+
+    assertEquals("UTF-8", detector.detectEncoding(data).name());
   }
 
   @Test
   public void shortUtf8StringIsUtf8() throws Exception {
     byte[] data = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd ama".getBytes("UTF-8");
 
-    assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+    EncodingDetector.FallbackEncodingDetector detector =
+        new EncodingDetector.FallbackEncodingDetector();
+
+    assertEquals("UTF-8", detector.detectEncoding(data).name());
   }
+  
+  @Test
+  public void nullCustomDetector() throws Exception {
+    byte[] data = "\u4F60\u597D".getBytes("BIG5");
+
+    try {
+      assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, false, null).name());
+      fail("Null Custom encoder is not supported");
+    } catch (NullPointerException e) {
+      // Expected!
+    }
+  }
+
 }