You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@shindig.apache.org by jo...@apache.org on 2009/11/24 07:56:49 UTC
svn commit: r883607 - in /incubator/shindig/trunk/java/gadgets/src:
main/java/org/apache/shindig/gadgets/encoding/
main/java/org/apache/shindig/gadgets/http/
test/java/org/apache/shindig/gadgets/encoding/
Author: johnh
Date: Tue Nov 24 06:56:47 2009
New Revision: 883607
URL: http://svn.apache.org/viewvc?rev=883607&view=rev
Log:
Refactoring to allow injection of a better (or simply profiled) encoding detector then the ICU
or simple UTF8 detection.
Thanks to Ziv Horesh for the patch!
Modified:
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java?rev=883607&r1=883606&r2=883607&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/encoding/EncodingDetector.java Tue Nov 24 06:56:47 2009
@@ -20,6 +20,7 @@
import java.nio.charset.Charset;
+import com.google.inject.Inject;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@@ -32,6 +33,17 @@
private static final Charset UTF_8 = Charset.forName("UTF-8");
private static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
+
+ public static class FallbackEncodingDetector {
+ public Charset detectEncoding(byte[] input) {
+ // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(input);
+ CharsetMatch match = detector.detect();
+ return Charset.forName(match.getName().toUpperCase());
+ }
+ }
+
/**
* Returns the detected encoding of the given byte array.
*
@@ -40,9 +52,12 @@
* encoding for HTTP) if the bytes are not valid UTF-8. Only recommended if you can reasonably
* expect that other encodings are going to be specified. Full encoding detection is very
* expensive!
+ * @param alternateDecoder specify a fallback encoding detection.
+ * Only used if assume88591IfNotUtf8 is false.
* @return The detected encoding.
*/
- public static Charset detectEncoding(byte[] input, boolean assume88591IfNotUtf8) {
+ public static Charset detectEncoding(byte[] input, boolean assume88591IfNotUtf8,
+ FallbackEncodingDetector alternateDecoder) {
if (looksLikeValidUtf8(input)) {
return UTF_8;
}
@@ -51,11 +66,8 @@
return ISO_8859_1;
}
- // Fall back to the incredibly slow ICU. It might be better to just skip this entirely.
- CharsetDetector detector = new CharsetDetector();
- detector.setText(input);
- CharsetMatch match = detector.detect();
- return Charset.forName(match.getName().toUpperCase());
+ // Fall back encoding:
+ return alternateDecoder.detectEncoding(input);
}
/**
Modified: incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java?rev=883607&r1=883606&r2=883607&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/main/java/org/apache/shindig/gadgets/http/HttpResponse.java Tue Nov 24 06:56:47 2009
@@ -134,6 +134,11 @@
@Inject(optional = true) @Named("shindig.http.fast-encoding-detection")
private static boolean fastEncodingDetection = true;
+ // Support injection of smarter encoding detection
+ @Inject(optional = true)
+ private static EncodingDetector.FallbackEncodingDetector customEncodingDetector =
+ new EncodingDetector.FallbackEncodingDetector();
+
// Holds character sets for fast conversion
private static final Map<String, Charset> encodingToCharset = new MapMaker().makeMap();
@@ -472,7 +477,8 @@
}
}
}
- Charset encoding = EncodingDetector.detectEncoding(body, fastEncodingDetection);
+ Charset encoding = EncodingDetector.detectEncoding(body, fastEncodingDetection,
+ customEncodingDetector);
// Record the charset in the content-type header so that its value can be cached
// and re-used. This is a BIG performance win.
values.clear();
Modified: incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java
URL: http://svn.apache.org/viewvc/incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java?rev=883607&r1=883606&r2=883607&view=diff
==============================================================================
--- incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java (original)
+++ incubator/shindig/trunk/java/gadgets/src/test/java/org/apache/shindig/gadgets/encoding/EncodingDetectorTest.java Tue Nov 24 06:56:47 2009
@@ -18,42 +18,76 @@
*/
package org.apache.shindig.gadgets.encoding;
+import static org.easymock.classextension.EasyMock.expect;
+import static org.easymock.classextension.EasyMock.replay;
+import static org.easymock.classextension.EasyMock.verify;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+import org.easymock.classextension.EasyMock;
import org.junit.Test;
+import java.nio.charset.Charset;
+
public class EncodingDetectorTest {
+ private EncodingDetector.FallbackEncodingDetector newMockFallbackEncoding(byte[] input,
+ String charset) {
+ EncodingDetector.FallbackEncodingDetector detector =
+ EasyMock.createNiceMock(EncodingDetector.FallbackEncodingDetector.class);
+ expect(detector.detectEncoding(input)).andReturn(Charset.forName(charset)).once();
+ replay(detector);
+ return detector;
+ }
+
@Test
public void asciiAssumesUtf8() throws Exception {
byte[] data = "Hello, world".getBytes("US-ASCII");
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+ assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true, null).name());
}
-
+
@Test
public void detectedUtf8WithByteOrderMark() {
byte[] data = {
(byte)0xEF, (byte)0xBB, (byte)0xBF, 'h', 'e', 'l', 'l', 'o'
};
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+ assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true, null).name());
}
@Test
public void assumeLatin1OnInvalidUtf8() throws Exception {
byte[] data = "\u4F60\u597D".getBytes("BIG5");
-
- assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true).name());
+
+ assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, true, null).name());
}
@Test
+ public void testFallbackDetectorIsUsed() throws Exception {
+ byte[] data = ("\u6211\u662F\u4E00\u4E2A\u4E0D\u5584\u4E8E\u8BB2\u8BDD\u7684\u4EBA\uFF0C" +
+ "\u552F\u5176\u4E0D\u5584\u4E8E\u8BB2\u8BDD\uFF0C\u6709\u601D\u60F3\u8868" +
+ "\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
+ .getBytes("GB18030");
+
+ EncodingDetector.FallbackEncodingDetector detector =
+ newMockFallbackEncoding(data, "GB18030");
+
+ assertEquals("GB18030", EncodingDetector.detectEncoding(data, false, detector).name());
+ verify(detector);
+ }
+
+ // Test the fallback detector:
+ @Test
public void doNotAssumeLatin1OnInvalidUtf8() throws Exception {
byte[] data = ("\u6211\u662F\u4E00\u4E2A\u4E0D\u5584\u4E8E\u8BB2\u8BDD\u7684\u4EBA\uFF0C" +
"\u552F\u5176\u4E0D\u5584\u4E8E\u8BB2\u8BDD\uFF0C\u6709\u601D\u60F3\u8868" +
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
.getBytes("GB18030");
- assertEquals("GB18030", EncodingDetector.detectEncoding(data, false).name());
+ EncodingDetector.FallbackEncodingDetector detector =
+ new EncodingDetector.FallbackEncodingDetector();
+
+ assertEquals("GB18030", EncodingDetector.detectEncoding(data, false, detector).name());
}
@Test
@@ -63,13 +97,32 @@
"\u8FBE\u4E0D\u51FA\uFF0C\u6709\u611F\u60C5\u65E0\u6CD5\u503E\u5410")
.getBytes("UTF-8");
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+ EncodingDetector.FallbackEncodingDetector detector =
+ new EncodingDetector.FallbackEncodingDetector();
+
+ assertEquals("UTF-8", detector.detectEncoding(data).name());
}
@Test
public void shortUtf8StringIsUtf8() throws Exception {
byte[] data = "Games, HQ, Mang\u00E1, Anime e tudo que um bom nerd ama".getBytes("UTF-8");
- assertEquals("UTF-8", EncodingDetector.detectEncoding(data, true).name());
+ EncodingDetector.FallbackEncodingDetector detector =
+ new EncodingDetector.FallbackEncodingDetector();
+
+ assertEquals("UTF-8", detector.detectEncoding(data).name());
}
+
+ @Test
+ public void nullCustomDetector() throws Exception {
+ byte[] data = "\u4F60\u597D".getBytes("BIG5");
+
+ try {
+ assertEquals("ISO-8859-1", EncodingDetector.detectEncoding(data, false, null).name());
+ fail("Null Custom encoder is not supported");
+ } catch (NullPointerException e) {
+ // Expected!
+ }
+ }
+
}