You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@abdera.apache.org by jm...@apache.org on 2006/07/12 03:57:42 UTC
svn commit: r421073 -
/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java
Author: jmsnell
Date: Tue Jul 11 18:57:41 2006
New Revision: 421073
URL: http://svn.apache.org/viewvc?rev=421073&view=rev
Log:
A character encoding sniffing inputstream implementation.
Later on this evening I'll plug this into the parser so that things just work correctly.
Added:
incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java
Added: incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java?rev=421073&view=auto
==============================================================================
--- incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java (added)
+++ incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java Tue Jul 11 18:57:41 2006
@@ -0,0 +1,86 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License. For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.abdera.util;
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+/**
+ * Will attempt to autodetect the character encoding from the stream
+ * This will preserve the BOM if it exists
+ */
+public class SniffingInputStream
+ extends FilterInputStream {
+
+ private String encoding = null;
+ private boolean bomset = false;
+
+ protected SniffingInputStream(InputStream in) throws IOException {
+ super(new PushbackInputStream(in,4));
+ encoding = detectEncoding();
+ }
+
+ public boolean isBomSet() {
+ return bomset;
+ }
+
+ public String getEncoding() {
+ return encoding;
+ }
+
+ private String detectEncoding() throws IOException {
+ PushbackInputStream pin = (PushbackInputStream) this.in;
+ byte[] bom = new byte[4];
+ pin.read(bom);
+ pin.unread(bom);
+ if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFFFFFFFE && bom[3] == 0xFFFFFFFF) {
+ bomset = true;
+ return "utf-32be";
+ } else if (bom[0] == 0xFFFFFFFF && bom[1] == 0xFFFFFFFE && bom[2] == 0x00 && bom[3] == 0x00) {
+ bomset = true;
+ return "utf-32le";
+ } else if ((bom[0] == 0xFFFFFFFE && bom[1] == 0xFFFFFFFF && bom[2] == 0x00 && bom[3] == 0x00) ||
+ (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFFFFFFFF && bom[3] == 0xFFFFFFFE)) {
+ bomset = true;
+ return null;
+ } else if (bom[0] == 0xFFFFFFFE && bom[1] == 0xFFFFFFFF) {
+ bomset = true;
+ return "utf-16be";
+ } else if (bom[0] == 0xFFFFFFFF && bom[1] == 0xFFFFFFFE) {
+ bomset = true;
+ return "utf-16le";
+ } else if (bom[0] == 0xFFFFFFEF && bom[1] == 0xFFFFFFBB && bom[2] == 0xFFFFFFBF) {
+ bomset = true;
+ return "utf-8";
+ } else if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0x00 && bom[3] == 0x3C) {
+ return "utf-32be";
+ } else if (bom[0] == 0x3C && bom[1] == 0x00 && bom[2] == 0x00 && bom[3] == 0x00) {
+ return "utf-32le";
+ } else if (bom[0] == 0x00 && bom[1] == 0x3C && bom[2] == 0x00 && bom[3] == 0x3F) {
+ return "utf-16be";
+ } else if (bom[0] == 0x3C && bom[1] == 0x00 && bom[2] == 0x3F && bom[3] == 0x00) {
+ return "utf-16le";
+ } else if (bom[0] == 0x4C && bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) {
+ return "edbdic";
+ }
+ return null;
+ }
+
+}