You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@abdera.apache.org by jm...@apache.org on 2006/07/12 08:06:03 UTC
svn commit: r421149 - in /incubator/abdera/java/trunk: core/src/main/java/org/apache/abdera/parser/ core/src/main/java/org/apache/abdera/util/ parser/src/main/java/org/apache/abdera/parser/stax/

Author: jmsnell
Date: Tue Jul 11 23:06:02 2006
New Revision: 421149

URL: http://svn.apache.org/viewvc?rev=421149&view=rev
Log:
OK, we now have better autodetection of charset encoding.
The SniffingInputStream will look first for a BOM to determine the charset.  Failing that, it will look at the XML prolog
for the charset.  The parser will use the detected charset or the one passed in via ParserOptions to initialize an
InputStreamReader used by the parser. 

To turn off charset autodetection, there is a new ParserOption.  Autodetection is on by default.

This gives us broader support for charsets, but still isn't as broad as, say, Mark Pilgrim's feedparser.

I'm sure there are ways this could be improved, but it seems to work.

Added:
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/SniffingInputStream.java
Removed:
    incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/SniffingInputStream.java
Modified:
    incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java
    incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java
    incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java

Modified: incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java?rev=421149&r1=421148&r2=421149&view=diff
==============================================================================
--- incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java (original)
+++ incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/parser/ParserOptions.java Tue Jul 11 23:06:02 2006
@@ -76,4 +76,8 @@
    * character strings encountered during the parse process.
    */
   void setTextFilter(TextFilter textFilter);
+  
+  boolean getAutodetectCharset();
+  
+  void setAutodetectCharset(boolean detect);
 }

Modified: incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java?rev=421149&r1=421148&r2=421149&view=diff
==============================================================================
--- incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java (original)
+++ incubator/abdera/java/trunk/core/src/main/java/org/apache/abdera/util/AbstractParserOptions.java Tue Jul 11 23:06:02 2006
@@ -32,6 +32,7 @@
   protected String charset = null;
   protected ParseFilter parseFilter = null;
   protected TextFilter textFilter = null;
+  protected boolean detect = true;
 
   protected abstract void initFactory();
   protected abstract void checkFactory(Factory factory);
@@ -67,5 +68,13 @@
   
   public void setTextFilter(TextFilter textFilter) {
     this.textFilter = textFilter;
+  }
+  
+  public boolean getAutodetectCharset() {
+    return this.detect;
+  }
+  
+  public void setAutodetectCharset(boolean detect) {
+    this.detect = detect;
   }
 }

Modified: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java?rev=421149&r1=421148&r2=421149&view=diff
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java (original)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/FOMParser.java Tue Jul 11 23:06:02 2006
@@ -18,6 +18,7 @@
 package org.apache.abdera.parser.stax;
 
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.net.URI;
 
@@ -31,6 +32,7 @@
 import org.apache.abdera.parser.Parser;
 import org.apache.abdera.parser.ParserOptions;
 import org.apache.abdera.util.AbstractParser;
+//import org.apache.abdera.util.SniffingInputStream;
 import org.apache.axiom.om.OMDocument;
 
 public class FOMParser 
@@ -53,26 +55,43 @@
     ParserOptions options) {
       Document<T> document = builder.getFomDocument();
       document.setBaseUri(base);
-      if (options.getCharset() != null) {
-        ((OMDocument)document).setCharsetEncoding(options.getCharset());
-      }
       return document;
   }
   
+  private void setCharset(ParserOptions options, String charset, Document doc) {
+    if (charset != null) doc.setCharset(charset);
+    if (options.getCharset() != null) {
+      ((OMDocument)doc).setCharsetEncoding(options.getCharset());
+    }    
+  }
+  
   public <T extends Element>Document<T> parse(
     InputStream in, 
     URI base, 
     ParserOptions options)
       throws ParseException {
     Document<T> document = null;
-
     if (in == null)
       throw new IllegalArgumentException("InputStream must not be null");
-
     try {
-      FOMFactory factory = getFomFactory(options);
-      FOMBuilder builder = new FOMBuilder(factory, in, options);
-      document = getDocument(builder, base, options);
+      String charset = (options != null) ? options.getCharset() : null;
+      boolean detect = (options != null) ? options.getAutodetectCharset() : true;
+      if (charset == null && detect) {
+        SniffingInputStream sin = 
+          (in instanceof SniffingInputStream) ? 
+            (SniffingInputStream)in : 
+            new SniffingInputStream(in);
+        charset = sin.getEncoding();
+        in = sin;
+      }
+      Reader isr = null;
+      if (charset == null) {
+        isr = new InputStreamReader(in);
+      } else {
+        isr = new InputStreamReader(in,charset);
+      }
+      if (options != null && charset != null) options.setCharset(charset);
+      document = parse(isr, base, options);
     } catch (Exception e) {
       if (!(e instanceof ParseException))
         e = new ParseException(e);
@@ -87,16 +106,15 @@
     ParserOptions options) 
       throws ParseException {
     Document<T> document = null;
-
     if (in == null)
       throw new IllegalArgumentException("Reader must not be null");
-
     try {
       FOMFactory factory = getFomFactory(options);
       XMLStreamReader xmlreader = 
         XMLInputFactory.newInstance().createXMLStreamReader(in);
       FOMBuilder builder = new FOMBuilder(factory, xmlreader, options);
       document = getDocument(builder, base, options);
+      setCharset(options, xmlreader.getCharacterEncodingScheme(), document);
     } catch (Exception e) {
       if (!(e instanceof ParseException))
         e = new ParseException(e);

Added: incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/SniffingInputStream.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/SniffingInputStream.java?rev=421149&view=auto
==============================================================================
--- incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/SniffingInputStream.java (added)
+++ incubator/abdera/java/trunk/parser/src/main/java/org/apache/abdera/parser/stax/SniffingInputStream.java Tue Jul 11 23:06:02 2006
@@ -0,0 +1,104 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  The ASF licenses this file to You
+* under the Apache License, Version 2.0 (the "License"); you may not
+* use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.  For additional information regarding
+* copyright in this work, please see the NOTICE file in the top level
+* directory of this distribution.
+*/
+package org.apache.abdera.parser.stax;
+
+import java.io.BufferedInputStream;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamReader;
+
+/**
+ * Will attempt to autodetect the character encoding from the stream
+ * This will preserve the BOM if it exists
+ */
+public class SniffingInputStream 
+  extends FilterInputStream {
+
+  private String encoding = null;
+  private boolean bomset = false;
+  
+  public SniffingInputStream(InputStream in) {
+    super(new BufferedInputStream(in,4));
+    try {
+      encoding = detectEncoding();
+    } catch (IOException e) {}
+  }
+
+  public boolean isBomSet() {
+    return bomset;
+  }
+  
+  public String getEncoding() {
+    return encoding;
+  }
+  
+  private String detectEncoding() throws IOException {
+    BufferedInputStream pin = (BufferedInputStream) this.in;
+    byte[] bom = new byte[4];
+    pin.mark(pin.available());
+    pin.read(bom);
+    pin.reset();  
+    String charset = null;
+    if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFFFFFFFE && bom[3] == 0xFFFFFFFF) {
+      bomset = true;
+      return "utf-32be";
+    } else if (bom[0] == 0xFFFFFFFF && bom[1] == 0xFFFFFFFE && bom[2] == 0x00 && bom[3] == 0x00) {
+      bomset = true;
+      return "utf-32le";
+    } else if ((bom[0] == 0xFFFFFFFE && bom[1] == 0xFFFFFFFF && bom[2] == 0x00 && bom[3] == 0x00) ||
+               (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFFFFFFFF && bom[3] == 0xFFFFFFFE)) {
+      bomset = true;
+      return null;
+    } else if (bom[0] == 0xFFFFFFFE && bom[1] == 0xFFFFFFFF) {
+      bomset = true;
+      return "utf-16be";
+    } else if (bom[0] == 0xFFFFFFFF && bom[1] == 0xFFFFFFFE) {
+      bomset = true;
+      return "utf-16le";
+    } else if (bom[0] == 0xFFFFFFEF && bom[1] == 0xFFFFFFBB && bom[2] == 0xFFFFFFBF)  {
+      bomset = true;
+      return "utf-8";
+    } else if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0x00 && bom[3] == 0x3C) {
+      charset = "utf-32be";
+    } else if (bom[0] == 0x3C && bom[1] == 0x00 && bom[2] == 0x00 && bom[3] == 0x00) {
+      charset = "utf-32le";
+    } else if (bom[0] == 0x00 && bom[1] == 0x3C && bom[2] == 0x00 && bom[3] == 0x3F) {
+      charset = "utf-16be";
+    } else if (bom[0] == 0x3C && bom[1] == 0x00 && bom[2] == 0x3F && bom[3] == 0x00) {
+      charset = "utf-16le";
+    } else if (bom[0] == 0x4C && bom[1] == 0x6F && bom[2] == 0xA7 && bom[3] == 0x94) {
+      charset = "edbdic";
+    } 
+    bomset = false;
+    pin.mark(pin.available());
+    try {
+      XMLStreamReader xmlreader = 
+        XMLInputFactory.newInstance().createXMLStreamReader(pin);
+      String cs = xmlreader.getCharacterEncodingScheme();
+      if (cs != null) charset = cs;
+    } catch (Exception e) {
+    } finally {
+      pin.reset();
+    }
+    return charset;
+  }
+  
+}