You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by eb...@apache.org on 2012/03/07 16:58:12 UTC
svn commit: r1298001 - in /commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv: CSVParser.java UnicodeUnescapeReader.java

Author: ebourg
Date: Wed Mar  7 15:58:12 2012
New Revision: 1298001

URL: http://svn.apache.org/viewvc?rev=1298001&view=rev
Log:
Replaced the unicode escaping code from the parser with a class implementing java.io.Reader

Added:
    commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java   (with props)
Modified:
    commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java

Modified: commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
URL: http://svn.apache.org/viewvc/commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java?rev=1298001&r1=1298000&r2=1298001&view=diff
==============================================================================
--- commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java (original)
+++ commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java Wed Mar  7 15:58:12 2012
@@ -76,8 +76,6 @@ public class CSVParser implements Iterab
     private final List<String> record = new ArrayList<String>();
     private final Token reusableToken = new Token();
     private final CharBuffer wsBuf = new CharBuffer();
-    private final CharBuffer code = new CharBuffer(4);
-
 
     /**
      * Token is an internal token representation.
@@ -137,6 +135,10 @@ public class CSVParser implements Iterab
      * @param format the CSVFormat used for CSV parsing
      */
     public CSVParser(Reader input, CSVFormat format) {
+        if (format.isUnicodeEscapesInterpreted()) {
+            input = new UnicodeUnescapeReader(input);
+        }
+        
         this.in = new ExtendedBufferedReader(input);
         this.format = format;
     }
@@ -404,9 +406,6 @@ public class CSVParser implements Iterab
                 tkn.type = TOKEN;
                 tkn.isReady = true;
                 break;
-            } else if (c == '\\' && format.isUnicodeEscapesInterpreted() && in.lookAhead() == 'u') {
-                // interpret unicode escaped chars (like \u0070 -> p)
-                tkn.content.append((char) unicodeEscapeLexer(c));
             } else if (c == format.getEscape()) {
                 tkn.content.append((char) readEscape(c));
             } else {
@@ -444,10 +443,8 @@ public class CSVParser implements Iterab
         // assert c == delimiter;
         for (; ;) {
             c = in.read();
-
-            if (c == '\\' && format.isUnicodeEscapesInterpreted() && in.lookAhead() == 'u') {
-                tkn.content.append((char) unicodeEscapeLexer(c));
-            } else if (c == format.getEscape()) {
+            
+            if (c == format.getEscape()) {
                 tkn.content.append((char) readEscape(c));
             } else if (c == format.getEncapsulator()) {
                 if (in.lookAhead() == format.getEncapsulator()) {
@@ -487,62 +484,23 @@ public class CSVParser implements Iterab
         }
     }
 
-
-    /**
-     * Decodes Unicode escapes.
-     * <p/>
-     * Interpretation of "\\uXXXX" escape sequences where XXXX is a hex-number.
-     *
-     * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
-     * @return the decoded character
-     * @throws IOException on wrong unicode escape sequence or read error
-     */
-    private int unicodeEscapeLexer(int c) throws IOException {
-        int ret = 0;
-        // ignore 'u' (assume c==\ now) and read 4 hex digits
-        c = in.read();
-        code.clear();
-        try {
-            for (int i = 0; i < 4; i++) {
-                c = in.read();
-                if (isEndOfFile(c) || isEndOfLine(c)) {
-                    throw new NumberFormatException("number too short");
-                }
-                code.append((char) c);
-            }
-            ret = Integer.parseInt(code.toString(), 16);
-        } catch (NumberFormatException e) {
-            throw new IOException(
-                    "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
-                            + code.toString() + "'" + e.toString());
-        }
-        return ret;
-    }
-
     private int readEscape(int c) throws IOException {
         // assume c is the escape char (normally a backslash)
         c = in.read();
-        int out;
         switch (c) {
             case 'r':
-                out = '\r';
-                break;
+                return '\r';
             case 'n':
-                out = '\n';
-                break;
+                return '\n';
             case 't':
-                out = '\t';
-                break;
+                return '\t';
             case 'b':
-                out = '\b';
-                break;
+                return '\b';
             case 'f':
-                out = '\f';
-                break;
+                return '\f';
             default:
-                out = c;
+                return c;
         }
-        return out;
     }
 
     /**

Added: commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java
URL: http://svn.apache.org/viewvc/commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java?rev=1298001&view=auto
==============================================================================
--- commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java (added)
+++ commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java Wed Mar  7 15:58:12 2012
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.csv;
+
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+
+/**
+ * Reader transforming unicode escape sequences (i.e \u0065) in the provided
+ * stream into the corresponding unicode character.
+ * 
+ * @author Emmanuel Bourg
+ * @version $Revision$, $Date$
+ */
+class UnicodeUnescapeReader extends Reader {
+    private PushbackReader reader;
+
+    /** The buffer used to read unicode escape sequences. */
+    private final char[] sequence = new char[5];
+
+    UnicodeUnescapeReader(Reader reader) {
+        this.reader = new PushbackReader(reader, sequence.length);
+    }
+
+    public int read(char[] cbuf, int off, int len) throws IOException {
+        int count = 0;
+        for (int i = 0; i < len; i++) {
+            int c = reader.read();
+            
+            if (c == -1) {
+                return count == 0 ? -1 : count;
+            }
+            
+            if (c == '\\') {
+                int l = reader.read(sequence);
+                if (l == sequence.length 
+                        && 'u' == sequence[0]
+                        && isHexadecimal(sequence[1])
+                        && isHexadecimal(sequence[2])
+                        && isHexadecimal(sequence[3])
+                        && isHexadecimal(sequence[4])) {
+                    // unicode escape found
+                    c = Integer.parseInt(new String(sequence, 1, 4), 16);
+                    
+                } else if (l > 0) {
+                    // put the characters back in the stream
+                    reader.unread(sequence, 0, l);
+                }
+            }
+
+            cbuf[off + i] = (char) c;
+            count++;
+        }
+        
+        return count;
+    }
+    
+    private boolean isHexadecimal(char c) {
+        return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
+    }
+
+    public void close() throws IOException {
+        if (reader != null) {
+            reader.close();
+        }
+    }
+}

Propchange: commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: commons/sandbox/csv/trunk/src/main/java/org/apache/commons/csv/UnicodeUnescapeReader.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL