You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by Benedikt Ritter <br...@apache.org> on 2013/05/03 08:42:11 UTC

Re: svn commit: r1478621 - in /commons/proper/csv/trunk/src: main/java/org/apache/commons/csv/CSVLexer.java main/java/org/apache/commons/csv/Lexer.java test/java/org/apache/commons/csv/CSVLexerTest.java

2013/5/3 <se...@apache.org>

> Author: sebb
> Date: Fri May  3 01:10:34 2013
> New Revision: 1478621
>
> URL: http://svn.apache.org/r1478621
> Log:
> CSV-58 Unescape handling needs rethinking
> Fixed up most issues.
> TODO should TAB, FF and BACKSPACE be un/escaped?
>

Thanks for taking care of this sebb.


>
> Modified:
>
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
>
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java
>
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
>
> Modified:
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java?rev=1478621&r1=1478620&r2=1478621&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
> (original)
> +++
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
> Fri May  3 01:10:34 2013
> @@ -160,7 +160,12 @@ final class CSVLexer extends Lexer {
>                  tkn.type = TOKEN;
>                  break;
>              } else if (isEscape(c)) {
> -                tkn.content.append((char) readEscape());
> +                final int unescaped = readEscape();
> +                if (unescaped == Constants.END_OF_STREAM) { // unexpected
> char after escape
> +                    tkn.content.append((char) c).append((char)
> in.getLastChar());
> +                } else {
> +                    tkn.content.append((char) unescaped);
> +                }
>                  c = in.read(); // continue
>              } else {
>                  tkn.content.append((char) c);
> @@ -203,7 +208,12 @@ final class CSVLexer extends Lexer {
>              c = in.read();
>
>              if (isEscape(c)) {
> -                tkn.content.append((char) readEscape());
> +                final int unescaped = readEscape();
> +                if (unescaped == Constants.END_OF_STREAM) { // unexpected
> char after escape
> +                    tkn.content.append((char) c).append((char)
> in.getLastChar());
> +                } else {
> +                    tkn.content.append((char) unescaped);
> +                }
>              } else if (isQuoteChar(c)) {
>                  if (isQuoteChar(in.lookAhead())) {
>                      // double or escaped encapsulator -> add single
> encapsulator to token
>
> Modified:
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java?rev=1478621&r1=1478620&r2=1478621&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java
> (original)
> +++
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java
> Fri May  3 01:10:34 2013
> @@ -74,8 +74,18 @@ abstract class Lexer {
>      }
>
>      // TODO escape handling needs more work
> +    /**
> +     * Handle an escape sequence.
> +     * The current character must be the escape character.
> +     * On return, the next character is available by calling {@link
> ExtendedBufferedReader#getLastChar()}
> +     * on the input stream.
> +     *
> +     * @return the unescaped character (as an int) or {@link
> END_OF_STREAM} if char following the escape is invalid.
> +     * @throws IOException if there is a problem reading the stream or
> the end of stream is detected:
> +     * the escape character is not allowed at end of strem
> +     */
>      int readEscape() throws IOException {
> -        // assume c is the escape char (normally a backslash)
> +        // the escape char has just been read (normally a backslash)
>          final int c = in.read();
>          switch (c) {
>          case 'r':
> @@ -88,10 +98,21 @@ abstract class Lexer {
>              return BACKSPACE;
>          case 'f':
>              return FF;
> +        case CR:
> +        case LF:
> +        case FF: // TODO is this correct?
> +        case TAB: // TODO is this correct? Do tabs need to be escaped?
> +        case BACKSPACE: // TODO is this correct?
> +            return c;
>          case END_OF_STREAM:
>              throw new IOException("EOF whilst processing escape
> sequence");
>          default:
> -            return c;
> +            // Now check for meta-characters
> +            if (isDelimiter(c) || isEscape(c) || isQuoteChar(c) ||
> isCommentStart(c)) {
> +                return c;
> +            }
> +            // indicate unexpected char - available from in.getLastChar()
> +            return END_OF_STREAM;
>          }
>      }
>
>
> Modified:
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java?rev=1478621&r1=1478620&r2=1478621&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
> (original)
> +++
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
> Fri May  3 01:10:34 2013
> @@ -36,7 +36,6 @@ import java.io.IOException;
>  import java.io.StringReader;
>
>  import org.junit.Before;
> -import org.junit.Ignore;
>  import org.junit.Test;
>
>  /**
> @@ -311,48 +310,42 @@ public class CSVLexerTest {
>          assertThat(lexer.nextToken(new Token()), hasContent("character" +
> LF + "Escaped"));
>      }
>
> -    @Test
> +    @Test // TODO is this correct? Do we expect TAB to be un/escaped?
>      public void testEscapedTab() throws Exception {
>          final Lexer lexer = getLexer("character\\" + TAB + "Escaped",
> formatWithEscaping);
>          assertThat(lexer.nextToken(new Token()), hasContent("character" +
> TAB + "Escaped"));
>      }
>
> -    @Test
> +    @Test // TODO is this correct? Do we expect BACKSPACE to be
> un/escaped?
>      public void testEscapeBackspace() throws Exception {
>          final Lexer lexer = getLexer("character\\" + BACKSPACE +
> "Escaped", formatWithEscaping);
>          assertThat(lexer.nextToken(new Token()), hasContent("character" +
> BACKSPACE + "Escaped"));
>      }
>
> -    @Test
> +    @Test // TODO is this correct? Do we expect FF to be un/escaped?
>      public void testEscapeFF() throws Exception {
>          final Lexer lexer = getLexer("character\\" + FF + "Escaped",
> formatWithEscaping);
>          assertThat(lexer.nextToken(new Token()), hasContent("character" +
> FF + "Escaped"));
>      }
>
> -    // FIXME this should work after CSV-58 is resolved. Currently the
> result will be "charactera\NEscaped"
>      @Test
> -    @Ignore
>      public void testEscapedMySqlNullValue() throws Exception {
>          // MySQL uses \N to symbolize null values. We have to restore this
>          final Lexer lexer = getLexer("character\\NEscaped",
> formatWithEscaping);
>          assertThat(lexer.nextToken(new Token()),
> hasContent("character\\NEscaped"));
>      }
>
> -    // FIXME this should work after CSV-58 is resolved. Currently the
> result will be "characteraEscaped"
>      @Test
> -    @Ignore
>      public void testEscapedCharacter() throws Exception {
>          final Lexer lexer = getLexer("character\\aEscaped",
> formatWithEscaping);
>          assertThat(lexer.nextToken(new Token()),
> hasContent("character\\aEscaped"));
>      }
>
> -    // FIXME this should work after CSV-58 is resolved. Currently the
> result will be "characterCREscaped"
>      @Test
> -    @Ignore
>      public void testEscapedControlCharacter() throws Exception {
> -        // we are explicitly using an escape different from \ here,
> because \r is the character sequence for CR
> +        // we are explicitly using an escape different from \ here
>          final Lexer lexer = getLexer("character!rEscaped",
> CSVFormat.newBuilder().withEscape('!').build());
> -        assertThat(lexer.nextToken(new Token()),
> hasContent("character!rEscaped"));
> +        assertThat(lexer.nextToken(new Token()), hasContent("character" +
> CR + "Escaped"));
>      }
>
>      @Test
>
>
>


-- 
http://people.apache.org/~britter/
http://www.systemoutprintln.de/
http://twitter.com/BenediktRitter
http://github.com/britter