You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by se...@apache.org on 2013/08/06 17:44:42 UTC
svn commit: r1511006 - in /commons/proper/csv/trunk/src:
main/java/org/apache/commons/csv/ test/java/org/apache/commons/csv/
Author: sebb
Date: Tue Aug 6 15:44:41 2013
New Revision: 1511006
URL: http://svn.apache.org/r1511006
Log:
Merge Lexer with CSVLexer
Removed:
commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java
Modified:
commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java
Modified: commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
URL: http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java?rev=1511006&r1=1511005&r2=1511006&view=diff
==============================================================================
--- commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java (original)
+++ commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java Tue Aug 6 15:44:41 2013
@@ -17,6 +17,13 @@
package org.apache.commons.csv;
+import static org.apache.commons.csv.Constants.BACKSPACE;
+import static org.apache.commons.csv.Constants.CR;
+import static org.apache.commons.csv.Constants.END_OF_STREAM;
+import static org.apache.commons.csv.Constants.FF;
+import static org.apache.commons.csv.Constants.LF;
+import static org.apache.commons.csv.Constants.TAB;
+import static org.apache.commons.csv.Constants.UNDEFINED;
import static org.apache.commons.csv.Token.Type.COMMENT;
import static org.apache.commons.csv.Token.Type.EOF;
import static org.apache.commons.csv.Token.Type.EORECORD;
@@ -30,11 +37,38 @@ import java.io.IOException;
*
* @version $Id$
*/
-final class CSVLexer extends Lexer {
+final class CSVLexer {
+
+ /**
+ * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it
+ * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two
+ * chars (using surrogates) and thus there should never be a collision with a real text char.
+ */
+ private static final char DISABLED = '\ufffe';
+
+ private final char delimiter;
+ private final char escape;
+ private final char quoteChar;
+ private final char commmentStart;
+
+ final boolean ignoreSurroundingSpaces;
+ final boolean ignoreEmptyLines;
+
+ final CSVFormat format;
+
+ /** The input stream */
+ final ExtendedBufferedReader in;
/** INTERNAL API. ctor needs to be public so can be called dynamically by PerformanceTest class */
CSVLexer(final CSVFormat format, final ExtendedBufferedReader in) {
- super(format, in);
+ this.format = format;
+ this.in = in;
+ this.delimiter = format.getDelimiter();
+ this.escape = mapNullToDisabled(format.getEscape());
+ this.quoteChar = mapNullToDisabled(format.getQuoteChar());
+ this.commmentStart = mapNullToDisabled(format.getCommentStart());
+ this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
+ this.ignoreEmptyLines = format.getIgnoreEmptyLines();
}
/**
@@ -48,7 +82,6 @@ final class CSVLexer extends Lexer {
* @throws java.io.IOException
* on stream access error
*/
- @Override
Token nextToken(final Token token) throws IOException {
// get the last read char (required for empty line detection)
@@ -257,4 +290,144 @@ final class CSVLexer extends Lexer {
}
}
+ private final char mapNullToDisabled(final Character c) {
+ return c == null ? DISABLED : c.charValue();
+ }
+
+ /**
+ * Returns the current line number
+ *
+ * @return the current line number
+ */
+ long getCurrentLineNumber() {
+ return in.getCurrentLineNumber();
+ }
+
+ // TODO escape handling needs more work
+ /**
+ * Handle an escape sequence.
+ * The current character must be the escape character.
+ * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()}
+ * on the input stream.
+ *
+ * @return the unescaped character (as an int) or {@link END_OF_STREAM} if char following the escape is invalid.
+ * @throws IOException if there is a problem reading the stream or the end of stream is detected:
+ * the escape character is not allowed at end of strem
+ */
+ int readEscape() throws IOException {
+ // the escape char has just been read (normally a backslash)
+ final int ch = in.read();
+ switch (ch) {
+ case 'r':
+ return CR;
+ case 'n':
+ return LF;
+ case 't':
+ return TAB;
+ case 'b':
+ return BACKSPACE;
+ case 'f':
+ return FF;
+ case CR:
+ case LF:
+ case FF: // TODO is this correct?
+ case TAB: // TODO is this correct? Do tabs need to be escaped?
+ case BACKSPACE: // TODO is this correct?
+ return ch;
+ case END_OF_STREAM:
+ throw new IOException("EOF whilst processing escape sequence");
+ default:
+ // Now check for meta-characters
+ if (isMetaChar(ch)) {
+ return ch;
+ }
+ // indicate unexpected char - available from in.getLastChar()
+ return END_OF_STREAM;
+ }
+ }
+
+ void trimTrailingSpaces(final StringBuilder buffer) {
+ int length = buffer.length();
+ while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) {
+ length = length - 1;
+ }
+ if (length != buffer.length()) {
+ buffer.setLength(length);
+ }
+ }
+
+ /**
+ * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character...
+ *
+ * @return true if the given or next character is a line-terminator
+ */
+ boolean readEndOfLine(int ch) throws IOException {
+ // check if we have \r\n...
+ if (ch == CR && in.lookAhead() == LF) {
+ // note: does not change ch outside of this method!
+ ch = in.read();
+ }
+ return ch == LF || ch == CR;
+ }
+
+ boolean isClosed() {
+ return in.isClosed();
+ }
+
+ /**
+ * @return true if the given char is a whitespace character
+ */
+ boolean isWhitespace(final int ch) {
+ return !isDelimiter(ch) && Character.isWhitespace((char) ch);
+ }
+
+ /**
+ * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file.
+ *
+ * @param ch the character to check
+ * @return true if the character is at the start of a line.
+ */
+ boolean isStartOfLine(final int ch) {
+ return ch == LF || ch == CR || ch == UNDEFINED;
+ }
+
+ /**
+ * @return true if the given character indicates end of file
+ */
+ boolean isEndOfFile(final int ch) {
+ return ch == END_OF_STREAM;
+ }
+
+ boolean isDelimiter(final int ch) {
+ return ch == delimiter;
+ }
+
+ boolean isEscape(final int ch) {
+ return ch == escape;
+ }
+
+ boolean isQuoteChar(final int ch) {
+ return ch == quoteChar;
+ }
+
+ boolean isCommentStart(final int ch) {
+ return ch == commmentStart;
+ }
+
+ private boolean isMetaChar(final int ch) {
+ return ch == delimiter ||
+ ch == escape ||
+ ch == quoteChar ||
+ ch == commmentStart;
+ }
+
+ /**
+ * Closes resources.
+ *
+ * @throws IOException
+ * If an I/O error occurs
+ */
+ void close() throws IOException {
+ in.close();
+ }
}
Modified: commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
URL: http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java?rev=1511006&r1=1511005&r2=1511006&view=diff
==============================================================================
--- commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java (original)
+++ commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java Tue Aug 6 15:44:41 2013
@@ -217,7 +217,7 @@ public final class CSVParser implements
private final CSVFormat format;
private final Map<String, Integer> headerMap;
- private final Lexer lexer;
+ private final CSVLexer lexer;
/** A record buffer for getRecord(). Grows as necessary and is reused. */
private final List<String> record = new ArrayList<String>();
Modified: commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
URL: http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java?rev=1511006&r1=1511005&r2=1511006&view=diff
==============================================================================
--- commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java (original)
+++ commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java Tue Aug 6 15:44:41 2013
@@ -52,14 +52,14 @@ public class CSVLexerTest {
formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
}
- private Lexer getLexer(final String input, final CSVFormat format) {
+ private CSVLexer getLexer(final String input, final CSVFormat format) {
return new CSVLexer(format, new ExtendedBufferedReader(new StringReader(input)));
}
@Test
public void testSurroundingSpacesAreDeleted() throws IOException {
final String code = "noSpaces, leadingSpaces,trailingSpaces , surroundingSpaces , ,,";
- final Lexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
+ final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "noSpaces"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingSpaces"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingSpaces"));
@@ -72,7 +72,7 @@ public class CSVLexerTest {
@Test
public void testSurroundingTabsAreDeleted() throws IOException {
final String code = "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
- final Lexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
+ final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "noTabs"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "leadingTab"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "trailingTab"));
@@ -99,7 +99,7 @@ public class CSVLexerTest {
"\n"+
"\n";
final CSVFormat format = CSVFormat.DEFAULT.withIgnoreEmptyLines(true);
- final Lexer parser = getLexer(code, format);
+ final CSVLexer parser = getLexer(code, format);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
@@ -123,7 +123,7 @@ public class CSVLexerTest {
"# penultimate comment\n"+
"# Final comment\n";
final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
- final Lexer parser = getLexer(code, format);
+ final CSVLexer parser = getLexer(code, format);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "first"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
@@ -161,7 +161,7 @@ public class CSVLexerTest {
final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#').withIgnoreEmptyLines(false);
assertFalse("Should not ignore empty lines", format.getIgnoreEmptyLines());
- final Lexer parser = getLexer(code, format);
+ final CSVLexer parser = getLexer(code, format);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
@@ -199,7 +199,7 @@ public class CSVLexerTest {
final String code = "a,\\,,b\\\n\\,,";
final CSVFormat format = CSVFormat.DEFAULT;
assertFalse(format.isEscaping());
- final Lexer parser = getLexer(code, format);
+ final CSVLexer parser = getLexer(code, format);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
// an unquoted single backslash is not an escape char
@@ -221,7 +221,7 @@ public class CSVLexerTest {
final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
final CSVFormat format = formatWithEscaping.withIgnoreEmptyLines(false);
assertTrue(format.isEscaping());
- final Lexer parser = getLexer(code, format);
+ final CSVLexer parser = getLexer(code, format);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
@@ -241,7 +241,7 @@ public class CSVLexerTest {
* a, " foo " ,b
*/
final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \" ,b\na, \" foo \" ,b";
- final Lexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
+ final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
@@ -261,7 +261,7 @@ public class CSVLexerTest {
@Test
public void testNextToken5() throws IOException {
final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t \n\"";
- final Lexer parser = getLexer(code, CSVFormat.DEFAULT);
+ final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo\n"));
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
@@ -280,7 +280,7 @@ public class CSVLexerTest {
*/
final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
final CSVFormat format = CSVFormat.DEFAULT.withQuoteChar('\'').withCommentStart('!').withDelimiter(';');
- final Lexer parser = getLexer(code, format);
+ final CSVLexer parser = getLexer(code, format);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
assertThat(parser.nextToken(new Token()), matches(EORECORD, "b and ' more\n"));
}
@@ -289,7 +289,7 @@ public class CSVLexerTest {
@Test
public void testDelimiterIsWhitespace() throws IOException {
final String code = "one\ttwo\t\tfour \t five\t six";
- final Lexer parser = getLexer(code, CSVFormat.TDF);
+ final CSVLexer parser = getLexer(code, CSVFormat.TDF);
assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
@@ -300,96 +300,96 @@ public class CSVLexerTest {
@Test
public void testEscapedCR() throws Exception {
- final Lexer lexer = getLexer("character\\" + CR + "Escaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\" + CR + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
}
@Test
public void testCR() throws Exception {
- final Lexer lexer = getLexer("character" + CR + "NotEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character" + CR + "NotEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character"));
assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
}
@Test
public void testEscapedLF() throws Exception {
- final Lexer lexer = getLexer("character\\" + LF + "Escaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\" + LF + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + LF + "Escaped"));
}
@Test
public void testLF() throws Exception {
- final Lexer lexer = getLexer("character" + LF + "NotEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character" + LF + "NotEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character"));
assertThat(lexer.nextToken(new Token()), hasContent("NotEscaped"));
}
@Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
public void testEscapedTab() throws Exception {
- final Lexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\" + TAB + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "Escaped"));
}
@Test
public void testTab() throws Exception {
- final Lexer lexer = getLexer("character" + TAB + "NotEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character" + TAB + "NotEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + TAB + "NotEscaped"));
}
@Test // TODO is this correct? Do we expect <esc>BACKSPACE to be unescaped?
public void testEscapedBackspace() throws Exception {
- final Lexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\" + BACKSPACE + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "Escaped"));
}
@Test
public void testBackspace() throws Exception {
- final Lexer lexer = getLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character" + BACKSPACE + "NotEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + BACKSPACE + "NotEscaped"));
}
@Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
public void testEscapedFF() throws Exception {
- final Lexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\" + FF + "Escaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "Escaped"));
}
@Test
public void testFF() throws Exception {
- final Lexer lexer = getLexer("character" + FF + "NotEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character" + FF + "NotEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character" + FF + "NotEscaped"));
}
@Test
public void testEscapedMySqlNullValue() throws Exception {
// MySQL uses \N to symbolize null values. We have to restore this
- final Lexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\NEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character\\NEscaped"));
}
@Test
public void testEscapedCharacter() throws Exception {
- final Lexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
+ final CSVLexer lexer = getLexer("character\\aEscaped", formatWithEscaping);
assertThat(lexer.nextToken(new Token()), hasContent("character\\aEscaped"));
}
@Test
public void testEscapedControlCharacter() throws Exception {
// we are explicitly using an escape different from \ here
- final Lexer lexer = getLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'));
+ final CSVLexer lexer = getLexer("character!rEscaped", CSVFormat.DEFAULT.withEscape('!'));
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
}
@Test
public void testEscapedControlCharacter2() throws Exception {
- final Lexer lexer = getLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'));
+ final CSVLexer lexer = getLexer("character\\rEscaped", CSVFormat.DEFAULT.withEscape('\\'));
assertThat(lexer.nextToken(new Token()), hasContent("character" + CR + "Escaped"));
}
@Test(expected = IOException.class)
public void testEscapingAtEOF() throws Exception {
final String code = "escaping at EOF is evil\\";
- final Lexer lexer = getLexer(code, formatWithEscaping);
+ final CSVLexer lexer = getLexer(code, formatWithEscaping);
lexer.nextToken(new Token());
}
Modified: commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java
URL: http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java?rev=1511006&r1=1511005&r2=1511006&view=diff
==============================================================================
--- commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java (original)
+++ commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java Tue Aug 6 15:44:41 2013
@@ -224,9 +224,9 @@ public class PerformanceTest {
}
- private static Constructor<Lexer> getLexerCtor(final String clazz) throws Exception {
+ private static Constructor<CSVLexer> getLexerCtor(final String clazz) throws Exception {
@SuppressWarnings("unchecked")
- final Class<Lexer> lexer = (Class<Lexer>) Class.forName("org.apache.commons.csv." + clazz);
+ final Class<CSVLexer> lexer = (Class<CSVLexer>) Class.forName("org.apache.commons.csv." + clazz);
return lexer.getConstructor(new Class<?>[]{CSVFormat.class, ExtendedBufferedReader.class});
}
@@ -235,7 +235,7 @@ public class PerformanceTest {
String dynamic = "";
for (int i = 0; i < max; i++) {
final ExtendedBufferedReader input = new ExtendedBufferedReader(getReader());
- Lexer lexer = null;
+ CSVLexer lexer = null;
if (test.startsWith("CSVLexer")) {
dynamic="!";
lexer = getLexerCtor(test).newInstance(new Object[]{format, input});
Re: svn commit: r1511006 - in /commons/proper/csv/trunk/src:
main/java/org/apache/commons/csv/ test/java/org/apache/commons/csv/
Posted by Gary Gregory <ga...@gmail.com>.
It looks like now all the CSVLexer ivars can be private.
Gary
On Tue, Aug 6, 2013 at 11:44 AM, <se...@apache.org> wrote:
> Author: sebb
> Date: Tue Aug 6 15:44:41 2013
> New Revision: 1511006
>
> URL: http://svn.apache.org/r1511006
> Log:
> Merge Lexer with CSVLexer
>
> Removed:
>
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/Lexer.java
> Modified:
>
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
>
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
>
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
>
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java
>
> Modified:
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java?rev=1511006&r1=1511005&r2=1511006&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
> (original)
> +++
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVLexer.java
> Tue Aug 6 15:44:41 2013
> @@ -17,6 +17,13 @@
>
> package org.apache.commons.csv;
>
> +import static org.apache.commons.csv.Constants.BACKSPACE;
> +import static org.apache.commons.csv.Constants.CR;
> +import static org.apache.commons.csv.Constants.END_OF_STREAM;
> +import static org.apache.commons.csv.Constants.FF;
> +import static org.apache.commons.csv.Constants.LF;
> +import static org.apache.commons.csv.Constants.TAB;
> +import static org.apache.commons.csv.Constants.UNDEFINED;
> import static org.apache.commons.csv.Token.Type.COMMENT;
> import static org.apache.commons.csv.Token.Type.EOF;
> import static org.apache.commons.csv.Token.Type.EORECORD;
> @@ -30,11 +37,38 @@ import java.io.IOException;
> *
> * @version $Id$
> */
> -final class CSVLexer extends Lexer {
> +final class CSVLexer {
> +
> + /**
> + * Constant char to use for disabling comments, escapes and
> encapsulation. The value -2 is used because it
> + * won't be confused with an EOF signal (-1), and because the Unicode
> value {@code FFFE} would be encoded as two
> + * chars (using surrogates) and thus there should never be a
> collision with a real text char.
> + */
> + private static final char DISABLED = '\ufffe';
> +
> + private final char delimiter;
> + private final char escape;
> + private final char quoteChar;
> + private final char commmentStart;
> +
> + final boolean ignoreSurroundingSpaces;
> + final boolean ignoreEmptyLines;
> +
> + final CSVFormat format;
> +
> + /** The input stream */
> + final ExtendedBufferedReader in;
>
> /** INTERNAL API. ctor needs to be public so can be called
> dynamically by PerformanceTest class */
> CSVLexer(final CSVFormat format, final ExtendedBufferedReader in) {
> - super(format, in);
> + this.format = format;
> + this.in = in;
> + this.delimiter = format.getDelimiter();
> + this.escape = mapNullToDisabled(format.getEscape());
> + this.quoteChar = mapNullToDisabled(format.getQuoteChar());
> + this.commmentStart = mapNullToDisabled(format.getCommentStart());
> + this.ignoreSurroundingSpaces =
> format.getIgnoreSurroundingSpaces();
> + this.ignoreEmptyLines = format.getIgnoreEmptyLines();
> }
>
> /**
> @@ -48,7 +82,6 @@ final class CSVLexer extends Lexer {
> * @throws java.io.IOException
> * on stream access error
> */
> - @Override
> Token nextToken(final Token token) throws IOException {
>
> // get the last read char (required for empty line detection)
> @@ -257,4 +290,144 @@ final class CSVLexer extends Lexer {
> }
> }
>
> + private final char mapNullToDisabled(final Character c) {
> + return c == null ? DISABLED : c.charValue();
> + }
> +
> + /**
> + * Returns the current line number
> + *
> + * @return the current line number
> + */
> + long getCurrentLineNumber() {
> + return in.getCurrentLineNumber();
> + }
> +
> + // TODO escape handling needs more work
> + /**
> + * Handle an escape sequence.
> + * The current character must be the escape character.
> + * On return, the next character is available by calling {@link
> ExtendedBufferedReader#getLastChar()}
> + * on the input stream.
> + *
> + * @return the unescaped character (as an int) or {@link
> END_OF_STREAM} if char following the escape is invalid.
> + * @throws IOException if there is a problem reading the stream or
> the end of stream is detected:
> + * the escape character is not allowed at end of strem
> + */
> + int readEscape() throws IOException {
> + // the escape char has just been read (normally a backslash)
> + final int ch = in.read();
> + switch (ch) {
> + case 'r':
> + return CR;
> + case 'n':
> + return LF;
> + case 't':
> + return TAB;
> + case 'b':
> + return BACKSPACE;
> + case 'f':
> + return FF;
> + case CR:
> + case LF:
> + case FF: // TODO is this correct?
> + case TAB: // TODO is this correct? Do tabs need to be escaped?
> + case BACKSPACE: // TODO is this correct?
> + return ch;
> + case END_OF_STREAM:
> + throw new IOException("EOF whilst processing escape
> sequence");
> + default:
> + // Now check for meta-characters
> + if (isMetaChar(ch)) {
> + return ch;
> + }
> + // indicate unexpected char - available from in.getLastChar()
> + return END_OF_STREAM;
> + }
> + }
> +
> + void trimTrailingSpaces(final StringBuilder buffer) {
> + int length = buffer.length();
> + while (length > 0 && Character.isWhitespace(buffer.charAt(length
> - 1))) {
> + length = length - 1;
> + }
> + if (length != buffer.length()) {
> + buffer.setLength(length);
> + }
> + }
> +
> + /**
> + * Greedily accepts \n, \r and \r\n This checker consumes silently
> the second control-character...
> + *
> + * @return true if the given or next character is a line-terminator
> + */
> + boolean readEndOfLine(int ch) throws IOException {
> + // check if we have \r\n...
> + if (ch == CR && in.lookAhead() == LF) {
> + // note: does not change ch outside of this method!
> + ch = in.read();
> + }
> + return ch == LF || ch == CR;
> + }
> +
> + boolean isClosed() {
> + return in.isClosed();
> + }
> +
> + /**
> + * @return true if the given char is a whitespace character
> + */
> + boolean isWhitespace(final int ch) {
> + return !isDelimiter(ch) && Character.isWhitespace((char) ch);
> + }
> +
> + /**
> + * Checks if the current character represents the start of a line: a
> CR, LF or is at the start of the file.
> + *
> + * @param ch the character to check
> + * @return true if the character is at the start of a line.
> + */
> + boolean isStartOfLine(final int ch) {
> + return ch == LF || ch == CR || ch == UNDEFINED;
> + }
> +
> + /**
> + * @return true if the given character indicates end of file
> + */
> + boolean isEndOfFile(final int ch) {
> + return ch == END_OF_STREAM;
> + }
> +
> + boolean isDelimiter(final int ch) {
> + return ch == delimiter;
> + }
> +
> + boolean isEscape(final int ch) {
> + return ch == escape;
> + }
> +
> + boolean isQuoteChar(final int ch) {
> + return ch == quoteChar;
> + }
> +
> + boolean isCommentStart(final int ch) {
> + return ch == commmentStart;
> + }
> +
> + private boolean isMetaChar(final int ch) {
> + return ch == delimiter ||
> + ch == escape ||
> + ch == quoteChar ||
> + ch == commmentStart;
> + }
> +
> + /**
> + * Closes resources.
> + *
> + * @throws IOException
> + * If an I/O error occurs
> + */
> + void close() throws IOException {
> + in.close();
> + }
> }
>
> Modified:
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java?rev=1511006&r1=1511005&r2=1511006&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
> (original)
> +++
> commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java
> Tue Aug 6 15:44:41 2013
> @@ -217,7 +217,7 @@ public final class CSVParser implements
> private final CSVFormat format;
> private final Map<String, Integer> headerMap;
>
> - private final Lexer lexer;
> + private final CSVLexer lexer;
>
> /** A record buffer for getRecord(). Grows as necessary and is
> reused. */
> private final List<String> record = new ArrayList<String>();
>
> Modified:
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java?rev=1511006&r1=1511005&r2=1511006&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
> (original)
> +++
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVLexerTest.java
> Tue Aug 6 15:44:41 2013
> @@ -52,14 +52,14 @@ public class CSVLexerTest {
> formatWithEscaping = CSVFormat.DEFAULT.withEscape('\\');
> }
>
> - private Lexer getLexer(final String input, final CSVFormat format) {
> + private CSVLexer getLexer(final String input, final CSVFormat format)
> {
> return new CSVLexer(format, new ExtendedBufferedReader(new
> StringReader(input)));
> }
>
> @Test
> public void testSurroundingSpacesAreDeleted() throws IOException {
> final String code = "noSpaces, leadingSpaces,trailingSpaces ,
> surroundingSpaces , ,,";
> - final Lexer parser = getLexer(code,
> CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
> + final CSVLexer parser = getLexer(code,
> CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "noSpaces"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "leadingSpaces"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "trailingSpaces"));
> @@ -72,7 +72,7 @@ public class CSVLexerTest {
> @Test
> public void testSurroundingTabsAreDeleted() throws IOException {
> final String code =
> "noTabs,\tleadingTab,trailingTab\t,\tsurroundingTabs\t,\t\t,,";
> - final Lexer parser = getLexer(code,
> CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
> + final CSVLexer parser = getLexer(code,
> CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "noTabs"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "leadingTab"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "trailingTab"));
> @@ -99,7 +99,7 @@ public class CSVLexerTest {
> "\n"+
> "\n";
> final CSVFormat format =
> CSVFormat.DEFAULT.withIgnoreEmptyLines(true);
> - final Lexer parser = getLexer(code, format);
> + final CSVLexer parser = getLexer(code, format);
>
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "first"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
> @@ -123,7 +123,7 @@ public class CSVLexerTest {
> "# penultimate comment\n"+
> "# Final comment\n";
> final CSVFormat format = CSVFormat.DEFAULT.withCommentStart('#');
> - final Lexer parser = getLexer(code, format);
> + final CSVLexer parser = getLexer(code, format);
>
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "first"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "line"));
> @@ -161,7 +161,7 @@ public class CSVLexerTest {
> final CSVFormat format =
> CSVFormat.DEFAULT.withCommentStart('#').withIgnoreEmptyLines(false);
> assertFalse("Should not ignore empty lines",
> format.getIgnoreEmptyLines());
>
> - final Lexer parser = getLexer(code, format);
> + final CSVLexer parser = getLexer(code, format);
>
>
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "1"));
> @@ -199,7 +199,7 @@ public class CSVLexerTest {
> final String code = "a,\\,,b\\\n\\,,";
> final CSVFormat format = CSVFormat.DEFAULT;
> assertFalse(format.isEscaping());
> - final Lexer parser = getLexer(code, format);
> + final CSVLexer parser = getLexer(code, format);
>
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
> // an unquoted single backslash is not an escape char
> @@ -221,7 +221,7 @@ public class CSVLexerTest {
> final String code = "a,\\,,b\\\\\n\\,,\\\nc,d\\\r\ne";
> final CSVFormat format =
> formatWithEscaping.withIgnoreEmptyLines(false);
> assertTrue(format.isEscaping());
> - final Lexer parser = getLexer(code, format);
> + final CSVLexer parser = getLexer(code, format);
>
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, ","));
> @@ -241,7 +241,7 @@ public class CSVLexerTest {
> * a, " foo " ,b
> */
> final String code = "a,\"foo\",b\na, \" foo\",b\na,\"foo \"
> ,b\na, \" foo \" ,b";
> - final Lexer parser = getLexer(code,
> CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
> + final CSVLexer parser = getLexer(code,
> CSVFormat.DEFAULT.withIgnoreSurroundingSpaces(true));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "foo"));
> assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
> @@ -261,7 +261,7 @@ public class CSVLexerTest {
> @Test
> public void testNextToken5() throws IOException {
> final String code = "a,\"foo\n\",b\n\"foo\n baar ,,,\"\n\"\n\t
> \n\"";
> - final Lexer parser = getLexer(code, CSVFormat.DEFAULT);
> + final CSVLexer parser = getLexer(code, CSVFormat.DEFAULT);
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN,
> "foo\n"));
> assertThat(parser.nextToken(new Token()), matches(EORECORD, "b"));
> @@ -280,7 +280,7 @@ public class CSVLexerTest {
> */
> final String code = "a;'b and '' more\n'\n!comment;;;;\n;;";
> final CSVFormat format =
> CSVFormat.DEFAULT.withQuoteChar('\'').withCommentStart('!').withDelimiter(';');
> - final Lexer parser = getLexer(code, format);
> + final CSVLexer parser = getLexer(code, format);
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "a"));
> assertThat(parser.nextToken(new Token()), matches(EORECORD, "b
> and ' more\n"));
> }
> @@ -289,7 +289,7 @@ public class CSVLexerTest {
> @Test
> public void testDelimiterIsWhitespace() throws IOException {
> final String code = "one\ttwo\t\tfour \t five\t six";
> - final Lexer parser = getLexer(code, CSVFormat.TDF);
> + final CSVLexer parser = getLexer(code, CSVFormat.TDF);
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "one"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, "two"));
> assertThat(parser.nextToken(new Token()), matches(TOKEN, ""));
> @@ -300,96 +300,96 @@ public class CSVLexerTest {
>
> @Test
> public void testEscapedCR() throws Exception {
> - final Lexer lexer = getLexer("character\\" + CR + "Escaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\" + CR + "Escaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> CR + "Escaped"));
> }
>
> @Test
> public void testCR() throws Exception {
> - final Lexer lexer = getLexer("character" + CR + "NotEscaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character" + CR + "NotEscaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character"));
> assertThat(lexer.nextToken(new Token()),
> hasContent("NotEscaped"));
> }
>
> @Test
> public void testEscapedLF() throws Exception {
> - final Lexer lexer = getLexer("character\\" + LF + "Escaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\" + LF + "Escaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> LF + "Escaped"));
> }
>
> @Test
> public void testLF() throws Exception {
> - final Lexer lexer = getLexer("character" + LF + "NotEscaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character" + LF + "NotEscaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character"));
> assertThat(lexer.nextToken(new Token()),
> hasContent("NotEscaped"));
> }
>
> @Test // TODO is this correct? Do we expect <esc>TAB to be unescaped?
> public void testEscapedTab() throws Exception {
> - final Lexer lexer = getLexer("character\\" + TAB + "Escaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\" + TAB + "Escaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> TAB + "Escaped"));
> }
>
> @Test
> public void testTab() throws Exception {
> - final Lexer lexer = getLexer("character" + TAB + "NotEscaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character" + TAB + "NotEscaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> TAB + "NotEscaped"));
> }
>
> @Test // TODO is this correct? Do we expect <esc>BACKSPACE to be
> unescaped?
> public void testEscapedBackspace() throws Exception {
> - final Lexer lexer = getLexer("character\\" + BACKSPACE +
> "Escaped", formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\" + BACKSPACE +
> "Escaped", formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> BACKSPACE + "Escaped"));
> }
>
> @Test
> public void testBackspace() throws Exception {
> - final Lexer lexer = getLexer("character" + BACKSPACE +
> "NotEscaped", formatWithEscaping);
> + final CSVLexer lexer = getLexer("character" + BACKSPACE +
> "NotEscaped", formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> BACKSPACE + "NotEscaped"));
> }
>
> @Test // TODO is this correct? Do we expect <esc>FF to be unescaped?
> public void testEscapedFF() throws Exception {
> - final Lexer lexer = getLexer("character\\" + FF + "Escaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\" + FF + "Escaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> FF + "Escaped"));
> }
>
> @Test
> public void testFF() throws Exception {
> - final Lexer lexer = getLexer("character" + FF + "NotEscaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character" + FF + "NotEscaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> FF + "NotEscaped"));
> }
>
> @Test
> public void testEscapedMySqlNullValue() throws Exception {
> // MySQL uses \N to symbolize null values. We have to restore this
> - final Lexer lexer = getLexer("character\\NEscaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\NEscaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()),
> hasContent("character\\NEscaped"));
> }
>
> @Test
> public void testEscapedCharacter() throws Exception {
> - final Lexer lexer = getLexer("character\\aEscaped",
> formatWithEscaping);
> + final CSVLexer lexer = getLexer("character\\aEscaped",
> formatWithEscaping);
> assertThat(lexer.nextToken(new Token()),
> hasContent("character\\aEscaped"));
> }
>
> @Test
> public void testEscapedControlCharacter() throws Exception {
> // we are explicitly using an escape different from \ here
> - final Lexer lexer = getLexer("character!rEscaped",
> CSVFormat.DEFAULT.withEscape('!'));
> + final CSVLexer lexer = getLexer("character!rEscaped",
> CSVFormat.DEFAULT.withEscape('!'));
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> CR + "Escaped"));
> }
>
> @Test
> public void testEscapedControlCharacter2() throws Exception {
> - final Lexer lexer = getLexer("character\\rEscaped",
> CSVFormat.DEFAULT.withEscape('\\'));
> + final CSVLexer lexer = getLexer("character\\rEscaped",
> CSVFormat.DEFAULT.withEscape('\\'));
> assertThat(lexer.nextToken(new Token()), hasContent("character" +
> CR + "Escaped"));
> }
>
> @Test(expected = IOException.class)
> public void testEscapingAtEOF() throws Exception {
> final String code = "escaping at EOF is evil\\";
> - final Lexer lexer = getLexer(code, formatWithEscaping);
> + final CSVLexer lexer = getLexer(code, formatWithEscaping);
>
> lexer.nextToken(new Token());
> }
>
> Modified:
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java
> URL:
> http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java?rev=1511006&r1=1511005&r2=1511006&view=diff
>
> ==============================================================================
> ---
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java
> (original)
> +++
> commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/PerformanceTest.java
> Tue Aug 6 15:44:41 2013
> @@ -224,9 +224,9 @@ public class PerformanceTest {
> }
>
>
> - private static Constructor<Lexer> getLexerCtor(final String clazz)
> throws Exception {
> + private static Constructor<CSVLexer> getLexerCtor(final String clazz)
> throws Exception {
> @SuppressWarnings("unchecked")
> - final Class<Lexer> lexer = (Class<Lexer>)
> Class.forName("org.apache.commons.csv." + clazz);
> + final Class<CSVLexer> lexer = (Class<CSVLexer>)
> Class.forName("org.apache.commons.csv." + clazz);
> return lexer.getConstructor(new Class<?>[]{CSVFormat.class,
> ExtendedBufferedReader.class});
> }
>
> @@ -235,7 +235,7 @@ public class PerformanceTest {
> String dynamic = "";
> for (int i = 0; i < max; i++) {
> final ExtendedBufferedReader input = new
> ExtendedBufferedReader(getReader());
> - Lexer lexer = null;
> + CSVLexer lexer = null;
> if (test.startsWith("CSVLexer")) {
> dynamic="!";
> lexer = getLexerCtor(test).newInstance(new
> Object[]{format, input});
>
>
>
--
E-Mail: garydgregory@gmail.com | ggregory@apache.org
Java Persistence with Hibernate, Second Edition<http://www.manning.com/bauer3/>
JUnit in Action, Second Edition <http://www.manning.com/tahchiev/>
Spring Batch in Action <http://www.manning.com/templier/>
Blog: http://garygregory.wordpress.com
Home: http://garygregory.com/
Tweet! http://twitter.com/GaryGregory