You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@metamodel.apache.org by ka...@apache.org on 2013/08/02 14:24:04 UTC
[2/9] git commit: Performance improvement to CSV reading when values
are only single line based.
Performance improvement to CSV reading when values are only single line
based.
Project: http://git-wip-us.apache.org/repos/asf/incubator-metamodel/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-metamodel/commit/72d0608b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-metamodel/tree/72d0608b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-metamodel/diff/72d0608b
Branch: refs/heads/master
Commit: 72d0608b2ba1dd3878bcef3eb51e6369c4ccdaf3
Parents: 10b5b77
Author: kaspers <ka...@kaspers-think.humaninference.com>
Authored: Fri Jul 26 10:44:29 2013 +0200
Committer: kaspers <ka...@kaspers-think.humaninference.com>
Committed: Fri Jul 26 10:44:29 2013 +0200
----------------------------------------------------------------------
.../apache/metamodel/csv/CsvConfiguration.java | 270 ++++++++++---------
.../apache/metamodel/csv/CsvDataContext.java | 43 ++-
.../org/apache/metamodel/csv/CsvDataSet.java | 2 +-
.../metamodel/csv/SingleLineCsvDataSet.java | 124 +++++++++
.../apache/metamodel/csv/SingleLineCsvRow.java | 57 ++++
.../metamodel/csv/CsvBigFileMemoryTest.java | 173 +++++++-----
6 files changed, 463 insertions(+), 206 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-metamodel/blob/72d0608b/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java
----------------------------------------------------------------------
diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java b/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java
index e2cd846..1662eab 100644
--- a/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java
+++ b/csv/src/main/java/org/apache/metamodel/csv/CsvConfiguration.java
@@ -31,130 +31,148 @@ import org.apache.metamodel.util.FileHelper;
*/
public final class CsvConfiguration extends BaseObject implements Serializable {
- private static final long serialVersionUID = 1L;
-
- /**
- * The value is '\\uFFFF', the "not a character" value which should not
- * occur in any valid Unicode string. This special char can be used to
- * disable either quote chars or escape chars.
- */
- public static final char NOT_A_CHAR = '\uFFFF';
- public static final int NO_COLUMN_NAME_LINE = 0;
- public static final int DEFAULT_COLUMN_NAME_LINE = 1;
- public static final char DEFAULT_SEPARATOR_CHAR = ',';
- public static final char DEFAULT_QUOTE_CHAR = '"';
- public static final char DEFAULT_ESCAPE_CHAR = '\\';
-
- private final int columnNameLineNumber;
- private final String encoding;
- private final char separatorChar;
- private final char quoteChar;
- private final char escapeChar;
- private final boolean failOnInconsistentRowLength;
-
- public CsvConfiguration() {
- this(DEFAULT_COLUMN_NAME_LINE);
- }
-
- public CsvConfiguration(int columnNameLineNumber) {
- this(columnNameLineNumber, FileHelper.DEFAULT_ENCODING,
- DEFAULT_SEPARATOR_CHAR, DEFAULT_QUOTE_CHAR, DEFAULT_ESCAPE_CHAR);
- }
-
- public CsvConfiguration(int columnNameLineNumber, String encoding,
- char separatorChar, char quoteChar, char escapeChar) {
- this(columnNameLineNumber, encoding, separatorChar, quoteChar,
- escapeChar, false);
- }
-
- public CsvConfiguration(int columnNameLineNumber, String encoding,
- char separatorChar, char quoteChar, char escapeChar,
- boolean failOnInconsistentRowLength) {
- this.columnNameLineNumber = columnNameLineNumber;
- this.encoding = encoding;
- this.separatorChar = separatorChar;
- this.quoteChar = quoteChar;
- this.escapeChar = escapeChar;
- this.failOnInconsistentRowLength = failOnInconsistentRowLength;
- }
-
- /**
- * Determines whether to fail (by throwing an
- * {@link InconsistentRowLengthException}) if a line in the CSV file has
- * inconsistent amounts of columns.
- *
- * If set to false (default) MetaModel will gracefully fill in missing null
- * values in or ignore additional values in a line.
- *
- * @return a boolean indicating whether to fail or gracefully compensate for
- * inconsistent lines in the CSV files.
- */
- public boolean isFailOnInconsistentRowLength() {
- return failOnInconsistentRowLength;
- }
-
- /**
- * The line number (1 based) from which to get the names of the columns.
- *
- * @return the line number (1 based)
- */
- public int getColumnNameLineNumber() {
- return columnNameLineNumber;
- }
-
- /**
- * Gets the file encoding to use for reading the file.
- *
- * @return the text encoding of the file.
- */
- public String getEncoding() {
- return encoding;
- }
-
- /**
- * Gets the separator char (typically comma or semicolon) for separating
- * values.
- *
- * @return the separator char
- */
- public char getSeparatorChar() {
- return separatorChar;
- }
-
- /**
- * Gets the quote char, used for encapsulating values.
- *
- * @return the quote char
- */
- public char getQuoteChar() {
- return quoteChar;
- }
-
- /**
- * Gets the escape char, used for escaping eg. quote chars inside values.
- *
- * @return the escape char
- */
- public char getEscapeChar() {
- return escapeChar;
- }
-
- @Override
- protected void decorateIdentity(List<Object> identifiers) {
- identifiers.add(columnNameLineNumber);
- identifiers.add(encoding);
- identifiers.add(separatorChar);
- identifiers.add(quoteChar);
- identifiers.add(escapeChar);
- identifiers.add(failOnInconsistentRowLength);
- }
-
- @Override
- public String toString() {
- return "CsvConfiguration[columnNameLineNumber=" + columnNameLineNumber
- + ", encoding=" + encoding + ", separatorChar=" + separatorChar
- + ", quoteChar=" + quoteChar + ", escapeChar=" + escapeChar
- + ", failOnInconsistentRowLength="
- + failOnInconsistentRowLength + "]";
- }
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * The value is '\\uFFFF', the "not a character" value which should not
+ * occur in any valid Unicode string. This special char can be used to
+ * disable either quote chars or escape chars.
+ */
+ public static final char NOT_A_CHAR = '\uFFFF';
+ public static final int NO_COLUMN_NAME_LINE = 0;
+ public static final int DEFAULT_COLUMN_NAME_LINE = 1;
+ public static final char DEFAULT_SEPARATOR_CHAR = ',';
+ public static final char DEFAULT_QUOTE_CHAR = '"';
+ public static final char DEFAULT_ESCAPE_CHAR = '\\';
+
+ private final int columnNameLineNumber;
+ private final String encoding;
+ private final char separatorChar;
+ private final char quoteChar;
+ private final char escapeChar;
+ private final boolean failOnInconsistentRowLength;
+ private final boolean multilineValues;
+
+ public CsvConfiguration() {
+ this(DEFAULT_COLUMN_NAME_LINE);
+ }
+
+ public CsvConfiguration(int columnNameLineNumber) {
+ this(columnNameLineNumber, FileHelper.DEFAULT_ENCODING, DEFAULT_SEPARATOR_CHAR, DEFAULT_QUOTE_CHAR,
+ DEFAULT_ESCAPE_CHAR);
+ }
+
+ public CsvConfiguration(int columnNameLineNumber, boolean failOnInconsistentRowLength, boolean multilineValues) {
+ this(columnNameLineNumber, FileHelper.DEFAULT_ENCODING, DEFAULT_SEPARATOR_CHAR, DEFAULT_QUOTE_CHAR,
+ DEFAULT_ESCAPE_CHAR, failOnInconsistentRowLength, multilineValues);
+ }
+
+ public CsvConfiguration(int columnNameLineNumber, String encoding, char separatorChar, char quoteChar,
+ char escapeChar) {
+ this(columnNameLineNumber, encoding, separatorChar, quoteChar, escapeChar, false);
+ }
+
+ public CsvConfiguration(int columnNameLineNumber, String encoding, char separatorChar, char quoteChar,
+ char escapeChar, boolean failOnInconsistentRowLength) {
+ this(columnNameLineNumber, encoding, separatorChar, quoteChar, escapeChar, failOnInconsistentRowLength, true);
+ }
+
+ public CsvConfiguration(int columnNameLineNumber, String encoding, char separatorChar, char quoteChar,
+ char escapeChar, boolean failOnInconsistentRowLength, boolean multilineValues) {
+ this.columnNameLineNumber = columnNameLineNumber;
+ this.encoding = encoding;
+ this.separatorChar = separatorChar;
+ this.quoteChar = quoteChar;
+ this.escapeChar = escapeChar;
+ this.failOnInconsistentRowLength = failOnInconsistentRowLength;
+ this.multilineValues = multilineValues;
+ }
+
+ /**
+ * Determines whether to fail (by throwing an
+ * {@link InconsistentRowLengthException}) if a line in the CSV file has
+ * inconsistent amounts of columns.
+ *
+ * If set to false (default) MetaModel will gracefully fill in missing null
+ * values in or ignore additional values in a line.
+ *
+ * @return a boolean indicating whether to fail or gracefully compensate for
+ * inconsistent lines in the CSV files.
+ */
+ public boolean isFailOnInconsistentRowLength() {
+ return failOnInconsistentRowLength;
+ }
+
+ /**
+ * Determines whether the CSV files read using this configuration should be
+ * allowed to have multiline values in them.
+ *
+ * @return
+ */
+ public boolean isMultilineValues() {
+ return multilineValues;
+ }
+
+ /**
+ * The line number (1 based) from which to get the names of the columns.
+ *
+ * @return the line number (1 based)
+ */
+ public int getColumnNameLineNumber() {
+ return columnNameLineNumber;
+ }
+
+ /**
+ * Gets the file encoding to use for reading the file.
+ *
+ * @return the text encoding of the file.
+ */
+ public String getEncoding() {
+ return encoding;
+ }
+
+ /**
+ * Gets the separator char (typically comma or semicolon) for separating
+ * values.
+ *
+ * @return the separator char
+ */
+ public char getSeparatorChar() {
+ return separatorChar;
+ }
+
+ /**
+ * Gets the quote char, used for encapsulating values.
+ *
+ * @return the quote char
+ */
+ public char getQuoteChar() {
+ return quoteChar;
+ }
+
+ /**
+ * Gets the escape char, used for escaping eg. quote chars inside values.
+ *
+ * @return the escape char
+ */
+ public char getEscapeChar() {
+ return escapeChar;
+ }
+
+ @Override
+ protected void decorateIdentity(List<Object> identifiers) {
+ identifiers.add(columnNameLineNumber);
+ identifiers.add(encoding);
+ identifiers.add(separatorChar);
+ identifiers.add(quoteChar);
+ identifiers.add(escapeChar);
+ identifiers.add(failOnInconsistentRowLength);
+ }
+
+ @Override
+ public String toString() {
+ return "CsvConfiguration[columnNameLineNumber=" + columnNameLineNumber + ", encoding=" + encoding
+ + ", separatorChar=" + separatorChar + ", quoteChar=" + quoteChar + ", escapeChar=" + escapeChar
+ + ", failOnInconsistentRowLength=" + failOnInconsistentRowLength + "]";
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-metamodel/blob/72d0608b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java
----------------------------------------------------------------------
diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java
index 2b275a9..5effa2c 100644
--- a/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java
+++ b/csv/src/main/java/org/apache/metamodel/csv/CsvDataContext.java
@@ -35,6 +35,7 @@ import org.apache.metamodel.QueryPostprocessDataContext;
import org.apache.metamodel.UpdateScript;
import org.apache.metamodel.UpdateableDataContext;
import org.apache.metamodel.data.DataSet;
+import org.apache.metamodel.data.EmptyDataSet;
import org.apache.metamodel.query.FilterItem;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Table;
@@ -46,6 +47,7 @@ import org.apache.metamodel.util.UrlResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import au.com.bytecode.opencsv.CSVParser;
import au.com.bytecode.opencsv.CSVReader;
/**
@@ -336,23 +338,50 @@ public final class CsvDataContext extends QueryPostprocessDataContext implements
@Override
public DataSet materializeMainSchemaTable(Table table, Column[] columns, int maxRows) {
final int lineNumber = _configuration.getColumnNameLineNumber();
- final CSVReader reader = createCsvReader(lineNumber);
final int columnCount = table.getColumnCount();
+
+ final BufferedReader reader = FileHelper.getBufferedReader(_resource.read(), _configuration.getEncoding());
+
+ try {
+ // skip column header lines
+ for (int i = 0; i < lineNumber; i++) {
+ String line = reader.readLine();
+ if (line == null) {
+ return new EmptyDataSet(columns);
+ }
+ }
+ } catch (IOException e) {
+ throw new MetaModelException("IOException occurred while reading from CSV resource: " + _resource, e);
+ }
+
final boolean failOnInconsistentRowLength = _configuration.isFailOnInconsistentRowLength();
- if (maxRows < 0) {
- return new CsvDataSet(reader, columns, null, columnCount, failOnInconsistentRowLength);
- } else {
- return new CsvDataSet(reader, columns, maxRows, columnCount, failOnInconsistentRowLength);
+
+ final Integer maxRowsOrNull = (maxRows > 0 ? maxRows : null);
+
+ if (_configuration.isMultilineValues()) {
+ final CSVReader csvReader = createCsvReader(reader);
+ return new CsvDataSet(csvReader, columns, maxRowsOrNull, columnCount, failOnInconsistentRowLength);
}
+
+ final CSVParser csvParser = new CSVParser(_configuration.getSeparatorChar(), _configuration.getQuoteChar(),
+ _configuration.getEscapeChar());
+ return new SingleLineCsvDataSet(reader, csvParser, columns, maxRowsOrNull, columnCount,
+ failOnInconsistentRowLength);
}
protected CSVReader createCsvReader(int skipLines) {
- final Reader fileReader = FileHelper.getReader(_resource.read(), _configuration.getEncoding());
- final CSVReader csvReader = new CSVReader(fileReader, _configuration.getSeparatorChar(),
+ final Reader reader = FileHelper.getReader(_resource.read(), _configuration.getEncoding());
+ final CSVReader csvReader = new CSVReader(reader, _configuration.getSeparatorChar(),
_configuration.getQuoteChar(), _configuration.getEscapeChar(), skipLines);
return csvReader;
}
+ protected CSVReader createCsvReader(BufferedReader reader) {
+ final CSVReader csvReader = new CSVReader(reader, _configuration.getSeparatorChar(),
+ _configuration.getQuoteChar(), _configuration.getEscapeChar());
+ return csvReader;
+ }
+
@Override
protected CsvSchema getMainSchema() throws MetaModelException {
CsvSchema schema = new CsvSchema(getMainSchemaName(), this);
http://git-wip-us.apache.org/repos/asf/incubator-metamodel/blob/72d0608b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java
----------------------------------------------------------------------
diff --git a/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java
index b69ab13..4bfb18c 100644
--- a/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java
+++ b/csv/src/main/java/org/apache/metamodel/csv/CsvDataSet.java
@@ -83,7 +83,7 @@ final class CsvDataSet extends AbstractDataSet {
return false;
}
}
-
+
private boolean nextInternal() {
if (_reader == null) {
return false;
http://git-wip-us.apache.org/repos/asf/incubator-metamodel/blob/72d0608b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java
----------------------------------------------------------------------
diff --git a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java
new file mode 100644
index 0000000..2d8a800
--- /dev/null
+++ b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvDataSet.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.metamodel.csv;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+
+import org.apache.metamodel.MetaModelException;
+import org.apache.metamodel.data.AbstractDataSet;
+import org.apache.metamodel.data.DataSetHeader;
+import org.apache.metamodel.data.Row;
+import org.apache.metamodel.schema.Column;
+import org.apache.metamodel.util.FileHelper;
+
+import au.com.bytecode.opencsv.CSVParser;
+
+/**
+ * A specialized DataSet implementation for the CSV module under circumstances
+ * where multiline values are disabled. In this case we can use a optimized
+ * CSVParser and also lazy evaluate lines read from the file.
+ */
+final class SingleLineCsvDataSet extends AbstractDataSet {
+
+ private final BufferedReader _reader;
+ private final CSVParser _csvParser;
+ private final int _columnsInTable;
+ private final boolean _failOnInconsistentRowLength;
+
+ private volatile int _rowNumber;
+ private volatile Integer _rowsRemaining;
+ private volatile Row _row;
+
+ public SingleLineCsvDataSet(BufferedReader reader, CSVParser csvParser, Column[] columns, Integer maxRows,
+ int columnsInTable, boolean failOnInconsistentRowLength) {
+ super(columns);
+ _reader = reader;
+ _csvParser = csvParser;
+ _columnsInTable = columnsInTable;
+ _failOnInconsistentRowLength = failOnInconsistentRowLength;
+ _rowNumber = 0;
+ _rowsRemaining = maxRows;
+ }
+
+ @Override
+ public void close() {
+ FileHelper.safeClose(_reader);
+ _row = null;
+ _rowsRemaining = null;
+ }
+
+ @Override
+ public boolean next() {
+ if (_rowsRemaining != null && _rowsRemaining > 0) {
+ _rowsRemaining--;
+ return nextInternal();
+ } else if (_rowsRemaining == null) {
+ return nextInternal();
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ protected DataSetHeader getHeader() {
+ // re-make this method protected so that it's visible for
+ // SingleLineCsvRow.
+ return super.getHeader();
+ }
+
+ protected boolean isFailOnInconsistentRowLength() {
+ return _failOnInconsistentRowLength;
+ }
+
+ protected int getColumnsInTable() {
+ return _columnsInTable;
+ }
+
+ protected CSVParser getCsvParser() {
+ return _csvParser;
+ }
+
+ public boolean nextInternal() {
+ if (_reader == null) {
+ return false;
+ }
+
+ try {
+ final String line = _reader.readLine();
+ if (line == null) {
+ close();
+ return false;
+ }
+
+ _rowNumber++;
+ _row = new SingleLineCsvRow(this, line, _columnsInTable, _failOnInconsistentRowLength, _rowNumber);
+ return true;
+ } catch (IOException e) {
+ close();
+ throw new MetaModelException("IOException occurred while reading next line of CSV resource", e);
+ }
+ }
+
+ @Override
+ public Row getRow() {
+ return _row;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-metamodel/blob/72d0608b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java
----------------------------------------------------------------------
diff --git a/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java
new file mode 100644
index 0000000..ee93974
--- /dev/null
+++ b/csv/src/main/java/org/apache/metamodel/csv/SingleLineCsvRow.java
@@ -0,0 +1,57 @@
+package org.apache.metamodel.csv;
+
+import org.apache.metamodel.data.AbstractRow;
+import org.apache.metamodel.data.DataSetHeader;
+import org.apache.metamodel.data.Style;
+import org.apache.metamodel.util.LazyRef;
+
+import au.com.bytecode.opencsv.CSVParser;
+
+/**
+ * Specialized row implementation for single-line CSV values
+ */
+final class SingleLineCsvRow extends AbstractRow {
+
+ private static final long serialVersionUID = 1L;
+
+ private final SingleLineCsvDataSet _dataSet;
+ private final LazyRef<String[]> _valuesRef;
+
+ public SingleLineCsvRow(SingleLineCsvDataSet dataSet, final String line, final int columnsInTable,
+ final boolean failOnInconsistentRowLength, final int rowNumber) {
+ _dataSet = dataSet;
+ _valuesRef = new LazyRef<String[]>() {
+ @Override
+ protected String[] fetch() throws Throwable {
+ final CSVParser parser = _dataSet.getCsvParser();
+ final String[] values = parser.parseLine(line);
+
+ if (failOnInconsistentRowLength) {
+ if (columnsInTable != values.length) {
+ throw new InconsistentRowLengthException(columnsInTable, SingleLineCsvRow.this, values,
+ rowNumber);
+ }
+ }
+
+ return values;
+ }
+ };
+ }
+
+ @Override
+ public Object getValue(int index) throws IndexOutOfBoundsException {
+ String[] values = _valuesRef.get();
+ return values[index];
+ }
+
+ @Override
+ public Style getStyle(int index) throws IndexOutOfBoundsException {
+ return Style.NO_STYLE;
+ }
+
+ @Override
+ protected DataSetHeader getHeader() {
+ return _dataSet.getHeader();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-metamodel/blob/72d0608b/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java
----------------------------------------------------------------------
diff --git a/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java b/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java
index 54c6987..c7243bd 100644
--- a/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java
+++ b/csv/src/test/java/org/apache/metamodel/csv/CsvBigFileMemoryTest.java
@@ -19,85 +19,114 @@
package org.apache.metamodel.csv;
import java.io.File;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import junit.framework.TestCase;
import org.apache.metamodel.DataContext;
-import org.apache.metamodel.csv.CsvConfiguration;
-import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.data.DataSet;
+import org.apache.metamodel.data.Row;
import org.apache.metamodel.query.Query;
import org.apache.metamodel.query.SelectItem;
import org.apache.metamodel.schema.Table;
-import junit.framework.TestCase;
-
public class CsvBigFileMemoryTest extends TestCase {
- private final int hugeFileRows = 3000;
- private final int hugeFileCols = 2000;
-
- private File getHugeFile() {
- final File file = new File("target/huge_csv.csv");
- if (!file.exists()) {
-
- final ExampleDataGenerator exampleDataGenerator = new ExampleDataGenerator(
- hugeFileRows, hugeFileCols);
- exampleDataGenerator.createFile(file);
- }
- return file;
- }
-
- /**
- * Runs a performance test based on the data created by the
- * ExampleDataCreator utility.
- *
- * @see ExampleDataGenerator
- * @throws Exception
- */
- public void testHugeFile() throws Exception {
- final File file = getHugeFile();
-
- final long timeAtStart = System.currentTimeMillis();
- System.out.println("time at start: " + timeAtStart);
-
- final DataContext dc = new CsvDataContext(file, new CsvConfiguration());
- final Table t = dc.getDefaultSchema().getTables()[0];
-
- final long timeAfterDataContext = System.currentTimeMillis();
- System.out.println("time after DataContext: " + timeAfterDataContext);
-
- final Query q = new Query().select(t.getColumns()).from(t);
- DataSet ds = dc.executeQuery(q);
-
- long timeAfterQuery = System.currentTimeMillis();
- System.out.println("time after query: " + timeAfterQuery);
-
- while (ds.next()) {
- assertEquals(hugeFileCols, ds.getRow().getValues().length);
- }
- ds.close();
-
- long timeAfterDataSet = System.currentTimeMillis();
- System.out.println("time after dataSet: " + timeAfterDataSet);
-
- if (!file.delete()) {
- file.deleteOnExit();
- }
- }
-
- public void testApproximatedCountHugeFile() throws Exception {
- DataContext dc = new CsvDataContext(getHugeFile());
-
- Table table = dc.getDefaultSchema().getTables()[0];
- Query q = dc.query().from(table).selectCount().toQuery();
- SelectItem selectItem = q.getSelectClause().getItem(0);
- selectItem.setFunctionApproximationAllowed(true);
-
- DataSet ds = dc.executeQuery(q);
- assertTrue(ds.next());
- Object[] values = ds.getRow().getValues();
- assertEquals(1, values.length);
- assertEquals(3332, ((Long) ds.getRow().getValue(selectItem)).intValue());
- assertEquals(3332, ((Long) values[0]).intValue());
- assertFalse(ds.next());
- }
+ private final int hugeFileRows = 30000;
+ private final int hugeFileCols = 2000;
+
+ private File getHugeFile() {
+ final File file = new File("target/huge_csv.csv");
+ if (!file.exists()) {
+
+ final ExampleDataGenerator exampleDataGenerator = new ExampleDataGenerator(hugeFileRows, hugeFileCols);
+ exampleDataGenerator.createFile(file);
+ }
+ return file;
+ }
+
+ /**
+ * Runs a performance test based on the data created by the
+ * ExampleDataCreator utility.
+ *
+ * @see ExampleDataGenerator
+ * @throws Exception
+ */
+ public void testHugeFile() throws Exception {
+ final File file = getHugeFile();
+
+ final long timeAtStart = System.currentTimeMillis();
+ System.out.println("time at start: " + timeAtStart);
+
+ final DataContext dc = new CsvDataContext(file, new CsvConfiguration(1, false, false));
+ final Table t = dc.getDefaultSchema().getTables()[0];
+
+ final long timeAfterDataContext = System.currentTimeMillis();
+ System.out.println("time after DataContext: " + timeAfterDataContext);
+
+ final Query q = new Query().select(t.getColumns()).from(t);
+ DataSet ds = dc.executeQuery(q);
+
+ long timeAfterQuery = System.currentTimeMillis();
+ System.out.println("time after query: " + timeAfterQuery);
+
+ final CountDownLatch countDown = new CountDownLatch(hugeFileRows);
+ final AtomicBoolean success = new AtomicBoolean(true);
+
+ ExecutorService executorService = Executors.newFixedThreadPool(30);
+
+ while (ds.next()) {
+ final Row row = ds.getRow();
+ executorService.submit(new Runnable() {
+ @Override
+ public void run() {
+ if (hugeFileCols != row.getValues().length) {
+ System.out.println("Weird row: " + row);
+ success.set(false);
+ }
+ countDown.countDown();
+ }
+ });
+ }
+ ds.close();
+
+ countDown.await();
+ assertTrue(success.get());
+
+ executorService.shutdown();
+
+ long timeAfterDataSet = System.currentTimeMillis();
+ System.out.println("time after dataSet: " + timeAfterDataSet);
+
+ long totalTime = timeAfterDataSet - timeAfterDataContext;
+ System.out.println("Total time to process large file: " + totalTime + " millis");
+
+ // results with old impl: [13908, 13827, 14577]
+
+ // results with new impl: [8567, 8965, 8154]
+
+ if (!file.delete()) {
+ file.deleteOnExit();
+ }
+ }
+
+ public void testApproximatedCountHugeFile() throws Exception {
+ DataContext dc = new CsvDataContext(getHugeFile());
+
+ Table table = dc.getDefaultSchema().getTables()[0];
+ Query q = dc.query().from(table).selectCount().toQuery();
+ SelectItem selectItem = q.getSelectClause().getItem(0);
+ selectItem.setFunctionApproximationAllowed(true);
+
+ DataSet ds = dc.executeQuery(q);
+ assertTrue(ds.next());
+ Object[] values = ds.getRow().getValues();
+ assertEquals(1, values.length);
+ assertEquals(3332, ((Long) ds.getRow().getValue(selectItem)).intValue());
+ assertEquals(3332, ((Long) values[0]).intValue());
+ assertFalse(ds.next());
+ }
}