You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by wgtmac <gi...@git.apache.org> on 2017/12/07 06:21:34 UTC
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
GitHub user wgtmac opened a pull request:
https://github.com/apache/orc/pull/199
ORC-276: [C++] Create a simple tool to import CSV files
A simple tool for users to create ORC files. It only support importing primitive type values from CSV files.
You can merge this pull request into a Git repository by running:
$ git pull https://github.com/wgtmac/orc ORC-276
Alternatively you can review and apply these changes as the patch at:
https://github.com/apache/orc/pull/199.patch
To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:
This closes #199
----
commit a54d134b71282e43eb6fc06fa388f66fe10af336
Author: Gang Wu <ga...@alibaba-inc.com>
Date: 2017-12-06T22:55:33Z
ORC-276: [C++] Create a simple tool to import CSV files
----
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by wgtmac <gi...@git.apache.org>.
Github user wgtmac commented on the issue:
https://github.com/apache/orc/pull/199
Thanks @xndai. Please see the last commit for the fixes that you have mentioned.
@omalley @majetideepak Please take a look when you have time. Thanks!
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by wgtmac <gi...@git.apache.org>.
Github user wgtmac commented on the issue:
https://github.com/apache/orc/pull/199
@majetideepak Please take a look for the updated patch. Thanks!
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r158096524
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,476 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <getopt.h>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+// extract one column raw text from one line
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void) {
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-mm-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import --input <input file> --output <output> "
--- End diff --
Can you include the options in the usage here?
The `input`, `output`, and `schema` are not options. They have to be specified towards the end of the options arguments.
Example from `FileScan.cc`.
```
if (argc < 1 || helpFlag) {
std::cerr << "Usage: orc-scan [-h] [--help]\n"
<< " [-b<size>] [--batch=<size>] <filename>\n";
return 1;
}
```
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155593796
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(DELIMITER);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(DELIMITER, start);
+ ++col;
+ }
+ return s.substr(start, end - start);
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ stringBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ dblBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ boolBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ tsBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << "Import CSV file into an Orc file using the specified schema.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ std::cout << GetDate() << "Start importing Orc file..." << std::endl;
+
+ double totalElapsedTime = 0.0;
+ double totalCPUTime = 0.0;
+
+ orc::DataBuffer<char> buffer(*orc::getDefaultPool());
+ buffer.resize(4 * 1024 * 1024);
+
+ // set ORC writer options here
+ uint64_t stripeSize = (36 << 20); // 36M
+ uint64_t blockSize = 64 * 1024; // 64K
+ uint64_t batchSize = 1024;
+ orc::CompressionKind compression = orc::CompressionKind_ZLIB;
+
+ orc::WriterOptions options;
+ options.setStripeSize(stripeSize);
+ options.setCompressionBlockSize(blockSize);
+ options.setCompression(compression);
+
+ ORC_UNIQUE_PTR<orc::OutputStream> outStream = orc::writeLocalFile(output);
+ ORC_UNIQUE_PTR<orc::Writer> writer =
+ orc::createWriter(*fileType, outStream.get(), options);
+ ORC_UNIQUE_PTR<orc::ColumnVectorBatch> rowBatch =
+ writer->createRowBatch(batchSize);
+
+ bool eof = false;
+ std::string line;
+ std::vector<std::string> data;
+ std::ifstream finput(input.c_str());
+ while (!eof) {
+ uint64_t numValues = 0;
+ uint64_t bufferOffset = 0;
+
+ data.clear();
+ memset(rowBatch->notNull.data(), 1, batchSize);
+
+ for (uint64_t i = 0; i < batchSize; ++i) {
+ if (!std::getline(finput, line)) {
+ eof = true;
+ break;
+ }
+ data.push_back(line);
+ ++numValues;
+ }
+
+ if (numValues != 0) {
+ orc::StructVectorBatch* structBatch =
+ dynamic_cast<orc::StructVectorBatch*>(rowBatch.get());
+ structBatch->numElements = numValues;
+
+ for (uint64_t i = 0; i < structBatch->fields.size(); ++i) {
+ orc::TypeKind subTypeKind = fileType->getSubtype(i)->getKind();
+ switch (subTypeKind) {
+ case orc::BYTE:
+ case orc::INT:
+ case orc::SHORT:
+ case orc::LONG:
+ fillLongValues(data,
+ structBatch->fields[i],
+ numValues,
+ i);
+ break;
+ case orc::STRING:
+ case orc::CHAR:
+ case orc::VARCHAR:
+ case orc::BINARY:
+ fillStringValues(data,
+ structBatch->fields[i],
+ numValues,
+ i,
+ buffer,
+ bufferOffset);
+ break;
+ case orc::FLOAT:
+ case orc::DOUBLE:
+ fillDoubleValues(data,
+ structBatch->fields[i],
+ numValues,
+ i);
+ break;
+ case orc::DECIMAL:
+ fillDecimalValues(data,
+ structBatch->fields[i],
+ numValues,
+ i,
+ fileType->getSubtype(i)->getScale(),
+ fileType->getSubtype(i)->getPrecision());
+ break;
+ case orc::BOOLEAN:
+ fillBoolValues(data,
+ structBatch->fields[i],
+ numValues,
+ i);
+ break;
+ case orc::DATE:
+ fillDateValues(data,
+ structBatch->fields[i],
+ numValues,
+ i);
+ break;
+ case orc::TIMESTAMP:
+ fillTimestampValues(data,
+ structBatch->fields[i],
+ numValues,
+ i);
+ break;
+ case orc::STRUCT:
+ case orc::LIST:
+ case orc::MAP:
+ case orc::UNION:
+ throw std::runtime_error("Type is not supported yet.");
--- End diff --
Add type name in your exception message.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155629518
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << " [--delimiter=<delimiter character>]\n"
+ << "Import CSV file into an Orc file using the specified schema.\n"
+ << "Compound types are not supported at the moment.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ if (argc > 4) {
+ std::string delimiter = argv[4];
+ const std::string DELIMITER_PREFIX = "--delimiter=";
+ if (delimiter.find(DELIMITER_PREFIX) != 0) {
+ std::cout << "Cannot find " << DELIMITER_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ gDelimiter = delimiter.substr(DELIMITER_PREFIX.size())[0];
+ }
+ }
+
+ std::cout << GetDate() << "Start importing Orc file..." << std::endl;
+
+ double totalElapsedTime = 0.0;
+ clock_t totalCPUTime = 0;
+
+ orc::DataBuffer<char> buffer(*orc::getDefaultPool());
+ buffer.resize(4 * 1024 * 1024);
+
+ // set ORC writer options here
+ uint64_t stripeSize = (128 << 20); // 128M
+ uint64_t blockSize = 64 << 10; // 64K
+ uint64_t batchSize = 1024;
+ orc::CompressionKind compression = orc::CompressionKind_ZLIB;
--- End diff --
Why not make these input options as well?
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on the issue:
https://github.com/apache/orc/pull/199
I just realized we need to add tests for each tool as well. `/tools/test` has some examples.
Some of the tools are missing tests as well. I will file a JIRA to cover those.
Sorry for not noticing this earlier.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r158422184
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,476 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <getopt.h>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+// extract one column raw text from one line
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void) {
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-mm-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import --input <input file> --output <output> "
--- End diff --
We need to include the short names too
```
std::cout << "Usage: csv-import [-h] [--help]\n"
<< " [-d<character>] [--delimiter=<character>]\n"
<< " [-s<size>] [--stripe=<size>]\n"
<< " [-b<size>] [--block=<size>]\n"
<< " [-b<size>] [--batch=<size>]\n"
<< " <schema> <input> <output>\n"
<< "Import CSV file into an Orc file using the specified schema.\n"
<< "Compound types are not yet supported.\n";
```
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155594426
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
--- End diff --
I think it will be very helpful to parameterize the delimiter.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by wgtmac <gi...@git.apache.org>.
Github user wgtmac commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r158342083
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,476 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <getopt.h>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+// extract one column raw text from one line
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void) {
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-mm-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import --input <input file> --output <output> "
--- End diff --
Done. Thanks!
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155593186
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(DELIMITER);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(DELIMITER, start);
+ ++col;
+ }
+ return s.substr(start, end - start);
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ stringBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ dblBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ boolBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ tsBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << "Import CSV file into an Orc file using the specified schema.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ std::cout << GetDate() << "Start importing Orc file..." << std::endl;
+
+ double totalElapsedTime = 0.0;
+ double totalCPUTime = 0.0;
+
+ orc::DataBuffer<char> buffer(*orc::getDefaultPool());
+ buffer.resize(4 * 1024 * 1024);
+
+ // set ORC writer options here
+ uint64_t stripeSize = (36 << 20); // 36M
--- End diff --
36M as default stripe size seems to be too small. Should probably use 128M, 196M or 256M.
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on the issue:
https://github.com/apache/orc/pull/199
LGTM. Thanks!
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155597298
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(DELIMITER);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(DELIMITER, start);
+ ++col;
+ }
+ return s.substr(start, end - start);
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ stringBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ memcpy(buffer.data() + offset,
--- End diff --
we might want to resize again when we are running out of 4M buffer size. Or at least we should raise an exception so we don't overflow the buffer.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155592825
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(DELIMITER);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(DELIMITER, start);
+ ++col;
+ }
+ return s.substr(start, end - start);
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ stringBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ dblBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ boolBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ longBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ tsBatch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << "Import CSV file into an Orc file using the specified schema.\n";
--- End diff --
Better to call out complex types are not supported currently.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by asfgit <gi...@git.apache.org>.
Github user asfgit closed the pull request at:
https://github.com/apache/orc/pull/199
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155629237
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << " [--delimiter=<delimiter character>]\n"
+ << "Import CSV file into an Orc file using the specified schema.\n"
+ << "Compound types are not supported at the moment.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ if (argc > 4) {
--- End diff --
Can you handle options like other tools via the `static struct option longOptions[]`?
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by wgtmac <gi...@git.apache.org>.
Github user wgtmac commented on the issue:
https://github.com/apache/orc/pull/199
@majetideepak Tests have been added, please review it when you get the chance. Thanks!
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155596136
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,411 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+#define DELIMITER ','
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(DELIMITER);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(DELIMITER, start);
+ ++col;
+ }
+ return s.substr(start, end - start);
--- End diff --
You will subtract from string::npos when the last column doesn't end with a delimiter or the number of columns in csv is less than what specified in schema. We need to better handle these cases.
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on the issue:
https://github.com/apache/orc/pull/199
Looks good! Thank you!
---
[GitHub] orc issue #199: ORC-276: [C++] Create a simple tool to import CSV files
Posted by xndai <gi...@git.apache.org>.
Github user xndai commented on the issue:
https://github.com/apache/orc/pull/199
Hi @majetideepak , Gang is on vacation and will look into your feedback after he's back next week. Thx.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155995744
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << " [--delimiter=<delimiter character>]\n"
+ << "Import CSV file into an Orc file using the specified schema.\n"
+ << "Compound types are not supported at the moment.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ if (argc > 4) {
+ std::string delimiter = argv[4];
+ const std::string DELIMITER_PREFIX = "--delimiter=";
+ if (delimiter.find(DELIMITER_PREFIX) != 0) {
+ std::cout << "Cannot find " << DELIMITER_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ gDelimiter = delimiter.substr(DELIMITER_PREFIX.size())[0];
+ }
+ }
+
+ std::cout << GetDate() << "Start importing Orc file..." << std::endl;
+
+ double totalElapsedTime = 0.0;
+ clock_t totalCPUTime = 0;
+
+ orc::DataBuffer<char> buffer(*orc::getDefaultPool());
+ buffer.resize(4 * 1024 * 1024);
+
+ // set ORC writer options here
+ uint64_t stripeSize = (128 << 20); // 128M
+ uint64_t blockSize = 64 << 10; // 64K
+ uint64_t batchSize = 1024;
+ orc::CompressionKind compression = orc::CompressionKind_ZLIB;
+
+ orc::WriterOptions options;
+ options.setStripeSize(stripeSize);
+ options.setCompressionBlockSize(blockSize);
+ options.setCompression(compression);
+
+ ORC_UNIQUE_PTR<orc::OutputStream> outStream = orc::writeLocalFile(output);
+ ORC_UNIQUE_PTR<orc::Writer> writer =
+ orc::createWriter(*fileType, outStream.get(), options);
+ ORC_UNIQUE_PTR<orc::ColumnVectorBatch> rowBatch =
+ writer->createRowBatch(batchSize);
+
+ bool eof = false;
+ std::string line;
+ std::vector<std::string> data;
+ std::ifstream finput(input.c_str());
+ while (!eof) {
--- End diff --
Some code comments will definitely help to extend this in the future.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by majetideepak <gi...@git.apache.org>.
Github user majetideepak commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r155629814
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << " [--delimiter=<delimiter character>]\n"
+ << "Import CSV file into an Orc file using the specified schema.\n"
+ << "Compound types are not supported at the moment.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ if (argc > 4) {
+ std::string delimiter = argv[4];
+ const std::string DELIMITER_PREFIX = "--delimiter=";
+ if (delimiter.find(DELIMITER_PREFIX) != 0) {
+ std::cout << "Cannot find " << DELIMITER_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ gDelimiter = delimiter.substr(DELIMITER_PREFIX.size())[0];
+ }
+ }
+
+ std::cout << GetDate() << "Start importing Orc file..." << std::endl;
+
+ double totalElapsedTime = 0.0;
+ clock_t totalCPUTime = 0;
+
+ orc::DataBuffer<char> buffer(*orc::getDefaultPool());
+ buffer.resize(4 * 1024 * 1024);
--- End diff --
This is redundant since the resize is done lazily.
---
[GitHub] orc pull request #199: ORC-276: [C++] Create a simple tool to import CSV fil...
Posted by wgtmac <gi...@git.apache.org>.
Github user wgtmac commented on a diff in the pull request:
https://github.com/apache/orc/pull/199#discussion_r156215585
--- Diff: tools/src/CSVFileImport.cc ---
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Exceptions.hh"
+#include "orc/OrcFile.hh"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+#include <time.h>
+
+static char gDelimiter = ',';
+
+std::string extractColumn(std::string s, uint64_t colIndex) {
+ uint64_t col = 0;
+ size_t start = 0;
+ size_t end = s.find(gDelimiter);
+ while (col < colIndex && end != std::string::npos) {
+ start = end + 1;
+ end = s.find(gDelimiter, start);
+ ++col;
+ }
+ return col == colIndex ? s.substr(start, end - start) : "";
+}
+
+static const char* GetDate(void)
+{
+ static char buf[200];
+ time_t t = time(NULL);
+ struct tm* p = localtime(&t);
+ strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", p);
+ return buf;
+}
+
+void fillLongValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ longBatch->data[i] = atoll(col.c_str());
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+void fillStringValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ orc::DataBuffer<char>& buffer,
+ uint64_t& offset) {
+ orc::StringVectorBatch* stringBatch =
+ dynamic_cast<orc::StringVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ if (buffer.size() - offset < col.size()) {
+ buffer.reserve(buffer.size() * 2);
+ }
+ memcpy(buffer.data() + offset,
+ col.c_str(),
+ col.size());
+ stringBatch->data[i] = buffer.data() + offset;
+ stringBatch->length[i] = static_cast<int64_t>(col.size());
+ offset += col.size();
+ }
+ }
+ stringBatch->hasNulls = hasNull;
+ stringBatch->numElements = numValues;
+}
+
+void fillDoubleValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::DoubleVectorBatch* dblBatch =
+ dynamic_cast<orc::DoubleVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ dblBatch->data[i] = atof(col.c_str());
+ }
+ }
+ dblBatch->hasNulls = hasNull;
+ dblBatch->numElements = numValues;
+}
+
+// parse fixed point decimal numbers
+void fillDecimalValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex,
+ size_t scale,
+ size_t precision) {
+
+
+ orc::Decimal128VectorBatch* d128Batch = ORC_NULLPTR;
+ orc::Decimal64VectorBatch* d64Batch = ORC_NULLPTR;
+ if (precision <= 18) {
+ d64Batch = dynamic_cast<orc::Decimal64VectorBatch*>(batch);
+ d64Batch->scale = static_cast<int32_t>(scale);
+ } else {
+ d128Batch = dynamic_cast<orc::Decimal128VectorBatch*>(batch);
+ d128Batch->scale = static_cast<int32_t>(scale);
+ }
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ size_t ptPos = col.find('.');
+ size_t curScale = 0;
+ std::string num = col;
+ if (ptPos != std::string::npos) {
+ curScale = col.length() - ptPos - 1;
+ num = col.substr(0, ptPos) + col.substr(ptPos + 1);
+ }
+ orc::Int128 decimal(num);
+ while (curScale != scale) {
+ curScale++;
+ decimal *= 10;
+ }
+ if (precision <= 18) {
+ d64Batch->values[i] = decimal.toLong();
+ } else {
+ d128Batch->values[i] = decimal;
+ }
+ }
+ }
+ batch->hasNulls = hasNull;
+ batch->numElements = numValues;
+}
+
+void fillBoolValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* boolBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ std::transform(col.begin(), col.end(), col.begin(), ::tolower);
+ if (col == "true" || col == "t") {
+ boolBatch->data[i] = true;
+ } else {
+ boolBatch->data[i] = false;
+ }
+ }
+ }
+ boolBatch->hasNulls = hasNull;
+ boolBatch->numElements = numValues;
+}
+
+// parse date string from format YYYY-MM-dd
+void fillDateValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::LongVectorBatch* longBatch =
+ dynamic_cast<orc::LongVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ struct tm tm;
+ memset(&tm, 0, sizeof(struct tm));
+ strptime(col.c_str(), "%Y-%m-%d", &tm);
+ time_t t = mktime(&tm);
+ time_t t1970 = 0;
+ double seconds = difftime(t, t1970);
+ int64_t days = static_cast<int64_t>(seconds / (60*60*24));
+ longBatch->data[i] = days;
+ }
+ }
+ longBatch->hasNulls = hasNull;
+ longBatch->numElements = numValues;
+}
+
+// parse timestamp values in seconds
+void fillTimestampValues(const std::vector<std::string>& data,
+ orc::ColumnVectorBatch* batch,
+ uint64_t numValues,
+ uint64_t colIndex) {
+ orc::TimestampVectorBatch* tsBatch =
+ dynamic_cast<orc::TimestampVectorBatch*>(batch);
+ bool hasNull = false;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ std::string col = extractColumn(data[i], colIndex);
+ if (col.empty()) {
+ batch->notNull[i] = 0;
+ hasNull = true;
+ } else {
+ batch->notNull[i] = 1;
+ tsBatch->data[i] = atoll(col.c_str());
+ tsBatch->nanoseconds[i] = 0;
+ }
+ }
+ tsBatch->hasNulls = hasNull;
+ tsBatch->numElements = numValues;
+}
+
+void usage() {
+ std::cout << "Usage: csv-import <input> <output> --schema=<file schema>"
+ << " [--delimiter=<delimiter character>]\n"
+ << "Import CSV file into an Orc file using the specified schema.\n"
+ << "Compound types are not supported at the moment.\n";
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 4) {
+ std::cout << "Invalid number of arguments." << std::endl;
+ usage();
+ return 1;
+ }
+
+ std::string input = argv[1];
+ std::string output = argv[2];
+ std::string schema = argv[3];
+
+ const std::string SCHEMA_PREFIX = "--schema=";
+ ORC_UNIQUE_PTR<orc::Type> fileType = ORC_NULLPTR;
+ if (schema.find(SCHEMA_PREFIX) != 0) {
+ std::cout << "Cannot find " << SCHEMA_PREFIX << " argument." << std::endl;
+ usage();
+ return 1;
+ } else {
+ fileType = orc::Type::buildTypeFromString(schema.substr(SCHEMA_PREFIX.size()));
+ }
+
+ if (argc > 4) {
--- End diff --
Done! Good suggestion. Please review the new commit again. Thanks!
---