You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "Mark Schaefer (JIRA)" <ji...@apache.org> on 2016/08/12 15:55:21 UTC
[jira] [Created] (PARQUET-676) MAX_VALUES_PER_LITERAL_RUN causes
RLE encoding failure
Mark Schaefer created PARQUET-676:
-------------------------------------
Summary: MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
Key: PARQUET-676
URL: https://issues.apache.org/jira/browse/PARQUET-676
Project: Parquet
Issue Type: Bug
Components: parquet-cpp
Environment: Mac OSX
Reporter: Mark Schaefer
The following code works for NUM_TO_ENCODE <= 400, but fails greater than that with the error:
Check failed: (encoded) == (num_buffered_values_)
It appears to have to do with how large of an RLE buffer is allocated for buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be recovery from that, or any error indicating what the problem is. I'm assuming MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if so, it seems that there ought to be an exception or something generated. This could also be the basis of a writer example.
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <iostream>
#include <memory>
#include <list>
#include <parquet/api/writer.h>
using namespace parquet;
int main(int argc, char** argv) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <file>"
<< std::endl;
return -1;
}
std::string filename = argv[1];
try {
const int NUM_TO_ENCODE = 400;
std::shared_ptr<OutputStream> ostream(new LocalFileOutputStream(filename));
parquet::schema::NodeVector fields;
parquet::schema::NodePtr schema;
fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
fields.push_back(parquet::schema::ByteArray("name", Repetition::OPTIONAL));
schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
std::unique_ptr<ParquetFileWriter> writer = ParquetFileWriter::Open(ostream, std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));
RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
ColumnWriter* colBlock = rgBlock->NextColumn();
Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
std::vector<int32_t> intbuf;
std::vector<int16_t> defbuf;
std::vector<ByteArray> strbuf;
for (int i = 0; i < NUM_TO_ENCODE; ++i) {
intbuf.push_back(i);
if (i % 10 == 0) {
defbuf.push_back(0);
} else {
defbuf.push_back(1);
uint8_t* buf = new uint8_t[4];
ByteArray ba;
sprintf((char*)buf,"%d",i);
ba.ptr = buf;
ba.len = strlen((const char*)ba.ptr);
strbuf.push_back(ba);
}
}
intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
intWriter->Close();
colBlock = rgBlock->NextColumn();
ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << defbuf.size() << std::endl;
strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, strbuf.data());
strWriter->Close();
} catch (const std::exception& e) {
std::cerr << "Parquet error: "
<< e.what()
<< std::endl;
return -1;
}
return 0;
}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)