You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "Mark Schaefer (JIRA)" <ji...@apache.org> on 2016/08/12 15:55:21 UTC

[jira] [Created] (PARQUET-676) MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure

Mark Schaefer created PARQUET-676:
-------------------------------------

             Summary: MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
                 Key: PARQUET-676
                 URL: https://issues.apache.org/jira/browse/PARQUET-676
             Project: Parquet
          Issue Type: Bug
          Components: parquet-cpp
         Environment: Mac OSX
            Reporter: Mark Schaefer


The following code works for NUM_TO_ENCODE <= 400, but fails greater than that with the error:

Check failed: (encoded) == (num_buffered_values_)

It appears to have to do with how large of an RLE buffer is allocated for buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be recovery from that, or any error indicating what the problem is. I'm assuming MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if so, it seems that there ought to be an exception or something generated. This could also be the basis of a writer example.

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <iostream>
#include <memory>
#include <list>

#include <parquet/api/writer.h>

using namespace parquet;

int main(int argc, char** argv) {
  if (argc != 2) {
    std::cerr << "Usage: " << argv[0] << " <file>"
              << std::endl;
    return -1;
  }

  std::string filename = argv[1];

  try {
    const int NUM_TO_ENCODE = 400;
    std::shared_ptr<OutputStream> ostream(new LocalFileOutputStream(filename));
    parquet::schema::NodeVector fields;
    parquet::schema::NodePtr schema;

    fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
    fields.push_back(parquet::schema::ByteArray("name", Repetition::OPTIONAL));

    schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);

    std::unique_ptr<ParquetFileWriter> writer = ParquetFileWriter::Open(ostream, std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));

    RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
    ColumnWriter* colBlock = rgBlock->NextColumn();
    Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
    std::vector<int32_t> intbuf;
    std::vector<int16_t> defbuf;
    std::vector<ByteArray> strbuf;
    for (int i = 0; i < NUM_TO_ENCODE; ++i) {
        intbuf.push_back(i);
        if (i % 10 == 0) {
            defbuf.push_back(0);
        } else {
            defbuf.push_back(1);
            uint8_t* buf = new uint8_t[4];
            ByteArray ba;
            sprintf((char*)buf,"%d",i);
            ba.ptr = buf;
            ba.len = strlen((const char*)ba.ptr);
            strbuf.push_back(ba);
        }
    }
    intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
    intWriter->Close();
    colBlock = rgBlock->NextColumn();
    ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
    std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << defbuf.size() << std::endl;
    strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, strbuf.data());
    strWriter->Close();
  } catch (const std::exception& e) {
    std::cerr << "Parquet error: "
              << e.what()
              << std::endl;
    return -1;
  }

  return 0;
}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)