You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "Mark Schaefer (JIRA)" <ji...@apache.org> on 2016/08/12 15:56:20 UTC
[jira] [Updated] (PARQUET-676) MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure

     [ https://issues.apache.org/jira/browse/PARQUET-676?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Mark Schaefer updated PARQUET-676:
----------------------------------
    Description: 
The following code works for NUM_TO_ENCODE <= 400, but fails greater than that with the error:

Check failed: (encoded) == (num_buffered_values_)

It appears to have to do with how large of an RLE buffer is allocated for buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be recovery from that, or any error indicating what the problem is. I'm assuming MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if so, it seems that there ought to be an exception or something generated. This could also be the basis of a writer example.

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <iostream>
#include <memory>
#include <list>

#include <parquet/api/writer.h>

using namespace parquet;

int main(int argc, char** argv) {
  if (argc != 2) {
    std::cerr << "Usage: " << argv[0] << " <file>"
              << std::endl;
    return -1;
  }

  std::string filename = argv[1];

  try {
    const int NUM_TO_ENCODE = 400;
    std::shared_ptr<OutputStream> ostream(new LocalFileOutputStream(filename));
    parquet::schema::NodeVector fields;
    parquet::schema::NodePtr schema;

    fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
    fields.push_back(parquet::schema::ByteArray("name", Repetition::OPTIONAL));

    schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);

    std::unique_ptr<ParquetFileWriter> writer = ParquetFileWriter::Open(ostream, std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));

    RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
    ColumnWriter* colBlock = rgBlock->NextColumn();
    Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
    std::vector<int32_t> intbuf;
    std::vector<int16_t> defbuf;
    std::vector<ByteArray> strbuf;
    for (int i = 0; i < NUM_TO_ENCODE; ++i) {
        intbuf.push_back( i );
        if (i % 10 == 0) {
            defbuf.push_back(0);
        } else {
            defbuf.push_back(1);
            uint8_t* buf = new uint8_t[4];
            ByteArray ba;
            sprintf((char*)buf,"%d",i);
            ba.ptr = buf;
            ba.len = strlen((const char*)ba.ptr);
            strbuf.push_back(ba);
        }
    }
    intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
    intWriter->Close();
    colBlock = rgBlock->NextColumn();
    ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
    std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << defbuf.size() << std::endl;
    strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, strbuf.data());
    strWriter->Close();
  } catch (const std::exception& e) {
    std::cerr << "Parquet error: "
              << e.what()
              << std::endl;
    return -1;
  }

  return 0;
}

  was:
The following code works for NUM_TO_ENCODE <= 400, but fails greater than that with the error:

Check failed: (encoded) == (num_buffered_values_)

It appears to have to do with how large of an RLE buffer is allocated for buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be recovery from that, or any error indicating what the problem is. I'm assuming MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if so, it seems that there ought to be an exception or something generated. This could also be the basis of a writer example.

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include <iostream>
#include <memory>
#include <list>

#include <parquet/api/writer.h>

using namespace parquet;

int main(int argc, char** argv) {
  if (argc != 2) {
    std::cerr << "Usage: " << argv[0] << " <file>"
              << std::endl;
    return -1;
  }

  std::string filename = argv[1];

  try {
    const int NUM_TO_ENCODE = 400;
    std::shared_ptr<OutputStream> ostream(new LocalFileOutputStream(filename));
    parquet::schema::NodeVector fields;
    parquet::schema::NodePtr schema;

    fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
    fields.push_back(parquet::schema::ByteArray("name", Repetition::OPTIONAL));

    schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);

    std::unique_ptr<ParquetFileWriter> writer = ParquetFileWriter::Open(ostream, std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));

    RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
    ColumnWriter* colBlock = rgBlock->NextColumn();
    Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
    std::vector<int32_t> intbuf;
    std::vector<int16_t> defbuf;
    std::vector<ByteArray> strbuf;
    for (int i = 0; i < NUM_TO_ENCODE; ++i) {
        intbuf.push_back(i);
        if (i % 10 == 0) {
            defbuf.push_back(0);
        } else {
            defbuf.push_back(1);
            uint8_t* buf = new uint8_t[4];
            ByteArray ba;
            sprintf((char*)buf,"%d",i);
            ba.ptr = buf;
            ba.len = strlen((const char*)ba.ptr);
            strbuf.push_back(ba);
        }
    }
    intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
    intWriter->Close();
    colBlock = rgBlock->NextColumn();
    ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
    std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << defbuf.size() << std::endl;
    strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, strbuf.data());
    strWriter->Close();
  } catch (const std::exception& e) {
    std::cerr << "Parquet error: "
              << e.what()
              << std::endl;
    return -1;
  }

  return 0;
}


> MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
> ------------------------------------------------------
>
>                 Key: PARQUET-676
>                 URL: https://issues.apache.org/jira/browse/PARQUET-676
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>         Environment: Mac OSX
>            Reporter: Mark Schaefer
>
> The following code works for NUM_TO_ENCODE <= 400, but fails greater than that with the error:
> Check failed: (encoded) == (num_buffered_values_)
> It appears to have to do with how large of an RLE buffer is allocated for buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be recovery from that, or any error indicating what the problem is. I'm assuming MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if so, it seems that there ought to be an exception or something generated. This could also be the basis of a writer example.
> // Licensed to the Apache Software Foundation (ASF) under one
> // or more contributor license agreements.  See the NOTICE file
> // distributed with this work for additional information
> // regarding copyright ownership.  The ASF licenses this file
> // to you under the Apache License, Version 2.0 (the
> // "License"); you may not use this file except in compliance
> // with the License.  You may obtain a copy of the License at
> //
> //   http://www.apache.org/licenses/LICENSE-2.0
> //
> // Unless required by applicable law or agreed to in writing,
> // software distributed under the License is distributed on an
> // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
> // KIND, either express or implied.  See the License for the
> // specific language governing permissions and limitations
> // under the License.
> #include <iostream>
> #include <memory>
> #include <list>
> #include <parquet/api/writer.h>
> using namespace parquet;
> int main(int argc, char** argv) {
>   if (argc != 2) {
>     std::cerr << "Usage: " << argv[0] << " <file>"
>               << std::endl;
>     return -1;
>   }
>   std::string filename = argv[1];
>   try {
>     const int NUM_TO_ENCODE = 400;
>     std::shared_ptr<OutputStream> ostream(new LocalFileOutputStream(filename));
>     parquet::schema::NodeVector fields;
>     parquet::schema::NodePtr schema;
>     fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
>     fields.push_back(parquet::schema::ByteArray("name", Repetition::OPTIONAL));
>     schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
>     std::unique_ptr<ParquetFileWriter> writer = ParquetFileWriter::Open(ostream, std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));
>     RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
>     ColumnWriter* colBlock = rgBlock->NextColumn();
>     Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
>     std::vector<int32_t> intbuf;
>     std::vector<int16_t> defbuf;
>     std::vector<ByteArray> strbuf;
>     for (int i = 0; i < NUM_TO_ENCODE; ++i) {
>         intbuf.push_back( i );
>         if (i % 10 == 0) {
>             defbuf.push_back(0);
>         } else {
>             defbuf.push_back(1);
>             uint8_t* buf = new uint8_t[4];
>             ByteArray ba;
>             sprintf((char*)buf,"%d",i);
>             ba.ptr = buf;
>             ba.len = strlen((const char*)ba.ptr);
>             strbuf.push_back(ba);
>         }
>     }
>     intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
>     intWriter->Close();
>     colBlock = rgBlock->NextColumn();
>     ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
>     std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << defbuf.size() << std::endl;
>     strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, strbuf.data());
>     strWriter->Close();
>   } catch (const std::exception& e) {
>     std::cerr << "Parquet error: "
>               << e.what()
>               << std::endl;
>     return -1;
>   }
>   return 0;
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)