You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "Wes McKinney (JIRA)" <ji...@apache.org> on 2016/09/03 15:12:20 UTC

[jira] [Resolved] (PARQUET-676) MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure

     [ https://issues.apache.org/jira/browse/PARQUET-676?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Wes McKinney resolved PARQUET-676.
----------------------------------
       Resolution: Fixed
    Fix Version/s: cpp-0.1

Issue resolved by pull request 150
[https://github.com/apache/parquet-cpp/pull/150]

> MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
> ------------------------------------------------------
>
>                 Key: PARQUET-676
>                 URL: https://issues.apache.org/jira/browse/PARQUET-676
>             Project: Parquet
>          Issue Type: Bug
>          Components: parquet-cpp
>         Environment: Mac OSX
>            Reporter: Mark Schaefer
>            Assignee: Wes McKinney
>             Fix For: cpp-0.1
>
>
> The following code works for NUM_TO_ENCODE <= 400, but fails greater than that with the error:
> Check failed: (encoded) == (num_buffered_values_)
> It appears to have to do with how large of an RLE buffer is allocated for buffering, causing Put to fail in levels.cc:78, but there doesn't seem to be recovery from that, or any error indicating what the problem is. I'm assuming MAX_VALUES_PER_LITERAL_RUN is somehow derived from the Parquet spec, but if so, it seems that there ought to be an exception or something generated. This could also be the basis of a writer example.
> // Licensed to the Apache Software Foundation (ASF) under one
> // or more contributor license agreements.  See the NOTICE file
> // distributed with this work for additional information
> // regarding copyright ownership.  The ASF licenses this file
> // to you under the Apache License, Version 2.0 (the
> // "License"); you may not use this file except in compliance
> // with the License.  You may obtain a copy of the License at
> //
> //   http://www.apache.org/licenses/LICENSE-2.0
> //
> // Unless required by applicable law or agreed to in writing,
> // software distributed under the License is distributed on an
> // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
> // KIND, either express or implied.  See the License for the
> // specific language governing permissions and limitations
> // under the License.
> #include <iostream>
> #include <memory>
> #include <list>
> #include <parquet/api/writer.h>
> using namespace parquet;
> int main(int argc, char** argv) {
>   if (argc != 2) {
>     std::cerr << "Usage: " << argv[0] << " <file>"
>               << std::endl;
>     return -1;
>   }
>   std::string filename = argv[1];
>   try {
>     const int NUM_TO_ENCODE = 400;
>     std::shared_ptr<OutputStream> ostream(new LocalFileOutputStream(filename));
>     parquet::schema::NodeVector fields;
>     parquet::schema::NodePtr schema;
>     fields.push_back(parquet::schema::Int32("id", Repetition::REQUIRED));
>     fields.push_back(parquet::schema::ByteArray("name", Repetition::OPTIONAL));
>     schema = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
>     std::unique_ptr<ParquetFileWriter> writer = ParquetFileWriter::Open(ostream, std::dynamic_pointer_cast<parquet::schema::GroupNode>(schema));
>     RowGroupWriter* rgBlock = writer->AppendRowGroup(NUM_TO_ENCODE);
>     ColumnWriter* colBlock = rgBlock->NextColumn();
>     Int32Writer* intWriter = static_cast<Int32Writer*>(colBlock);
>     std::vector<int32_t> intbuf;
>     std::vector<int16_t> defbuf;
>     std::vector<ByteArray> strbuf;
>     for (int i = 0; i < NUM_TO_ENCODE; ++i) {
>         intbuf.push_back( i );
>         if (i % 10 == 0) {
>             defbuf.push_back(0);
>         } else {
>             defbuf.push_back(1);
>             uint8_t* buf = new uint8_t[4];
>             ByteArray ba;
>             sprintf((char*)buf,"%d",i);
>             ba.ptr = buf;
>             ba.len = strlen((const char*)ba.ptr);
>             strbuf.push_back(ba);
>         }
>     }
>     intWriter->WriteBatch(intbuf.size(), nullptr, nullptr, intbuf.data());
>     intWriter->Close();
>     colBlock = rgBlock->NextColumn();
>     ByteArrayWriter* strWriter = static_cast<ByteArrayWriter*>(colBlock);
>     std::cerr << "sizes: strings:" << strbuf.size() << " definitions: " << defbuf.size() << std::endl;
>     strWriter->WriteBatch(defbuf.size(), defbuf.data(), nullptr, strbuf.data());
>     strWriter->Close();
>   } catch (const std::exception& e) {
>     std::cerr << "Parquet error: "
>               << e.what()
>               << std::endl;
>     return -1;
>   }
>   return 0;
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)