You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@orc.apache.org by omalley <gi...@git.apache.org> on 2016/09/14 18:19:49 UTC

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

GitHub user omalley opened a pull request:

    https://github.com/apache/orc/pull/60

    ORC-101 fix broken encoding for string and decimal bloom filters

    

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/omalley/orc orc-101

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/orc/pull/60.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #60
    
----
commit 8c1ff67421340f4ad2f9b2e9738d3378d220bdcb
Author: Owen O'Malley <om...@apache.org>
Date:   2016-09-13T20:28:44Z

    ORC-101 Correct bloom filters for strings and decimals to use utf8 encoding.

----


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79095035
  
    --- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java ---
    @@ -106,49 +198,58 @@ public OrcIndex readRowIndex(StripeInformation stripe,
           if (indexes == null) {
             indexes = new OrcProto.RowIndex[typeCount];
           }
    +      if (bloomFilterKinds == null) {
    +        bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
    +      }
           if (bloomFilterIndices == null) {
             bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
           }
    -      long offset = stripe.getOffset();
    -      List<OrcProto.Stream> streams = footer.getStreamsList();
    -      for (int i = 0; i < streams.size(); i++) {
    -        OrcProto.Stream stream = streams.get(i);
    -        OrcProto.Stream nextStream = null;
    -        if (i < streams.size() - 1) {
    -          nextStream = streams.get(i+1);
    +      DiskRangeList ranges = planIndexReading(fileSchema, footer,
    +          ignoreNonUtf8BloomFilter, included, sargColumns, bloomFilterKinds);
    +      ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false);
    +      long offset = 0;
    +      DiskRangeList range = ranges;
    +      for(OrcProto.Stream stream: footer.getStreamsList()) {
    +        // advance to find the next range
    +        while (range != null && range.getEnd() <= offset) {
    +          range = range.next;
             }
    -        int col = stream.getColumn();
    -        int len = (int) stream.getLength();
    -        // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
    -        // filter and combine the io to read row index and bloom filters for that column together
    -        if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
    -          boolean readBloomFilter = false;
    -          if (sargColumns != null && sargColumns[col] &&
    -              nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
    -            len += nextStream.getLength();
    -            i += 1;
    -            readBloomFilter = true;
    -          }
    -          if ((included == null || included[col]) && indexes[col] == null) {
    -            byte[] buffer = new byte[len];
    -            file.readFully(offset, buffer, 0, buffer.length);
    -            ByteBuffer bb = ByteBuffer.wrap(buffer);
    -            indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
    -                ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(),
    -                codec, bufferSize));
    -            if (readBloomFilter) {
    -              bb.position((int) stream.getLength());
    -              bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
    -                  "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)),
    -                  nextStream.getLength(), codec, bufferSize));
    -            }
    +        // no more ranges, so we are done
    +        if (range == null) {
    +          break;
    +        }
    +        int column = stream.getColumn();
    +        if (stream.hasKind() && range.getOffset() <= offset) {
    +          switch (stream.getKind()) {
    +            case ROW_INDEX:
    +              if (included == null || included[column]) {
    +                ByteBuffer bb = range.getData().duplicate();
    +                bb.position((int) (offset - range.getOffset()));
    +                bb.limit((int) (bb.position() + stream.getLength()));
    +                indexes[column] = OrcProto.RowIndex.parseFrom(InStream.create("index",
    +                    ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(),
    +                    codec, bufferSize));
    +              }
    +              break;
    +            case BLOOM_FILTER:
    +            case BLOOM_FILTER_UTF8:
    +              if (sargColumns != null && sargColumns[column]) {
    +                ByteBuffer bb = range.getData().duplicate();
    +                bb.position((int) (offset - range.getOffset()));
    +                bb.limit((int) (bb.position() + stream.getLength()));
    +                bloomFilterIndices[column] = OrcProto.BloomFilterIndex.parseFrom
    +                    (InStream.create("bloom_filter",
    --- End diff --
    
    same here.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79101181
  
    --- Diff: java/core/src/java/org/apache/orc/OrcFile.java ---
    @@ -231,6 +232,33 @@ public static Reader createReader(Path path,
         void preFooterWrite(WriterContext context) throws IOException;
       }
     
    +  public static enum BloomFilterVersion {
    +    // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams for string
    +    // and decimal columns.
    +    ORIGINAL("original"),
    +    // Only include the BLOOM_FILTER_UTF8 for string and decimal columns.
    +    // See ORC-101
    +    UTF8("utf8");
    +
    +    private final String id;
    +    private BloomFilterVersion(String id) {
    +      this.id = id;
    +    }
    +
    +    public String toString() {
    +      return id;
    +    }
    +
    +    public static BloomFilterVersion fromString(String s) {
    +      for (BloomFilterVersion version: values()) {
    +        if (version.id.equals(s)) {
    +          return version;
    +        }
    +      }
    +      throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
    --- End diff --
    
    I don't see how. To get the value, the code will always do this conversion after getting the string from OrcConf, so the error will be caught coming out.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79093764
  
    --- Diff: java/core/src/java/org/apache/orc/OrcConf.java ---
    @@ -105,6 +105,12 @@
               "dictionary or not will be retained thereafter."),
       BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
           "", "List of columns to create bloom filters for when writing."),
    +  BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
    +      "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(),
    +      "Which version of the bloom filter should we write."),
    --- End diff --
    
    I see from below to provide the option of writing both default and utf-8 we are using this config. Can you use enum of "default","utf-8" and add a description saying default will write both versions. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79097969
  
    --- Diff: java/core/src/java/org/apache/orc/util/BloomFilterIO.java ---
    @@ -0,0 +1,71 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.orc.util;
    +
    +import org.apache.orc.OrcProto;
    +
    +public class BloomFilterIO  {
    +
    +  private BloomFilterIO() {
    +    // never called
    +  }
    +
    +  /**
    +   * Deserialize a bloom filter from the ORC file.
    +   */
    +  public static BloomFilter deserialize(OrcProto.Stream.Kind kind,
    +                                        OrcProto.BloomFilter bloomFilter) {
    +    if (bloomFilter == null) {
    +      return null;
    +    }
    +    long values[] = new long[bloomFilter.getBitsetCount()];
    +    for(int i=0; i < values.length; ++i) {
    +      values[i] = bloomFilter.getBitset(i);
    +    }
    +    int numFuncs = bloomFilter.getNumHashFunctions();
    +    switch (kind) {
    +      case BLOOM_FILTER:
    +        return new BloomFilter(values, numFuncs);
    +      case BLOOM_FILTER_UTF8:
    +        return new BloomFilterUtf8(values, numFuncs);
    +    }
    +    throw new IllegalArgumentException("Unknown bloom filter kind " + kind);
    +  }
    +
    +  /**
    +   * Serialize the BloomFilter to the ORC file.
    +   * @param builder the builder to write to
    +   * @param bloomFilter the bloom filter to serialize
    +   */
    +  public static void serialize(OrcProto.BloomFilter.Builder builder,
    +                               BloomFilter bloomFilter) {
    +    long[] bitset = bloomFilter.getBitSet();
    +    if (builder.getBitsetCount() != bitset.length) {
    +      builder.clear();
    +      for(int i=0; i < bitset.length; ++i) {
    +        builder.addBitset(bitset[i]);
    --- End diff --
    
    can we make this byte[]? Protobuf deserializes this as List<Long> IIRC (Sergey and Dain reported). 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79095023
  
    --- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java ---
    @@ -106,49 +198,58 @@ public OrcIndex readRowIndex(StripeInformation stripe,
           if (indexes == null) {
             indexes = new OrcProto.RowIndex[typeCount];
           }
    +      if (bloomFilterKinds == null) {
    +        bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
    +      }
           if (bloomFilterIndices == null) {
             bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
           }
    -      long offset = stripe.getOffset();
    -      List<OrcProto.Stream> streams = footer.getStreamsList();
    -      for (int i = 0; i < streams.size(); i++) {
    -        OrcProto.Stream stream = streams.get(i);
    -        OrcProto.Stream nextStream = null;
    -        if (i < streams.size() - 1) {
    -          nextStream = streams.get(i+1);
    +      DiskRangeList ranges = planIndexReading(fileSchema, footer,
    +          ignoreNonUtf8BloomFilter, included, sargColumns, bloomFilterKinds);
    +      ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false);
    +      long offset = 0;
    +      DiskRangeList range = ranges;
    +      for(OrcProto.Stream stream: footer.getStreamsList()) {
    +        // advance to find the next range
    +        while (range != null && range.getEnd() <= offset) {
    +          range = range.next;
             }
    -        int col = stream.getColumn();
    -        int len = (int) stream.getLength();
    -        // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
    -        // filter and combine the io to read row index and bloom filters for that column together
    -        if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
    -          boolean readBloomFilter = false;
    -          if (sargColumns != null && sargColumns[col] &&
    -              nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
    -            len += nextStream.getLength();
    -            i += 1;
    -            readBloomFilter = true;
    -          }
    -          if ((included == null || included[col]) && indexes[col] == null) {
    -            byte[] buffer = new byte[len];
    -            file.readFully(offset, buffer, 0, buffer.length);
    -            ByteBuffer bb = ByteBuffer.wrap(buffer);
    -            indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
    -                ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(),
    -                codec, bufferSize));
    -            if (readBloomFilter) {
    -              bb.position((int) stream.getLength());
    -              bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
    -                  "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)),
    -                  nextStream.getLength(), codec, bufferSize));
    -            }
    +        // no more ranges, so we are done
    +        if (range == null) {
    +          break;
    +        }
    +        int column = stream.getColumn();
    +        if (stream.hasKind() && range.getOffset() <= offset) {
    +          switch (stream.getKind()) {
    +            case ROW_INDEX:
    +              if (included == null || included[column]) {
    +                ByteBuffer bb = range.getData().duplicate();
    +                bb.position((int) (offset - range.getOffset()));
    +                bb.limit((int) (bb.position() + stream.getLength()));
    +                indexes[column] = OrcProto.RowIndex.parseFrom(InStream.create("index",
    --- End diff --
    
    use InStream.createCodedInputStream() instead? to void metadata explosion.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79100160
  
    --- Diff: java/core/src/java/org/apache/orc/util/BloomFilter.java ---
    @@ -130,7 +125,7 @@ public void addString(String val) {
         if (val == null) {
           add(null);
         } else {
    -      add(val.getBytes());
    +      add(val.getBytes(Charset.defaultCharset()));
    --- End diff --
    
    No, this one is explicitly the default charset to be compatible. BloomFilterUtf8 is the fixed one.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79093894
  
    --- Diff: java/core/src/java/org/apache/orc/OrcFile.java ---
    @@ -231,6 +232,33 @@ public static Reader createReader(Path path,
         void preFooterWrite(WriterContext context) throws IOException;
       }
     
    +  public static enum BloomFilterVersion {
    +    // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams for string
    +    // and decimal columns.
    +    ORIGINAL("original"),
    +    // Only include the BLOOM_FILTER_UTF8 for string and decimal columns.
    +    // See ORC-101
    +    UTF8("utf8");
    +
    +    private final String id;
    +    private BloomFilterVersion(String id) {
    +      this.id = id;
    +    }
    +
    +    public String toString() {
    +      return id;
    +    }
    +
    +    public static BloomFilterVersion fromString(String s) {
    +      for (BloomFilterVersion version: values()) {
    +        if (version.id.equals(s)) {
    +          return version;
    +        }
    +      }
    +      throw new IllegalArgumentException("Unknown BloomFilterVersion " + s);
    --- End diff --
    
    Can we do this validate in OrcConf? so that we don't wrong value here


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79093773
  
    --- Diff: java/core/src/java/org/apache/orc/OrcConf.java ---
    @@ -105,6 +105,12 @@
               "dictionary or not will be retained thereafter."),
       BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
           "", "List of columns to create bloom filters for when writing."),
    +  BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
    +      "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(),
    +      "Which version of the bloom filter should we write."),
    --- End diff --
    
    If orc.bloom.filter.write.version is set to original, it will put both streams so that the files are usable by old versions of orc.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79097639
  
    --- Diff: java/core/src/java/org/apache/orc/util/BloomFilter.java ---
    @@ -130,7 +125,7 @@ public void addString(String val) {
         if (val == null) {
           add(null);
         } else {
    -      add(val.getBytes());
    +      add(val.getBytes(Charset.defaultCharset()));
    --- End diff --
    
    Should we make the default "UTF-8" and provide an alternate addString() that accepts Charset?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r80114710
  
    --- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java ---
    @@ -1168,8 +1193,9 @@ public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
           sargColumns = sargColumns == null ?
               (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
         }
    -    return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
    -        bloomFilterIndex);
    +    return dataReader.readRowIndex(stripe, evolution.getFileType(0), stripeFooter,
    --- End diff --
    
    Actually RecordReaderImpl.mapSargColumnsToOrcInternalColIdx already maps column names to the physical file column ids. It handles ACID files by looking for the field names in row field instead of the root. So at this level, it should be consistently using the file id.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc issue #60: ORC-101 fix broken encoding for string and decimal bloom filt...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on the issue:

    https://github.com/apache/orc/pull/60
  
    Ok, the latest push has a few changes:
    * bloom_filter_utf8 streams use a new encoding with bytes instead of long[]. This is much more efficient for performance and storage size.
    * all column types now have bloom_filter_utf8 streams (largely to get the new representation)
    * the default is to just write the new bloom_filter_utf8 streams that old readers will ignore. There is an option to write both bloom_filter and bloom_filter_utf8 streams to support old readers.
    * there is an option for new readers to ignore the old bloom filters.
    * files generated after hive-12055 will correctly use the utf8 encoding even for the bloom_filter stream.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79093595
  
    --- Diff: java/core/src/java/org/apache/orc/OrcConf.java ---
    @@ -105,6 +105,12 @@
               "dictionary or not will be retained thereafter."),
       BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
           "", "List of columns to create bloom filters for when writing."),
    +  BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
    +      "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(),
    +      "Which version of the bloom filter should we write."),
    --- End diff --
    
    Any reason to have this config? If we are going to default to UTF8 anyways we don't need this right?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79100075
  
    --- Diff: java/core/src/java/org/apache/orc/OrcConf.java ---
    @@ -105,6 +105,12 @@
               "dictionary or not will be retained thereafter."),
       BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns",
           "", "List of columns to create bloom filters for when writing."),
    +  BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version",
    +      "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(),
    +      "Which version of the bloom filter should we write."),
    --- End diff --
    
    The default is "utf8". Maybe we should use "both" and "utf8"?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79101245
  
    --- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderUtils.java ---
    @@ -106,49 +198,58 @@ public OrcIndex readRowIndex(StripeInformation stripe,
           if (indexes == null) {
             indexes = new OrcProto.RowIndex[typeCount];
           }
    +      if (bloomFilterKinds == null) {
    +        bloomFilterKinds = new OrcProto.Stream.Kind[typeCount];
    +      }
           if (bloomFilterIndices == null) {
             bloomFilterIndices = new OrcProto.BloomFilterIndex[typeCount];
           }
    -      long offset = stripe.getOffset();
    -      List<OrcProto.Stream> streams = footer.getStreamsList();
    -      for (int i = 0; i < streams.size(); i++) {
    -        OrcProto.Stream stream = streams.get(i);
    -        OrcProto.Stream nextStream = null;
    -        if (i < streams.size() - 1) {
    -          nextStream = streams.get(i+1);
    +      DiskRangeList ranges = planIndexReading(fileSchema, footer,
    +          ignoreNonUtf8BloomFilter, included, sargColumns, bloomFilterKinds);
    +      ranges = readDiskRanges(file, zcr, stripe.getOffset(), ranges, false);
    +      long offset = 0;
    +      DiskRangeList range = ranges;
    +      for(OrcProto.Stream stream: footer.getStreamsList()) {
    +        // advance to find the next range
    +        while (range != null && range.getEnd() <= offset) {
    +          range = range.next;
             }
    -        int col = stream.getColumn();
    -        int len = (int) stream.getLength();
    -        // row index stream and bloom filter are interlaced, check if the sarg column contains bloom
    -        // filter and combine the io to read row index and bloom filters for that column together
    -        if (stream.hasKind() && (stream.getKind() == OrcProto.Stream.Kind.ROW_INDEX)) {
    -          boolean readBloomFilter = false;
    -          if (sargColumns != null && sargColumns[col] &&
    -              nextStream.getKind() == OrcProto.Stream.Kind.BLOOM_FILTER) {
    -            len += nextStream.getLength();
    -            i += 1;
    -            readBloomFilter = true;
    -          }
    -          if ((included == null || included[col]) && indexes[col] == null) {
    -            byte[] buffer = new byte[len];
    -            file.readFully(offset, buffer, 0, buffer.length);
    -            ByteBuffer bb = ByteBuffer.wrap(buffer);
    -            indexes[col] = OrcProto.RowIndex.parseFrom(InStream.create("index",
    -                ReaderImpl.singleton(new BufferChunk(bb, 0)), stream.getLength(),
    -                codec, bufferSize));
    -            if (readBloomFilter) {
    -              bb.position((int) stream.getLength());
    -              bloomFilterIndices[col] = OrcProto.BloomFilterIndex.parseFrom(InStream.create(
    -                  "bloom_filter", ReaderImpl.singleton(new BufferChunk(bb, 0)),
    -                  nextStream.getLength(), codec, bufferSize));
    -            }
    +        // no more ranges, so we are done
    +        if (range == null) {
    +          break;
    +        }
    +        int column = stream.getColumn();
    +        if (stream.hasKind() && range.getOffset() <= offset) {
    +          switch (stream.getKind()) {
    +            case ROW_INDEX:
    +              if (included == null || included[column]) {
    +                ByteBuffer bb = range.getData().duplicate();
    +                bb.position((int) (offset - range.getOffset()));
    +                bb.limit((int) (bb.position() + stream.getLength()));
    +                indexes[column] = OrcProto.RowIndex.parseFrom(InStream.create("index",
    --- End diff --
    
    I just copied the code that was there, but sure.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc issue #60: ORC-101 fix broken encoding for string and decimal bloom filt...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on the issue:

    https://github.com/apache/orc/pull/60
  
    lgtm, +1


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by asfgit <gi...@git.apache.org>.
Github user asfgit closed the pull request at:

    https://github.com/apache/orc/pull/60


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79097833
  
    --- Diff: java/core/src/java/org/apache/orc/impl/WriterImpl.java ---
    @@ -1901,7 +2014,11 @@ void writeBatch(ColumnVector vector, int offset,
               HiveDecimal value = vec.vector[0].getHiveDecimal();
               indexStatistics.updateDecimal(value);
               if (createBloomFilter) {
    -            bloomFilter.addString(value.toString());
    +            String str = value.toString();
    +            if (bloomFilter != null) {
    +              bloomFilter.addString(str);
    --- End diff --
    
    Can you plz add a comment here for not using UTF-8 for decimals?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by prasanthj <gi...@git.apache.org>.
Github user prasanthj commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79916836
  
    --- Diff: java/core/src/java/org/apache/orc/impl/RecordReaderImpl.java ---
    @@ -1168,8 +1193,9 @@ public OrcIndex readRowIndex(int stripeIndex, boolean[] included,
           sargColumns = sargColumns == null ?
               (sargApp == null ? null : sargApp.sargColumns) : sargColumns;
         }
    -    return dataReader.readRowIndex(stripe, stripeFooter, included, indexes, sargColumns,
    -        bloomFilterIndex);
    +    return dataReader.readRowIndex(stripe, evolution.getFileType(0), stripeFooter,
    --- End diff --
    
    Should the index change based on ACID file or not?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] orc pull request #60: ORC-101 fix broken encoding for string and decimal blo...

Posted by omalley <gi...@git.apache.org>.
Github user omalley commented on a diff in the pull request:

    https://github.com/apache/orc/pull/60#discussion_r79101377
  
    --- Diff: java/core/src/java/org/apache/orc/util/BloomFilterIO.java ---
    @@ -0,0 +1,71 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.orc.util;
    +
    +import org.apache.orc.OrcProto;
    +
    +public class BloomFilterIO  {
    +
    +  private BloomFilterIO() {
    +    // never called
    +  }
    +
    +  /**
    +   * Deserialize a bloom filter from the ORC file.
    +   */
    +  public static BloomFilter deserialize(OrcProto.Stream.Kind kind,
    +                                        OrcProto.BloomFilter bloomFilter) {
    +    if (bloomFilter == null) {
    +      return null;
    +    }
    +    long values[] = new long[bloomFilter.getBitsetCount()];
    +    for(int i=0; i < values.length; ++i) {
    +      values[i] = bloomFilter.getBitset(i);
    +    }
    +    int numFuncs = bloomFilter.getNumHashFunctions();
    +    switch (kind) {
    +      case BLOOM_FILTER:
    +        return new BloomFilter(values, numFuncs);
    +      case BLOOM_FILTER_UTF8:
    +        return new BloomFilterUtf8(values, numFuncs);
    +    }
    +    throw new IllegalArgumentException("Unknown bloom filter kind " + kind);
    +  }
    +
    +  /**
    +   * Serialize the BloomFilter to the ORC file.
    +   * @param builder the builder to write to
    +   * @param bloomFilter the bloom filter to serialize
    +   */
    +  public static void serialize(OrcProto.BloomFilter.Builder builder,
    +                               BloomFilter bloomFilter) {
    +    long[] bitset = bloomFilter.getBitSet();
    +    if (builder.getBitsetCount() != bitset.length) {
    +      builder.clear();
    +      for(int i=0; i < bitset.length; ++i) {
    +        builder.addBitset(bitset[i]);
    --- End diff --
    
    We'd need to change the protobuf, which won't be backwards compatible. I guess we could do that for the new bloom_filter_utf8 stream.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastructure@apache.org or file a JIRA ticket
with INFRA.
---