You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/06/24 01:32:00 UTC
[jira] [Commented] (PARQUET-1336) BinaryComparator should implements Serializable

    [ https://issues.apache.org/jira/browse/PARQUET-1336?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16521326#comment-16521326 ] 

ASF GitHub Bot commented on PARQUET-1336:
-----------------------------------------

wangyum opened a new pull request #497: PARQUET-1336: BinaryComparator should implements Serializable
URL: https://github.com/apache/parquet-mr/pull/497
 
 
   `BinaryComparator` should implements `Serializable`. Otherwise, the following `UserDefinedPredicate` will throw `NotSerializableException`:
   ```scala
   new UserDefinedPredicate[Binary] with Serializable {
     private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
     private val size = strToBinary.length
     val comparator = PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR
   
     override def canDrop(statistics: Statistics[Binary]): Boolean = {
       val max = statistics.getMax
       val min = statistics.getMin
       comparator.compare(max.slice(0, math.min(size, max.length)), strToBinary) < 0 ||
         comparator.compare(min.slice(0, math.min(size, min.length)), strToBinary) > 0
     }
   
     override def inverseCanDrop(statistics: Statistics[Binary]): Boolean = false
   
     override def keep(value: Binary): Boolean =
       UTF8String.fromBytes(value.getBytes).startsWith(UTF8String.fromString(v))
   }
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> BinaryComparator should implements Serializable 
> ------------------------------------------------
>
>                 Key: PARQUET-1336
>                 URL: https://issues.apache.org/jira/browse/PARQUET-1336
>             Project: Parquet
>          Issue Type: Improvement
>          Components: parquet-mr
>    Affects Versions: 1.10.0
>            Reporter: Yuming Wang
>            Priority: Major
>              Labels: pull-request-available
>
> {code:java}
> [info] Cause: java.lang.RuntimeException: java.io.NotSerializableException: org.apache.parquet.schema.PrimitiveComparator$8
> [info] at org.apache.parquet.hadoop.ParquetInputFormat.setFilterPredicate(ParquetInputFormat.java:211)
> [info] at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:399)
> [info] at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:349)
> [info] at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:128)
> [info] at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:182)
> [info] at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1791)
> [info] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1162)
> [info] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1162)
> [info] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2071)
> [info] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2071)
> [info] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
> [info] at org.apache.spark.scheduler.Task.run(Task.scala:109)
> [info] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:367)
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)