You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@parquet.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/06/24 01:32:00 UTC
[jira] [Commented] (PARQUET-1336) BinaryComparator should
implements Serializable
[ https://issues.apache.org/jira/browse/PARQUET-1336?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16521326#comment-16521326 ]
ASF GitHub Bot commented on PARQUET-1336:
-----------------------------------------
wangyum opened a new pull request #497: PARQUET-1336: BinaryComparator should implements Serializable
URL: https://github.com/apache/parquet-mr/pull/497
`BinaryComparator` should implements `Serializable`. Otherwise, the following `UserDefinedPredicate` will throw `NotSerializableException`:
```scala
new UserDefinedPredicate[Binary] with Serializable {
private val strToBinary = Binary.fromReusedByteArray(v.getBytes)
private val size = strToBinary.length
val comparator = PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR
override def canDrop(statistics: Statistics[Binary]): Boolean = {
val max = statistics.getMax
val min = statistics.getMin
comparator.compare(max.slice(0, math.min(size, max.length)), strToBinary) < 0 ||
comparator.compare(min.slice(0, math.min(size, min.length)), strToBinary) > 0
}
override def inverseCanDrop(statistics: Statistics[Binary]): Boolean = false
override def keep(value: Binary): Boolean =
UTF8String.fromBytes(value.getBytes).startsWith(UTF8String.fromString(v))
}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
> BinaryComparator should implements Serializable
> ------------------------------------------------
>
> Key: PARQUET-1336
> URL: https://issues.apache.org/jira/browse/PARQUET-1336
> Project: Parquet
> Issue Type: Improvement
> Components: parquet-mr
> Affects Versions: 1.10.0
> Reporter: Yuming Wang
> Priority: Major
> Labels: pull-request-available
>
> {code:java}
> [info] Cause: java.lang.RuntimeException: java.io.NotSerializableException: org.apache.parquet.schema.PrimitiveComparator$8
> [info] at org.apache.parquet.hadoop.ParquetInputFormat.setFilterPredicate(ParquetInputFormat.java:211)
> [info] at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:399)
> [info] at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$buildReaderWithPartitionValues$1.apply(ParquetFileFormat.scala:349)
> [info] at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:128)
> [info] at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:182)
> [info] at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> [info] at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1791)
> [info] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1162)
> [info] at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1162)
> [info] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2071)
> [info] at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2071)
> [info] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
> [info] at org.apache.spark.scheduler.Task.run(Task.scala:109)
> [info] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:367)
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)