You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (JIRA)" <ji...@apache.org> on 2018/07/18 01:28:00 UTC

[jira] [Comment Edited] (SPARK-24828) Incompatible parquet formats - java.lang.UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainLongDictionary

    [ https://issues.apache.org/jira/browse/SPARK-24828?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16547264#comment-16547264 ] 

Yuming Wang edited comment on SPARK-24828 at 7/18/18 1:27 AM:
--------------------------------------------------------------

The {{label}} column, sometime is {{integer}} type, sometime is {{long}} type:

{"name":"label","type":"integer","nullable":false,"metadata":{}}

("name":"label","type":"long","nullable":true,"metadata":{}}

 
{noformat}
$ java -jar ./parquet-tools/target/parquet-tools-1.10.1-SNAPSHOT.jar meta file:///Users/data/a2_m2.parquet/
file: file:/Users/yumwang/data/a2_m2.parquet/part-00000-1ff92a81-68c8-446b-a54e-a042a8fd7f1e.snappy.parquet
creator: parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"label","type":"integer","nullable":false,"metadata":{}},{"name":"CLASS","type":"long","nullable":true,"metadata":{}},{"name":"SENSORID","type":"string","nullable":true,"metadata":{}},{"name":"X","type":"double","nullable":true,"metadata":{}},{"name":"Y","type":"double","nullable":true,"metadata":{}},{"name":"Z","type":"double","nullable":true,"metadata":{}},{"name":"_id","type":"string","nullable":true,"metadata":{}},{"name":"_rev","type":"string","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"X"},{"idx":1,"name":"Y"},{"idx":2,"name":"Z"}]},"num_attrs":3}}},{"name":"prediction","type":"double","nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
label: REQUIRED INT32 R:0 D:0
CLASS: OPTIONAL INT64 R:0 D:1
SENSORID: OPTIONAL BINARY O:UTF8 R:0 D:1
X: OPTIONAL DOUBLE R:0 D:1
Y: OPTIONAL DOUBLE R:0 D:1
Z: OPTIONAL DOUBLE R:0 D:1
_id: OPTIONAL BINARY O:UTF8 R:0 D:1
_rev: OPTIONAL BINARY O:UTF8 R:0 D:1
features: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
prediction: OPTIONAL DOUBLE R:0 D:1

row group 1: RC:893 TS:41499 OFFSET:4
--------------------------------------------------------------------------------
label: INT32 SNAPPY DO:0 FPO:4 SZ:108/104/0.96 VC:893 ENC:BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
CLASS: INT64 SNAPPY DO:0 FPO:112 SZ:131/127/0.97 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 2, num_nulls: 0]
SENSORID: BINARY SNAPPY DO:0 FPO:243 SZ:81/77/0.95 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: aaaaaaaa, max: aaaaaaaa, num_nulls: 0]
X: DOUBLE SNAPPY DO:0 FPO:324 SZ:957/1045/1.09 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.85, max: 1.59, num_nulls: 0]
Y: DOUBLE SNAPPY DO:0 FPO:1281 SZ:957/1045/1.09 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.85, max: 1.59, num_nulls: 0]
Z: DOUBLE SNAPPY DO:0 FPO:2238 SZ:957/1045/1.09 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.85, max: 1.59, num_nulls: 0]
_id: BINARY SNAPPY DO:0 FPO:3195 SZ:7964/32248/4.05 VC:893 ENC:BIT_PACKED,RLE,PLAIN ST:[no stats for this column]
_rev: BINARY SNAPPY DO:0 FPO:11159 SZ:2340/2503/1.07 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[no stats for this column]
features:
.type: INT32 SNAPPY DO:0 FPO:13499 SZ:199/193/0.97 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:13698 SZ:296/290/0.98 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 3, max: 3, num_nulls: 620]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:13994 SZ:269/265/0.99 VC:893 ENC:RLE,PLAIN ST:[num_nulls: 893, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:14263 SZ:1772/2406/1.36 VC:2133 ENC:RLE,PLAIN_DICTIONARY ST:[min: -0.85, max: 1.59, num_nulls: 273]
prediction: DOUBLE SNAPPY DO:0 FPO:16035 SZ:156/151/0.97 VC:893 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.0, max: 1.0, num_nulls: 0]
file: file:/Users/yumwang/data/a2_m2.parquet/part-00000-50be1622-7a2c-43d2-b3ce-f0703ab8f458.snappy.parquet
creator: parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"CLASS","type":"long","nullable":true,"metadata":{}},{"name":"SENSORID","type":"string","nullable":true,"metadata":{}},{"name":"X","type":"double","nullable":true,"metadata":{}},{"name":"Y","type":"double","nullable":true,"metadata":{}},{"name":"Z","type":"double","nullable":true,"metadata":{}},{"name":"_id","type":"string","nullable":true,"metadata":{}},{"name":"_rev","type":"string","nullable":true,"metadata":{}},{"name":"label","type":"long","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"X"},{"idx":1,"name":"Y"},{"idx":2,"name":"Z"}]},"num_attrs":3}}},{"name":"rawPrediction","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"probability","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"prediction","type":"double","nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
CLASS: OPTIONAL INT64 R:0 D:1
SENSORID: OPTIONAL BINARY O:UTF8 R:0 D:1
X: OPTIONAL DOUBLE R:0 D:1
Y: OPTIONAL DOUBLE R:0 D:1
Z: OPTIONAL DOUBLE R:0 D:1
_id: OPTIONAL BINARY O:UTF8 R:0 D:1
_rev: OPTIONAL BINARY O:UTF8 R:0 D:1
label: OPTIONAL INT64 R:0 D:1
features: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
rawPrediction: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
probability: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
prediction: OPTIONAL DOUBLE R:0 D:1

row group 1: RC:1521 TS:161228 OFFSET:4
--------------------------------------------------------------------------------
CLASS: INT64 SNAPPY DO:0 FPO:4 SZ:144/142/0.99 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
SENSORID: BINARY SNAPPY DO:0 FPO:148 SZ:81/77/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: aaaaaaaa, max: aaaaaaaa, num_nulls: 0]
X: DOUBLE SNAPPY DO:0 FPO:229 SZ:3833/5559/1.45 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -40.06, max: 22.1, num_nulls: 0]
Y: DOUBLE SNAPPY DO:0 FPO:4062 SZ:3271/4933/1.51 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -21.26, max: 15.12, num_nulls: 0]
Z: DOUBLE SNAPPY DO:0 FPO:7333 SZ:4298/6110/1.42 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -25.29, max: 39.48, num_nulls: 0]
_id: BINARY SNAPPY DO:0 FPO:11631 SZ:13589/54857/4.04 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[no stats for this column]
_rev: BINARY SNAPPY DO:0 FPO:25220 SZ:31685/36274/1.14 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[no stats for this column]
label: INT64 SNAPPY DO:0 FPO:56905 SZ:144/142/0.99 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
features:
.type: INT32 SNAPPY DO:0 FPO:57049 SZ:85/81/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:57134 SZ:83/79/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 3, max: 3, num_nulls: 1518]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:57217 SZ:57/55/0.96 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:57274 SZ:10708/15527/1.45 VC:4557 ENC:RLE,PLAIN_DICTIONARY ST:[min: -40.06, max: 39.48, num_nulls: 3]
rawPrediction:
.type: INT32 SNAPPY DO:0 FPO:67982 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:68047 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:68079 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:68118 SZ:14426/18436/1.28 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: -12.804798928798698, max: 12.804798928798698, num_nulls: 0]
probability:
.type: INT32 SNAPPY DO:0 FPO:82544 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:82609 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:82641 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:82680 SZ:18059/18436/1.02 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: 2.7475480111085987E-6, max: 0.9999972524519889, num_nulls: 0]
prediction: DOUBLE SNAPPY DO:0 FPO:100739 SZ:259/264/1.02 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.0, max: 1.0, num_nulls: 0]
file: file:/Users/yumwang/data/a2_m2.parquet/part-00000-7a79fc78-7d1f-4897-883e-addc3aff5158.snappy.parquet
creator: parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"CLASS","type":"long","nullable":true,"metadata":{}},{"name":"SENSORID","type":"string","nullable":true,"metadata":{}},{"name":"X","type":"double","nullable":true,"metadata":{}},{"name":"Y","type":"double","nullable":true,"metadata":{}},{"name":"Z","type":"double","nullable":true,"metadata":{}},{"name":"_id","type":"string","nullable":true,"metadata":{}},{"name":"_rev","type":"string","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"X"},{"idx":1,"name":"Y"},{"idx":2,"name":"Z"}]},"num_attrs":3}}},{"name":"rawPrediction","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"probability","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"prediction","type":"double","nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
CLASS: OPTIONAL INT64 R:0 D:1
SENSORID: OPTIONAL BINARY O:UTF8 R:0 D:1
X: OPTIONAL DOUBLE R:0 D:1
Y: OPTIONAL DOUBLE R:0 D:1
Z: OPTIONAL DOUBLE R:0 D:1
_id: OPTIONAL BINARY O:UTF8 R:0 D:1
_rev: OPTIONAL BINARY O:UTF8 R:0 D:1
features: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..array: REPEATED INT32 R:1 D:3
.values: OPTIONAL F:1
..array: REPEATED DOUBLE R:1 D:3
rawPrediction: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..array: REPEATED INT32 R:1 D:3
.values: OPTIONAL F:1
..array: REPEATED DOUBLE R:1 D:3
probability: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..array: REPEATED INT32 R:1 D:3
.values: OPTIONAL F:1
..array: REPEATED DOUBLE R:1 D:3
prediction: OPTIONAL DOUBLE R:0 D:1

row group 1: RC:1521 TS:161075 OFFSET:4
--------------------------------------------------------------------------------
CLASS: INT64 SNAPPY DO:0 FPO:4 SZ:150/148/0.99 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
SENSORID: BINARY SNAPPY DO:0 FPO:154 SZ:81/77/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: aaaaaaaa, max: aaaaaaaa, num_nulls: 0]
X: DOUBLE SNAPPY DO:0 FPO:235 SZ:3805/5559/1.46 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -40.06, max: 22.1, num_nulls: 0]
Y: DOUBLE SNAPPY DO:0 FPO:4040 SZ:3314/4924/1.49 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -21.26, max: 15.12, num_nulls: 0]
Z: DOUBLE SNAPPY DO:0 FPO:7354 SZ:4303/6110/1.42 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -25.29, max: 39.48, num_nulls: 0]
_id: BINARY SNAPPY DO:0 FPO:11657 SZ:13562/54857/4.04 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[no stats for this column]
_rev: BINARY SNAPPY DO:0 FPO:25219 SZ:31661/36274/1.15 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[no stats for this column]
features:
.type: INT32 SNAPPY DO:0 FPO:56880 SZ:83/79/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:56963 SZ:81/77/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 3, max: 3, num_nulls: 1518]
.indices:
..array: INT32 SNAPPY DO:0 FPO:57044 SZ:55/53/0.96 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..array: DOUBLE SNAPPY DO:0 FPO:57099 SZ:10526/15527/1.48 VC:4557 ENC:RLE,PLAIN_DICTIONARY ST:[min: -40.06, max: 39.48, num_nulls: 3]
rawPrediction:
.type: INT32 SNAPPY DO:0 FPO:67625 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:67690 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..array: INT32 SNAPPY DO:0 FPO:67722 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..array: DOUBLE SNAPPY DO:0 FPO:67761 SZ:14442/18436/1.28 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: -12.804798928798698, max: 12.804798928798698, num_nulls: 0]
probability:
.type: INT32 SNAPPY DO:0 FPO:82203 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:82268 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..array: INT32 SNAPPY DO:0 FPO:82300 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..array: DOUBLE SNAPPY DO:0 FPO:82339 SZ:18078/18436/1.02 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: 2.7475480111085987E-6, max: 0.9999972524519889, num_nulls: 0]
prediction: DOUBLE SNAPPY DO:0 FPO:100417 SZ:264/262/0.99 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.0, max: 1.0, num_nulls: 0]
file: file:/Users/yumwang/data/a2_m2.parquet/part-00000-abb75ae3-1889-4ca4-9972-bc483673f71f.snappy.parquet
creator: parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"CLASS","type":"long","nullable":true,"metadata":{}},{"name":"SENSORID","type":"string","nullable":true,"metadata":{}},{"name":"X","type":"double","nullable":true,"metadata":{}},{"name":"Y","type":"double","nullable":true,"metadata":{}},{"name":"Z","type":"double","nullable":true,"metadata":{}},{"name":"_id","type":"string","nullable":true,"metadata":{}},{"name":"_rev","type":"string","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"X"},{"idx":1,"name":"Y"},{"idx":2,"name":"Z"}]},"num_attrs":3}}},{"name":"rawPrediction","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"probability","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"prediction","type":"double","nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
CLASS: OPTIONAL INT64 R:0 D:1
SENSORID: OPTIONAL BINARY O:UTF8 R:0 D:1
X: OPTIONAL DOUBLE R:0 D:1
Y: OPTIONAL DOUBLE R:0 D:1
Z: OPTIONAL DOUBLE R:0 D:1
_id: OPTIONAL BINARY O:UTF8 R:0 D:1
_rev: OPTIONAL BINARY O:UTF8 R:0 D:1
features: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
rawPrediction: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
probability: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
prediction: OPTIONAL DOUBLE R:0 D:1

row group 1: RC:1521 TS:161081 OFFSET:4
--------------------------------------------------------------------------------
CLASS: INT64 SNAPPY DO:0 FPO:4 SZ:153/148/0.97 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
SENSORID: BINARY SNAPPY DO:0 FPO:157 SZ:81/77/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: aaaaaaaa, max: aaaaaaaa, num_nulls: 0]
X: DOUBLE SNAPPY DO:0 FPO:238 SZ:3793/5559/1.47 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -40.06, max: 22.1, num_nulls: 0]
Y: DOUBLE SNAPPY DO:0 FPO:4031 SZ:3259/4923/1.51 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -21.26, max: 15.12, num_nulls: 0]
Z: DOUBLE SNAPPY DO:0 FPO:7290 SZ:4279/6110/1.43 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -25.29, max: 39.48, num_nulls: 0]
_id: BINARY SNAPPY DO:0 FPO:11569 SZ:13579/54857/4.04 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[no stats for this column]
_rev: BINARY SNAPPY DO:0 FPO:25148 SZ:31671/36274/1.15 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[no stats for this column]
features:
.type: INT32 SNAPPY DO:0 FPO:56819 SZ:84/80/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:56903 SZ:82/78/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 3, max: 3, num_nulls: 1518]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:56985 SZ:56/54/0.96 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:57041 SZ:10507/15527/1.48 VC:4557 ENC:RLE,PLAIN_DICTIONARY ST:[min: -40.06, max: 39.48, num_nulls: 3]
rawPrediction:
.type: INT32 SNAPPY DO:0 FPO:67548 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:67613 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:67645 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:67684 SZ:14415/18436/1.28 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: -12.804798928798698, max: 12.804798928798698, num_nulls: 0]
probability:
.type: INT32 SNAPPY DO:0 FPO:82099 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:82164 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:82196 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:82235 SZ:18051/18436/1.02 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: 2.7475480111085987E-6, max: 0.9999972524519889, num_nulls: 0]
prediction: DOUBLE SNAPPY DO:0 FPO:100286 SZ:269/266/0.99 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.0, max: 1.0, num_nulls: 0]
file: file:/Users/yumwang/data/a2_m2.parquet/part-00000-c70dd288-1a4d-4dc6-a3da-44b2ec303e53.snappy.parquet
creator: parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"label","type":"integer","nullable":false,"metadata":{}},{"name":"CLASS","type":"long","nullable":true,"metadata":{}},{"name":"SENSORID","type":"string","nullable":true,"metadata":{}},{"name":"X","type":"double","nullable":true,"metadata":{}},{"name":"Y","type":"double","nullable":true,"metadata":{}},{"name":"Z","type":"double","nullable":true,"metadata":{}},{"name":"_id","type":"string","nullable":true,"metadata":{}},{"name":"_rev","type":"string","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"X"},{"idx":1,"name":"Y"},{"idx":2,"name":"Z"}]},"num_attrs":3}}},{"name":"prediction","type":"double","nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
label: REQUIRED INT32 R:0 D:0
CLASS: OPTIONAL INT64 R:0 D:1
SENSORID: OPTIONAL BINARY O:UTF8 R:0 D:1
X: OPTIONAL DOUBLE R:0 D:1
Y: OPTIONAL DOUBLE R:0 D:1
Z: OPTIONAL DOUBLE R:0 D:1
_id: OPTIONAL BINARY O:UTF8 R:0 D:1
_rev: OPTIONAL BINARY O:UTF8 R:0 D:1
features: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
prediction: OPTIONAL DOUBLE R:0 D:1

row group 1: RC:1521 TS:123998 OFFSET:4
--------------------------------------------------------------------------------
label: INT32 SNAPPY DO:0 FPO:4 SZ:58/54/0.93 VC:1521 ENC:BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 0, num_nulls: 0]
CLASS: INT64 SNAPPY DO:0 FPO:62 SZ:149/147/0.99 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
SENSORID: BINARY SNAPPY DO:0 FPO:211 SZ:81/77/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: aaaaaaaa, max: aaaaaaaa, num_nulls: 0]
X: DOUBLE SNAPPY DO:0 FPO:292 SZ:3850/5559/1.44 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -40.06, max: 22.1, num_nulls: 0]
Y: DOUBLE SNAPPY DO:0 FPO:4142 SZ:3278/4920/1.50 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -21.26, max: 15.12, num_nulls: 0]
Z: DOUBLE SNAPPY DO:0 FPO:7420 SZ:4324/6110/1.41 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -25.29, max: 39.48, num_nulls: 0]
_id: BINARY SNAPPY DO:0 FPO:11744 SZ:13568/54857/4.04 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[no stats for this column]
_rev: BINARY SNAPPY DO:0 FPO:25312 SZ:31619/36274/1.15 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[no stats for this column]
features:
.type: INT32 SNAPPY DO:0 FPO:56931 SZ:84/80/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:57015 SZ:82/78/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 3, max: 3, num_nulls: 1518]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:57097 SZ:56/54/0.96 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:57153 SZ:10609/15527/1.46 VC:4557 ENC:RLE,PLAIN_DICTIONARY ST:[min: -40.06, max: 39.48, num_nulls: 3]
prediction: DOUBLE SNAPPY DO:0 FPO:67762 SZ:265/261/0.98 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.0, max: 1.0, num_nulls: 0]
file: file:/Users/yumwang/data/a2_m2.parquet/part-00000-f05dff0f-20be-4087-9897-268f7e86927a.snappy.parquet
creator: parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)
extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"CLASS","type":"long","nullable":true,"metadata":{}},{"name":"SENSORID","type":"string","nullable":true,"metadata":{}},{"name":"X","type":"double","nullable":true,"metadata":{}},{"name":"Y","type":"double","nullable":true,"metadata":{}},{"name":"Z","type":"double","nullable":true,"metadata":{}},{"name":"_id","type":"string","nullable":true,"metadata":{}},{"name":"_rev","type":"string","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"attrs":{"numeric":[{"idx":0,"name":"X"},{"idx":1,"name":"Y"},{"idx":2,"name":"Z"}]},"num_attrs":3}}},{"name":"rawPrediction","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"probability","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}},{"name":"prediction","type":"double","nullable":true,"metadata":{}}]}

file schema: spark_schema
--------------------------------------------------------------------------------
CLASS: OPTIONAL INT64 R:0 D:1
SENSORID: OPTIONAL BINARY O:UTF8 R:0 D:1
X: OPTIONAL DOUBLE R:0 D:1
Y: OPTIONAL DOUBLE R:0 D:1
Z: OPTIONAL DOUBLE R:0 D:1
_id: OPTIONAL BINARY O:UTF8 R:0 D:1
_rev: OPTIONAL BINARY O:UTF8 R:0 D:1
features: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
rawPrediction: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
probability: OPTIONAL F:4
.type: REQUIRED INT32 O:INT_8 R:0 D:1
.size: OPTIONAL INT32 R:0 D:2
.indices: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED INT32 R:1 D:3
.values: OPTIONAL F:1
..list: REPEATED F:1
...element: REQUIRED DOUBLE R:1 D:3
prediction: OPTIONAL DOUBLE R:0 D:1

row group 1: RC:1521 TS:161083 OFFSET:4
--------------------------------------------------------------------------------
CLASS: INT64 SNAPPY DO:0 FPO:4 SZ:148/145/0.98 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
SENSORID: BINARY SNAPPY DO:0 FPO:152 SZ:81/77/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: aaaaaaaa, max: aaaaaaaa, num_nulls: 0]
X: DOUBLE SNAPPY DO:0 FPO:233 SZ:3750/5559/1.48 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -40.06, max: 22.1, num_nulls: 0]
Y: DOUBLE SNAPPY DO:0 FPO:3983 SZ:3284/4924/1.50 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -21.26, max: 15.12, num_nulls: 0]
Z: DOUBLE SNAPPY DO:0 FPO:7267 SZ:4277/6110/1.43 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -25.29, max: 39.48, num_nulls: 0]
_id: BINARY SNAPPY DO:0 FPO:11544 SZ:13583/54857/4.04 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[no stats for this column]
_rev: BINARY SNAPPY DO:0 FPO:25127 SZ:31669/36274/1.15 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[no stats for this column]
features:
.type: INT32 SNAPPY DO:0 FPO:56796 SZ:85/81/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 0, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:56881 SZ:83/79/0.95 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 3, max: 3, num_nulls: 1518]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:56964 SZ:57/55/0.96 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:57021 SZ:10443/15527/1.49 VC:4557 ENC:RLE,PLAIN_DICTIONARY ST:[min: -40.06, max: 39.48, num_nulls: 3]
rawPrediction:
.type: INT32 SNAPPY DO:0 FPO:67464 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:67529 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:67561 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:67600 SZ:14425/18436/1.28 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: -12.804798928798698, max: 12.804798928798698, num_nulls: 0]
probability:
.type: INT32 SNAPPY DO:0 FPO:82025 SZ:65/61/0.94 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: 1, max: 1, num_nulls: 0]
.size: INT32 SNAPPY DO:0 FPO:82090 SZ:32/30/0.94 VC:1521 ENC:BIT_PACKED,RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.indices:
..list:
...element: INT32 SNAPPY DO:0 FPO:82122 SZ:39/37/0.95 VC:1521 ENC:RLE,PLAIN ST:[num_nulls: 1521, min/max not defined]
.values:
..list:
...element: DOUBLE SNAPPY DO:0 FPO:82161 SZ:18061/18436/1.02 VC:3042 ENC:RLE,PLAIN_DICTIONARY ST:[min: 2.7475480111085987E-6, max: 0.9999972524519889, num_nulls: 0]
prediction: DOUBLE SNAPPY DO:0 FPO:100222 SZ:268/267/1.00 VC:1521 ENC:RLE,BIT_PACKED,PLAIN_DICTIONARY ST:[min: -0.0, max: 1.0, num_nulls: 0]{noformat}


was (Author: q79969786):
I'm working on

> Incompatible parquet formats - java.lang.UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainLongDictionary
> -------------------------------------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: SPARK-24828
>                 URL: https://issues.apache.org/jira/browse/SPARK-24828
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.3.0
>         Environment: Environment for creating the parquet file:
> IBM Watson Studio Apache Spark Service, V2.1.2
> Environment for reading the parquet file:
> java version "1.8.0_144"
> Java(TM) SE Runtime Environment (build 1.8.0_144-b01)
> Java HotSpot(TM) 64-Bit Server VM (build 25.144-b01, mixed mode)
> MacOSX 10.13.3 (17D47)
> Spark spark-2.1.2-bin-hadoop2.7 directly obtained from http://spark.apache.org/downloads.html
>            Reporter: Romeo Kienzer
>            Priority: Minor
>         Attachments: a2_m2.parquet.zip
>
>
> As requested by [~hyukjin.kwon] here a new issue - related issue can be found here
>  
> Using the attached parquet file from one Spark installation, reading it using an installation directly obtained from [http://spark.apache.org/downloads.html] yields to the following exception:
>  
> 18/07/17 07:40:38 ERROR Executor: Exception in task 3.0 in stage 1.0 (TID 4)
>  scala.MatchError: [1.0,null] (of class org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema)
>      at org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator$$anonfun$1.apply(MulticlassClassificationEvaluator.scala:79)
>      at org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator$$anonfun$1.apply(MulticlassClassificationEvaluator.scala:79)
>      at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
>      at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
>      at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:193)
>      at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
>      at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
>      at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
>      at org.apache.spark.scheduler.Task.run(Task.scala:99)
>      at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
>      at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>      at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>      at java.lang.Thread.run(Thread.java:748)
>  18/07/17 07:40:38 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
>  java.lang.UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainLongDictionary
>      at org.apache.parquet.column.Dictionary.decodeToInt(Dictionary.java:48)
>      at org.apache.spark.sql.execution.vectorized.OnHeapColumnVector.getInt(OnHeapColumnVector.java:233)
>      at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
>      at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
>      at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
>      at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>      at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>      at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>      at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>      at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:191)
>      at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
>      at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
>      at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
>      at org.apache.spark.scheduler.Task.run(Task.scala:99)
>      at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
>      at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>      at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>      at java.lang.Thread.run(Thread.java:748)
>  
> The file is attached [^a2_m2.parquet.zip]
>  
> The following code reproduces the error:
> df = spark.read.parquet('a2_m2.parquet')
> from pyspark.ml.evaluation import MulticlassClassificationEvaluator
> binEval = MulticlassClassificationEvaluator().setMetricName("accuracy") .setPredictionCol("prediction").setLabelCol("label")
> accuracy = binEval.evaluate(df)



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org