You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Josh Rosen (JIRA)" <ji...@apache.org> on 2016/09/19 20:23:20 UTC
[jira] [Assigned] (SPARK-17160) GetExternalRowField does not
properly escape field names, causing generated code not to compile
[ https://issues.apache.org/jira/browse/SPARK-17160?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Josh Rosen reassigned SPARK-17160:
----------------------------------
Assignee: Josh Rosen
> GetExternalRowField does not properly escape field names, causing generated code not to compile
> -----------------------------------------------------------------------------------------------
>
> Key: SPARK-17160
> URL: https://issues.apache.org/jira/browse/SPARK-17160
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.0.0
> Reporter: Josh Rosen
> Assignee: Josh Rosen
> Priority: Critical
>
> The following end-to-end test uncovered a bug in {{GetExternalRowField}}:
> {code}
> import org.apache.spark.sql.functions._
> import org.apache.spark.sql.catalyst.encoders._
> spark.sql("set spark.sql.codegen.fallback=false")
> val df = Seq(("100-200", "1", "300")).toDF("a", "b", "c")
> val df2 = df.select(regexp_replace($"a", "(\\d+)", "num"))
> df2.mapPartitions(x => x)(RowEncoder(df2.schema)).collect()
> {code}
> This causes
> {code}
> java.lang.Exception: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 55, Column 64: Invalid escape sequence
> {code}
> The generated code is
> {code}
> /* 001 */ public Object generate(Object[] references) {
> /* 002 */ return new GeneratedIterator(references);
> /* 003 */ }
> /* 004 */
> /* 005 */ final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
> /* 006 */ private Object[] references;
> /* 007 */ private scala.collection.Iterator inputadapter_input;
> /* 008 */ private java.lang.String serializefromobject_errMsg;
> /* 009 */ private java.lang.String serializefromobject_errMsg1;
> /* 010 */ private UnsafeRow serializefromobject_result;
> /* 011 */ private org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder serializefromobject_holder;
> /* 012 */ private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter serializefromobject_rowWriter;
> /* 013 */
> /* 014 */ public GeneratedIterator(Object[] references) {
> /* 015 */ this.references = references;
> /* 016 */ }
> /* 017 */
> /* 018 */ public void init(int index, scala.collection.Iterator inputs[]) {
> /* 019 */ partitionIndex = index;
> /* 020 */ inputadapter_input = inputs[0];
> /* 021 */ this.serializefromobject_errMsg = (java.lang.String) references[0];
> /* 022 */ this.serializefromobject_errMsg1 = (java.lang.String) references[1];
> /* 023 */ serializefromobject_result = new UnsafeRow(1);
> /* 024 */ this.serializefromobject_holder = new org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder(serializefromobject_result, 32);
> /* 025 */ this.serializefromobject_rowWriter = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(serializefromobject_holder, 1);
> /* 026 */ }
> /* 027 */
> /* 028 */ protected void processNext() throws java.io.IOException {
> /* 029 */ while (inputadapter_input.hasNext()) {
> /* 030 */ InternalRow inputadapter_row = (InternalRow) inputadapter_input.next();
> /* 031 */ org.apache.spark.sql.Row inputadapter_value = (org.apache.spark.sql.Row)inputadapter_row.get(0, null);
> /* 032 */
> /* 033 */ if (false) {
> /* 034 */ throw new RuntimeException(serializefromobject_errMsg);
> /* 035 */ }
> /* 036 */
> /* 037 */ boolean serializefromobject_isNull1 = false || false;
> /* 038 */ final boolean serializefromobject_value1 = serializefromobject_isNull1 ? false : inputadapter_value.isNullAt(0);
> /* 039 */ boolean serializefromobject_isNull = false;
> /* 040 */ UTF8String serializefromobject_value = null;
> /* 041 */ if (!serializefromobject_isNull1 && serializefromobject_value1) {
> /* 042 */ final UTF8String serializefromobject_value5 = null;
> /* 043 */ serializefromobject_isNull = true;
> /* 044 */ serializefromobject_value = serializefromobject_value5;
> /* 045 */ } else {
> /* 046 */ if (false) {
> /* 047 */ throw new RuntimeException(serializefromobject_errMsg1);
> /* 048 */ }
> /* 049 */
> /* 050 */ if (false) {
> /* 051 */ throw new RuntimeException("The input external row cannot be null.");
> /* 052 */ }
> /* 053 */
> /* 054 */ if (inputadapter_value.isNullAt(0)) {
> /* 055 */ throw new RuntimeException("The 0th field 'regexp_replace(a, (\d+), num)' of input row " +
> /* 056 */ "cannot be null.");
> /* 057 */ }
> /* 058 */
> /* 059 */ final Object serializefromobject_value8 = inputadapter_value.get(0);
> /* 060 */ java.lang.String serializefromobject_value7 = null;
> /* 061 */ if (!false) {
> /* 062 */ if (serializefromobject_value8 instanceof java.lang.String) {
> /* 063 */ serializefromobject_value7 = (java.lang.String) serializefromobject_value8;
> /* 064 */ } else {
> /* 065 */ throw new RuntimeException(serializefromobject_value8.getClass().getName() + " is not a valid " +
> /* 066 */ "external type for schema of string");
> /* 067 */ }
> /* 068 */ }
> /* 069 */ boolean serializefromobject_isNull6 = false;
> /* 070 */ final UTF8String serializefromobject_value6 = serializefromobject_isNull6 ? null : org.apache.spark.unsafe.types.UTF8String.fromString(serializefromobject_value7);
> /* 071 */ serializefromobject_isNull6 = serializefromobject_value6 == null;
> /* 072 */ serializefromobject_isNull = serializefromobject_isNull6;
> /* 073 */ serializefromobject_value = serializefromobject_value6;
> /* 074 */ }
> /* 075 */ serializefromobject_holder.reset();
> /* 076 */
> /* 077 */ serializefromobject_rowWriter.zeroOutNullBytes();
> /* 078 */
> /* 079 */ if (serializefromobject_isNull) {
> /* 080 */ serializefromobject_rowWriter.setNullAt(0);
> /* 081 */ } else {
> /* 082 */ serializefromobject_rowWriter.write(0, serializefromobject_value);
> /* 083 */ }
> /* 084 */ serializefromobject_result.setTotalSize(serializefromobject_holder.totalSize());
> /* 085 */ append(serializefromobject_result);
> /* 086 */ if (shouldStop()) return;
> /* 087 */ }
> /* 088 */ }
> /* 089 */ }
> {code}
> Here, the problem is that the auto-generated field name contains special characters (including backslashes) and those aren't escaped when being interpolated into the generated code, causing the invalid string literal
> {code}
> "The 0th field 'regexp_replace(a, (\d+), num)' of input row "
> {code}
> to appear in the generated code.
> We need to update {{GetExternalRowField}} to escape field names and also need to audit other expressions to make sure that we're not making the same mistake there.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org