You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Josh Rosen (JIRA)" <ji...@apache.org> on 2014/11/21 22:08:34 UTC
[jira] [Comment Edited] (SPARK-2775) HiveContext does not support
dots in column names.
[ https://issues.apache.org/jira/browse/SPARK-2775?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14205829#comment-14205829 ]
Josh Rosen edited comment on SPARK-2775 at 11/21/14 9:08 PM:
-------------------------------------------------------------
Chatted with [~marmbrus] about this and he pointed me to a workaround: use {{applySchema}} to replace the dots with some other separator (underscore, in this case):
{code}
import sqlContext.applySchema
/** Replaces . in column names with _ (underscore) */
protected def cleanSchema(dataType: DataType): DataType = dataType match {
case StructType(fields) =>
StructType(
fields.map(f =>
f.copy(name = f.name.replaceAllLiterally(".", "_"), dataType = cleanSchema(f.dataType))))
case ArrayType(elemType, nullable) => ArrayType(cleanSchema(elemType), nullable)
case NullType => StringType
case other => other
}
/** Replaces . signs in column names with _ */
protected def cleanSchema(schemaRDD: SchemaRDD): SchemaRDD = {
applySchema(schemaRDD, cleanSchema(schemaRDD.schema).asInstanceOf[StructType])
}
{code}
I've translated this to Python:
{code}
from pyspark.sql import *
from copy import copy
def cleanSchema(schemaRDD):
def renameField(field):
field = copy(field)
field.name = field.name.replace(".", "_")
field.dataType = doCleanSchema(field.dataType)
return field
def doCleanSchema(dataType):
dataType = copy(dataType)
if isinstance(dataType, StructType):
dataType.fields = [renameField(f) for f in dataType.fields]
elif isinstance(dataType, ArrayType):
dataType.elementType = doCleanSchema(dataType.elementType )
return dataType
return sqlContext.applySchema(schemaRDD.map(lambda x: x), doCleanSchema(schemaRDD.schema()))
print cleanSchema(resultsRDD)
{code}
was (Author: joshrosen):
Chatted with [~marmbrus] about this and he pointed me to a workaround: use {{applySchema}} to replace the dots with some other separator (underscore, in this case):
{code}
import sqlContext.applySchema
/** Replaces . in column names with _ (underscore) */
protected def cleanSchema(dataType: DataType): DataType = dataType match {
case StructType(fields) =>
StructType(
fields.map(f =>
f.copy(name = f.name.replaceAll(".", "_"), dataType = cleanSchema(f.dataType))))
case ArrayType(elemType, nullable) => ArrayType(cleanSchema(elemType), nullable)
case NullType => StringType
case other => other
}
/** Replaces . signs in column names with _ */
protected def cleanSchema(schemaRDD: SchemaRDD): SchemaRDD = {
applySchema(schemaRDD, cleanSchema(schemaRDD.schema).asInstanceOf[StructType])
}
{code}
I've translated this to Python:
{code}
from pyspark.sql import *
from copy import copy
def cleanSchema(schemaRDD):
def renameField(field):
field = copy(field)
field.name = field.name.replace(".", "_")
field.dataType = doCleanSchema(field.dataType)
return field
def doCleanSchema(dataType):
dataType = copy(dataType)
if isinstance(dataType, StructType):
dataType.fields = [renameField(f) for f in dataType.fields]
elif isinstance(dataType, ArrayType):
dataType.elementType = doCleanSchema(dataType.elementType )
return dataType
return sqlContext.applySchema(schemaRDD.map(lambda x: x), doCleanSchema(schemaRDD.schema()))
print cleanSchema(resultsRDD)
{code}
> HiveContext does not support dots in column names.
> ---------------------------------------------------
>
> Key: SPARK-2775
> URL: https://issues.apache.org/jira/browse/SPARK-2775
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Reporter: Yin Huai
>
> When you try the following snippet in hive/console.
> {code}
> val data = sc.parallelize(Seq("""{"key.number1": "value1", "key.number2": "value2"}"""))
> jsonRDD(data).registerAsTable("jt")
> hql("select `key.number1` from jt")
> {code}
> You will find the name of key.number1 cannot be resolved.
> {code}
> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: Unresolved attributes: 'key.number1, tree:
> Project ['key.number1]
> LowerCaseSchema
> Subquery jt
> SparkLogicalPlan (ExistingRdd [key.number1#8,key.number2#9], MappedRDD[17] at map at JsonRDD.scala:37)
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org