You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by Apache Wiki <wi...@apache.org> on 2009/04/16 23:23:49 UTC

[Hadoop Wiki] Update of "Hive/LanguageManual/Transform" by ZhengShao

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Hadoop Wiki" for change notification.

The following page has been changed by ZhengShao:
http://wiki.apache.org/hadoop/Hive/LanguageManual/Transform

------------------------------------------------------------------------------
  
  Note that columns will be transformed to ''STRING'' and delimited by TAB before feeding to the user script, and the standard output of the user script will be treated as TAB-separated ''STRING'' columns. User scripts can output debug information to standard error which will be shown on the task detail page on hadoop.
  
- In the syntax, both ''MAP'' and ''REDUCE'' can be also written as ''SELECT TRANSFORM''.  There are actually no difference between these three.
+ In the syntax, both ''MAP ...'' and ''REDUCE ...'' can be also written as ''SELECT TRANSFORM ( ... )''.  There are actually no difference between these three.
  Hive runs the reduce script in the reduce task (instead of the map task) because of the ''clusterBy''/''distributeBy''/''sortBy'' clause in the inner query.
  
  Please also see [wiki:Self:Hive/LanguageManual/SortBy Sort By / Cluster By / Distribute By].
@@ -23, +23 @@

  query:
    FROM (
      FROM src
-     MAP '(' expression (',' expression)* ')'
+     MAP expression (',' expression)*
      USING 'my_map_script'
      ( AS colName (',' colName)* )?
      ( clusterBy? | distributeBy? sortBy? ) src_alias
    )
-   REDUCE '(' expression (, expression)* ')'
+   REDUCE expression (',' expression)*
+     USING 'my_reduce_script'
+     ( AS colName (',' colName)* )?
+ 
+   FROM (
+     FROM src
+     SELECT TRANSFORM '(' expression (',' expression)* ')'
+     USING 'my_map_script'
+     ( AS colName (',' colName)* )?
+     ( clusterBy? | distributeBy? sortBy? ) src_alias
+   )
+   SELECT TRANSFORM '(' expression (',' expression)* ')'
      USING 'my_reduce_script'
      ( AS colName (',' colName)* )?
  }}}
@@ -37, +48 @@

  {{{
    FROM (
      FROM pv_users
-     MAP ( pv_users.userid, pv_users.date )
+     MAP pv_users.userid, pv_users.date
      USING 'map_script'
      AS dt, uid
      CLUSTER BY dt) map_output
    INSERT OVERWRITE TABLE pv_users_reduced
-     REDUCE ( map_output.dt, map_output.uid )
+     REDUCE map_output.dt, map_output.uid
+     USING 'reduce_script'
+     AS date, count;
+   FROM (
+     FROM pv_users
+     SELECT TRANSFORM(pv_users.userid, pv_users.date)
+     USING 'map_script'
+     AS dt, uid
+     CLUSTER BY dt) map_output
+   INSERT OVERWRITE TABLE pv_users_reduced
+     SELECT TRANSFORM(map_output.dt, map_output.uid)
      USING 'reduce_script'
      AS date, count;
  }}}
@@ -54, +75 @@

  {{{
    FROM (
      FROM pv_users
-     MAP ( pv_users.userid, pv_users.date )
+     MAP pv_users.userid, pv_users.date
      USING 'map_script'
      CLUSTER BY key) map_output
    INSERT OVERWRITE TABLE pv_users_reduced
-     REDUCE ( map_output.key, map_output.value )
+     REDUCE map_output.key, map_output.value
      USING 'reduce_script'
      AS date, count;
  }}}