You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by sa...@apache.org on 2018/05/15 17:53:33 UTC

incubator-airflow git commit: [AIRFLOW-2452] Document field_dict must be OrderedDict

Repository: incubator-airflow
Updated Branches:
  refs/heads/master 7c233179e -> 648b14b4d


[AIRFLOW-2452] Document field_dict must be OrderedDict

HiveCliHook.load_file has a parameter called
field_dict, which defines name-type pairs
for columns, must be OrderedDict so as to
keep columns' order, but it's undocumented.
This PR adds an note about that, and fixes
HiveCliHook.load_df function which calls
load_file internally.

Closes #3347 from sekikn/AIRFLOW-2452


Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/648b14b4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/648b14b4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/648b14b4

Branch: refs/heads/master
Commit: 648b14b4d95bf3aca26e8b54ffe8585b52efc8fd
Parents: 7c23317
Author: Kengo Seki <se...@apache.org>
Authored: Tue May 15 10:53:29 2018 -0700
Committer: r39132 <si...@yahoo.com>
Committed: Tue May 15 10:53:29 2018 -0700

----------------------------------------------------------------------
 airflow/hooks/hive_hooks.py   | 16 +++++++++++-----
 tests/hooks/test_hive_hook.py |  2 ++
 2 files changed, 13 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/648b14b4/airflow/hooks/hive_hooks.py
----------------------------------------------------------------------
diff --git a/airflow/hooks/hive_hooks.py b/airflow/hooks/hive_hooks.py
index 6c9a907..92958c0 100644
--- a/airflow/hooks/hive_hooks.py
+++ b/airflow/hooks/hive_hooks.py
@@ -27,6 +27,7 @@ import itertools
 import re
 import subprocess
 import time
+from collections import OrderedDict
 from tempfile import NamedTemporaryFile
 import hmsclient
 
@@ -303,8 +304,9 @@ class HiveCliHook(BaseHook):
         :param recreate: whether to drop and recreate the table at every
             execution
         :type recreate: bool
-        :param field_dict: mapping from column name to hive data type
-        :type field_dict: dict
+        :param field_dict: mapping from column name to hive data type.
+            Note that it must be OrderedDict so as to keep columns' order.
+        :type field_dict: OrderedDict
         :param encoding: string encoding to use when writing DataFrame to file
         :type encoding: str
         :param pandas_kwargs: passed to DataFrame.to_csv
@@ -325,7 +327,10 @@ class HiveCliHook(BaseHook):
                 'V': 'STRING'    # void
             }
 
-            return dict((col, DTYPE_KIND_HIVE_TYPE[dtype.kind]) for col, dtype in df.dtypes.iteritems())
+            d = OrderedDict()
+            for col, dtype in df.dtypes.iteritems():
+                d[col] = DTYPE_KIND_HIVE_TYPE[dtype.kind]
+            return d
 
         if pandas_kwargs is None:
             pandas_kwargs = {}
@@ -378,8 +383,9 @@ class HiveCliHook(BaseHook):
         :param delimiter: field delimiter in the file
         :type delimiter: str
         :param field_dict: A dictionary of the fields name in the file
-            as keys and their Hive types as values
-        :type field_dict: dict
+            as keys and their Hive types as values.
+            Note that it must be OrderedDict so as to keep columns' order.
+        :type field_dict: OrderedDict
         :param create: whether to create the table if it doesn't exist
         :type create: bool
         :param overwrite: whether to overwrite the data in table or partition

http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/648b14b4/tests/hooks/test_hive_hook.py
----------------------------------------------------------------------
diff --git a/tests/hooks/test_hive_hook.py b/tests/hooks/test_hive_hook.py
index 9eda9e3..3c278e3 100644
--- a/tests/hooks/test_hive_hook.py
+++ b/tests/hooks/test_hive_hook.py
@@ -25,6 +25,7 @@ import random
 import mock
 import unittest
 
+from collections import OrderedDict
 from hmsclient import HMSClient
 
 from airflow.exceptions import AirflowException
@@ -130,6 +131,7 @@ class TestHiveCliHook(unittest.TestCase):
         kwargs = mock_load_file.call_args[1]
         self.assertEqual(kwargs["delimiter"], delimiter)
         self.assertEqual(kwargs["field_dict"], {"c": u"STRING"})
+        self.assertTrue(isinstance(kwargs["field_dict"], OrderedDict))
         self.assertEqual(kwargs["table"], table)