You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airflow.apache.org by sa...@apache.org on 2018/05/15 17:53:33 UTC
incubator-airflow git commit: [AIRFLOW-2452] Document field_dict must
be OrderedDict
Repository: incubator-airflow
Updated Branches:
refs/heads/master 7c233179e -> 648b14b4d
[AIRFLOW-2452] Document field_dict must be OrderedDict
HiveCliHook.load_file has a parameter called
field_dict, which defines name-type pairs
for columns, must be OrderedDict so as to
keep columns' order, but it's undocumented.
This PR adds an note about that, and fixes
HiveCliHook.load_df function which calls
load_file internally.
Closes #3347 from sekikn/AIRFLOW-2452
Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/648b14b4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/648b14b4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/648b14b4
Branch: refs/heads/master
Commit: 648b14b4d95bf3aca26e8b54ffe8585b52efc8fd
Parents: 7c23317
Author: Kengo Seki <se...@apache.org>
Authored: Tue May 15 10:53:29 2018 -0700
Committer: r39132 <si...@yahoo.com>
Committed: Tue May 15 10:53:29 2018 -0700
----------------------------------------------------------------------
airflow/hooks/hive_hooks.py | 16 +++++++++++-----
tests/hooks/test_hive_hook.py | 2 ++
2 files changed, 13 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/648b14b4/airflow/hooks/hive_hooks.py
----------------------------------------------------------------------
diff --git a/airflow/hooks/hive_hooks.py b/airflow/hooks/hive_hooks.py
index 6c9a907..92958c0 100644
--- a/airflow/hooks/hive_hooks.py
+++ b/airflow/hooks/hive_hooks.py
@@ -27,6 +27,7 @@ import itertools
import re
import subprocess
import time
+from collections import OrderedDict
from tempfile import NamedTemporaryFile
import hmsclient
@@ -303,8 +304,9 @@ class HiveCliHook(BaseHook):
:param recreate: whether to drop and recreate the table at every
execution
:type recreate: bool
- :param field_dict: mapping from column name to hive data type
- :type field_dict: dict
+ :param field_dict: mapping from column name to hive data type.
+ Note that it must be OrderedDict so as to keep columns' order.
+ :type field_dict: OrderedDict
:param encoding: string encoding to use when writing DataFrame to file
:type encoding: str
:param pandas_kwargs: passed to DataFrame.to_csv
@@ -325,7 +327,10 @@ class HiveCliHook(BaseHook):
'V': 'STRING' # void
}
- return dict((col, DTYPE_KIND_HIVE_TYPE[dtype.kind]) for col, dtype in df.dtypes.iteritems())
+ d = OrderedDict()
+ for col, dtype in df.dtypes.iteritems():
+ d[col] = DTYPE_KIND_HIVE_TYPE[dtype.kind]
+ return d
if pandas_kwargs is None:
pandas_kwargs = {}
@@ -378,8 +383,9 @@ class HiveCliHook(BaseHook):
:param delimiter: field delimiter in the file
:type delimiter: str
:param field_dict: A dictionary of the fields name in the file
- as keys and their Hive types as values
- :type field_dict: dict
+ as keys and their Hive types as values.
+ Note that it must be OrderedDict so as to keep columns' order.
+ :type field_dict: OrderedDict
:param create: whether to create the table if it doesn't exist
:type create: bool
:param overwrite: whether to overwrite the data in table or partition
http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/648b14b4/tests/hooks/test_hive_hook.py
----------------------------------------------------------------------
diff --git a/tests/hooks/test_hive_hook.py b/tests/hooks/test_hive_hook.py
index 9eda9e3..3c278e3 100644
--- a/tests/hooks/test_hive_hook.py
+++ b/tests/hooks/test_hive_hook.py
@@ -25,6 +25,7 @@ import random
import mock
import unittest
+from collections import OrderedDict
from hmsclient import HMSClient
from airflow.exceptions import AirflowException
@@ -130,6 +131,7 @@ class TestHiveCliHook(unittest.TestCase):
kwargs = mock_load_file.call_args[1]
self.assertEqual(kwargs["delimiter"], delimiter)
self.assertEqual(kwargs["field_dict"], {"c": u"STRING"})
+ self.assertTrue(isinstance(kwargs["field_dict"], OrderedDict))
self.assertEqual(kwargs["table"], table)