You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by ro...@apache.org on 2022/04/08 06:18:57 UTC

[iotdb] branch master updated: [IOTDB-2838] Check and auto correct endian type for NumpyTablet (#5448)

This is an automated email from the ASF dual-hosted git repository.

rong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 519293b078 [IOTDB-2838] Check and auto correct endian type for NumpyTablet (#5448)
519293b078 is described below

commit 519293b0783f7c1f7c3505da724ffbe3569e246a
Author: Haonan <hh...@outlook.com>
AuthorDate: Fri Apr 8 14:18:52 2022 +0800

    [IOTDB-2838] Check and auto correct endian type for NumpyTablet (#5448)
---
 client-py/README.md                                |  19 +--
 client-py/SessionExample.py                        |  14 +-
 client-py/iotdb/utils/IoTDBConstants.py            |  11 ++
 client-py/iotdb/utils/NumpyTablet.py               |  15 ++-
 client-py/tests/test_numpy_tablet.py               | 147 +++++++++++++++++++++
 .../UserGuide/API/Programming-Python-Native-API.md |  19 +--
 .../UserGuide/API/Programming-Python-Native-API.md |  18 +--
 7 files changed, 206 insertions(+), 37 deletions(-)

diff --git a/client-py/README.md b/client-py/README.md
index 6c0acd62e7..41c0a113b8 100644
--- a/client-py/README.md
+++ b/client-py/README.md
@@ -182,8 +182,9 @@ Comparing with Tablet, Numpy Tablet is using [numpy.ndarray](https://numpy.org/d
 With less memory footprint and time cost of serialization, the insert performance will be better.
 
 **Notice**
-1. time and numerical value columns in Tablet is ndarray
-2. ndarray should be big-endian, see the example below
+1. time and value columns in Tablet are ndarray.
+2. recommended to use the specific dtypes to each ndarray, see the example below 
+(if not, the default dtypes are also ok).
 
 ```python
 data_types_ = [
@@ -195,14 +196,14 @@ data_types_ = [
     TSDataType.TEXT,
 ]
 np_values_ = [
-  np.array([False, True, False, True], np.dtype(">?")),
-  np.array([10, 100, 100, 0], np.dtype(">i4")),
-  np.array([11, 11111, 1, 0], np.dtype(">i8")),
-  np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
-  np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
-  np.array(["test01", "test02", "test03", "test04"]),
+    np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+    np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+    np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+    np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+    np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+    np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
 ]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
 np_tablet_ = NumpyTablet(
   "root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
 )
diff --git a/client-py/SessionExample.py b/client-py/SessionExample.py
index 75897a44fb..bbc9669527 100644
--- a/client-py/SessionExample.py
+++ b/client-py/SessionExample.py
@@ -185,14 +185,14 @@ session.insert_tablet(tablet_)
 
 # insert one numpy tablet into the database.
 np_values_ = [
-    np.array([False, True, False, True], np.dtype(">?")),
-    np.array([10, 100, 100, 0], np.dtype(">i4")),
-    np.array([11, 11111, 1, 0], np.dtype(">i8")),
-    np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
-    np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
-    np.array(["test01", "test02", "test03", "test04"]),
+    np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+    np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+    np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+    np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+    np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+    np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
 ]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
 np_tablet_ = NumpyTablet(
     "root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
 )
diff --git a/client-py/iotdb/utils/IoTDBConstants.py b/client-py/iotdb/utils/IoTDBConstants.py
index 7b992e3d7a..ef66741fec 100644
--- a/client-py/iotdb/utils/IoTDBConstants.py
+++ b/client-py/iotdb/utils/IoTDBConstants.py
@@ -17,6 +17,7 @@
 #
 
 from enum import Enum, unique
+import numpy as np
 
 
 @unique
@@ -36,6 +37,16 @@ class TSDataType(Enum):
     def __hash__(self):
         return self.value
 
+    def np_dtype(self):
+        return {
+            TSDataType.BOOLEAN: np.dtype(">?"),
+            TSDataType.FLOAT: np.dtype(">f4"),
+            TSDataType.DOUBLE: np.dtype(">f8"),
+            TSDataType.INT32: np.dtype(">i4"),
+            TSDataType.INT64: np.dtype(">i8"),
+            TSDataType.TEXT: np.dtype("str"),
+        }[self]
+
 
 @unique
 class TSEncoding(Enum):
diff --git a/client-py/iotdb/utils/NumpyTablet.py b/client-py/iotdb/utils/NumpyTablet.py
index 8dd8457459..b81a172a40 100644
--- a/client-py/iotdb/utils/NumpyTablet.py
+++ b/client-py/iotdb/utils/NumpyTablet.py
@@ -17,7 +17,6 @@
 #
 
 import struct
-
 from iotdb.utils.IoTDBConstants import TSDataType
 from iotdb.utils.BitMap import BitMap
 
@@ -41,15 +40,25 @@ class NumpyTablet(object):
         """
         if len(values) > 0 and len(values[0]) != len(timestamps):
             raise RuntimeError(
-                "Input error! len(timestamps) does not equal to len(values)!"
+                "Input error! len(timestamps) does not equal to len(values[0])!"
+            )
+        if len(values) != len(data_types):
+            raise RuntimeError(
+                "Input error! len(values) does not equal to len(data_types)!"
             )
 
-        if not NumpyTablet.check_sorted(timestamps):
+        if not self.check_sorted(timestamps):
             index = timestamps.argsort()
             timestamps = timestamps[index]
             for i in range(len(values)):
                 values[i] = values[i][index]
 
+        if timestamps.dtype != TSDataType.INT64.np_dtype():
+            timestamps = timestamps.astype(TSDataType.INT64.np_dtype())
+        for i in range(len(values)):
+            if values[i].dtype != data_types[i].np_dtype():
+                values[i] = values[i].astype(data_types[i].np_dtype())
+
         self.__values = values
         self.__timestamps = timestamps
         self.__device_id = device_id
diff --git a/client-py/tests/test_numpy_tablet.py b/client-py/tests/test_numpy_tablet.py
new file mode 100644
index 0000000000..b984193975
--- /dev/null
+++ b/client-py/tests/test_numpy_tablet.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import numpy as np
+from iotdb.utils.IoTDBConstants import TSDataType
+from iotdb.utils.NumpyTablet import NumpyTablet
+from iotdb.utils.Tablet import Tablet
+
+
+def test_numpy_tablet_serialization():
+
+    measurements_ = ["s_01", "s_02", "s_03", "s_04", "s_05", "s_06"]
+    data_types_ = [
+        TSDataType.BOOLEAN,
+        TSDataType.INT32,
+        TSDataType.INT64,
+        TSDataType.FLOAT,
+        TSDataType.DOUBLE,
+        TSDataType.TEXT,
+    ]
+    values_ = [
+        [False, 10, 11, 1.1, 10011.1, "test01"],
+        [True, 100, 11111, 1.25, 101.0, "test02"],
+        [False, 100, 1, 188.1, 688.25, "test03"],
+        [True, 0, 0, 0, 6.25, "test04"],
+    ]
+    timestamps_ = [16, 17, 18, 19]
+    tablet_ = Tablet(
+        "root.sg_test_01.d_01", measurements_, data_types_, values_, timestamps_
+    )
+    np_values_ = [
+        np.array([False, True, False, True], np.dtype(">?")),
+        np.array([10, 100, 100, 0], np.dtype(">i4")),
+        np.array([11, 11111, 1, 0], np.dtype(">i8")),
+        np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
+        np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
+        np.array(["test01", "test02", "test03", "test04"]),
+    ]
+    np_timestamps_ = np.array([16, 17, 18, 19], np.dtype(">i8"))
+    np_tablet_ = NumpyTablet(
+        "root.sg_test_01.d_01", measurements_, data_types_, np_values_, np_timestamps_
+    )
+    assert tablet_.get_binary_timestamps() == np_tablet_.get_binary_timestamps()
+    assert tablet_.get_binary_values() == np_tablet_.get_binary_values()
+
+
+def test_sort_numpy_tablet():
+
+    measurements_ = ["s_01", "s_02", "s_03", "s_04", "s_05", "s_06"]
+    data_types_ = [
+        TSDataType.BOOLEAN,
+        TSDataType.INT32,
+        TSDataType.INT64,
+        TSDataType.FLOAT,
+        TSDataType.DOUBLE,
+        TSDataType.TEXT,
+    ]
+    values_ = [
+        [True, 10000, 11111, 8.999, 776, "test05"],
+        [True, 1000, 1111, 0, 6.25, "test06"],
+        [False, 100, 111, 188.1, 688.25, "test07"],
+        [False, 10, 11, 1.25, 101.0, "test08"],
+        [False, 0, 1, 1.1, 10011.1, "test09"],
+    ]
+    timestamps_ = [5, 6, 7, 8, 9]
+    tablet_ = Tablet(
+        "root.sg_test_01.d_01", measurements_, data_types_, values_, timestamps_
+    )
+    np_values_unsorted = [
+        np.array([False, False, False, True, True], np.dtype(">?")),
+        np.array([0, 10, 100, 1000, 10000], np.dtype(">i4")),
+        np.array([1, 11, 111, 1111, 11111], np.dtype(">i8")),
+        np.array([1.1, 1.25, 188.1, 0, 8.999], np.dtype(">f4")),
+        np.array([10011.1, 101.0, 688.25, 6.25, 776], np.dtype(">f8")),
+        np.array(["test09", "test08", "test07", "test06", "test05"]),
+    ]
+    np_timestamps_unsorted = np.array([9, 8, 7, 6, 5], np.dtype(">i8"))
+    np_tablet_ = NumpyTablet(
+        "root.sg_test_01.d_01",
+        measurements_,
+        data_types_,
+        np_values_unsorted,
+        np_timestamps_unsorted,
+    )
+    assert tablet_.get_binary_timestamps() == np_tablet_.get_binary_timestamps()
+    assert tablet_.get_binary_values() == np_tablet_.get_binary_values()
+
+
+def test_numpy_tablet_auto_correct_datatype():
+
+    measurements_ = ["s_01", "s_02", "s_03", "s_04", "s_05", "s_06"]
+    data_types_ = [
+        TSDataType.BOOLEAN,
+        TSDataType.INT32,
+        TSDataType.INT64,
+        TSDataType.FLOAT,
+        TSDataType.DOUBLE,
+        TSDataType.TEXT,
+    ]
+    values_ = [
+        [True, 10000, 11111, 8.999, 776, "test05"],
+        [True, 1000, 1111, 0, 6.25, "test06"],
+        [False, 100, 111, 188.1, 688.25, "test07"],
+        [False, 10, 11, 1.25, 101.0, "test08"],
+        [False, 0, 1, 1.1, 10011.1, "test09"],
+    ]
+    timestamps_ = [5, 6, 7, 8, 9]
+    tablet_ = Tablet(
+        "root.sg_test_01.d_01", measurements_, data_types_, values_, timestamps_
+    )
+    np_values_unsorted = [
+        np.array([False, False, False, True, True]),
+        np.array([0, 10, 100, 1000, 10000]),
+        np.array([1, 11, 111, 1111, 11111]),
+        np.array([1.1, 1.25, 188.1, 0, 8.999]),
+        np.array([10011.1, 101.0, 688.25, 6.25, 776]),
+        np.array(["test09", "test08", "test07", "test06", "test05"]),
+    ]
+    np_timestamps_unsorted = np.array([9, 8, 7, 6, 5])
+    # numpy.dtype of int and float should be little endian by default
+    assert np_timestamps_unsorted.dtype != np.dtype(">i8")
+    for i in range(1, 4):
+        assert np_values_unsorted[i].dtype != data_types_[i].np_dtype()
+    np_tablet_ = NumpyTablet(
+        "root.sg_test_01.d_01",
+        measurements_,
+        data_types_,
+        np_values_unsorted,
+        np_timestamps_unsorted,
+    )
+    assert tablet_.get_binary_timestamps() == np_tablet_.get_binary_timestamps()
+    assert tablet_.get_binary_values() == np_tablet_.get_binary_values()
diff --git a/docs/UserGuide/API/Programming-Python-Native-API.md b/docs/UserGuide/API/Programming-Python-Native-API.md
index eab4a477ac..c200586bbf 100644
--- a/docs/UserGuide/API/Programming-Python-Native-API.md
+++ b/docs/UserGuide/API/Programming-Python-Native-API.md
@@ -161,7 +161,8 @@ With less memory footprint and time cost of serialization, the insert performanc
 
 **Notice**
 1. time and numerical value columns in Tablet is ndarray
-2. ndarray should be big-endian, see the example below
+2. recommended to use the specific dtypes to each ndarray, see the example below
+   (if not, the default dtypes are also ok).
 
 ```python
 data_types_ = [
@@ -173,14 +174,14 @@ data_types_ = [
     TSDataType.TEXT,
 ]
 np_values_ = [
-  np.array([False, True, False, True], np.dtype(">?")),
-  np.array([10, 100, 100, 0], np.dtype(">i4")),
-  np.array([11, 11111, 1, 0], np.dtype(">i8")),
-  np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
-  np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
-  np.array(["test01", "test02", "test03", "test04"]),
+    np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+    np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+    np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+    np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+    np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+    np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
 ]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
 np_tablet_ = NumpyTablet(
   "root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
 )
@@ -204,7 +205,7 @@ session.insert_record(device_id, timestamp, measurements_, data_types_, values_)
 ```python
 session.insert_records(
     device_ids_, time_list_, measurements_list_, data_type_list_, values_list_
-    )
+)
 ```
 
 * Insert multiple Records that belong to the same device.
diff --git a/docs/zh/UserGuide/API/Programming-Python-Native-API.md b/docs/zh/UserGuide/API/Programming-Python-Native-API.md
index 9cb92e74ed..55fc59d031 100644
--- a/docs/zh/UserGuide/API/Programming-Python-Native-API.md
+++ b/docs/zh/UserGuide/API/Programming-Python-Native-API.md
@@ -160,8 +160,8 @@ session.insert_tablet(tablet_)
 内存占用和序列化耗时会降低很多,写入效率也会有很大提升。
 
 **注意**
-1. Tablet 中的每一列值记录为一个 ndarray
-2. ndarray 需要为大端类型的数据类型,具体可参考下面的例子
+1. Tablet 中的每一列时间戳和值记录为一个 ndarray
+2. ndarray 推荐使用如下面例子中的特定的 dtype,如果不使用,不会影响正确性。
 
 ```python
 data_types_ = [
@@ -173,14 +173,14 @@ data_types_ = [
     TSDataType.TEXT,
 ]
 np_values_ = [
-  np.array([False, True, False, True], np.dtype(">?")),
-  np.array([10, 100, 100, 0], np.dtype(">i4")),
-  np.array([11, 11111, 1, 0], np.dtype(">i8")),
-  np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
-  np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
-  np.array(["test01", "test02", "test03", "test04"]),
+    np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+    np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+    np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+    np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+    np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+    np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
 ]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
 np_tablet_ = NumpyTablet(
   "root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
 )