You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iotdb.apache.org by ro...@apache.org on 2022/04/08 06:18:57 UTC
[iotdb] branch master updated: [IOTDB-2838] Check and auto correct endian type for NumpyTablet (#5448)
This is an automated email from the ASF dual-hosted git repository.
rong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new 519293b078 [IOTDB-2838] Check and auto correct endian type for NumpyTablet (#5448)
519293b078 is described below
commit 519293b0783f7c1f7c3505da724ffbe3569e246a
Author: Haonan <hh...@outlook.com>
AuthorDate: Fri Apr 8 14:18:52 2022 +0800
[IOTDB-2838] Check and auto correct endian type for NumpyTablet (#5448)
---
client-py/README.md | 19 +--
client-py/SessionExample.py | 14 +-
client-py/iotdb/utils/IoTDBConstants.py | 11 ++
client-py/iotdb/utils/NumpyTablet.py | 15 ++-
client-py/tests/test_numpy_tablet.py | 147 +++++++++++++++++++++
.../UserGuide/API/Programming-Python-Native-API.md | 19 +--
.../UserGuide/API/Programming-Python-Native-API.md | 18 +--
7 files changed, 206 insertions(+), 37 deletions(-)
diff --git a/client-py/README.md b/client-py/README.md
index 6c0acd62e7..41c0a113b8 100644
--- a/client-py/README.md
+++ b/client-py/README.md
@@ -182,8 +182,9 @@ Comparing with Tablet, Numpy Tablet is using [numpy.ndarray](https://numpy.org/d
With less memory footprint and time cost of serialization, the insert performance will be better.
**Notice**
-1. time and numerical value columns in Tablet is ndarray
-2. ndarray should be big-endian, see the example below
+1. time and value columns in Tablet are ndarray.
+2. recommended to use the specific dtypes to each ndarray, see the example below
+(if not, the default dtypes are also ok).
```python
data_types_ = [
@@ -195,14 +196,14 @@ data_types_ = [
TSDataType.TEXT,
]
np_values_ = [
- np.array([False, True, False, True], np.dtype(">?")),
- np.array([10, 100, 100, 0], np.dtype(">i4")),
- np.array([11, 11111, 1, 0], np.dtype(">i8")),
- np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
- np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
- np.array(["test01", "test02", "test03", "test04"]),
+ np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+ np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+ np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+ np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+ np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+ np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
np_tablet_ = NumpyTablet(
"root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
)
diff --git a/client-py/SessionExample.py b/client-py/SessionExample.py
index 75897a44fb..bbc9669527 100644
--- a/client-py/SessionExample.py
+++ b/client-py/SessionExample.py
@@ -185,14 +185,14 @@ session.insert_tablet(tablet_)
# insert one numpy tablet into the database.
np_values_ = [
- np.array([False, True, False, True], np.dtype(">?")),
- np.array([10, 100, 100, 0], np.dtype(">i4")),
- np.array([11, 11111, 1, 0], np.dtype(">i8")),
- np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
- np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
- np.array(["test01", "test02", "test03", "test04"]),
+ np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+ np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+ np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+ np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+ np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+ np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
np_tablet_ = NumpyTablet(
"root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
)
diff --git a/client-py/iotdb/utils/IoTDBConstants.py b/client-py/iotdb/utils/IoTDBConstants.py
index 7b992e3d7a..ef66741fec 100644
--- a/client-py/iotdb/utils/IoTDBConstants.py
+++ b/client-py/iotdb/utils/IoTDBConstants.py
@@ -17,6 +17,7 @@
#
from enum import Enum, unique
+import numpy as np
@unique
@@ -36,6 +37,16 @@ class TSDataType(Enum):
def __hash__(self):
return self.value
+ def np_dtype(self):
+ return {
+ TSDataType.BOOLEAN: np.dtype(">?"),
+ TSDataType.FLOAT: np.dtype(">f4"),
+ TSDataType.DOUBLE: np.dtype(">f8"),
+ TSDataType.INT32: np.dtype(">i4"),
+ TSDataType.INT64: np.dtype(">i8"),
+ TSDataType.TEXT: np.dtype("str"),
+ }[self]
+
@unique
class TSEncoding(Enum):
diff --git a/client-py/iotdb/utils/NumpyTablet.py b/client-py/iotdb/utils/NumpyTablet.py
index 8dd8457459..b81a172a40 100644
--- a/client-py/iotdb/utils/NumpyTablet.py
+++ b/client-py/iotdb/utils/NumpyTablet.py
@@ -17,7 +17,6 @@
#
import struct
-
from iotdb.utils.IoTDBConstants import TSDataType
from iotdb.utils.BitMap import BitMap
@@ -41,15 +40,25 @@ class NumpyTablet(object):
"""
if len(values) > 0 and len(values[0]) != len(timestamps):
raise RuntimeError(
- "Input error! len(timestamps) does not equal to len(values)!"
+ "Input error! len(timestamps) does not equal to len(values[0])!"
+ )
+ if len(values) != len(data_types):
+ raise RuntimeError(
+ "Input error! len(values) does not equal to len(data_types)!"
)
- if not NumpyTablet.check_sorted(timestamps):
+ if not self.check_sorted(timestamps):
index = timestamps.argsort()
timestamps = timestamps[index]
for i in range(len(values)):
values[i] = values[i][index]
+ if timestamps.dtype != TSDataType.INT64.np_dtype():
+ timestamps = timestamps.astype(TSDataType.INT64.np_dtype())
+ for i in range(len(values)):
+ if values[i].dtype != data_types[i].np_dtype():
+ values[i] = values[i].astype(data_types[i].np_dtype())
+
self.__values = values
self.__timestamps = timestamps
self.__device_id = device_id
diff --git a/client-py/tests/test_numpy_tablet.py b/client-py/tests/test_numpy_tablet.py
new file mode 100644
index 0000000000..b984193975
--- /dev/null
+++ b/client-py/tests/test_numpy_tablet.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+import numpy as np
+from iotdb.utils.IoTDBConstants import TSDataType
+from iotdb.utils.NumpyTablet import NumpyTablet
+from iotdb.utils.Tablet import Tablet
+
+
+def test_numpy_tablet_serialization():
+
+ measurements_ = ["s_01", "s_02", "s_03", "s_04", "s_05", "s_06"]
+ data_types_ = [
+ TSDataType.BOOLEAN,
+ TSDataType.INT32,
+ TSDataType.INT64,
+ TSDataType.FLOAT,
+ TSDataType.DOUBLE,
+ TSDataType.TEXT,
+ ]
+ values_ = [
+ [False, 10, 11, 1.1, 10011.1, "test01"],
+ [True, 100, 11111, 1.25, 101.0, "test02"],
+ [False, 100, 1, 188.1, 688.25, "test03"],
+ [True, 0, 0, 0, 6.25, "test04"],
+ ]
+ timestamps_ = [16, 17, 18, 19]
+ tablet_ = Tablet(
+ "root.sg_test_01.d_01", measurements_, data_types_, values_, timestamps_
+ )
+ np_values_ = [
+ np.array([False, True, False, True], np.dtype(">?")),
+ np.array([10, 100, 100, 0], np.dtype(">i4")),
+ np.array([11, 11111, 1, 0], np.dtype(">i8")),
+ np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
+ np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
+ np.array(["test01", "test02", "test03", "test04"]),
+ ]
+ np_timestamps_ = np.array([16, 17, 18, 19], np.dtype(">i8"))
+ np_tablet_ = NumpyTablet(
+ "root.sg_test_01.d_01", measurements_, data_types_, np_values_, np_timestamps_
+ )
+ assert tablet_.get_binary_timestamps() == np_tablet_.get_binary_timestamps()
+ assert tablet_.get_binary_values() == np_tablet_.get_binary_values()
+
+
+def test_sort_numpy_tablet():
+
+ measurements_ = ["s_01", "s_02", "s_03", "s_04", "s_05", "s_06"]
+ data_types_ = [
+ TSDataType.BOOLEAN,
+ TSDataType.INT32,
+ TSDataType.INT64,
+ TSDataType.FLOAT,
+ TSDataType.DOUBLE,
+ TSDataType.TEXT,
+ ]
+ values_ = [
+ [True, 10000, 11111, 8.999, 776, "test05"],
+ [True, 1000, 1111, 0, 6.25, "test06"],
+ [False, 100, 111, 188.1, 688.25, "test07"],
+ [False, 10, 11, 1.25, 101.0, "test08"],
+ [False, 0, 1, 1.1, 10011.1, "test09"],
+ ]
+ timestamps_ = [5, 6, 7, 8, 9]
+ tablet_ = Tablet(
+ "root.sg_test_01.d_01", measurements_, data_types_, values_, timestamps_
+ )
+ np_values_unsorted = [
+ np.array([False, False, False, True, True], np.dtype(">?")),
+ np.array([0, 10, 100, 1000, 10000], np.dtype(">i4")),
+ np.array([1, 11, 111, 1111, 11111], np.dtype(">i8")),
+ np.array([1.1, 1.25, 188.1, 0, 8.999], np.dtype(">f4")),
+ np.array([10011.1, 101.0, 688.25, 6.25, 776], np.dtype(">f8")),
+ np.array(["test09", "test08", "test07", "test06", "test05"]),
+ ]
+ np_timestamps_unsorted = np.array([9, 8, 7, 6, 5], np.dtype(">i8"))
+ np_tablet_ = NumpyTablet(
+ "root.sg_test_01.d_01",
+ measurements_,
+ data_types_,
+ np_values_unsorted,
+ np_timestamps_unsorted,
+ )
+ assert tablet_.get_binary_timestamps() == np_tablet_.get_binary_timestamps()
+ assert tablet_.get_binary_values() == np_tablet_.get_binary_values()
+
+
+def test_numpy_tablet_auto_correct_datatype():
+
+ measurements_ = ["s_01", "s_02", "s_03", "s_04", "s_05", "s_06"]
+ data_types_ = [
+ TSDataType.BOOLEAN,
+ TSDataType.INT32,
+ TSDataType.INT64,
+ TSDataType.FLOAT,
+ TSDataType.DOUBLE,
+ TSDataType.TEXT,
+ ]
+ values_ = [
+ [True, 10000, 11111, 8.999, 776, "test05"],
+ [True, 1000, 1111, 0, 6.25, "test06"],
+ [False, 100, 111, 188.1, 688.25, "test07"],
+ [False, 10, 11, 1.25, 101.0, "test08"],
+ [False, 0, 1, 1.1, 10011.1, "test09"],
+ ]
+ timestamps_ = [5, 6, 7, 8, 9]
+ tablet_ = Tablet(
+ "root.sg_test_01.d_01", measurements_, data_types_, values_, timestamps_
+ )
+ np_values_unsorted = [
+ np.array([False, False, False, True, True]),
+ np.array([0, 10, 100, 1000, 10000]),
+ np.array([1, 11, 111, 1111, 11111]),
+ np.array([1.1, 1.25, 188.1, 0, 8.999]),
+ np.array([10011.1, 101.0, 688.25, 6.25, 776]),
+ np.array(["test09", "test08", "test07", "test06", "test05"]),
+ ]
+ np_timestamps_unsorted = np.array([9, 8, 7, 6, 5])
+ # numpy.dtype of int and float should be little endian by default
+ assert np_timestamps_unsorted.dtype != np.dtype(">i8")
+ for i in range(1, 4):
+ assert np_values_unsorted[i].dtype != data_types_[i].np_dtype()
+ np_tablet_ = NumpyTablet(
+ "root.sg_test_01.d_01",
+ measurements_,
+ data_types_,
+ np_values_unsorted,
+ np_timestamps_unsorted,
+ )
+ assert tablet_.get_binary_timestamps() == np_tablet_.get_binary_timestamps()
+ assert tablet_.get_binary_values() == np_tablet_.get_binary_values()
diff --git a/docs/UserGuide/API/Programming-Python-Native-API.md b/docs/UserGuide/API/Programming-Python-Native-API.md
index eab4a477ac..c200586bbf 100644
--- a/docs/UserGuide/API/Programming-Python-Native-API.md
+++ b/docs/UserGuide/API/Programming-Python-Native-API.md
@@ -161,7 +161,8 @@ With less memory footprint and time cost of serialization, the insert performanc
**Notice**
1. time and numerical value columns in Tablet is ndarray
-2. ndarray should be big-endian, see the example below
+2. recommended to use the specific dtypes to each ndarray, see the example below
+ (if not, the default dtypes are also ok).
```python
data_types_ = [
@@ -173,14 +174,14 @@ data_types_ = [
TSDataType.TEXT,
]
np_values_ = [
- np.array([False, True, False, True], np.dtype(">?")),
- np.array([10, 100, 100, 0], np.dtype(">i4")),
- np.array([11, 11111, 1, 0], np.dtype(">i8")),
- np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
- np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
- np.array(["test01", "test02", "test03", "test04"]),
+ np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+ np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+ np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+ np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+ np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+ np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
np_tablet_ = NumpyTablet(
"root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
)
@@ -204,7 +205,7 @@ session.insert_record(device_id, timestamp, measurements_, data_types_, values_)
```python
session.insert_records(
device_ids_, time_list_, measurements_list_, data_type_list_, values_list_
- )
+)
```
* Insert multiple Records that belong to the same device.
diff --git a/docs/zh/UserGuide/API/Programming-Python-Native-API.md b/docs/zh/UserGuide/API/Programming-Python-Native-API.md
index 9cb92e74ed..55fc59d031 100644
--- a/docs/zh/UserGuide/API/Programming-Python-Native-API.md
+++ b/docs/zh/UserGuide/API/Programming-Python-Native-API.md
@@ -160,8 +160,8 @@ session.insert_tablet(tablet_)
内存占用和序列化耗时会降低很多,写入效率也会有很大提升。
**注意**
-1. Tablet 中的每一列值记录为一个 ndarray
-2. ndarray 需要为大端类型的数据类型,具体可参考下面的例子
+1. Tablet 中的每一列时间戳和值记录为一个 ndarray
+2. ndarray 推荐使用如下面例子中的特定的 dtype,如果不使用,不会影响正确性。
```python
data_types_ = [
@@ -173,14 +173,14 @@ data_types_ = [
TSDataType.TEXT,
]
np_values_ = [
- np.array([False, True, False, True], np.dtype(">?")),
- np.array([10, 100, 100, 0], np.dtype(">i4")),
- np.array([11, 11111, 1, 0], np.dtype(">i8")),
- np.array([1.1, 1.25, 188.1, 0], np.dtype(">f4")),
- np.array([10011.1, 101.0, 688.25, 6.25], np.dtype(">f8")),
- np.array(["test01", "test02", "test03", "test04"]),
+ np.array([False, True, False, True], TSDataType.BOOLEAN.np_dtype()),
+ np.array([10, 100, 100, 0], TSDataType.INT32.np_dtype()),
+ np.array([11, 11111, 1, 0], TSDataType.INT64.np_dtype()),
+ np.array([1.1, 1.25, 188.1, 0], TSDataType.FLOAT.np_dtype()),
+ np.array([10011.1, 101.0, 688.25, 6.25], TSDataType.DOUBLE.np_dtype()),
+ np.array(["test01", "test02", "test03", "test04"], TSDataType.TEXT.np_dtype()),
]
-np_timestamps_ = np.array([1, 2, 3, 4], np.dtype(">i8"))
+np_timestamps_ = np.array([1, 2, 3, 4], TSDataType.INT64.np_dtype())
np_tablet_ = NumpyTablet(
"root.sg_test_01.d_02", measurements_, data_types_, np_values_, np_timestamps_
)