You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@ignite.apache.org by GitBox <gi...@apache.org> on 2021/02/17 12:26:03 UTC

[GitHub] [ignite-python-thin-client] isapego commented on a change in pull request #17: IGNITE-14186 Implement c module to speedup hashcode.

isapego commented on a change in pull request #17:
URL: https://github.com/apache/ignite-python-thin-client/pull/17#discussion_r577531139



##########
File path: tests/test_cutils.py
##########
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from collections import OrderedDict
+
+import pytest
+
+import pyignite.utils as _putils
+from pyignite.datatypes import IntObject
+
+try:
+    from pyignite import _cutils
+
+    _cutils_hashcode = _cutils.hashcode
+    _cutils_schema_id = _cutils.schema_id
+except ImportError:
+    _cutils_hashcode = lambda x: None
+    _cutils_schema_id = lambda x: None
+    pass
+
+
+@pytest.mark.skip_if_no_cext
+def test_bytes_hashcode():
+    assert _cutils_hashcode(None) == 0
+    assert _cutils_hashcode(b'') == 1
+    assert _cutils_hashcode(bytearray()) == 1
+    assert _cutils_hashcode(memoryview(b'')) == 1
+
+    for i in range(1000):
+        rnd_bytes = bytearray([random.randint(0, 255) for _ in range(random.randint(1, 1024))])
+
+        fallback_val = _putils.__hashcode_fallback(rnd_bytes)
+        assert _cutils_hashcode(rnd_bytes) == fallback_val
+        assert _cutils_hashcode(bytes(rnd_bytes)) == fallback_val
+        assert _cutils_hashcode(memoryview(rnd_bytes)) == fallback_val
+
+
+@pytest.mark.skip_if_no_cext
+def test_string_hashcode():
+    assert _cutils_hashcode(None) == 0
+    assert _cutils_hashcode('') == 0
+
+    for i in range(1000):
+        rnd_str = get_random_unicode(random.randint(1, 128))

Review comment:
       Using random values is not very good thing for tests. Tests should always aim for repeatable result

##########
File path: setup.py
##########
@@ -13,92 +13,117 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from collections import defaultdict
+from distutils.command.build_ext import build_ext
+from distutils.errors import CCompilerError, DistutilsExecError, DistutilsPlatformError
+
 import setuptools
 import sys
 
 
-PYTHON_REQUIRED = (3, 4)
-PYTHON_INSTALLED = sys.version_info[:2]
+cext = setuptools.Extension(
+    "pyignite._cutils",
+    sources=[
+        "./cext/cutils.c"
+    ],
+    include_dirs=["./cext"]
+)
+
+if sys.platform == 'win32':
+    ext_errors = (CCompilerError, DistutilsExecError, DistutilsPlatformError, IOError, ValueError)
+else:
+    ext_errors = (CCompilerError, DistutilsExecError, DistutilsPlatformError)
+
+
+class BuildFailed(Exception):
+    pass
+
+
+class ve_build_ext(build_ext):
+    # This class allows C extension building to fail.
 
-if PYTHON_INSTALLED < PYTHON_REQUIRED:
-    sys.stderr.write('''
+    def run(self):
+        try:
+            build_ext.run(self)
+        except DistutilsPlatformError:
+            raise BuildFailed()
 
-`pyignite` is not compatible with Python {}.{}!
-Use Python {}.{} or above.
+    def build_extension(self, ext):
+        try:
+            build_ext.build_extension(self, ext)
+        except ext_errors:
+            raise BuildFailed()
 
 
-'''.format(
-            PYTHON_INSTALLED[0],
-            PYTHON_INSTALLED[1],
-            PYTHON_REQUIRED[0],
-            PYTHON_REQUIRED[1],
+def run_setup(with_binary=True):
+    if with_binary:
+        kw = dict(
+            ext_modules=[cext],
+            cmdclass=dict(build_ext=ve_build_ext),
         )
+    else:
+        kw = dict()
+
+    setuptools.setup(
+        name='pyignite',
+        version='0.4.0',
+        python_requires='>=3.6',
+        author='The Apache Software Foundation',
+        author_email='dev@ignite.apache.org',
+        description='Apache Ignite binary client Python API',
+        url='https://github.com/apache/ignite-python-thin-client',
+        packages=setuptools.find_packages(),
+        install_requires=[
+            "attrs==18.1.0"
+        ],
+        tests_require=[
+            'pytest==3.6.1',
+            'pytest-cov==2.5.1',
+            'teamcity-messages==1.21',
+            'psutil==5.6.5',
+            'jinja2==2.11.3'
+        ],
+        setup_requires=[
+            'pytest-runner==4.2'
+        ],
+        extras_require={
+            'docs': [
+                'wheel==0.36.2',
+                'Sphinx==1.7.5',
+                'sphinxcontrib-fulltoc==1.2.0'
+            ],
+        },

Review comment:
       We need a test that would check that requrements in `setup.py` are the same as in `requrements/*.txt`

##########
File path: cext/cutils.c
##########
@@ -0,0 +1,186 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <Python.h>
+
+#ifdef _MSC_VER
+
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+#else
+#include <stdint.h>
+#endif
+
+static int32_t FNV1_OFFSET_BASIS = 0x811c9dc5;
+static int32_t FNV1_PRIME = 0x01000193;
+
+
+PyObject* hashcode(PyObject* self, PyObject *args);
+PyObject* schema_id(PyObject* self, PyObject *args);
+
+PyObject* str_hashcode(PyObject* data);
+int32_t str_hashcode_(PyObject* data, int lower);
+PyObject* b_hashcode(PyObject* data);
+
+static PyMethodDef methods[] = {
+    {"hashcode", (PyCFunction) hashcode, METH_VARARGS, ""},
+    {"schema_id", (PyCFunction) schema_id, METH_VARARGS, ""},
+    {NULL, NULL, 0, NULL}       /* Sentinel */
+};
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_cutils",
+    0,                    /* m_doc */
+    -1,                   /* m_size */
+    methods,              /* m_methods */
+    NULL,                 /* m_slots */
+    NULL,                 /* m_traverse */
+    NULL,                 /* m_clear */
+    NULL,                 /* m_free */
+};
+
+static char* hashcode_input_err = "supported only strings, bytearrays, bytes and memoryview";
+static char* schema_id_input_err = "input argument must be dict or int";
+static char* schema_field_type_err = "schema keys must be strings";
+
+PyMODINIT_FUNC PyInit__cutils(void) {
+	return PyModule_Create(&moduledef);
+}
+
+PyObject* hashcode(PyObject* self, PyObject *args) {
+    PyObject* data;
+
+    if (!PyArg_ParseTuple(args, "O", &data)) {
+        return NULL;
+    }
+
+    if (data == Py_None) {
+        return PyLong_FromLong(0);
+    }
+    else if (PyUnicode_CheckExact(data)) {
+        return str_hashcode(data);
+    }
+    else {
+        return b_hashcode(data);
+    }
+}
+
+PyObject* str_hashcode(PyObject* data) {
+    return PyLong_FromLong(str_hashcode_(data, 0));
+}
+
+int32_t str_hashcode_(PyObject *data, int lower) {
+    int32_t res = 0;
+    Py_ssize_t sz = PyUnicode_GET_LENGTH(data);
+
+    Py_ssize_t i;
+    for (i = 0; i < sz; i++) {
+        Py_UCS4 ch = PyUnicode_READ_CHAR(data, i);
+
+        if (lower) {
+            ch = Py_UNICODE_TOLOWER(ch);
+        }
+
+        res = 31 * res + ch;
+    }
+
+    return res;
+}
+
+PyObject* b_hashcode(PyObject* data) {
+    int32_t res = 1;
+    Py_ssize_t sz; char* buf;
+
+    if (PyBytes_CheckExact(data)) {
+        sz = PyBytes_GET_SIZE(data);
+        buf = PyBytes_AS_STRING(data);
+    }
+    else if (PyByteArray_CheckExact(data)) {
+        sz = PyByteArray_GET_SIZE(data);
+        buf = PyByteArray_AS_STRING(data);
+    }
+    else if (PyMemoryView_Check(data)) {
+        Py_buffer* pyBuf = PyMemoryView_GET_BUFFER(data);
+        sz = pyBuf->len;
+        buf = (char*)pyBuf->buf;
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, hashcode_input_err);
+        return NULL;
+    }
+
+    Py_ssize_t i;
+    for (i = 0; i < sz; i++) {
+        res = 31 * res + buf[i];

Review comment:
       I believe we should use `signed char` or `unsigned char` here. Currently we use `char` which has implementation-dependent signedness so the code above could result in different hashcode calculation on different platforms.

##########
File path: cext/cutils.c
##########
@@ -0,0 +1,186 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <Python.h>
+
+#ifdef _MSC_VER
+
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+#else
+#include <stdint.h>
+#endif
+
+static int32_t FNV1_OFFSET_BASIS = 0x811c9dc5;
+static int32_t FNV1_PRIME = 0x01000193;
+
+
+PyObject* hashcode(PyObject* self, PyObject *args);
+PyObject* schema_id(PyObject* self, PyObject *args);
+
+PyObject* str_hashcode(PyObject* data);
+int32_t str_hashcode_(PyObject* data, int lower);
+PyObject* b_hashcode(PyObject* data);
+
+static PyMethodDef methods[] = {
+    {"hashcode", (PyCFunction) hashcode, METH_VARARGS, ""},
+    {"schema_id", (PyCFunction) schema_id, METH_VARARGS, ""},
+    {NULL, NULL, 0, NULL}       /* Sentinel */
+};
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_cutils",
+    0,                    /* m_doc */
+    -1,                   /* m_size */
+    methods,              /* m_methods */
+    NULL,                 /* m_slots */
+    NULL,                 /* m_traverse */
+    NULL,                 /* m_clear */
+    NULL,                 /* m_free */
+};
+
+static char* hashcode_input_err = "supported only strings, bytearrays, bytes and memoryview";
+static char* schema_id_input_err = "input argument must be dict or int";
+static char* schema_field_type_err = "schema keys must be strings";
+
+PyMODINIT_FUNC PyInit__cutils(void) {
+	return PyModule_Create(&moduledef);
+}
+
+PyObject* hashcode(PyObject* self, PyObject *args) {
+    PyObject* data;
+
+    if (!PyArg_ParseTuple(args, "O", &data)) {
+        return NULL;
+    }
+
+    if (data == Py_None) {
+        return PyLong_FromLong(0);
+    }
+    else if (PyUnicode_CheckExact(data)) {
+        return str_hashcode(data);
+    }
+    else {
+        return b_hashcode(data);
+    }
+}
+
+PyObject* str_hashcode(PyObject* data) {
+    return PyLong_FromLong(str_hashcode_(data, 0));
+}
+
+int32_t str_hashcode_(PyObject *data, int lower) {
+    int32_t res = 0;
+    Py_ssize_t sz = PyUnicode_GET_LENGTH(data);
+
+    Py_ssize_t i;
+    for (i = 0; i < sz; i++) {
+        Py_UCS4 ch = PyUnicode_READ_CHAR(data, i);
+
+        if (lower) {
+            ch = Py_UNICODE_TOLOWER(ch);
+        }

Review comment:
       I'm not sure how fast `PyUnicode_READ_CHAR` and `Py_UNICODE_TOLOWER` are, but I think it may be faster to get the full string in one call using `PyUnicode_DATA` and work with C representation after that. Or at least we may want to use `PyUnicode_READ` instead of `PyUnicode_READ_CHAR`, as documentation states it is more efficient.

##########
File path: cext/cutils.c
##########
@@ -0,0 +1,186 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <Python.h>
+
+#ifdef _MSC_VER
+
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+
+#else
+#include <stdint.h>
+#endif
+
+static int32_t FNV1_OFFSET_BASIS = 0x811c9dc5;
+static int32_t FNV1_PRIME = 0x01000193;
+
+
+PyObject* hashcode(PyObject* self, PyObject *args);
+PyObject* schema_id(PyObject* self, PyObject *args);
+
+PyObject* str_hashcode(PyObject* data);
+int32_t str_hashcode_(PyObject* data, int lower);
+PyObject* b_hashcode(PyObject* data);
+
+static PyMethodDef methods[] = {
+    {"hashcode", (PyCFunction) hashcode, METH_VARARGS, ""},
+    {"schema_id", (PyCFunction) schema_id, METH_VARARGS, ""},
+    {NULL, NULL, 0, NULL}       /* Sentinel */
+};
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "_cutils",
+    0,                    /* m_doc */
+    -1,                   /* m_size */
+    methods,              /* m_methods */
+    NULL,                 /* m_slots */
+    NULL,                 /* m_traverse */
+    NULL,                 /* m_clear */
+    NULL,                 /* m_free */
+};
+
+static char* hashcode_input_err = "supported only strings, bytearrays, bytes and memoryview";
+static char* schema_id_input_err = "input argument must be dict or int";
+static char* schema_field_type_err = "schema keys must be strings";
+
+PyMODINIT_FUNC PyInit__cutils(void) {
+	return PyModule_Create(&moduledef);
+}
+
+PyObject* hashcode(PyObject* self, PyObject *args) {
+    PyObject* data;
+
+    if (!PyArg_ParseTuple(args, "O", &data)) {
+        return NULL;

Review comment:
       What happens when we return NULL from c-module? Does it convert to None?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org