You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by wi...@apache.org on 2023/10/26 13:30:09 UTC

[couchdb] 01/01: Fix lucene support

This is an automated email from the ASF dual-hosted git repository.

willholley pushed a commit to branch mango-beginswith
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit cdfb26d8d891f995ca739f448b60f8ef5ab760cb
Author: Will Holley <wi...@uk.ibm.com>
AuthorDate: Thu Oct 26 12:42:02 2023 +0000

    Fix lucene support
---
 src/docs/src/api/database/find.rst    | 136 ++++++++++++++++++----------------
 src/mango/src/mango_selector_text.erl |   3 +-
 src/mango/test/03-operator-test.py    |  41 +++++++---
 3 files changed, 103 insertions(+), 77 deletions(-)

diff --git a/src/docs/src/api/database/find.rst b/src/docs/src/api/database/find.rst
index d25350708..153aa9a09 100644
--- a/src/docs/src/api/database/find.rst
+++ b/src/docs/src/api/database/find.rst
@@ -673,68 +673,74 @@ In addition, some 'meta' condition operators are available. Some condition
 operators accept any valid JSON content as the argument.  Other condition
 operators require the argument to be in a specific JSON format.
 
-+---------------+-------------+------------+-----------------------------------+
-| Operator type | Operator    | Argument   | Purpose                           |
-+===============+=============+============+===================================+
-| (In)equality  | ``$lt``     | Any JSON   | The field is less than the        |
-|               |             |            | argument.                         |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$lte``    | Any JSON   | The field is less than or equal to|
-|               |             |            | the argument.                     |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$eq``     | Any JSON   | The field is equal to the argument|
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$ne``     | Any JSON   | The field is not equal to the     |
-|               |             |            | argument.                         |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$gte``    | Any JSON   | The field is greater than or equal|
-|               |             |            | to the argument.                  |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$gt``     | Any JSON   | The field is greater than the     |
-|               |             |            | to the argument.                  |
-+---------------+-------------+------------+-----------------------------------+
-| Object        | ``$exists`` | Boolean    | Check whether the field exists or |
-|               |             |            | not, regardless of its value.     |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$type``   | String     | Check the document field's type.  |
-|               |             |            | Valid values are ``"null"``,      |
-|               |             |            | ``"boolean"``, ``"number"``,      |
-|               |             |            | ``"string"``, ``"array"``, and    |
-|               |             |            | ``"object"``.                     |
-+---------------+-------------+------------+-----------------------------------+
-| Array         | ``$in``     | Array of   | The document field must exist in  |
-|               |             | JSON values| the list provided.                |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$nin``    | Array of   | The document field not must exist |
-|               |             | JSON values| in the list provided.             |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$size``   | Integer    | Special condition to match the    |
-|               |             |            | length of an array field in a     |
-|               |             |            | document. Non-array fields cannot |
-|               |             |            | match this condition.             |
-+---------------+-------------+------------+-----------------------------------+
-| Miscellaneous | ``$mod``    | [Divisor,  | Divisor is a non-zero integer,    |
-|               |             | Remainder] | Remainder is any integer.         |
-|               |             |            | Non-integer values result in a    |
-|               |             |            | 404. Matches documents where      |
-|               |             |            | ``field % Divisor == Remainder``  |
-|               |             |            | is true, and only when the        |
-|               |             |            | document field is an integer.     |
-+---------------+-------------+------------+-----------------------------------+
-|               | ``$regex``  | String     | A regular expression pattern to   |
-|               |             |            | match against the document field. |
-|               |             |            | Only matches when the field is a  |
-|               |             |            | string value and matches the      |
-|               |             |            | supplied regular expression. The  |
-|               |             |            | matching algorithms are based on  |
-|               |             |            | the Perl Compatible Regular       |
-|               |             |            | Expression (PCRE) library. For    |
-|               |             |            | more information about what is    |
-|               |             |            | implemented, see the see the      |
-|               |             |            | `Erlang Regular Expression        |
-|               |             |            | <http://erlang.org/doc            |
-|               |             |            | /man/re.html>`_.                  |
-+---------------+-------------+------------+-----------------------------------+
++---------------+-----------------+-------------+------------------------------------+
+| Operator type |    Operator     |  Argument   |              Purpose               |
++===============+=================+=============+====================================+
+| (In)equality  | ``$lt``         | Any JSON    | The field is less than the         |
+|               |                 |             | argument.                          |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$lte``        | Any JSON    | The field is less than or equal to |
+|               |                 |             | the argument.                      |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$eq``         | Any JSON    | The field is equal to the argument |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$ne``         | Any JSON    | The field is not equal to the      |
+|               |                 |             | argument.                          |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$gte``        | Any JSON    | The field is greater than or equal |
+|               |                 |             | to the argument.                   |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$gt``         | Any JSON    | The field is greater than the      |
+|               |                 |             | to the argument.                   |
++---------------+-----------------+-------------+------------------------------------+
+| Object        | ``$exists``     | Boolean     | Check whether the field exists or  |
+|               |                 |             | not, regardless of its value.      |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$type``       | String      | Check the document field's type.   |
+|               |                 |             | Valid values are ``"null"``,       |
+|               |                 |             | ``"boolean"``, ``"number"``,       |
+|               |                 |             | ``"string"``, ``"array"``, and     |
+|               |                 |             | ``"object"``.                      |
++---------------+-----------------+-------------+------------------------------------+
+| Array         | ``$in``         | Array of    | The document field must exist in   |
+|               |                 | JSON values | the list provided.                 |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$nin``        | Array of    | The document field not must exist  |
+|               |                 | JSON values | in the list provided.              |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$size``       | Integer     | Special condition to match the     |
+|               |                 |             | length of an array field in a      |
+|               |                 |             | document. Non-array fields cannot  |
+|               |                 |             | match this condition.              |
++---------------+-----------------+-------------+------------------------------------+
+| Miscellaneous | ``$mod``        | [Divisor,   | Divisor is a non-zero integer,     |
+|               |                 | Remainder]  | Remainder is any integer.          |
+|               |                 |             | Non-integer values result in a     |
+|               |                 |             | 404. Matches documents where       |
+|               |                 |             | ``field % Divisor == Remainder``   |
+|               |                 |             | is true, and only when the         |
+|               |                 |             | document field is an integer.      |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$regex``      | String      | A regular expression pattern to    |
+|               |                 |             | match against the document field.  |
+|               |                 |             | Only matches when the field is a   |
+|               |                 |             | string value and matches the       |
+|               |                 |             | supplied regular expression. The   |
+|               |                 |             | matching algorithms are based on   |
+|               |                 |             | the Perl Compatible Regular        |
+|               |                 |             | Expression (PCRE) library. For     |
+|               |                 |             | more information about what is     |
+|               |                 |             | implemented, see the see the       |
+|               |                 |             | `Erlang Regular Expression         |
+|               |                 |             | <http://erlang.org/doc             |
+|               |                 |             | /man/re.html>`_.                   |
++---------------+-----------------+-------------+------------------------------------+
+|               | ``$beginsWith`` | String      | Matches where the document field   |
+|               |                 |             | begins with the specified prefix   |
+|               |                 |             | (case-sensitive). If the document  |
+|               |                 |             | field contains a non-string value, |
+|               |                 |             | the document is not matched.       |
++---------------+-----------------+-------------+------------------------------------+
 
 .. warning::
     Regular expressions do not work with indexes, so they should not be used to
@@ -754,8 +760,10 @@ can itself be another operator with arguments of its own. This enables us to
 build up more complex selector expressions.
 
 However, only equality operators such as ``$eq``, ``$gt``, ``$gte``, ``$lt``,
-and ``$lte`` (but not ``$ne``) can be used as the basis of a query. You should
-include at least one of these in a selector.
+``$lte`` and ``$beginsWith`` (but not ``$ne``) can be used as the basis 
+of a query that can make efficient use of a ``json`` index. You should 
+include at least one of these in a selector, or consider using 
+a ``text`` index if more flexibility is required.
 
 For example, if you try to perform a query that attempts to match all documents
 that have a field called `afieldname` containing a value that begins with the
diff --git a/src/mango/src/mango_selector_text.erl b/src/mango/src/mango_selector_text.erl
index 4a50ff9ba..7d8f73923 100644
--- a/src/mango/src/mango_selector_text.erl
+++ b/src/mango/src/mango_selector_text.erl
@@ -143,8 +143,9 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) ->
         false -> {op_not, {FieldExists, false}}
     end;
 convert(Path, {[{<<"$beginsWith">>, Arg}]}) when is_binary(Arg) ->
+    Prefix = mango_util:lucene_escape_query_value(Arg),
     Suffix = <<"*">>,
-    PrefixSearch = value_str(<<Arg/binary, Suffix/binary>>),
+    PrefixSearch = <<Prefix/binary, Suffix/binary>>,
     {op_field, {make_field(Path, Arg), PrefixSearch}};
 % We're not checking the actual type here, just looking for
 % anything that has a possibility of matching by checking
diff --git a/src/mango/test/03-operator-test.py b/src/mango/test/03-operator-test.py
index b43aacf5f..3b1a46565 100644
--- a/src/mango/test/03-operator-test.py
+++ b/src/mango/test/03-operator-test.py
@@ -10,12 +10,13 @@
 # License for the specific language governing permissions and limitations under
 # the License.
 
+from requests.exceptions import HTTPError
 import mango
 import unittest
 
 
 class BaseOperatorTests:
-    class Common(object):
+    class Common(unittest.TestCase):
         def assertUserIds(self, user_ids, docs):
             user_ids_returned = list(d["user_id"] for d in docs)
             user_ids.sort()
@@ -142,20 +143,36 @@ class BaseOperatorTests:
                 self.assertNotIn("twitter", d)
 
         def test_beginswith(self):
-            docs = self.db.find({"location.state": {"$beginsWith": "New"}})
-            self.assertEqual(len(docs), 2)
-            self.assertUserIds([2, 10], docs)
+            cases = [
+                {"prefix": "New", "user_ids": [2, 10]},
+                {
+                    # test escaped characters - note the space in the test string
+                    "prefix": "New ",
+                    "user_ids": [2, 10],
+                },
+                {
+                    # non-string values in documents should not match the prefix,
+                    # but should not error
+                    "prefix": "Foo",
+                    "user_ids": [],
+                },
+                {"prefix": " New", "user_ids": []},
+            ]
 
-        # non-string prefixes should return an error
-        def test_beginswith_invalid_prefix(self):
-            docs = self.db.find({"location.state": {"$beginsWith": 123}})
-            self.assertEqual(len(docs), 2)
+            for case in cases:
+                with self.subTest(prefix=case["prefix"]):
+                    selector = {"location.state": {"$beginsWith": case["prefix"]}}
+                    docs = self.db.find(selector)
+                    self.assertEqual(len(docs), len(case["user_ids"]))
+                    self.assertUserIds(case["user_ids"], docs)
 
-        # non-string values in documents should not match the prefix,
-        # but should not error
+        # non-string prefixes should return an error
         def test_beginswith_invalid_prefix(self):
-            docs = self.db.find({"user_id": {"$beginsWith": "Foo"}})
-            self.assertEqual(len(docs), 0)
+            cases = [123, True, [], {}]
+            for prefix in cases:
+                with self.subTest(prefix=prefix):
+                    with self.assertRaises(HTTPError):
+                        self.db.find({"location.state": {"$beginsWith": prefix}})
 
 
 class OperatorJSONTests(mango.UserDocsTests, BaseOperatorTests.Common):