You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@subversion.apache.org by br...@apache.org on 2012/11/12 16:36:48 UTC

svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Author: brane
Date: Mon Nov 12 15:36:47 2012
New Revision: 1408325

URL: http://svn.apache.org/viewvc?rev=1408325&view=rev
Log:
On the wc-collate-path branch: Enable GLOB and LIKE operator replacements.

* subversion/libsvn_subr/sqlite.c
  (SQLITE_MAX_LIKE_PATTERN_LENGTH): Limit the pattern lengths.
  (svn_sqlite__db_t): Add third buffer, sqlext_buf3, for glob/like;
  (glob_like_ucs_nfd_common): Common wrapper for svn_utf__glob.
  (glob_ucs_nfd, like_ucs_nfd): GLOB and LIKE user functions.
  (svn_sqlite__open): Initialize all sqlext_* buffers to 800 bytes.
   Register GLOB and LIKE operator replacements.

Modified:
    subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Modified: subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c
URL: http://svn.apache.org/viewvc/subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c?rev=1408325&r1=1408324&r2=1408325&view=diff
==============================================================================
--- subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c (original)
+++ subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c Mon Nov 12 15:36:47 2012
@@ -57,6 +57,11 @@ extern const sqlite3_api_routines *const
 #error SQLite is too old -- version 3.7.12 is the minimum required version
 #endif
 
+/* Limit the length of a GLOB or LIKE pattern. */
+#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
+# define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
+#endif
+
 const char *
 svn_sqlite__compiled_version(void)
 {
@@ -105,6 +110,7 @@ struct svn_sqlite__db_t
   /* Buffers for SQLite extensoins. */
   svn_stringbuf_t *sqlext_buf1;
   svn_stringbuf_t *sqlext_buf2;
+  svn_stringbuf_t *sqlext_buf3;
 };
 
 struct svn_sqlite__stmt_t
@@ -881,6 +887,74 @@ collate_ucs_nfd(void *baton,
   return result;
 }
 
+static void
+glob_like_ucs_nfd_common(sqlite3_context *context,
+                         int argc, sqlite3_value **argv,
+                         svn_boolean_t sql_like)
+{
+  svn_sqlite__db_t *const db = sqlite3_user_data(context);
+
+  const char *const pattern = (void*)sqlite3_value_text(argv[0]);
+  const apr_size_t pattern_len = sqlite3_value_bytes(argv[0]);
+  const char *const string = (void*)sqlite3_value_text(argv[1]);
+  const apr_size_t string_len = sqlite3_value_bytes(argv[1]);
+
+  const char *escape = NULL;
+  apr_size_t escape_len = 0;
+
+  svn_boolean_t match;
+  svn_error_t *err;
+
+  if (pattern_len > SQLITE_MAX_LIKE_PATTERN_LENGTH)
+    {
+      sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
+      return;
+    }
+
+  if (argc == 3 && sql_like)
+    {
+      escape = (void*)sqlite3_value_text(argv[2]);
+      escape_len = sqlite3_value_bytes(argv[2]);
+    }
+
+  if (pattern && string)
+    {
+      err = svn_utf__glob(pattern, pattern_len, string, string_len,
+                          escape, escape_len, sql_like,
+                          db->sqlext_buf1, db->sqlext_buf2, db->sqlext_buf3,
+                          &match);
+
+      if (err)
+        {
+          const char *errmsg;
+          svn_stringbuf_ensure(db->sqlext_buf1, 511);
+          errmsg = svn_err_best_message(err,
+                                        db->sqlext_buf1->data,
+                                        db->sqlext_buf1->blocksize);
+          svn_error_clear(err);
+          sqlite3_result_error(context, errmsg, -1);
+          return;
+        }
+
+      sqlite3_result_int(context, match);
+    }
+}
+
+/* Unicode normalizing implementation of GLOB */
+static void
+glob_ucs_nfd(sqlite3_context *context,
+             int argc, sqlite3_value **argv)
+{
+  glob_like_ucs_nfd_common(context, argc, argv, FALSE);
+}
+
+/* Unicode normalizing implementation of LIKE */
+static void
+like_ucs_nfd(sqlite3_context *context,
+             int argc, sqlite3_value **argv)
+{
+  glob_like_ucs_nfd_common(context, argc, argv, TRUE);
+}
 
 svn_error_t *
 svn_sqlite__open(svn_sqlite__db_t **db, const char *path,
@@ -895,12 +969,25 @@ svn_sqlite__open(svn_sqlite__db_t **db, 
 
   SVN_ERR(internal_open(&(*db)->db3, path, mode, scratch_pool));
 
-  (*db)->sqlext_buf1 = svn_stringbuf_create_ensure(4096, result_pool);
-  (*db)->sqlext_buf2 = svn_stringbuf_create_ensure(4096, result_pool);
+  /* Create extension buffers with space for 200 UCS-4 characters. */
+  (*db)->sqlext_buf1 = svn_stringbuf_create_ensure(799, result_pool);
+  (*db)->sqlext_buf2 = svn_stringbuf_create_ensure(799, result_pool);
+  (*db)->sqlext_buf3 = svn_stringbuf_create_ensure(799, result_pool);
+
+  /* Register collation and LIKE and GLOB operator replacements. */
   SQLITE_ERR(sqlite3_create_collation((*db)->db3,
                                       "svn-ucs-nfd", SQLITE_UTF8,
                                       *db, collate_ucs_nfd),
              *db);
+  SQLITE_ERR(sqlite3_create_function((*db)->db3, "glob", 2, SQLITE_UTF8,
+                                     *db, glob_ucs_nfd, NULL, NULL),
+             *db);
+  SQLITE_ERR(sqlite3_create_function((*db)->db3, "like", 2, SQLITE_UTF8,
+                                     *db, like_ucs_nfd, NULL, NULL),
+             *db);
+  SQLITE_ERR(sqlite3_create_function((*db)->db3, "like", 3, SQLITE_UTF8,
+                                     *db, like_ucs_nfd, NULL, NULL),
+             *db);
 
 #ifdef SQLITE3_DEBUG
   sqlite3_trace((*db)->db3, sqlite_tracer, (*db)->db3);



Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Branko Čibej <br...@wandisco.com>.
On 23.01.2013 02:32, Ben Reser wrote:
> On Sun, Jan 20, 2013 at 12:15 PM, Thomas Åkesson <thomas@akesson.cc
> <ma...@akesson.cc>> wrote:
>
>     I have spent quite a bit of time writing the wiki pages,
>     experimenting, and discussing with the people who have shown
>     interest (Branko, Julian, Ben and a couple of others in person at
>     Subversion Live, London). I am very saddened to see this negative
>     attitude towards resolving this long standing issue. No doubt,
>     most Mac OS X users with non-ASCII languages are too. 
>
>
> We actually ended up spending a fair amount of time discussing the
> solution to this issue after Subversion Live over dinner one night.
>  Branko had some ideas he wanted to try and that's what his branch is.
>  Don't take his statement that he views the branch at this point as
> something that will never be merged back to trunk as a negative
> attitude towards resolving the issue.

Indeed, I probably have more empathy towards accented Latin characters
than most devs hereabouts. :)

I simply don't have time right now to keep working on a client-side
solution. It's quite close to the top of my list for 1.9, however.

-- Brane

-- 
Branko Čibej
Director of Subversion | WANdisco | www.wandisco.com


Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Thomas Åkesson <th...@akesson.cc>.
On 23 jan 2013, at 02:32, Ben Reser <be...@reser.org> wrote:

> On Sun, Jan 20, 2013 at 12:15 PM, Thomas Åkesson <th...@akesson.cc> wrote:
>> I have spent quite a bit of time writing the wiki pages, experimenting, and discussing with the people who have shown interest (Branko, Julian, Ben and a couple of others in person at Subversion Live, London). I am very saddened to see this negative attitude towards resolving this long standing issue. No doubt, most Mac OS X users with non-ASCII languages are too. 
> 
> We actually ended up spending a fair amount of time discussing the solution to this issue after Subversion Live over dinner one night.  Branko had some ideas he wanted to try and that's what his branch is.  Don't take his statement that he views the branch at this point as something that will never be merged back to trunk as a negative attitude towards resolving the issue.

I think Branko's work on the issue is very promising. It was the attitude of an other participant in the thread that I had issues with. 

I am continuing work in the wiki, doing some restructuring to better describe the collation approach. I will post when it is ready for review. 

Thanks,
Thomas Å. 

Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Ben Reser <be...@reser.org>.
On Sun, Jan 20, 2013 at 12:15 PM, Thomas Åkesson <th...@akesson.cc> wrote:

> I have spent quite a bit of time writing the wiki pages, experimenting,
> and discussing with the people who have shown interest (Branko, Julian, Ben
> and a couple of others in person at Subversion Live, London). I
> am very saddened to see this negative attitude towards resolving this long
> standing issue. No doubt, most Mac OS X users with non-ASCII languages are
> too.
>

We actually ended up spending a fair amount of time discussing the solution
to this issue after Subversion Live over dinner one night.  Branko had some
ideas he wanted to try and that's what his branch is.  Don't take his
statement that he views the branch at this point as something that will
never be merged back to trunk as a negative attitude towards resolving the
issue.

Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Thomas Åkesson <th...@akesson.cc>.
First of all, I am really sorry that I did not observe this thread while ongoing. Due to time constraints, my contributions to Subversion happens now and then.

I have spent quite a bit of time writing the wiki pages, experimenting, and discussing with the people who have shown interest (Branko, Julian, Ben and a couple of others in person at Subversion Live, London). I am very saddened to see this negative attitude towards resolving this long standing issue. No doubt, most Mac OS X users with non-ASCII languages are too. 

Branko, if you can summarize your findings in the collation experiments and what parts actually made it back to trunk, I would find that very interesting. I would like to do further experiments if possible. 

Responding to some of Bert's concerns below. 

On 12 nov 2012, at 18:34, "Bert Huijben" <be...@qqmail.nl> wrote:

>> -----Original Message-----
>> From: Branko Čibej [mailto:brane@wandisco.com]
>> Sent: maandag 12 november 2012 17:49
>> To: dev@subversion.apache.org
>> Subject: Re: svn commit: r1408325 - /subversion/branches/wc-collate-
>> path/subversion/libsvn_subr/sqlite.c
>> 
>> It's all described and discussed here:
>> 
>> http://wiki.apache.org/subversion/UnicodeComposition
>> 
>> This branch is only exploring the client-side effects. The server needs
>> to adjust to make the whole thing bullet-proof.
> 
> I don't see a discussion of and/or answers to many questions in http://svn.apache.org/repos/asf/subversion/trunk/notes/unicode-composition-for-filenames in there.

Please add a reference to the wiki page in this note. The wiki to large extent supersedes the note, and references back. 

> 
> The most important: How are you going to handle the current hashtable approach in performance critical things like 'svn status'?
> 
> [I don't think a WIKI is the right place to discuss such topics, but that is a different topic]

I think the wiki is a great place for collaborative design. I wrote stuff in the wiki, and then posted to the list for feedback. Some people did respond with feedback... 

Most of the information is in the page linked from the wiki page mentioned above:
http://wiki.apache.org/subversion/NonNormalizingUnicodeCompositionAwareness

It discusses pros and cons of repository normalization. The design predates the collation idea. 

> 
> This involves a solution for how you are going to handle duplicate names. Many existing users only find these problems after committing a problematic file. In many cases they will remove that file and maybe add the same name with a different encoding. A mixed revision working copy (or an svn up from one to the other) can then have both files.

The wiki page actually does discuss this. It can not be fully resolved for Mac OS X users without Svn 1.x compatibility issues, but we can move subversion from "completely unusable" to "usable from a certain revision and forward". That would be a great step improvement. 

Please do provide feedback on which cases are not covered in the wiki. 

> 
> A normalization library and the right collate indexes won't resolve those problems.
> I don't think we can just apply a UNIQUE constraint or something without breaking compatibility?

The wiki article proposes that we introduce "normalization-uniqueness". I think very contrived use cases are needed to oppose that. 

> I would have hoped to see an explanation on what you are trying to resolve in a BRANCH-README or in the Wiki. 
> And given the information in 'unicode-composition-for-filenames' I don't see a libsvn_wc only solution to these issues.

No, as noted in wiki. 

Again, sorry for not noticing the thread earlier,
/Thomas Å.

Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Greg Stein <gs...@gmail.com>.
On Nov 13, 2012 4:00 AM, "Branko Čibej" <br...@wandisco.com> wrote:
>...
> extract a detailed design. In the meantime, if it makes you feel better,
> you can treat that branch as my private playground that's never intended
> to be merged back to trunk as-is.

Could you add a branch readme, stating such? I think that would have
prevented the whole thread :-)

Thx,
-g

Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Branko Čibej <br...@wandisco.com>.
On 12.11.2012 18:34, Bert Huijben wrote:
>> -----Original Message-----
>> From: Branko Čibej [mailto:brane@wandisco.com]
>> Sent: maandag 12 november 2012 17:49
>> To: dev@subversion.apache.org
>> Subject: Re: svn commit: r1408325 - /subversion/branches/wc-collate-
>> path/subversion/libsvn_subr/sqlite.c
>>
>> It's all described and discussed here:
>>
>> http://wiki.apache.org/subversion/UnicodeComposition
>>
>> This branch is only exploring the client-side effects. The server needs
>> to adjust to make the whole thing bullet-proof.
> I don't see a discussion of and/or answers to many questions in http://svn.apache.org/repos/asf/subversion/trunk/notes/unicode-composition-for-filenames in there. 
>
> The most important: How are you going to handle the current hashtable approach in performance critical things like 'svn status'?
>
> [I don't think a WIKI is the right place to discuss such topics, but that is a different topic]
>
>
> This involves a solution for how you are going to handle duplicate names. Many existing users only find these problems after committing a problematic file. In many cases they will remove that file and maybe add the same name with a different encoding. A mixed revision working copy (or an svn up from one to the other) can then have both files.
>
> A normalization library and the right collate indexes won't resolve those problems.
> I don't think we can just apply a UNIQUE constraint or something without breaking compatibility?
>
>
> I would have hoped to see an explanation on what you are trying to resolve in a BRANCH-README or in the Wiki. 
> And given the information in 'unicode-composition-for-filenames' I don't see a libsvn_wc only solution to these issues.
>
>
> [Patching libsvn_subr/sqlite.c -used by the repository and client- to only support a single new collate on opening, might by itself break upgrade scenarios from 1.7.]
>
>
>
> I really think some design is necessary if this branch tries to be more than some experiment that we don't intend to merge back to trunk.
> And whether or not it is such an experiment, a branch readme should document that.
>
> See http://subversion.apache.org/docs/community-guide/general.html#branch-readme-files.
>
>
>
> Re: your other mail with the same subject.
>
> I don't think we as a project like patch bombs. 
>
>
> I started asking about the design, to avoid the pain of a late review of huge changes all over the place with unknown impact. 
>
>
> I don't want to turn a patch for such an important issue into something which we won't be able to apply later because nobody is able to review it.

Bert, I hear you. I'm well aware of the issues. I'll be able to say more
about the design when I have a better idea of what's needed. Personally
I like to explore these kinds of dependencies in code, and only later
extract a detailed design. In the meantime, if it makes you feel better,
you can treat that branch as my private playground that's never intended
to be merged back to trunk as-is.


-- Brane

RE: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Bert Huijben <be...@qqmail.nl>.
> -----Original Message-----
> From: Branko Čibej [mailto:brane@wandisco.com]
> Sent: maandag 12 november 2012 17:49
> To: dev@subversion.apache.org
> Subject: Re: svn commit: r1408325 - /subversion/branches/wc-collate-
> path/subversion/libsvn_subr/sqlite.c
> 
> It's all described and discussed here:
> 
> http://wiki.apache.org/subversion/UnicodeComposition
> 
> This branch is only exploring the client-side effects. The server needs
> to adjust to make the whole thing bullet-proof.

I don't see a discussion of and/or answers to many questions in http://svn.apache.org/repos/asf/subversion/trunk/notes/unicode-composition-for-filenames in there. 

The most important: How are you going to handle the current hashtable approach in performance critical things like 'svn status'?

[I don't think a WIKI is the right place to discuss such topics, but that is a different topic]


This involves a solution for how you are going to handle duplicate names. Many existing users only find these problems after committing a problematic file. In many cases they will remove that file and maybe add the same name with a different encoding. A mixed revision working copy (or an svn up from one to the other) can then have both files.

A normalization library and the right collate indexes won't resolve those problems.
I don't think we can just apply a UNIQUE constraint or something without breaking compatibility?


I would have hoped to see an explanation on what you are trying to resolve in a BRANCH-README or in the Wiki. 
And given the information in 'unicode-composition-for-filenames' I don't see a libsvn_wc only solution to these issues.


[Patching libsvn_subr/sqlite.c -used by the repository and client- to only support a single new collate on opening, might by itself break upgrade scenarios from 1.7.]



I really think some design is necessary if this branch tries to be more than some experiment that we don't intend to merge back to trunk.
And whether or not it is such an experiment, a branch readme should document that.

See http://subversion.apache.org/docs/community-guide/general.html#branch-readme-files.



Re: your other mail with the same subject.

I don't think we as a project like patch bombs. 


I started asking about the design, to avoid the pain of a late review of huge changes all over the place with unknown impact. 


I don't want to turn a patch for such an important issue into something which we won't be able to apply later because nobody is able to review it.

	Bert


Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Branko Čibej <br...@wandisco.com>.
It's all described and discussed here:

http://wiki.apache.org/subversion/UnicodeComposition

This branch is only exploring the client-side effects. The server needs
to adjust to make the whole thing bullet-proof.

-- Brane

On 12.11.2012 17:11, Bert Huijben wrote:
>
>> -----Original Message-----
>> From: brane@apache.org [mailto:brane@apache.org]
>> Sent: maandag 12 november 2012 16:37
>> To: commits@subversion.apache.org
>> Subject: svn commit: r1408325 - /subversion/branches/wc-collate-
>> path/subversion/libsvn_subr/sqlite.c
>>
>> Author: brane
>> Date: Mon Nov 12 15:36:47 2012
>> New Revision: 1408325
>>
>> URL: http://svn.apache.org/viewvc?rev=1408325&view=rev
>> Log:
>> On the wc-collate-path branch: Enable GLOB and LIKE operator
>> replacements.
> Completely unrelated to this patch, but I'm still wondering what your total approach/plan on this branch will be.
>
> I can see that we handle this collate in sqlite (even though this breaks using a plain sqlite3 as tool on wc.db, etc.), but the notes/unicode-composition-for-filenames describes several other problems that need a fix at the same time in order not to break at least some current subversion users.
>
> One of these things is that we use hashtables to represent all nodes in a directory in several places. In some cases we get this from the working copy, in some cases from the db and in even other cases from the repository. Some of these may be normalized in some way, while others are not (especially with our compatibility guarantees within 1.X)
>
> I'm afraid that just getting wc.db compatible with normalization will just shift the problem one layer, while still not fixing the real problem. Erik Huelsmann thoroughly investigated this problem space some years ago and he documented that fixing the wc library is not enough for fixing the generic case. And if we are not fixing the generic case, I'm wondering if we should really work on a major slowdown of every common operation.
>
> We currently have a binary format, that can be used as a hash key, so many comparison and lookup operations are constant time.
> I'm not sure how they are after installing the collate handling.
>
>
> If we leave the generic case, there are easier ways to resolve this issue. One such thing would be to make apr (or a wrapper in Subversion) normalize the on disk paths in the other direction and deny (on the server) the non-normalized paths. This would eliminate the slowdown on most use cases that don't have a problem right now, and keep the code clean for future problems.
>
> If we have to check for collate handling everywhere in libsvn_wc and libsvn_client we make it much harder for outside developers to create patches and even fewer core subversion developers would dare touch these layers.
>
>
>
> I'm glad somebody is finally looking into these issues, but I think we should look at the full picture before we can talk about getting this back on trunk.
>
> 	Bert
>
>


-- 
Branko Čibej
Director of Subversion | WANdisco | www.wandisco.com


RE: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Bert Huijben <be...@qqmail.nl>.

> -----Original Message-----
> From: brane@apache.org [mailto:brane@apache.org]
> Sent: maandag 12 november 2012 16:37
> To: commits@subversion.apache.org
> Subject: svn commit: r1408325 - /subversion/branches/wc-collate-
> path/subversion/libsvn_subr/sqlite.c
> 
> Author: brane
> Date: Mon Nov 12 15:36:47 2012
> New Revision: 1408325
> 
> URL: http://svn.apache.org/viewvc?rev=1408325&view=rev
> Log:
> On the wc-collate-path branch: Enable GLOB and LIKE operator
> replacements.

Completely unrelated to this patch, but I'm still wondering what your total approach/plan on this branch will be.

I can see that we handle this collate in sqlite (even though this breaks using a plain sqlite3 as tool on wc.db, etc.), but the notes/unicode-composition-for-filenames describes several other problems that need a fix at the same time in order not to break at least some current subversion users.

One of these things is that we use hashtables to represent all nodes in a directory in several places. In some cases we get this from the working copy, in some cases from the db and in even other cases from the repository. Some of these may be normalized in some way, while others are not (especially with our compatibility guarantees within 1.X)

I'm afraid that just getting wc.db compatible with normalization will just shift the problem one layer, while still not fixing the real problem. Erik Huelsmann thoroughly investigated this problem space some years ago and he documented that fixing the wc library is not enough for fixing the generic case. And if we are not fixing the generic case, I'm wondering if we should really work on a major slowdown of every common operation.

We currently have a binary format, that can be used as a hash key, so many comparison and lookup operations are constant time.
I'm not sure how they are after installing the collate handling.


If we leave the generic case, there are easier ways to resolve this issue. One such thing would be to make apr (or a wrapper in Subversion) normalize the on disk paths in the other direction and deny (on the server) the non-normalized paths. This would eliminate the slowdown on most use cases that don't have a problem right now, and keep the code clean for future problems.

If we have to check for collate handling everywhere in libsvn_wc and libsvn_client we make it much harder for outside developers to create patches and even fewer core subversion developers would dare touch these layers.



I'm glad somebody is finally looking into these issues, but I think we should look at the full picture before we can talk about getting this back on trunk.

	Bert



Re: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Branko Čibej <br...@wandisco.com>.
On 12.11.2012 17:17, Bert Huijben wrote:
> +  SQLITE_ERR(sqlite3_create_function((*db)->db3, "glob", 2, SQLITE_UTF8,
> +                                     *db, glob_ucs_nfd, NULL, NULL),
> +             *db);
> +  SQLITE_ERR(sqlite3_create_function((*db)->db3, "like", 2, SQLITE_UTF8,
> +                                     *db, like_ucs_nfd, NULL, NULL),
> +             *db);
> +  SQLITE_ERR(sqlite3_create_function((*db)->db3, "like", 3, SQLITE_UTF8,
> +                                     *db, like_ucs_nfd, NULL, NULL),
> +             *db);
> How does this affect indexes?
>
> The usual like and glob code can use indexes in some cases?
>
> Do we still use LIKE and GLOB, or did we already replace all invocations for performance reasons?
> (I know I removed a lot of them to get queries to use indexes properly)

Please don't jump up and down on a half-finished implementation. :)

-- Brane

-- 
Branko Čibej
Director of Subversion | WANdisco | www.wandisco.com


RE: svn commit: r1408325 - /subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c

Posted by Bert Huijben <be...@qqmail.nl>.

> -----Original Message-----
> From: brane@apache.org [mailto:brane@apache.org]
> Sent: maandag 12 november 2012 16:37
> To: commits@subversion.apache.org
> Subject: svn commit: r1408325 - /subversion/branches/wc-collate-
> path/subversion/libsvn_subr/sqlite.c
> 
> Author: brane
> Date: Mon Nov 12 15:36:47 2012
> New Revision: 1408325
> 
> URL: http://svn.apache.org/viewvc?rev=1408325&view=rev
> Log:
> On the wc-collate-path branch: Enable GLOB and LIKE operator
> replacements.
> 
> * subversion/libsvn_subr/sqlite.c
>   (SQLITE_MAX_LIKE_PATTERN_LENGTH): Limit the pattern lengths.
>   (svn_sqlite__db_t): Add third buffer, sqlext_buf3, for glob/like;
>   (glob_like_ucs_nfd_common): Common wrapper for svn_utf__glob.
>   (glob_ucs_nfd, like_ucs_nfd): GLOB and LIKE user functions.
>   (svn_sqlite__open): Initialize all sqlext_* buffers to 800 bytes.
>    Register GLOB and LIKE operator replacements.
> 
> Modified:
>     subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c
> 
> Modified: subversion/branches/wc-collate-
> path/subversion/libsvn_subr/sqlite.c
> URL: http://svn.apache.org/viewvc/subversion/branches/wc-collate-
> path/subversion/libsvn_subr/sqlite.c?rev=1408325&r1=1408324&r2=1408325
> &view=diff
> ==========================================================
> ====================
> --- subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c
> (original)
> +++ subversion/branches/wc-collate-path/subversion/libsvn_subr/sqlite.c
> Mon Nov 12 15:36:47 2012
> @@ -57,6 +57,11 @@ extern const sqlite3_api_routines *const
>  #error SQLite is too old -- version 3.7.12 is the minimum required version
>  #endif
> 
> +/* Limit the length of a GLOB or LIKE pattern. */
> +#ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
> +# define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
> +#endif
> +
>  const char *
>  svn_sqlite__compiled_version(void)
>  {
> @@ -105,6 +110,7 @@ struct svn_sqlite__db_t
>    /* Buffers for SQLite extensoins. */
>    svn_stringbuf_t *sqlext_buf1;
>    svn_stringbuf_t *sqlext_buf2;
> +  svn_stringbuf_t *sqlext_buf3;
>  };
> 
>  struct svn_sqlite__stmt_t
> @@ -881,6 +887,74 @@ collate_ucs_nfd(void *baton,
>    return result;
>  }
> 
> +static void
> +glob_like_ucs_nfd_common(sqlite3_context *context,
> +                         int argc, sqlite3_value **argv,
> +                         svn_boolean_t sql_like)
> +{
> +  svn_sqlite__db_t *const db = sqlite3_user_data(context);
> +
> +  const char *const pattern = (void*)sqlite3_value_text(argv[0]);
> +  const apr_size_t pattern_len = sqlite3_value_bytes(argv[0]);
> +  const char *const string = (void*)sqlite3_value_text(argv[1]);
> +  const apr_size_t string_len = sqlite3_value_bytes(argv[1]);
> +
> +  const char *escape = NULL;
> +  apr_size_t escape_len = 0;
> +
> +  svn_boolean_t match;
> +  svn_error_t *err;
> +
> +  if (pattern_len > SQLITE_MAX_LIKE_PATTERN_LENGTH)
> +    {
> +      sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
> +      return;
> +    }
> +
> +  if (argc == 3 && sql_like)
> +    {
> +      escape = (void*)sqlite3_value_text(argv[2]);
> +      escape_len = sqlite3_value_bytes(argv[2]);
> +    }
> +
> +  if (pattern && string)
> +    {
> +      err = svn_utf__glob(pattern, pattern_len, string, string_len,
> +                          escape, escape_len, sql_like,
> +                          db->sqlext_buf1, db->sqlext_buf2, db->sqlext_buf3,
> +                          &match);
> +
> +      if (err)
> +        {
> +          const char *errmsg;
> +          svn_stringbuf_ensure(db->sqlext_buf1, 511);
> +          errmsg = svn_err_best_message(err,
> +                                        db->sqlext_buf1->data,
> +                                        db->sqlext_buf1->blocksize);
> +          svn_error_clear(err);
> +          sqlite3_result_error(context, errmsg, -1);
> +          return;
> +        }
> +
> +      sqlite3_result_int(context, match);
> +    }
> +}
> +
> +/* Unicode normalizing implementation of GLOB */
> +static void
> +glob_ucs_nfd(sqlite3_context *context,
> +             int argc, sqlite3_value **argv)
> +{
> +  glob_like_ucs_nfd_common(context, argc, argv, FALSE);
> +}
> +
> +/* Unicode normalizing implementation of LIKE */
> +static void
> +like_ucs_nfd(sqlite3_context *context,
> +             int argc, sqlite3_value **argv)
> +{
> +  glob_like_ucs_nfd_common(context, argc, argv, TRUE);
> +}
> 
>  svn_error_t *
>  svn_sqlite__open(svn_sqlite__db_t **db, const char *path,
> @@ -895,12 +969,25 @@ svn_sqlite__open(svn_sqlite__db_t **db,
> 
>    SVN_ERR(internal_open(&(*db)->db3, path, mode, scratch_pool));
> 
> -  (*db)->sqlext_buf1 = svn_stringbuf_create_ensure(4096, result_pool);
> -  (*db)->sqlext_buf2 = svn_stringbuf_create_ensure(4096, result_pool);
> +  /* Create extension buffers with space for 200 UCS-4 characters. */
> +  (*db)->sqlext_buf1 = svn_stringbuf_create_ensure(799, result_pool);
> +  (*db)->sqlext_buf2 = svn_stringbuf_create_ensure(799, result_pool);
> +  (*db)->sqlext_buf3 = svn_stringbuf_create_ensure(799, result_pool);
> +
> +  /* Register collation and LIKE and GLOB operator replacements. */
>    SQLITE_ERR(sqlite3_create_collation((*db)->db3,
>                                        "svn-ucs-nfd", SQLITE_UTF8,
>                                        *db, collate_ucs_nfd),
>               *db);
> +  SQLITE_ERR(sqlite3_create_function((*db)->db3, "glob", 2, SQLITE_UTF8,
> +                                     *db, glob_ucs_nfd, NULL, NULL),
> +             *db);
> +  SQLITE_ERR(sqlite3_create_function((*db)->db3, "like", 2, SQLITE_UTF8,
> +                                     *db, like_ucs_nfd, NULL, NULL),
> +             *db);
> +  SQLITE_ERR(sqlite3_create_function((*db)->db3, "like", 3, SQLITE_UTF8,
> +                                     *db, like_ucs_nfd, NULL, NULL),
> +             *db);

How does this affect indexes?

The usual like and glob code can use indexes in some cases?

Do we still use LIKE and GLOB, or did we already replace all invocations for performance reasons?
(I know I removed a lot of them to get queries to use indexes properly)

	Bert