You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nuttx.apache.org by xi...@apache.org on 2022/04/14 13:46:19 UTC
[incubator-nuttx] 01/03: libc/string: Use Byte-Shift algorithm for very long needles
This is an automated email from the ASF dual-hosted git repository.
xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git
commit c3cefe6da22d794b03ec130190ad9bf023b23eb8
Author: chao.an <an...@xiaomi.com>
AuthorDate: Thu Mar 25 23:37:30 2021 +0800
libc/string: Use Byte-Shift algorithm for very long needles
Reference here:
https://github.com/taleinat/byteshift_strstr
Signed-off-by: chao.an <an...@xiaomi.com>
Signed-off-by: anjiahao <an...@xiaomi.com>
---
libs/libc/string/lib_strstr.c | 227 +++++++++++++++++++++++++++++++++++-------
1 file changed, 192 insertions(+), 35 deletions(-)
diff --git a/libs/libc/string/lib_strstr.c b/libs/libc/string/lib_strstr.c
index d8ee9f2768..ab7bfd7cc1 100644
--- a/libs/libc/string/lib_strstr.c
+++ b/libs/libc/string/lib_strstr.c
@@ -25,67 +25,224 @@
#include <nuttx/config.h>
#include <string.h>
+#include <stdbool.h>
+#include <arpa/inet.h>
+
+/****************************************************************************
+ * Pre-processor Definitions
+ ****************************************************************************/
+
+#if LONG_MAX == INT32_MAX
+ #define LONG_INT_IS_4_BYTES 1
+ #define LONG_INT_N_BYTES 4
+#else
+ #define LONG_INT_N_BYTES sizeof(long)
+#endif
+
+/* Determine the size, in bytes, of a long integer. */
+
+#if defined (CONFIG_ENDIAN_BIG)
+ #define ULONG_BIGENDIAN(x) (x)
+#elif defined (LONG_INT_IS_4_BYTES)
+ #define ULONG_BIGENDIAN(x) htonl(x)
+#else
+ #undef ULONG_BIGENDIAN
+#endif
/****************************************************************************
* Public Functions
****************************************************************************/
-#undef strstr /* See mm/README.txt */
-FAR char *strstr(FAR const char *str, FAR const char *substr)
+/* Finds the first occurrence of the sub-string needle in the
+ * string haystack. Returns NULL if needle was not found.
+ */
+
+FAR char *strstr(FAR const char *haystack, FAR const char *needle)
{
- FAR const char *candidate; /* Candidate in str with matching start character */
- char ch; /* First character of the substring */
- size_t len; /* The length of the substring */
+#ifndef ULONG_BIGENDIAN
+ FAR const unsigned char *needle_cmp_end;
+#endif
+ FAR const unsigned char *i_haystack;
+ const char needle_first = *needle;
+ FAR const unsigned char *i_needle;
+ unsigned long last_haystack_chars;
+ unsigned long last_needle_chars;
+ FAR const char *sub_start;
+ size_t needle_cmp_len;
+ bool identical = true;
+ unsigned long mask;
+ size_t compare_len;
+ size_t needle_len;
+
+ if (!*needle)
+ {
+ return (FAR char *)haystack;
+ }
+
+ /* Runs strchr() on the first section of the haystack as it has a lower
+ * algorithmic complexity for discarding the first non-matching characters.
+ */
+
+ haystack = strchr(haystack, needle_first);
+ if (!haystack) /* First character of needle is not in the haystack. */
+ {
+ return NULL;
+ }
- /* Special case the empty substring */
+ /* First characters of haystack and needle are the same now. Both are
+ * guaranteed to be at least one character long.
+ * Now computes the sum of the first needle_len characters of haystack
+ * minus the sum of characters values of needle.
+ */
- len = strlen(substr);
- ch = *substr;
+ i_haystack = (FAR const unsigned char *)haystack + 1;
+ i_needle = (FAR const unsigned char *)needle + 1;
- if (!ch)
+ while (*i_haystack && *i_needle)
{
- /* We'll say that an empty substring matches at the beginning of
- * the string
- */
+ identical &= *i_haystack++ == *i_needle++;
+ }
+
+ /* i_haystack now references the (needle_len + 1)-th character. */
+
+ if (*i_needle) /* haystack is smaller than needle. */
+ {
+ return NULL;
+ }
+ else if (identical)
+ {
+ return (FAR char *)haystack;
+ }
+
+ needle_len = i_needle - (FAR const unsigned char *)needle;
- return (FAR char *)str;
+ /* Note:
+ * needle_len > 1, because we checked that it isn't zero, and if it
+ * is 1 then identical must be true because the first strchr() ensured
+ * that the first characters are identical
+ */
+
+ sub_start = haystack;
+ needle_cmp_len = (needle_len < LONG_INT_N_BYTES) ?
+ needle_len : LONG_INT_N_BYTES;
+
+#ifdef ULONG_BIGENDIAN
+ last_needle_chars =
+ ULONG_BIGENDIAN(*((FAR unsigned long *)(i_needle - needle_cmp_len)))
+ >> (8 * (LONG_INT_N_BYTES - needle_cmp_len));
+ last_haystack_chars =
+ ULONG_BIGENDIAN(*((FAR unsigned long *)(i_haystack - needle_cmp_len)))
+ >> (8 * (LONG_INT_N_BYTES - needle_cmp_len));
+#else
+ needle_cmp_end = i_needle;
+
+ i_needle -= needle_cmp_len;
+ i_haystack -= needle_cmp_len;
+ last_needle_chars = 0;
+ last_haystack_chars = 0;
+
+ while (i_needle != needle_cmp_end)
+ {
+ last_needle_chars <<= 8;
+ last_needle_chars ^= *i_needle++;
+ last_haystack_chars <<= 8;
+ last_haystack_chars ^= *i_haystack++;
}
+#endif
- /* Search for the substring */
+ /* At this point:
+ * needle is at least two characters long
+ * haystack is at least needle_len characters long (also at least two)
+ * the first characters of needle and haystack are identical
+ */
- candidate = str;
- for (; ; )
+ if (needle_len > LONG_INT_N_BYTES + 1)
{
- /* strchr() will return a pointer to the next occurrence of the
- * character ch in the string
+ /* we will call memcmp() only once we know that the LONG_INT_N_BYTES
+ * last chars are equal, so it will be enough to compare all but the
+ * last LONG_INT_N_BYTES characters
*/
- candidate = strchr(candidate, ch);
- if (!candidate || strlen(candidate) < len)
- {
- /* First character of the substring does not appear in the string
- * or the remainder of the string is not long enough to contain the
- * substring.
- */
+ compare_len = needle_len - LONG_INT_N_BYTES;
- return NULL;
- }
+ /* iterate through the remainder of haystack while checking for
+ * identity of the last LONG_INT_N_BYTES, and checking the rest
+ * with memcmp()
+ */
- /* Check if this is the beginning of a matching substring */
+ while (*i_haystack)
+ {
+ last_haystack_chars <<= 8;
+ last_haystack_chars ^= *i_haystack++;
+
+ sub_start++;
+ if (last_haystack_chars == last_needle_chars &&
+ memcmp(sub_start, needle, compare_len) == 0)
+ {
+ return (FAR char *)sub_start;
+ }
+ }
+ }
+ else if (needle_len == LONG_INT_N_BYTES + 1)
+ {
+ /* iterate through the remainder of haystack while checking for
+ * identity of the last LONG_INT_N_BYTES as well as the single
+ * additional character, which is the first one
+ */
- if (strncmp(candidate, substr, len) == 0)
+ while (*i_haystack)
{
- return (FAR char *)candidate;
+ last_haystack_chars <<= 8;
+ last_haystack_chars ^= *i_haystack++;
+
+ sub_start++;
+ if (last_haystack_chars == last_needle_chars &&
+ *sub_start == needle_first)
+ {
+ return (FAR char *)sub_start;
+ }
}
+ }
+ else if (needle_len == LONG_INT_N_BYTES)
+ {
+ /* iterate through the remainder of haystack while checking for
+ * identity of the last LONG_INT_N_BYTES characters, which
+ * should exactly match the entire needle
+ */
- /* No, find the next candidate after this one */
+ while (*i_haystack)
+ {
+ last_haystack_chars <<= 8;
+ last_haystack_chars ^= *i_haystack++;
- candidate++;
+ if (last_haystack_chars == last_needle_chars)
+ {
+ return (FAR char *)(i_haystack - needle_len);
+ }
+ }
}
+ else /* needle_len < LONG_INT_N_BYTES */
+ {
+ mask = (((unsigned long)1) << (needle_len * 8)) - 1;
+ last_needle_chars &= mask;
- /* Won't get here, but some compilers might complain. Other compilers
- * might complain about this code being unreachable too.
- */
+ /* iterate through the remainder of haystack, updating the sums'
+ * difference and checking for identity whenever the difference
+ * is zero
+ */
+
+ while (*i_haystack)
+ {
+ last_haystack_chars <<= 8;
+ last_haystack_chars ^= *i_haystack++;
+ last_haystack_chars &= mask;
+
+ if (last_haystack_chars == last_needle_chars)
+ {
+ return (FAR char *)(i_haystack - needle_len);
+ }
+ }
+ }
return NULL;
}