You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nuttx.apache.org by xi...@apache.org on 2021/11/06 12:39:39 UTC
[incubator-nuttx] 02/03: libc:machine:xtensa:add xtensa libc
implement
This is an automated email from the ASF dual-hosted git repository.
xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git
commit cfcff5f570319192af6f64d4e7407c991488cff8
Author: zhuyanlin <zh...@xiaomi.com>
AuthorDate: Thu Oct 28 11:56:18 2021 +0800
libc:machine:xtensa:add xtensa libc implement
N/A
Signed-off-by: zhuyanlin <zh...@xiaomi.com>
---
libs/libc/machine/xtensa/Kconfig | 43 ++
libs/libc/machine/xtensa/Make.defs | 31 +-
libs/libc/machine/xtensa/arch_memcpy.S | 281 ++++++++++++
libs/libc/machine/xtensa/arch_memmove.S | 480 ++++++++++++++++++++
libs/libc/machine/xtensa/arch_memset.S | 179 ++++++++
libs/libc/machine/xtensa/arch_strcmp.S | 767 ++++++++++++++++++++++++++++++++
libs/libc/machine/xtensa/arch_strcpy.S | 243 ++++++++++
libs/libc/machine/xtensa/arch_strlen.S | 123 +++++
libs/libc/machine/xtensa/arch_strncpy.S | 265 +++++++++++
libs/libc/machine/xtensa/xtensa_asm.h | 62 +++
10 files changed, 2472 insertions(+), 2 deletions(-)
diff --git a/libs/libc/machine/xtensa/Kconfig b/libs/libc/machine/xtensa/Kconfig
index f72f3c0..232fb73 100644
--- a/libs/libc/machine/xtensa/Kconfig
+++ b/libs/libc/machine/xtensa/Kconfig
@@ -2,3 +2,46 @@
# For a description of the syntax of this configuration file,
# see the file kconfig-language.txt in the NuttX tools repository.
#
+
+config XTENSA_MEMCPY
+ bool "Enable optimized memcpy() for XTENSA"
+ select LIBC_ARCH_MEMCPY
+ ---help---
+ Enable optimized XTENSA specific memcpy() library function
+
+config XTENSA_MEMMOVE
+ bool "Enable optimized memmove() for XTENSA"
+ select LIBC_ARCH_MEMMOVE
+ ---help---
+ Enable optimized XTENSA specific memmove() library function
+
+config XTENSA_MEMSET
+ bool "Enable optimized memset() for XTENSA"
+ select LIBC_ARCH_MEMSET
+ ---help---
+ Enable optimized XTENSA specific memset() library function
+
+config XTENSA_STRCMP
+ bool "Enable optimized strcmp() for XTENSA"
+ select LIBC_ARCH_STRCMP
+ ---help---
+ Enable optimized XTENSA specific strcmp() library function
+
+config XTENSA_STRCPY
+ bool "Enable optimized strcpy() for XTENSA"
+ select LIBC_ARCH_STRCPY
+ ---help---
+ Enable optimized XTENSA specific strcpy() library function
+
+config XTENSA_STRLEN
+ bool "Enable optimized strlen() for XTENSA"
+ select LIBC_ARCH_STRLEN
+ ---help---
+ Enable optimized XTENSA specific strlen() library function
+
+config XTENSA_STRNCPY
+ bool "Enable optimized strncpy() for XTENSA"
+ select LIBC_ARCH_STRNCPY
+ ---help---
+ Enable optimized XTENSA specific strncpy() library function
+
diff --git a/libs/libc/machine/xtensa/Make.defs b/libs/libc/machine/xtensa/Make.defs
index 8f33a82..379c7da 100644
--- a/libs/libc/machine/xtensa/Make.defs
+++ b/libs/libc/machine/xtensa/Make.defs
@@ -19,10 +19,37 @@
############################################################################
ifeq ($(CONFIG_LIBC_ARCH_ELF),y)
-
CSRCS += arch_elf.c
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMCPY),y)
+ASRCS += arch_memcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMMOVE),y)
+ASRCS += arch_memmove.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMSET),y)
+ASRCS += arch_memset.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCPY),y)
+ASRCS += arch_strcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRLEN),y)
+ASRCS += arch_strlen.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRNCPY),y)
+ASRCS += arch_strncpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCMP),y)
+ASRCS += arch_strcmp.S
+endif
DEPPATH += --dep-path machine/xtensa
VPATH += :machine/xtensa
-endif
diff --git a/libs/libc/machine/xtensa/arch_memcpy.S b/libs/libc/machine/xtensa/arch_memcpy.S
new file mode 100644
index 0000000..47de6dd
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memcpy.S
@@ -0,0 +1,281 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+ lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT 0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+ .section .text
+ .begin schedule
+ .literal_position
+
+ .local .Ldst1mod2
+ .local .Ldst2mod4
+ .local .Lbytecopy
+
+ .align 4
+ .global memcpy
+ .type memcpy, @function
+memcpy:
+ ENTRY(16)
+ /* a2 = dst, a3 = src, a4 = len */
+
+ mov a5, a2 # copy dst so that a2 is return value
+ bbsi.l a2, 0, .Ldst1mod2
+ bbsi.l a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+ /* Get number of loop iterations with 16B per iteration. */
+ srli a7, a4, 4
+
+ /* Check if source is aligned. */
+ slli a8, a3, 30
+ bnez a8, .Lsrcunaligned
+
+ /* Destination and source are word-aligned, use word copy. */
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, 2f
+#else
+ beqz a7, 2f
+ slli a8, a7, 4
+ add a8, a8, a3 # a8 = end of last 16B source chunk
+#endif
+1: l32i a6, a3, 0
+ l32i a7, a3, 4
+ s32i a6, a5, 0
+ l32i a6, a3, 8
+
+ s32i a7, a5, 4
+ l32i a7, a3, 12
+ s32i a6, a5, 8
+ addi a3, a3, 16
+ s32i a7, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ bltu a3, a8, 1b
+#endif
+
+ /* Copy any leftover pieces smaller than 16B. */
+2: bbci.l a4, 3, 3f
+
+ /* Copy 8 bytes. */
+ l32i a6, a3, 0
+ l32i a7, a3, 4
+ addi a3, a3, 8
+ s32i a6, a5, 0
+ s32i a7, a5, 4
+ addi a5, a5, 8
+
+3: bbsi.l a4, 2, 4f
+ bbsi.l a4, 1, 5f
+ bbsi.l a4, 0, 6f
+ RET(16)
+
+ # .align 4
+ /* Copy 4 bytes. */
+4: l32i a6, a3, 0
+ addi a3, a3, 4
+ s32i a6, a5, 0
+ addi a5, a5, 4
+ bbsi.l a4, 1, 5f
+ bbsi.l a4, 0, 6f
+ RET(16)
+
+ /* Copy 2 bytes. */
+5: l16ui a6, a3, 0
+ addi a3, a3, 2
+ s16i a6, a5, 0
+ addi a5, a5, 2
+ bbsi.l a4, 0, 6f
+ RET(16)
+
+ /* Copy 1 byte. */
+6: l8ui a6, a3, 0
+ s8i a6, a5, 0
+
+.Ldone:
+ RET(16)
+
+
+/* Destination is aligned; source is unaligned. */
+
+ # .align 4
+.Lsrcunaligned:
+ /* Avoid loading anything for zero-length copies. */
+ beqz a4, .Ldone
+
+ /* Copy 16 bytes per iteration for word-aligned dst and
+ unaligned src. */
+ ssa8 a3 # set shift amount from byte offset
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+ srli a11, a8, 30 # save unalignment offset for below
+ sub a3, a3, a11 # align a3
+#endif
+ l32i a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, 2f
+#else
+ beqz a7, 2f
+ slli a10, a7, 4
+ add a10, a10, a3 # a10 = end of last 16B source chunk
+#endif
+1: l32i a7, a3, 4
+ l32i a8, a3, 8
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ l32i a9, a3, 12
+ src_b a7, a7, a8
+ s32i a7, a5, 4
+ l32i a6, a3, 16
+ src_b a8, a8, a9
+ s32i a8, a5, 8
+ addi a3, a3, 16
+ src_b a9, a9, a6
+ s32i a9, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ bltu a3, a10, 1b
+#endif
+
+2: bbci.l a4, 3, 3f
+
+ /* Copy 8 bytes. */
+ l32i a7, a3, 4
+ l32i a8, a3, 8
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ addi a3, a3, 8
+ src_b a7, a7, a8
+ s32i a7, a5, 4
+ addi a5, a5, 8
+ mov a6, a8
+
+3: bbci.l a4, 2, 4f
+
+ /* Copy 4 bytes. */
+ l32i a7, a3, 4
+ addi a3, a3, 4
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ addi a5, a5, 4
+ mov a6, a7
+4:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+ add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+ bbsi.l a4, 1, 5f
+ bbsi.l a4, 0, 6f
+ RET(16)
+
+ /* Copy 2 bytes. */
+5: l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a3, a3, 2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a5, a5, 2
+ bbsi.l a4, 0, 6f
+ RET(16)
+
+ /* Copy 1 byte. */
+6: l8ui a6, a3, 0
+ s8i a6, a5, 0
+ RET(16)
+
+
+ # .align XCHAL_INST_FETCH_WIDTH
+__memcpy_aux:
+
+ /* Skip bytes to get proper alignment for three-byte loop */
+# .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+ loopnez a4, 2f
+#else
+ beqz a4, 2f
+ add a7, a3, a4 # a7 = end address for source
+#endif
+1: l8ui a6, a3, 0
+ addi a3, a3, 1
+ s8i a6, a5, 0
+ addi a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+ bltu a3, a7, 1b
+#endif
+2: RET(16)
+
+
+/* Destination is unaligned. */
+
+ # .align 4
+.Ldst1mod2: # dst is only byte aligned
+
+ /* Do short copies byte-by-byte. */
+ bltui a4, 7, .Lbytecopy
+
+ /* Copy 1 byte. */
+ l8ui a6, a3, 0
+ addi a3, a3, 1
+ addi a4, a4, -1
+ s8i a6, a5, 0
+ addi a5, a5, 1
+
+ /* Return to main algorithm if dst is now aligned. */
+ bbci.l a5, 1, .Ldstaligned
+
+.Ldst2mod4: # dst has 16-bit alignment
+
+ /* Do short copies byte-by-byte. */
+ bltui a4, 6, .Lbytecopy
+
+ /* Copy 2 bytes. */
+ l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a3, a3, 2
+ addi a4, a4, -2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a5, a5, 2
+
+ /* dst is now aligned; return to main algorithm. */
+ j .Ldstaligned
+
+ .end schedule
+
+ .size memcpy, . - memcpy
diff --git a/libs/libc/machine/xtensa/arch_memmove.S b/libs/libc/machine/xtensa/arch_memmove.S
new file mode 100644
index 0000000..7ce56c4
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memmove.S
@@ -0,0 +1,480 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+ lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT 0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+ .text
+ .begin schedule
+ .global memmove
+
+/*
+ * Byte by byte copy
+ */
+ .align 4
+ .byte 0 # 1 mod 4 alignment for LOOPNEZ
+ # (0 mod 4 alignment for LBEG)
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+ loopnez a4, .Lbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+ beqz a4, .Lbytecopydone
+ add a7, a3, a4 # a7 = end address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lnextbyte:
+ l8ui a6, a3, 0
+ addi a3, a3, 1
+ s8i a6, a5, 0
+ addi a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+ bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbytecopydone:
+ RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+ .align 4
+.Ldst1mod2: # dst is only byte aligned
+ _bltui a4, 7, .Lbytecopy # do short copies byte by byte
+
+ # copy 1 byte
+ l8ui a6, a3, 0
+ addi a3, a3, 1
+ addi a4, a4, -1
+ s8i a6, a5, 0
+ addi a5, a5, 1
+ _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
+ # return to main algorithm
+.Ldst2mod4: # dst 16-bit aligned
+ # copy 2 bytes
+ _bltui a4, 6, .Lbytecopy # do short copies byte by byte
+ l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a3, a3, 2
+ addi a4, a4, -2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a5, a5, 2
+ j .Ldstaligned # dst is now aligned, return to main algorithm
+
+.Lcommon:
+ bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
+ bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
+.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
+ srli a7, a4, 4 # number of loop iterations with 16B
+ # per iteration
+ movi a8, 3 # if source is not aligned,
+ bany a3, a8, .Lsrcunaligned # then use shifting copy
+ /*
+ * Destination and source are word-aligned, use word copy.
+ */
+ # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, .Loop1done
+#else /* !XCHAL_HAVE_LOOPS */
+ beqz a7, .Loop1done
+ slli a8, a7, 4
+ add a8, a8, a3 # a8 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1:
+ l32i a6, a3, 0
+ l32i a7, a3, 4
+ s32i a6, a5, 0
+ l32i a6, a3, 8
+ s32i a7, a5, 4
+ l32i a7, a3, 12
+ s32i a6, a5, 8
+ addi a3, a3, 16
+ s32i a7, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1done:
+ bbci.l a4, 3, .L2
+ # copy 8 bytes
+ l32i a6, a3, 0
+ l32i a7, a3, 4
+ addi a3, a3, 8
+ s32i a6, a5, 0
+ s32i a7, a5, 4
+ addi a5, a5, 8
+.L2:
+ bbsi.l a4, 2, .L3
+ bbsi.l a4, 1, .L4
+ bbsi.l a4, 0, .L5
+ RET(16)
+.L3:
+ # copy 4 bytes
+ l32i a6, a3, 0
+ addi a3, a3, 4
+ s32i a6, a5, 0
+ addi a5, a5, 4
+ bbsi.l a4, 1, .L4
+ bbsi.l a4, 0, .L5
+ RET(16)
+.L4:
+ # copy 2 bytes
+ l16ui a6, a3, 0
+ addi a3, a3, 2
+ s16i a6, a5, 0
+ addi a5, a5, 2
+ bbsi.l a4, 0, .L5
+ RET(16)
+.L5:
+ # copy 1 byte
+ l8ui a6, a3, 0
+ s8i a6, a5, 0
+ RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+ .align 4
+.Lsrcunaligned:
+ _beqz a4, .Ldone # avoid loading anything for zero-length copies
+ # copy 16 bytes per iteration for word-aligned dst and unaligned src
+ ssa8 a3 # set shift amount from byte offset
+
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+ and a11, a3, a8 # save unalignment offset for below
+ sub a3, a3, a11 # align a3
+#endif
+ l32i a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, .Loop2done
+#else /* !XCHAL_HAVE_LOOPS */
+ beqz a7, .Loop2done
+ slli a10, a7, 4
+ add a10, a10, a3 # a10 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2:
+ l32i a7, a3, 4
+ l32i a8, a3, 8
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ l32i a9, a3, 12
+ src_b a7, a7, a8
+ s32i a7, a5, 4
+ l32i a6, a3, 16
+ src_b a8, a8, a9
+ s32i a8, a5, 8
+ addi a3, a3, 16
+ src_b a9, a9, a6
+ s32i a9, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2done:
+ bbci.l a4, 3, .L12
+ # copy 8 bytes
+ l32i a7, a3, 4
+ l32i a8, a3, 8
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ addi a3, a3, 8
+ src_b a7, a7, a8
+ s32i a7, a5, 4
+ addi a5, a5, 8
+ mov a6, a8
+.L12:
+ bbci.l a4, 2, .L13
+ # copy 4 bytes
+ l32i a7, a3, 4
+ addi a3, a3, 4
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ addi a5, a5, 4
+ mov a6, a7
+.L13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+ add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+ bbsi.l a4, 1, .L14
+ bbsi.l a4, 0, .L15
+.Ldone: RET(16)
+.L14:
+ # copy 2 bytes
+ l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a3, a3, 2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a5, a5, 2
+ bbsi.l a4, 0, .L15
+ RET(16)
+.L15:
+ # copy 1 byte
+ l8ui a6, a3, 0
+ s8i a6, a5, 0
+ RET(16)
+
+/*
+ * Byte by byte copy
+ */
+ .align 4
+ .byte 0 # 1 mod 4 alignment for LOOPNEZ
+ # (0 mod 4 alignment for LBEG)
+.Lbackbytecopy:
+#if XCHAL_HAVE_LOOPS
+ loopnez a4, .Lbackbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+ beqz a4, .Lbackbytecopydone
+ sub a7, a3, a4 # a7 = start address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbacknextbyte:
+ addi a3, a3, -1
+ l8ui a6, a3, 0
+ addi a5, a5, -1
+ s8i a6, a5, 0
+#if !XCHAL_HAVE_LOOPS
+ bne a3, a7, .Lbacknextbyte # continue loop if
+ # $a3:src != $a7:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbackbytecopydone:
+ RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+ .align 4
+.Lbackdst1mod2: # dst is only byte aligned
+ _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
+
+ # copy 1 byte
+ addi a3, a3, -1
+ l8ui a6, a3, 0
+ addi a5, a5, -1
+ s8i a6, a5, 0
+ addi a4, a4, -1
+ _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
+ # return to main algorithm
+.Lbackdst2mod4: # dst 16-bit aligned
+ # copy 2 bytes
+ _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
+ addi a3, a3, -2
+ l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a5, a5, -2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a4, a4, -2
+ j .Lbackdstaligned # dst is now aligned,
+ # return to main algorithm
+
+ .align 4
+memmove:
+
+ ENTRY(16)
+ # a2/ dst, a3/ src, a4/ len
+ mov a5, a2 # copy dst so that a2 is return value
+.Lmovecommon:
+ sub a6, a5, a3
+ bgeu a6, a4, .Lcommon
+
+ add a5, a5, a4
+ add a3, a3, a4
+
+ bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
+ bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
+.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
+ srli a7, a4, 4 # number of loop iterations with 16B
+ # per iteration
+ movi a8, 3 # if source is not aligned,
+ bany a3, a8, .Lbacksrcunaligned # then use shifting copy
+ /*
+ * Destination and source are word-aligned, use word copy.
+ */
+ # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, .backLoop1done
+#else /* !XCHAL_HAVE_LOOPS */
+ beqz a7, .backLoop1done
+ slli a8, a7, 4
+ sub a8, a3, a8 # a8 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1:
+ addi a3, a3, -16
+ l32i a7, a3, 12
+ l32i a6, a3, 8
+ addi a5, a5, -16
+ s32i a7, a5, 12
+ l32i a7, a3, 4
+ s32i a6, a5, 8
+ l32i a6, a3, 0
+ s32i a7, a5, 4
+ s32i a6, a5, 0
+#if !XCHAL_HAVE_LOOPS
+ bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1done:
+ bbci.l a4, 3, .Lback2
+ # copy 8 bytes
+ addi a3, a3, -8
+ l32i a6, a3, 0
+ l32i a7, a3, 4
+ addi a5, a5, -8
+ s32i a6, a5, 0
+ s32i a7, a5, 4
+.Lback2:
+ bbsi.l a4, 2, .Lback3
+ bbsi.l a4, 1, .Lback4
+ bbsi.l a4, 0, .Lback5
+ RET(16)
+.Lback3:
+ # copy 4 bytes
+ addi a3, a3, -4
+ l32i a6, a3, 0
+ addi a5, a5, -4
+ s32i a6, a5, 0
+ bbsi.l a4, 1, .Lback4
+ bbsi.l a4, 0, .Lback5
+ RET(16)
+.Lback4:
+ # copy 2 bytes
+ addi a3, a3, -2
+ l16ui a6, a3, 0
+ addi a5, a5, -2
+ s16i a6, a5, 0
+ bbsi.l a4, 0, .Lback5
+ RET(16)
+.Lback5:
+ # copy 1 byte
+ addi a3, a3, -1
+ l8ui a6, a3, 0
+ addi a5, a5, -1
+ s8i a6, a5, 0
+ RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+ .align 4
+.Lbacksrcunaligned:
+ _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
+ # copy 16 bytes per iteration for word-aligned dst and unaligned src
+ ssa8 a3 # set shift amount from byte offset
+#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
+ * the lint or ferret client, or 0
+ * to save a few cycles */
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+ and a11, a3, a8 # save unalignment offset for below
+ sub a3, a3, a11 # align a3
+#endif
+ l32i a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, .backLoop2done
+#else /* !XCHAL_HAVE_LOOPS */
+ beqz a7, .backLoop2done
+ slli a10, a7, 4
+ sub a10, a3, a10 # a10 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2:
+ addi a3, a3, -16
+ l32i a7, a3, 12
+ l32i a8, a3, 8
+ addi a5, a5, -16
+ src_b a6, a7, a6
+ s32i a6, a5, 12
+ l32i a9, a3, 4
+ src_b a7, a8, a7
+ s32i a7, a5, 8
+ l32i a6, a3, 0
+ src_b a8, a9, a8
+ s32i a8, a5, 4
+ src_b a9, a6, a9
+ s32i a9, a5, 0
+#if !XCHAL_HAVE_LOOPS
+ bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2done:
+ bbci.l a4, 3, .Lback12
+ # copy 8 bytes
+ addi a3, a3, -8
+ l32i a7, a3, 4
+ l32i a8, a3, 0
+ addi a5, a5, -8
+ src_b a6, a7, a6
+ s32i a6, a5, 4
+ src_b a7, a8, a7
+ s32i a7, a5, 0
+ mov a6, a8
+.Lback12:
+ bbci.l a4, 2, .Lback13
+ # copy 4 bytes
+ addi a3, a3, -4
+ l32i a7, a3, 0
+ addi a5, a5, -4
+ src_b a6, a7, a6
+ s32i a6, a5, 0
+ mov a6, a7
+.Lback13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+ add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+ bbsi.l a4, 1, .Lback14
+ bbsi.l a4, 0, .Lback15
+.Lbackdone:
+ RET(16)
+.Lback14:
+ # copy 2 bytes
+ addi a3, a3, -2
+ l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a5, a5, -2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ bbsi.l a4, 0, .Lback15
+ RET(16)
+.Lback15:
+ # copy 1 byte
+ addi a3, a3, -1
+ addi a5, a5, -1
+ l8ui a6, a3, 0
+ s8i a6, a5, 0
+ RET(16)
+
+ .end schedule
+ .size memmove, . - memmove
diff --git a/libs/libc/machine/xtensa/arch_memset.S b/libs/libc/machine/xtensa/arch_memset.S
new file mode 100644
index 0000000..488172f
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memset.S
@@ -0,0 +1,179 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+/* void *memset (void *dst, int c, size_t length)
+
+ The algorithm is as follows:
+
+ Create a word with c in all byte positions.
+
+ If the destination is aligned, set 16B chunks with a loop, and then
+ finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
+
+ If the destination is unaligned, align it by conditionally
+ setting 1B and/or 2B and then go to aligned case.
+
+ This code tries to use fall-through branches for the common
+ case of an aligned destination (except for the branches to
+ the alignment labels). */
+
+
+/* Byte-by-byte set. */
+
+ .section .text
+ .begin schedule
+ .literal_position
+
+ .local .Lbyteset
+ .local .Ldst1mod2
+ .local .Ldst2mod4
+
+ .align 4
+ .global memset
+ .type memset, @function
+memset:
+ ENTRY(16)
+ /* a2 = dst, a3 = c, a4 = length */
+
+ /* Duplicate character into all bytes of word. */
+ extui a3, a3, 0, 8
+ slli a7, a3, 8
+ or a3, a3, a7
+ slli a7, a3, 16
+ or a3, a3, a7
+
+ mov a5, a2 // copy dst so that a2 is return value
+
+ /* Check if dst is unaligned. */
+ bbsi.l a2, 0, .Ldst1mod2
+ bbsi.l a2, 1, .Ldst2mod4
+ j .Ldstaligned
+
+.Ldst1mod2: // dst is only byte aligned
+
+ /* Do short sizes byte-by-byte. */
+ bltui a4, 8, .Lbyteset
+
+ /* Set 1 byte. */
+ s8i a3, a5, 0
+ addi a5, a5, 1
+ addi a4, a4, -1
+
+ /* Now retest if dst is aligned. */
+ bbci.l a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+ /* Do short sizes byte-by-byte. */
+ bltui a4, 8, .Lbyteset
+
+ /* Set 2 bytes. */
+ s16i a3, a5, 0
+ addi a5, a5, 2
+ addi a4, a4, -2
+
+ /* dst is now aligned; fall through to main algorithm */
+
+.Ldstaligned:
+
+ /* Get number of loop iterations with 16B per iteration. */
+ srli a7, a4, 4
+
+ /* Destination is word-aligned. */
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, 2f
+#else
+ beqz a7, 2f
+ slli a6, a7, 4
+ add a6, a6, a5 // a6 = end of last 16B chunk
+#endif
+ /* Set 16 bytes per iteration. */
+1: s32i a3, a5, 0
+ s32i a3, a5, 4
+ s32i a3, a5, 8
+ s32i a3, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ bltu a5, a6, 1b
+#endif
+
+ /* Set any leftover pieces smaller than 16B. */
+2: bbci.l a4, 3, 3f
+
+ /* Set 8 bytes. */
+ s32i a3, a5, 0
+ s32i a3, a5, 4
+ addi a5, a5, 8
+
+3: bbci.l a4, 2, 4f
+
+ /* Set 4 bytes. */
+ s32i a3, a5, 0
+ addi a5, a5, 4
+
+4: bbci.l a4, 1, 5f
+
+ /* Set 2 bytes. */
+ s16i a3, a5, 0
+ addi a5, a5, 2
+
+5: bbci.l a4, 0, 6f
+
+ /* Set 1 byte. */
+ s8i a3, a5, 0
+6: RET(16)
+
+
+ // .align XCHAL_INST_FETCH_WIDTH
+__memset_aux:
+
+ /* Skip bytes to get proper alignment for three-byte loop */
+// .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbyteset:
+#if XCHAL_HAVE_LOOPS
+ loopnez a4, 2f
+#else
+ beqz a4, 2f
+ add a6, a5, a4 // a6 = ending address
+#endif
+1: s8i a3, a5, 0
+ addi a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+ bltu a5, a6, 1b
+#endif
+2: RET(16)
+
+ .end schedule
+
+ .size memset, . - memset
diff --git a/libs/libc/machine/xtensa/arch_strcmp.S b/libs/libc/machine/xtensa/arch_strcmp.S
new file mode 100644
index 0000000..aab50be
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strcmp.S
@@ -0,0 +1,767 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcmp.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#define MASK4 0x40404040
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+ .section .text
+ .begin schedule
+ .align 4
+ .literal_position
+
+ .global strcmp
+ .type strcmp,@function
+ .align 4
+
+strcmp:
+
+#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_FLIX3
+/* Fast version for FLIX3 Little Endian */
+
+
+ ENTRY(16)
+ /* a2 = s1, a3 = s2 */
+
+ l8ui a8, a2, 0 # byte 0 from s1
+ l8ui a9, a3, 0 # byte 0 from s2
+ movi a10, 3 # mask
+ movi a5, 0xfffffffc
+ or a11, a2, a3
+ movi a4, MASK0 # mask for byte 0
+ movi a7, MASK4
+ addi a3, a3, -8
+ addi a2, a2, -8
+ and a5, a5, a2
+ bne.w18 a8, a9, .Lretdiff
+ l32i a8, a5, 8 # get word from aligned variant of s1
+
+ bany.w18 a11, a10, .Lnot_aligned
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop. */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+ 32 and 127.
+
+ Rather than check all bytes for zero:
+ Take one word (4 bytes). Call it w1.
+ Shift w1 left by one into w1'.
+ Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
+ Check that all 4 bit 6's (one for each byte) are one:
+ If they are, we are definitely not done.
+ If they are not, we are probably done, but need to check for zero. */
+
+.Laligned:
+ /* Loop forever */
+1:
+ loop a0, .Laligned_done
+
+ /* First unrolled loop body. */
+ l32i a9, a3, 8 # get word from s2
+ addi a3, a3, 8 # advance s2 pointer
+ slli a5, a8, 1
+ or a10, a8, a5
+ {l32i a11, a2, 12 # get word from s1+4
+ bne.w18 a8, a9, .Lwne2}
+ l32i a9, a3, 4 # get word from s2+4
+ bnall.w18 a10, a7, .Lprobeq
+
+ /* Second unrolled loop body. */
+ slli a5, a11, 1
+ or a10, a11, a5
+ addi a2, a2, 8 # advance s1 pointer
+ mov a8, a11
+ bne.w18 a11, a9, .Lwne2
+ l32i a8, a2, 8 # get word from s1
+ bnall.w18 a10, a7, .Lprobeq2
+
+.Laligned_done:
+ l32i a8, a2, 8 # get word from s1
+ j 1b
+
+.Lnot_aligned:
+ xor a11, a2, a3 # compare low two bits of s1 and s2
+ bany a11, a10, .Lunaligned # if they have different alignment
+
+ /* s1/s2 are not word-aligned. */
+ movi a5, 0xfffffffc
+ addi a2, a2, 1 # advance s1
+ beqz a9, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ and a6, a2, a5
+ l32i a8, a6, 8 # get word from s1
+ bnone a2, a10, .Laligned # if s1/s2 now aligned
+ l8ui a8, a2, 8 # byte 1 from s1
+ l8ui a9, a3, 8 # byte 1 from s2
+ addi a2, a2, 1 # advance s1
+ bne a8, a9, .Lretdiff # if different, return difference
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ and a6, a2, a5
+ l32i a8, a6, 8 # get word from s1
+ bnone a2, a10, .Laligned # if s1/s2 now aligned
+ l8ui a8, a2, 8 # byte 2 from s1
+ l8ui a9, a3, 8 # byte 2 from s2
+ addi a2, a2, 1 # advance s1
+ bne a8, a9, .Lretdiff # if different, return difference
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ l32i a8, a2, 8 # get word from s1
+ j .Laligned
+
+/* s1 and s2 have different alignment.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop.
+
+ Note: It is important for this unaligned case to come before the
+ code for aligned strings, because otherwise some of the branches
+ above cannot reach and have to be transformed to branches around
+ jumps. The unaligned code is smaller and the branches can reach
+ over it. */
+
+.Lunaligned:
+ movi.n a8, 0 # set up for the maximum loop count
+ loop a8, .Lretdiff # loop forever (almost anyway)
+ l8ui a8, a2, 8
+ l8ui a9, a3, 8
+ addi a2, a2, 1
+ bne a8, a9, .Lretdiff
+ addi a3, a3, 1
+ beqz a8, .Lretdiff
+.Lretdiff:
+ sub a2, a8, a9
+ RET(16)
+
+
+.Lprobeq2:
+ /* Adjust pointers to account for the loop unrolling. */
+ mov a8, a11
+ addi a2, a2, -4
+ addi a3, a3, 4
+
+ /* align (0 mod 4) */
+.Lprobeq:
+ /* Words are probably equal, but check for sure.
+ If not, loop over the rest of string using normal algorithm. */
+
+ bnone a8, a4, .Leq # if byte 0 is zero
+ movi a5, MASK1 # mask for byte 1
+ movi a6, MASK2 # mask for byte 2
+ bnone a8, a5, .Leq # if byte 1 is zero
+ movi a7, MASK3 # mask for byte 3
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bnone a8, a7, .Leq # if byte 3 is zero
+ /* align (1 mod 4) */
+ addi.n a2, a2, 12 # advance s1 pointer
+ addi.n a3, a3, 4 # advance s2 pointer
+ /* align (1 mod 4) or (2 mod 4) */
+1:
+ loop a0, .Lend # loop forever (a4 is bigger than max iters)
+
+ l32i a8, a2, 0 # get word from s1
+ l32i a9, a3, 0 # get word from s2
+ addi a2, a2, 4 # advance s1 pointer
+ bne a8, a9, .Lwne
+ bnone a8, a4, .Leq # if byte 0 is zero
+ bnone a8, a5, .Leq # if byte 1 is zero
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bnone a8, a7, .Leq # if byte 3 is zero
+ addi a3, a3, 4 # advance s2 pointer
+.Lend:
+ j 1b
+
+ /* Words are equal; some byte is zero. */
+.Leq: movi a2, 0 # return equal
+ RET(16)
+
+.Lwne2: /* Words are not equal. On big-endian processors, if none of the
+ bytes are zero, the return value can be determined by a simple
+ comparison. */
+.Lwne: /* Words are not equal. */
+ xor a2, a8, a9 # get word with nonzero in byte that differs
+ extui a10, a8, 0, 8
+ extui a11, a9, 0, 8
+ movi a5, MASK1 # mask for byte 1
+ bany.w18 a2, a4, .Ldiff0 # if byte 0 differs
+
+ bnone.w18 a8, a4, .Leq # if byte 0 is zero
+ movi a6, MASK2 # mask for byte 2
+ bany.w18 a2, a5, .Ldiff1 # if byte 1 differs
+ extui a10, a8, 24, 8
+ bnone.w18 a8, a5, .Leq # if byte 1 is zero
+ extui a11, a9, 24, 8
+ bany.w18 a2, a6, .Ldiff2 # if byte 2 differs
+ sub a2, a10, a11
+ bnone.w18 a8, a6, .Leq # if byte 2 is zero
+ /* Little-endian is a little more difficult because can't subtract
+ whole words. */
+.Ldiff3:
+ /* Bytes 0-2 are equal; byte 3 is different.
+ For little-endian need to have a sign bit for the difference. */
+ RET(16)
+.Ldiff0:
+ /* Byte 0 is different. */
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff1:
+ /* Byte 0 is equal; byte 1 is different. */
+ extui a10, a8, 8, 8
+ extui a11, a9, 8, 8
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff2:
+ /* Bytes 0-1 are equal; byte 2 is different. */
+ extui a10, a8, 16, 8
+ extui a11, a9, 16, 8
+ sub a2, a10, a11
+ RET(16)
+
+#else
+#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_PDX4
+/* Fast version for FLIX3 Little Endian */
+
+
+ ENTRY(16)
+ /* a2 = s1, a3 = s2 */
+
+ l8ui a8, a2, 0 # byte 0 from s1
+ l8ui a9, a3, 0 # byte 0 from s2
+ movi a10, 3 # mask
+ movi a5, 0xfffffffc
+ or a11, a2, a3
+ movi a4, MASK0 # mask for byte 0
+ movi a7, MASK4
+ addi a3, a3, -8
+ addi a2, a2, -8
+ and a5, a5, a2
+ bne.w15 a8, a9, .Lretdiff
+ l32i a8, a5, 8 # get word from aligned variant of s1
+
+ bany.w15 a11, a10, .Lnot_aligned
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop. */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+ 32 and 127.
+
+ Rather than check all bytes for zero:
+ Take one word (4 bytes). Call it w1.
+ Shift w1 left by one into w1'.
+ Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
+ Check that all 4 bit 6's (one for each byte) are one:
+ If they are, we are definitely not done.
+ If they are not, we are probably done, but need to check for zero. */
+
+.Laligned:
+ /* Loop forever */
+1:
+ loop a0, .Laligned_done
+
+ /* First unrolled loop body. */
+ l32i a9, a3, 8 # get word from s2
+ addi a3, a3, 8 # advance s2 pointer
+ slli a5, a8, 1
+ or a10, a8, a5
+ {
+ bne.w15 a8, a9, .Lwne2
+ l32i a11, a2, 12 # get word from s1+4
+ nop
+ nop
+ }
+ l32i a9, a3, 4 # get word from s2+4
+ bnall.w15 a10, a7, .Lprobeq
+
+ /* Second unrolled loop body. */
+ slli a5, a11, 1
+ or a10, a11, a5
+ addi a2, a2, 8 # advance s1 pointer
+ mov a8, a11
+ bne.w15 a11, a9, .Lwne2
+ l32i a8, a2, 8 # get word from s1
+ bnall.w15 a10, a7, .Lprobeq2
+
+.Laligned_done:
+ l32i a8, a2, 8 # get word from s1
+ j 1b
+
+.Lnot_aligned:
+ xor a11, a2, a3 # compare low two bits of s1 and s2
+ bany a11, a10, .Lunaligned # if they have different alignment
+
+ /* s1/s2 are not word-aligned. */
+ movi a5, 0xfffffffc
+ addi a2, a2, 1 # advance s1
+ beqz a9, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ and a6, a2, a5
+ l32i a8, a6, 8 # get word from s1
+ bnone a2, a10, .Laligned # if s1/s2 now aligned
+ l8ui a8, a2, 8 # byte 1 from s1
+ l8ui a9, a3, 8 # byte 1 from s2
+ addi a2, a2, 1 # advance s1
+ bne a8, a9, .Lretdiff # if different, return difference
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ and a6, a2, a5
+ l32i a8, a6, 8 # get word from s1
+ bnone a2, a10, .Laligned # if s1/s2 now aligned
+ l8ui a8, a2, 8 # byte 2 from s1
+ l8ui a9, a3, 8 # byte 2 from s2
+ addi a2, a2, 1 # advance s1
+ bne a8, a9, .Lretdiff # if different, return difference
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ l32i a8, a2, 8 # get word from s1
+ j .Laligned
+
+/* s1 and s2 have different alignment.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop.
+
+ Note: It is important for this unaligned case to come before the
+ code for aligned strings, because otherwise some of the branches
+ above cannot reach and have to be transformed to branches around
+ jumps. The unaligned code is smaller and the branches can reach
+ over it. */
+
+.Lunaligned:
+ movi.n a8, 0 # set up for the maximum loop count
+ loop a8, .Lretdiff # loop forever (almost anyway)
+ l8ui a8, a2, 8
+ l8ui a9, a3, 8
+ addi a2, a2, 1
+ bne a8, a9, .Lretdiff
+ addi a3, a3, 1
+ beqz a8, .Lretdiff
+.Lretdiff:
+ sub a2, a8, a9
+ RET(16)
+
+
+.Lprobeq2:
+ /* Adjust pointers to account for the loop unrolling. */
+ mov a8, a11
+ addi a2, a2, -4
+ addi a3, a3, 4
+
+ /* align (0 mod 4) */
+.Lprobeq:
+ /* Words are probably equal, but check for sure.
+ If not, loop over the rest of string using normal algorithm. */
+
+ bnone a8, a4, .Leq # if byte 0 is zero
+ movi a5, MASK1 # mask for byte 1
+ movi a6, MASK2 # mask for byte 2
+ bnone a8, a5, .Leq # if byte 1 is zero
+ movi a7, MASK3 # mask for byte 3
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bnone a8, a7, .Leq # if byte 3 is zero
+ /* align (1 mod 4) */
+ addi.n a2, a2, 12 # advance s1 pointer
+ addi.n a3, a3, 4 # advance s2 pointer
+ /* align (1 mod 4) or (2 mod 4) */
+1:
+ loop a0, .Lend # loop forever (a4 is bigger than max iters)
+
+ l32i a8, a2, 0 # get word from s1
+ l32i a9, a3, 0 # get word from s2
+ addi a2, a2, 4 # advance s1 pointer
+ bne a8, a9, .Lwne
+ bnone a8, a4, .Leq # if byte 0 is zero
+ bnone a8, a5, .Leq # if byte 1 is zero
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bnone a8, a7, .Leq # if byte 3 is zero
+ addi a3, a3, 4 # advance s2 pointer
+.Lend:
+ j 1b
+
+ /* Words are equal; some byte is zero. */
+.Leq: movi a2, 0 # return equal
+ RET(16)
+
+.Lwne2: /* Words are not equal. On big-endian processors, if none of the
+ bytes are zero, the return value can be determined by a simple
+ comparison. */
+.Lwne: /* Words are not equal. */
+ xor a2, a8, a9 # get word with nonzero in byte that differs
+ extui a10, a8, 0, 8
+ extui a11, a9, 0, 8
+ movi a5, MASK1 # mask for byte 1
+ bany.w15 a2, a4, .Ldiff0 # if byte 0 differs
+
+ bnone.w15 a8, a4, .Leq # if byte 0 is zero
+ movi a6, MASK2 # mask for byte 2
+ bany.w15 a2, a5, .Ldiff1 # if byte 1 differs
+ extui a10, a8, 24, 8
+ bnone.w15 a8, a5, .Leq # if byte 1 is zero
+ extui a11, a9, 24, 8
+ bany.w15 a2, a6, .Ldiff2 # if byte 2 differs
+ sub a2, a10, a11
+ bnone.w15 a8, a6, .Leq # if byte 2 is zero
+ /* Little-endian is a little more difficult because can't subtract
+ whole words. */
+.Ldiff3:
+ /* Bytes 0-2 are equal; byte 3 is different.
+ For little-endian need to have a sign bit for the difference. */
+ RET(16)
+.Ldiff0:
+ /* Byte 0 is different. */
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff1:
+ /* Byte 0 is equal; byte 1 is different. */
+ extui a10, a8, 8, 8
+ extui a11, a9, 8, 8
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff2:
+ /* Bytes 0-1 are equal; byte 2 is different. */
+ extui a10, a8, 16, 8
+ extui a11, a9, 16, 8
+ sub a2, a10, a11
+ RET(16)
+
+
+#else /* Not FLIX3 */
+ ENTRY(16)
+ /* a2 = s1, a3 = s2 */
+
+ l8ui a8, a2, 0 # byte 0 from s1
+ l8ui a9, a3, 0 # byte 0 from s2
+ movi a10, 3 # mask
+ bne a8, a9, .Lretdiff
+
+ or a11, a2, a3
+ bnone a11, a10, .Laligned
+
+ xor a11, a2, a3 # compare low two bits of s1 and s2
+ bany a11, a10, .Lunaligned # if they have different alignment
+
+ /* s1/s2 are not word-aligned. */
+ addi a2, a2, 1 # advance s1
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ bnone a2, a10, .Laligned # if s1/s2 now aligned
+ l8ui a8, a2, 0 # byte 1 from s1
+ l8ui a9, a3, 0 # byte 1 from s2
+ addi a2, a2, 1 # advance s1
+ bne a8, a9, .Lretdiff # if different, return difference
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ bnone a2, a10, .Laligned # if s1/s2 now aligned
+ l8ui a8, a2, 0 # byte 2 from s1
+ l8ui a9, a3, 0 # byte 2 from s2
+ addi a2, a2, 1 # advance s1
+ bne a8, a9, .Lretdiff # if different, return difference
+ beqz a8, .Leq # bytes equal, if zero, strings are equal
+ addi a3, a3, 1 # advance s2
+ j .Laligned
+
+/* s1 and s2 have different alignment.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop.
+
+ Note: It is important for this unaligned case to come before the
+ code for aligned strings, because otherwise some of the branches
+ above cannot reach and have to be transformed to branches around
+ jumps. The unaligned code is smaller and the branches can reach
+ over it. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ /* (2 mod 4) alignment for loop instruction */
+#else
+ /* (1 mod 4) alignment for loop instruction */
+ .byte 0
+ .byte 0
+#endif
+#endif
+.Lunaligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ _movi.n a8, 0 # set up for the maximum loop count
+#else
+ _movi a8, 0 # set up for the maximum loop count
+#endif
+ loop a8, .Lretdiff # loop forever (almost anyway)
+#endif
+.Lnextbyte:
+ l8ui a8, a2, 0
+ l8ui a9, a3, 0
+ addi a2, a2, 1
+ bne a8, a9, .Lretdiff
+ addi a3, a3, 1
+#if XCHAL_HAVE_LOOPS
+ beqz a8, .Lretdiff
+#else
+ bnez a8, .Lnextbyte
+#endif
+.Lretdiff:
+ sub a2, a8, a9
+ RET(16)
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop. */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+ 32 and 127.
+
+ Rather than check all bytes for zero:
+ Take one word (4 bytes). Call it w1.
+ Shift w1 left by one into w1'.
+ Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
+ Check that all 4 bit 6's (one for each byte) are one:
+ If they are, we are definitely not done.
+ If they are not, we are probably done, but need to check for zero. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_CONST16
+ /* (2 mod 4) alignment for loop instruction */
+ .byte 0
+#endif
+.Laligned:
+ movi a4, MASK0 # mask for byte 0
+ movi a7, MASK4
+
+ /* Loop forever */
+1:
+ loop a0, .Laligned_done
+
+ /* First unrolled loop body. */
+ l32i a8, a2, 0 # get word from s1
+ l32i a9, a3, 0 # get word from s2
+ slli a5, a8, 1
+ bne a8, a9, .Lwne2
+ or a9, a8, a5
+ bnall a9, a7, .Lprobeq
+
+ /* Second unrolled loop body. */
+ l32i a8, a2, 4 # get word from s1+4
+ l32i a9, a3, 4 # get word from s2+4
+ slli a5, a8, 1
+ bne a8, a9, .Lwne2
+ or a9, a8, a5
+ bnall a9, a7, .Lprobeq2
+
+ addi a2, a2, 8 # advance s1 pointer
+ addi a3, a3, 8 # advance s2 pointer
+.Laligned_done:
+ j 1b
+
+.Lprobeq2:
+ /* Adjust pointers to account for the loop unrolling. */
+ addi a2, a2, 4
+ addi a3, a3, 4
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+.Laligned:
+ movi a4, MASK0 # mask for byte 0
+ movi a7, MASK4
+ j .Lfirstword
+.Lnextword:
+ addi a2, a2, 4 # advance s1 pointer
+ addi a3, a3, 4 # advance s2 pointer
+.Lfirstword:
+ l32i a8, a2, 0 # get word from s1
+ l32i a9, a3, 0 # get word from s2
+ slli a5, a8, 1
+ bne a8, a9, .Lwne2
+ or a9, a8, a5
+ ball a9, a7, .Lnextword
+#endif /* !XCHAL_HAVE_LOOPS */
+
+ /* align (0 mod 4) */
+.Lprobeq:
+ /* Words are probably equal, but check for sure.
+ If not, loop over the rest of string using normal algorithm. */
+
+ bnone a8, a4, .Leq # if byte 0 is zero
+ movi a5, MASK1 # mask for byte 1
+ movi a6, MASK2 # mask for byte 2
+ bnone a8, a5, .Leq # if byte 1 is zero
+ movi a7, MASK3 # mask for byte 3
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bnone a8, a7, .Leq # if byte 3 is zero
+ /* align (1 mod 4) */
+#if XCHAL_HAVE_DENSITY
+ addi.n a2, a2, 4 # advance s1 pointer
+ addi.n a3, a3, 4 # advance s2 pointer
+ /* align (1 mod 4) or (2 mod 4) */
+#else
+ addi a2, a2, 4 # advance s1 pointer
+ addi a3, a3, 4 # advance s2 pointer
+ or a1, a1, a1 # nop
+#if XCHAL_HAVE_CONST16
+ or a1, a1, a1 # nop
+#endif
+ /* align (2 mod 4) */
+#endif /* XCHAL_HAVE_DENSITY */
+#if XCHAL_HAVE_LOOPS
+1:
+ loop a0, .Leq # loop forever (a4 is bigger than max iters)
+ l32i a8, a2, 0 # get word from s1
+ l32i a9, a3, 0 # get word from s2
+ addi a2, a2, 4 # advance s1 pointer
+ bne a8, a9, .Lwne
+ bnone a8, a4, .Leq # if byte 0 is zero
+ bnone a8, a5, .Leq # if byte 1 is zero
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bnone a8, a7, .Leq # if byte 3 is zero
+ addi a3, a3, 4 # advance s2 pointer
+ j 1b
+#else /* !XCHAL_HAVE_LOOPS */
+
+ j .Lfirstword2
+.Lnextword2:
+ addi a3, a3, 4 # advance s2 pointer
+.Lfirstword2:
+ l32i a8, a2, 0 # get word from s1
+ l32i a9, a3, 0 # get word from s2
+ addi a2, a2, 4 # advance s1 pointer
+ bne a8, a9, .Lwne
+ bnone a8, a4, .Leq # if byte 0 is zero
+ bnone a8, a5, .Leq # if byte 1 is zero
+ bnone a8, a6, .Leq # if byte 2 is zero
+ bany a8, a7, .Lnextword2 # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+ /* Words are equal; some byte is zero. */
+.Leq: movi a2, 0 # return equal
+ RET(16)
+
+.Lwne2: /* Words are not equal. On big-endian processors, if none of the
+ bytes are zero, the return value can be determined by a simple
+ comparison. */
+#if XCHAL_HAVE_BE
+ or a10, a8, a5
+ bnall a10, a7, .Lsomezero
+ bgeu a8, a9, .Lposreturn
+ movi a2, -1
+ RET(16)
+.Lposreturn:
+ movi a2, 1
+ RET(16)
+.Lsomezero: # There is probably some zero byte.
+#endif /* XCHAL_HAVE_BE */
+.Lwne: /* Words are not equal. */
+ xor a2, a8, a9 # get word with nonzero in byte that differs
+ bany a2, a4, .Ldiff0 # if byte 0 differs
+ movi a5, MASK1 # mask for byte 1
+ bnone a8, a4, .Leq # if byte 0 is zero
+ bany a2, a5, .Ldiff1 # if byte 1 differs
+ movi a6, MASK2 # mask for byte 2
+ bnone a8, a5, .Leq # if byte 1 is zero
+ bany a2, a6, .Ldiff2 # if byte 2 differs
+ bnone a8, a6, .Leq # if byte 2 is zero
+#if XCHAL_HAVE_BE
+.Ldiff3:
+.Ldiff2:
+.Ldiff1:
+ /* Byte 0 is equal (at least) and there is a difference before a zero
+ byte. Just subtract words to get the return value.
+ The high order equal bytes cancel, leaving room for the sign. */
+ sub a2, a8, a9
+ RET(16)
+
+.Ldiff0:
+ /* Need to make room for the sign, so can't subtract whole words. */
+ extui a10, a8, 24, 8
+ extui a11, a9, 24, 8
+ sub a2, a10, a11
+ RET(16)
+
+#else /* !XCHAL_HAVE_BE */
+ /* Little-endian is a little more difficult because can't subtract
+ whole words. */
+.Ldiff3:
+ /* Bytes 0-2 are equal; byte 3 is different.
+ For little-endian need to have a sign bit for the difference. */
+ extui a10, a8, 24, 8
+ extui a11, a9, 24, 8
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff0:
+ /* Byte 0 is different. */
+ extui a10, a8, 0, 8
+ extui a11, a9, 0, 8
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff1:
+ /* Byte 0 is equal; byte 1 is different. */
+ extui a10, a8, 8, 8
+ extui a11, a9, 8, 8
+ sub a2, a10, a11
+ RET(16)
+
+.Ldiff2:
+ /* Bytes 0-1 are equal; byte 2 is different. */
+ extui a10, a8, 16, 8
+ extui a11, a9, 16, 8
+ sub a2, a10, a11
+ RET(16)
+
+#endif /* !XCHAL_HAVE_BE */
+#endif /* FLIX3 */
+#endif /* FLIX3 */
+
+ .end schedule
+ .size strcmp, . - strcmp
+
diff --git a/libs/libc/machine/xtensa/arch_strcpy.S b/libs/libc/machine/xtensa/arch_strcpy.S
new file mode 100644
index 0000000..b062d87
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strcpy.S
@@ -0,0 +1,243 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+ .section .text
+ .begin schedule
+ .align 4
+ .literal_position
+ .global strcpy
+ .type strcpy, @function
+strcpy:
+ ENTRY(16)
+ /* a2 = dst, a3 = src */
+
+ mov a10, a2 # leave dst in return value register
+ movi a4, MASK0
+ movi a5, MASK1
+ movi a6, MASK2
+ movi a7, MASK3
+ bbsi.l a3, 0, .Lsrc1mod2
+ bbsi.l a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+ /* Check if the destination is aligned. */
+ movi a8, 3
+ bnone a10, a8, .Laligned
+
+ j .Ldstunaligned
+
+.Lsrc1mod2: # src address is odd
+ l8ui a8, a3, 0 # get byte 0
+ addi a3, a3, 1 # advance src pointer
+ s8i a8, a10, 0 # store byte 0
+ beqz a8, 1f # if byte 0 is zero
+ addi a10, a10, 1 # advance dst pointer
+ bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+ l8ui a8, a3, 0 # get byte 0
+ /* 1-cycle interlock */
+ s8i a8, a10, 0 # store byte 0
+ beqz a8, 1f # if byte 0 is zero
+ l8ui a8, a3, 1 # get byte 0
+ addi a3, a3, 2 # advance src pointer
+ s8i a8, a10, 1 # store byte 0
+ addi a10, a10, 2 # advance dst pointer
+ bnez a8, .Lsrcaligned
+1: RET(16)
+
+
+/* dst is word-aligned; src is word-aligned. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ /* (2 mod 4) alignment for loop instruction */
+#else
+ /* (1 mod 4) alignment for loop instruction */
+ .byte 0
+ .byte 0
+#endif
+.Laligned:
+#if XCHAL_HAVE_DENSITY
+ _movi.n a8, 0 # set up for the maximum loop count
+#else
+ _movi a8, 0 # set up for the maximum loop count
+#endif
+ loop a8, .Lz3 # loop forever (almost anyway)
+ l32i a8, a3, 0 # get word from src
+ addi a3, a3, 4 # advance src pointer
+ bnone a8, a4, .Lz0 # if byte 0 is zero
+ bnone a8, a5, .Lz1 # if byte 1 is zero
+ bnone a8, a6, .Lz2 # if byte 2 is zero
+ s32i a8, a10, 0 # store word to dst
+ bnone a8, a7, .Lz3 # if byte 3 is zero
+ addi a10, a10, 4 # advance dst pointer
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1: addi a10, a10, 4 # advance dst pointer
+.Laligned:
+ l32i a8, a3, 0 # get word from src
+ addi a3, a3, 4 # advance src pointer
+ bnone a8, a4, .Lz0 # if byte 0 is zero
+ bnone a8, a5, .Lz1 # if byte 1 is zero
+ bnone a8, a6, .Lz2 # if byte 2 is zero
+ s32i a8, a10, 0 # store word to dst
+ bany a8, a7, 1b # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+.Lz3: /* Byte 3 is zero. */
+ RET(16)
+
+.Lz0: /* Byte 0 is zero. */
+#if XCHAL_HAVE_BE
+ movi a8, 0
+#endif
+ s8i a8, a10, 0
+ RET(16)
+
+.Lz1: /* Byte 1 is zero. */
+#if XCHAL_HAVE_BE
+ extui a8, a8, 16, 16
+#endif
+ s16i a8, a10, 0
+ RET(16)
+
+.Lz2: /* Byte 2 is zero. */
+#if XCHAL_HAVE_BE
+ extui a8, a8, 16, 16
+#endif
+ s16i a8, a10, 0
+ movi a8, 0
+ s8i a8, a10, 2
+ RET(16)
+
+#if 1
+/* For now just use byte copy loop for the unaligned destination case. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ /* (2 mod 4) alignment for loop instruction */
+#else
+ /* (1 mod 4) alignment for loop instruction */
+ .byte 0
+ .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ _movi.n a8, 0 # set up for the maximum loop count
+#else
+ _movi a8, 0 # set up for the maximum loop count
+#endif
+ loop a8, 2f # loop forever (almost anyway)
+#endif
+1: l8ui a8, a3, 0
+ addi a3, a3, 1
+ s8i a8, a10, 0
+ addi a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+ beqz a8, 2f
+#else
+ bnez a8, 1b
+#endif
+2: RET(16)
+
+#else /* 0 */
+
+/* This code is not functional yet. */
+
+.Ldstunaligned:
+ l32i a9, a2, 0 # load word from dst
+#if XCHAL_HAVE_BE
+ ssa8b a9 # rotate by dst alignment so that
+ src a9, a9, a9 # shift in loop will put back in place
+ ssa8l a9 # shift left by byte*8
+#else
+ ssa8l a9 # rotate by dst alignment so that
+ src a9, a9, a9 # shift in loop will put back in place
+ ssa8b a9 # shift left by 32-byte*8
+#endif
+
+/* dst is word-aligned; src is unaligned. */
+
+.Ldstunalignedloop:
+ l32i a8, a3, 0 # get word from src
+ /* 1-cycle interlock */
+ bnone a8, a4, .Lu0 # if byte 0 is zero
+ bnone a8, a5, .Lu1 # if byte 1 is zero
+ bnone a8, a6, .Lu2 # if byte 2 is zero
+ src a9, a8, a9 # combine last word and this word
+ s32i a9, a10, 0 # store word to dst
+ bnone a8, a7, .Lu3 # if byte 3 is nonzero, iterate
+ l32i a9, a3, 4 # get word from src
+ addi a3, a3, 8 # advance src pointer
+ bnone a9, a4, .Lu4 # if byte 0 is zero
+ bnone a9, a5, .Lu5 # if byte 1 is zero
+ bnone a9, a6, .Lu6 # if byte 2 is zero
+ src a8, a9, a8 # combine last word and this word
+ s32i a8, a10, 4 # store word to dst
+ addi a10, a10, 8 # advance dst pointer
+ bany a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate
+
+ /* Byte 7 is zero. */
+.Lu7: RET(16)
+
+.Lu0: /* Byte 0 is zero. */
+#if XCHAL_HAVE_BE
+ movi a8, 0
+#endif
+ s8i a8, a10, 0
+ RET(16)
+
+.Lu1: /* Byte 1 is zero. */
+#if XCHAL_HAVE_BE
+ extui a8, a8, 16, 16
+#endif
+ s16i a8, a10, 0
+ RET(16)
+
+.Lu2: /* Byte 2 is zero. */
+ s16i a8, a10, 0
+ movi a8, 0
+ s8i a8, a10, 2
+ RET(16)
+
+#endif /* 0 */
+ .end schedule
+
+ .size strcpy, . - strcpy
diff --git a/libs/libc/machine/xtensa/arch_strlen.S b/libs/libc/machine/xtensa/arch_strlen.S
new file mode 100644
index 0000000..686268e
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strlen.S
@@ -0,0 +1,123 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strlen.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+ .section .text
+ .begin schedule
+ .align 4
+ .literal_position
+ .global strlen
+ .type strlen, @function
+strlen:
+ ENTRY(16)
+ /* a2 = s */
+
+ addi a3, a2, -4 # because we overincrement at the end
+ movi a4, MASK0
+ movi a5, MASK1
+ movi a6, MASK2
+ movi a7, MASK3
+ bbsi.l a2, 0, .L1mod2
+ bbsi.l a2, 1, .L2mod4
+ j .Laligned
+
+.L1mod2: # address is odd
+ l8ui a8, a3, 4 # get byte 0
+ addi a3, a3, 1 # advance string pointer
+ beqz a8, .Lz3 # if byte 0 is zero
+ bbci.l a3, 1, .Laligned # if string pointer is now word-aligned
+
+.L2mod4: # address is 2 mod 4
+ addi a3, a3, 2 # advance ptr for aligned access
+ l32i a8, a3, 0 # get word with first two bytes of string
+ bnone a8, a6, .Lz2 # if byte 2 (of word, not string) is zero
+ bany a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero
+
+ /* Byte 3 is zero. */
+ addi a3, a3, 3 # point to zero byte
+ sub a2, a3, a2 # subtract to get length
+ RET(16)
+
+
+/* String is word-aligned. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ /* (2 mod 4) alignment for loop instruction */
+#else
+ /* (1 mod 4) alignment for loop instruction */
+ .byte 0
+ .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ _movi.n a8, 0 # set up for the maximum loop count
+#else
+ _movi a8, 0 # set up for the maximum loop count
+#endif
+ loop a8, .Lz3 # loop forever (almost anyway)
+#endif
+1: l32i a8, a3, 4 # get next word of string
+ addi a3, a3, 4 # advance string pointer
+ bnone a8, a4, .Lz0 # if byte 0 is zero
+ bnone a8, a5, .Lz1 # if byte 1 is zero
+ bnone a8, a6, .Lz2 # if byte 2 is zero
+#if XCHAL_HAVE_LOOPS
+ bnone a8, a7, .Lz3 # if byte 3 is zero
+#else
+ bany a8, a7, 1b # repeat if byte 3 is non-zero
+#endif
+
+.Lz3: /* Byte 3 is zero. */
+ addi a3, a3, 3 # point to zero byte
+ /* Fall through.... */
+
+.Lz0: /* Byte 0 is zero. */
+ sub a2, a3, a2 # subtract to get length
+ RET(16)
+
+.Lz1: /* Byte 1 is zero. */
+ addi a3, a3, 1 # point to zero byte
+ sub a2, a3, a2 # subtract to get length
+ RET(16)
+
+.Lz2: /* Byte 2 is zero. */
+ addi a3, a3, 2 # point to zero byte
+ sub a2, a3, a2 # subtract to get length
+ RET(16)
+
+ .end schedule
+
+ .size strlen, . - strlen
diff --git a/libs/libc/machine/xtensa/arch_strncpy.S b/libs/libc/machine/xtensa/arch_strncpy.S
new file mode 100644
index 0000000..297f00c
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strncpy.S
@@ -0,0 +1,265 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strncpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+ .section .text
+.begin schedule
+ .align 4
+ .literal_position
+__strncpy_aux:
+
+.Lsrc1mod2: # src address is odd
+ l8ui a8, a3, 0 # get byte 0
+ addi a3, a3, 1 # advance src pointer
+ s8i a8, a10, 0 # store byte 0
+ addi a4, a4, -1 # decrement n
+ beqz a4, .Lret # if n is zero
+ addi a10, a10, 1 # advance dst pointer
+ beqz a8, .Lfill # if byte 0 is zero
+ bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+ l8ui a8, a3, 0 # get byte 0
+ addi a4, a4, -1 # decrement n
+ s8i a8, a10, 0 # store byte 0
+ beqz a4, .Lret # if n is zero
+ addi a10, a10, 1 # advance dst pointer
+ beqz a8, .Lfill # if byte 0 is zero
+ l8ui a8, a3, 1 # get byte 0
+ addi a3, a3, 2 # advance src pointer
+ s8i a8, a10, 0 # store byte 0
+ addi a4, a4, -1 # decrement n
+ beqz a4, .Lret # if n is zero
+ addi a10, a10, 1 # advance dst pointer
+ bnez a8, .Lsrcaligned
+ j .Lfill
+
+.Lret:
+ RET(16)
+
+ .align 4
+ .global strncpy
+ .type strncpy, @function
+strncpy:
+ ENTRY(16)
+ /* a2 = dst, a3 = src */
+
+ mov a10, a2 # leave dst in return value register
+ beqz a4, .Lret # if n is zero
+
+ movi a11, MASK0
+ movi a5, MASK1
+ movi a6, MASK2
+ movi a7, MASK3
+ bbsi.l a3, 0, .Lsrc1mod2
+ bbsi.l a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+ /* Check if the destination is aligned. */
+ movi a8, 3
+ bnone a10, a8, .Laligned
+
+ j .Ldstunaligned
+
+
+/* Fill the dst with zeros -- n is at least 1. */
+
+.Lfill:
+ movi a9, 0
+ bbsi.l a10, 0, .Lfill1mod2
+ bbsi.l a10, 1, .Lfill2mod4
+.Lfillaligned:
+ blti a4, 4, .Lfillcleanup
+
+ /* Loop filling complete words with zero. */
+#if XCHAL_HAVE_LOOPS
+
+ srai a8, a4, 2
+ loop a8, 1f
+ s32i a9, a10, 0
+ addi a10, a10, 4
+
+1: slli a8, a8, 2
+ sub a4, a4, a8
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1: s32i a9, a10, 0
+ addi a10, a10, 4
+ addi a4, a4, -4
+ bgei a4, 4, 1b
+
+#endif /* !XCHAL_HAVE_LOOPS */
+
+ beqz a4, 2f
+
+.Lfillcleanup:
+ /* Fill leftover (1 to 3) bytes with zero. */
+ s8i a9, a10, 0 # store byte 0
+ addi a4, a4, -1 # decrement n
+ addi a10, a10, 1
+ bnez a4, .Lfillcleanup
+
+2: RET(16)
+
+.Lfill1mod2: # dst address is odd
+ s8i a9, a10, 0 # store byte 0
+ addi a4, a4, -1 # decrement n
+ beqz a4, 2b # if n is zero
+ addi a10, a10, 1 # advance dst pointer
+ bbci.l a10, 1, .Lfillaligned # if dst is now word-aligned
+
+.Lfill2mod4: # dst address is 2 mod 4
+ s8i a9, a10, 0 # store byte 0
+ addi a4, a4, -1 # decrement n
+ beqz a4, 2b # if n is zero
+ s8i a9, a10, 1 # store byte 1
+ addi a4, a4, -1 # decrement n
+ beqz a4, 2b # if n is zero
+ addi a10, a10, 2 # advance dst pointer
+ j .Lfillaligned
+
+
+/* dst is word-aligned; src is word-aligned; n is at least 1. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ /* (2 mod 4) alignment for loop instruction */
+#else
+ /* (1 mod 4) alignment for loop instruction */
+ .byte 0
+ .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ _movi.n a8, 0 # set up for the maximum loop count
+#else
+ _movi a8, 0 # set up for the maximum loop count
+#endif
+ loop a8, 1f # loop forever (almost anyway)
+ blti a4, 5, .Ldstunaligned # n is near limit; do one at a time
+ l32i a8, a3, 0 # get word from src
+ addi a3, a3, 4 # advance src pointer
+ bnone a8, a11, .Lz0 # if byte 0 is zero
+ bnone a8, a5, .Lz1 # if byte 1 is zero
+ bnone a8, a6, .Lz2 # if byte 2 is zero
+ s32i a8, a10, 0 # store word to dst
+ addi a4, a4, -4 # decrement n
+ addi a10, a10, 4 # advance dst pointer
+ bnone a8, a7, .Lfill # if byte 3 is zero
+1:
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1: blti a4, 5, .Ldstunaligned # n is near limit; do one at a time
+ l32i a8, a3, 0 # get word from src
+ addi a3, a3, 4 # advance src pointer
+ bnone a8, a11, .Lz0 # if byte 0 is zero
+ bnone a8, a5, .Lz1 # if byte 1 is zero
+ bnone a8, a6, .Lz2 # if byte 2 is zero
+ s32i a8, a10, 0 # store word to dst
+ addi a4, a4, -4 # decrement n
+ addi a10, a10, 4 # advance dst pointer
+ bany a8, a7, 1b # no zeroes
+#endif /* !XCHAL_HAVE_LOOPS */
+
+ j .Lfill
+
+.Lz0: /* Byte 0 is zero. */
+#if XCHAL_HAVE_BE
+ movi a8, 0
+#endif
+ s8i a8, a10, 0
+ addi a4, a4, -1 # decrement n
+ addi a10, a10, 1 # advance dst pointer
+ j .Lfill
+
+.Lz1: /* Byte 1 is zero. */
+#if XCHAL_HAVE_BE
+ extui a8, a8, 16, 16
+#endif
+ s16i a8, a10, 0
+ addi a4, a4, -2 # decrement n
+ addi a10, a10, 2 # advance dst pointer
+ j .Lfill
+
+.Lz2: /* Byte 2 is zero. */
+#if XCHAL_HAVE_BE
+ extui a8, a8, 16, 16
+#endif
+ s16i a8, a10, 0
+ movi a8, 0
+ s8i a8, a10, 2
+ addi a4, a4, -3 # decrement n
+ addi a10, a10, 3 # advance dst pointer
+ j .Lfill
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ /* (2 mod 4) alignment for loop instruction */
+#else
+ /* (1 mod 4) alignment for loop instruction */
+ .byte 0
+ .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+ _movi.n a8, 0 # set up for the maximum loop count
+#else
+ _movi a8, 0 # set up for the maximum loop count
+#endif
+ loop a8, 2f # loop forever (almost anyway)
+#endif
+1: l8ui a8, a3, 0
+ addi a3, a3, 1
+ s8i a8, a10, 0
+ addi a4, a4, -1
+ beqz a4, 3f
+ addi a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+ beqz a8, 2f
+#else
+ bnez a8, 1b
+#endif
+2: j .Lfill
+
+3: RET(16)
+.end schedule
+
+ .size strncpy, . - strncpy
diff --git a/libs/libc/machine/xtensa/xtensa_asm.h b/libs/libc/machine/xtensa/xtensa_asm.h
new file mode 100644
index 0000000..9913763
--- /dev/null
+++ b/libs/libc/machine/xtensa/xtensa_asm.h
@@ -0,0 +1,62 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/xtensa_asm.h
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership. The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include <arch/chip/core-isa.h>
+
+/****************************************************************************
+ * Assembly Language Macros
+ ****************************************************************************/
+
+ .macro src_b r, w0, w1
+#if XCHAL_HAVE_BE
+ src \r, \w0, \w1
+#else
+ src \r, \w1, \w0
+#endif
+ .endm
+
+ .macro ssa8 r
+#if XCHAL_HAVE_BE
+ ssa8b \r
+#else
+ ssa8l \r
+#endif
+ .endm
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#if XCHAL_HAVE_BE
+# define MASK0 0xff000000
+# define MASK1 0x00ff0000
+# define MASK2 0x0000ff00
+# define MASK3 0x000000ff
+#else
+# define MASK0 0x000000ff
+# define MASK1 0x0000ff00
+# define MASK2 0x00ff0000
+# define MASK3 0xff000000
+#endif
+