You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nuttx.apache.org by xi...@apache.org on 2021/11/06 12:39:37 UTC

[incubator-nuttx] branch master updated (4b96c28 -> 5a4140f)

This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git.


    from 4b96c28  stm32h7:Support SPI SPI_DELAY_CONTROL
     new 580d17c  arch:xtensa: make xtensa_abi.h global include and usage
     new cfcff5f  libc:machine:xtensa:add xtensa libc implement
     new 5a4140f  arch:xtensa: add setjmp xtensa function

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 arch/Kconfig                                       |   1 +
 .../{src/common => include/xtensa}/xtensa_abi.h    |   0
 arch/xtensa/src/common/xtensa_context.S            |   3 +-
 arch/xtensa/src/common/xtensa_coproc.S             |   3 +-
 arch/xtensa/src/common/xtensa_cpuint.S             |   2 +-
 arch/xtensa/src/common/xtensa_int_handlers.S       |   2 +-
 arch/xtensa/src/common/xtensa_loadstore.S          |   2 +-
 arch/xtensa/src/common/xtensa_sigtramp.S           |   2 +-
 arch/xtensa/src/common/xtensa_simcall.S            |   2 +-
 arch/xtensa/src/common/xtensa_vectors.S            |   2 +-
 arch/xtensa/src/common/xtensa_windowspill.S        |   3 +-
 arch/xtensa/src/esp32/esp32_cpuindex.S             |   2 +-
 libs/libc/machine/xtensa/Kconfig                   |  43 ++
 libs/libc/machine/xtensa/Make.defs                 |  35 +-
 libs/libc/machine/xtensa/arch_memcpy.S             | 281 ++++++++
 libs/libc/machine/xtensa/arch_memmove.S            | 480 +++++++++++++
 libs/libc/machine/xtensa/arch_memset.S             | 179 +++++
 libs/libc/machine/xtensa/arch_setjmp.S             | 374 ++++++++++
 libs/libc/machine/xtensa/arch_strcmp.S             | 767 +++++++++++++++++++++
 libs/libc/machine/xtensa/arch_strcpy.S             | 243 +++++++
 libs/libc/machine/xtensa/arch_strlen.S             | 123 ++++
 libs/libc/machine/xtensa/arch_strncpy.S            | 265 +++++++
 .../libc/machine/xtensa/xtensa_asm.h               |  49 +-
 23 files changed, 2829 insertions(+), 34 deletions(-)
 rename arch/xtensa/{src/common => include/xtensa}/xtensa_abi.h (100%)
 create mode 100644 libs/libc/machine/xtensa/arch_memcpy.S
 create mode 100644 libs/libc/machine/xtensa/arch_memmove.S
 create mode 100644 libs/libc/machine/xtensa/arch_memset.S
 create mode 100644 libs/libc/machine/xtensa/arch_setjmp.S
 create mode 100644 libs/libc/machine/xtensa/arch_strcmp.S
 create mode 100644 libs/libc/machine/xtensa/arch_strcpy.S
 create mode 100644 libs/libc/machine/xtensa/arch_strlen.S
 create mode 100644 libs/libc/machine/xtensa/arch_strncpy.S
 copy include/nuttx/rc/dummy.h => libs/libc/machine/xtensa/xtensa_asm.h (73%)

[incubator-nuttx] 01/03: arch:xtensa: make xtensa_abi.h global include and usage

Posted by xi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git

commit 580d17cc02db481c3a8e8006398a73d7d2cdd603
Author: zhuyanlin <zh...@xiaomi.com>
AuthorDate: Thu Oct 28 20:22:09 2021 +0800

    arch:xtensa: make xtensa_abi.h global include and usage
    
    N/A
    
    Signed-off-by: zhuyanlin <zh...@xiaomi.com>
---
 arch/xtensa/{src/common => include/xtensa}/xtensa_abi.h | 0
 arch/xtensa/src/common/xtensa_context.S                 | 3 +--
 arch/xtensa/src/common/xtensa_coproc.S                  | 3 +--
 arch/xtensa/src/common/xtensa_cpuint.S                  | 2 +-
 arch/xtensa/src/common/xtensa_int_handlers.S            | 2 +-
 arch/xtensa/src/common/xtensa_loadstore.S               | 2 +-
 arch/xtensa/src/common/xtensa_sigtramp.S                | 2 +-
 arch/xtensa/src/common/xtensa_simcall.S                 | 2 +-
 arch/xtensa/src/common/xtensa_vectors.S                 | 2 +-
 arch/xtensa/src/common/xtensa_windowspill.S             | 3 +--
 arch/xtensa/src/esp32/esp32_cpuindex.S                  | 2 +-
 11 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/xtensa/src/common/xtensa_abi.h b/arch/xtensa/include/xtensa/xtensa_abi.h
similarity index 100%
rename from arch/xtensa/src/common/xtensa_abi.h
rename to arch/xtensa/include/xtensa/xtensa_abi.h
diff --git a/arch/xtensa/src/common/xtensa_context.S b/arch/xtensa/src/common/xtensa_context.S
index 3ac5b77..08c13fb 100644
--- a/arch/xtensa/src/common/xtensa_context.S
+++ b/arch/xtensa/src/common/xtensa_context.S
@@ -63,10 +63,9 @@
 #include <arch/irq.h>
 #include <arch/chip/core-isa.h>
 #include <arch/chip/tie.h>
+#include <arch/xtensa/xtensa_abi.h>
 #include <arch/xtensa/xtensa_specregs.h>
 
-#include "xtensa_abi.h"
-
 /****************************************************************************
  * Public Functions
  ****************************************************************************/
diff --git a/arch/xtensa/src/common/xtensa_coproc.S b/arch/xtensa/src/common/xtensa_coproc.S
index 3b2b6bf..0090004 100644
--- a/arch/xtensa/src/common/xtensa_coproc.S
+++ b/arch/xtensa/src/common/xtensa_coproc.S
@@ -40,14 +40,13 @@
 #include <nuttx/config.h>
 
 #include <arch/xtensa/core.h>
+#include <arch/xtensa/xtensa_abi.h>
 #include <arch/xtensa/xtensa_coproc.h>
 #include <arch/xtensa/xtensa_specregs.h>
 #include <arch/chip/core-isa.h>
 #include <arch/chip/tie.h>
 #include <arch/chip/tie-asm.h>
 
-#include "xtensa_abi.h"
-
 #if XCHAL_CP_NUM > 0
 
 /****************************************************************************
diff --git a/arch/xtensa/src/common/xtensa_cpuint.S b/arch/xtensa/src/common/xtensa_cpuint.S
index f875499..da60961 100644
--- a/arch/xtensa/src/common/xtensa_cpuint.S
+++ b/arch/xtensa/src/common/xtensa_cpuint.S
@@ -40,7 +40,7 @@
 #include <nuttx/config.h>
 #include <arch/chip/core-isa.h>
 
-#include "xtensa_abi.h"
+#include <arch/xtensa/xtensa_abi.h>
 
 #if XCHAL_HAVE_INTERRUPTS
 
diff --git a/arch/xtensa/src/common/xtensa_int_handlers.S b/arch/xtensa/src/common/xtensa_int_handlers.S
index 522c969..458cb78 100644
--- a/arch/xtensa/src/common/xtensa_int_handlers.S
+++ b/arch/xtensa/src/common/xtensa_int_handlers.S
@@ -61,11 +61,11 @@
 
 #include <arch/irq.h>
 #include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
 #include <arch/xtensa/xtensa_specregs.h>
 
 #include "chip.h"
 #include "xtensa.h"
-#include "xtensa_abi.h"
 #include "xtensa_timer.h"
 
 #if !defined(CONFIG_SMP) && CONFIG_ARCH_INTERRUPTSTACK > 15
diff --git a/arch/xtensa/src/common/xtensa_loadstore.S b/arch/xtensa/src/common/xtensa_loadstore.S
index b14fdfe..d979b4b 100644
--- a/arch/xtensa/src/common/xtensa_loadstore.S
+++ b/arch/xtensa/src/common/xtensa_loadstore.S
@@ -24,7 +24,7 @@
 
 #include <nuttx/config.h>
 
-#include "xtensa_abi.h"
+#include <arch/xtensa/xtensa_abi.h>
 
 /****************************************************************************
  * Public Functions
diff --git a/arch/xtensa/src/common/xtensa_sigtramp.S b/arch/xtensa/src/common/xtensa_sigtramp.S
index 9feebd7..6a1d330 100644
--- a/arch/xtensa/src/common/xtensa_sigtramp.S
+++ b/arch/xtensa/src/common/xtensa_sigtramp.S
@@ -26,7 +26,7 @@
 
 #include <nuttx/config.h>
 
-#include "xtensa_abi.h"
+#include <arch/xtensa/xtensa_abi.h>
 
 /****************************************************************************
  * Public Functions
diff --git a/arch/xtensa/src/common/xtensa_simcall.S b/arch/xtensa/src/common/xtensa_simcall.S
index 8361db4..7682856 100644
--- a/arch/xtensa/src/common/xtensa_simcall.S
+++ b/arch/xtensa/src/common/xtensa_simcall.S
@@ -24,7 +24,7 @@
 
 #include <nuttx/config.h>
 
-#include "xtensa_abi.h"
+#include <arch/xtensa/xtensa_abi.h>
 
 /****************************************************************************
  * Public Functions
diff --git a/arch/xtensa/src/common/xtensa_vectors.S b/arch/xtensa/src/common/xtensa_vectors.S
index fd0a662..8f1e893 100644
--- a/arch/xtensa/src/common/xtensa_vectors.S
+++ b/arch/xtensa/src/common/xtensa_vectors.S
@@ -40,10 +40,10 @@
 
 #include <arch/irq.h>
 #include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
 #include <arch/xtensa/xtensa_specregs.h>
 
 #include "xtensa.h"
-#include "xtensa_abi.h"
 
 /****************************************************************************
  * Public Functions
diff --git a/arch/xtensa/src/common/xtensa_windowspill.S b/arch/xtensa/src/common/xtensa_windowspill.S
index f2d3448..76dc81e 100644
--- a/arch/xtensa/src/common/xtensa_windowspill.S
+++ b/arch/xtensa/src/common/xtensa_windowspill.S
@@ -41,11 +41,10 @@
 
 #include <arch/chip/core-isa.h>
 #include <arch/xtensa/core.h>
+#include <arch/xtensa/xtensa_abi.h>
 #include <arch/xtensa/xtensa_specregs.h>
 #include <arch/xtensa/xtensa_corebits.h>
 
-#include "xtensa_abi.h"
-
 /****************************************************************************
  * Public Functions
  ****************************************************************************/
diff --git a/arch/xtensa/src/esp32/esp32_cpuindex.S b/arch/xtensa/src/esp32/esp32_cpuindex.S
index 981fd3b..6951ba7 100644
--- a/arch/xtensa/src/esp32/esp32_cpuindex.S
+++ b/arch/xtensa/src/esp32/esp32_cpuindex.S
@@ -24,7 +24,7 @@
  * Included Files
  ****************************************************************************/
 
-#include "xtensa_abi.h"
+#include <arch/xtensa/xtensa_abi.h>
 #include "chip_macros.h"
 
 /****************************************************************************

[incubator-nuttx] 03/03: arch:xtensa: add setjmp xtensa function

Posted by xi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git

commit 5a4140f020e3f00ac45f7084ec2bdac765b26f1e
Author: zhuyanlin <zh...@xiaomi.com>
AuthorDate: Tue Nov 2 17:25:03 2021 +0800

    arch:xtensa: add setjmp xtensa function
    
    N/A
    
    Signed-off-by: zhuyanlin <zh...@xiaomi.com>
---
 arch/Kconfig                           |   1 +
 libs/libc/machine/xtensa/Make.defs     |   4 +
 libs/libc/machine/xtensa/arch_setjmp.S | 374 +++++++++++++++++++++++++++++++++
 3 files changed, 379 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 73a37d2..8720240 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -103,6 +103,7 @@ config ARCH_XTENSA
 	select ARCH_HAVE_CUSTOMOPT
 	select ARCH_HAVE_TESTSET
 	select ARCH_HAVE_STDARG_H
+	select ARCH_HAVE_SETJMP if ARCH_TOOLCHAIN_GNU
 	---help---
 		Cadence® Tensilica® Xtensa® actictures.
 
diff --git a/libs/libc/machine/xtensa/Make.defs b/libs/libc/machine/xtensa/Make.defs
index 379c7da..78fb412 100644
--- a/libs/libc/machine/xtensa/Make.defs
+++ b/libs/libc/machine/xtensa/Make.defs
@@ -34,6 +34,10 @@ ifeq ($(CONFIG_XTENSA_MEMSET),y)
 ASRCS += arch_memset.S
 endif
 
+ifeq ($(CONFIG_ARCH_SETJMP_H),y)
+ASRCS += arch_setjmp.S
+endif
+
 ifeq ($(CONFIG_XTENSA_STRCPY),y)
 ASRCS += arch_strcpy.S
 endif
diff --git a/libs/libc/machine/xtensa/arch_setjmp.S b/libs/libc/machine/xtensa/arch_setjmp.S
new file mode 100644
index 0000000..99b83af
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_setjmp.S
@@ -0,0 +1,374 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_setjmp.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
+
+/* Windowed ABI:
+
+   This implementation relies heavily on the Xtensa register window
+   mechanism.  Setjmp flushes all the windows except its own to the
+   stack and then copies registers from the save areas on the stack
+   into the jmp_buf structure, along with the return address of the call
+   to setjmp.  Longjmp invalidates all the windows except its own, and
+   then sets things up so that it will return to the right place,
+   using a window underflow to automatically restore the registers.
+
+   Note that it would probably be sufficient to only copy the
+   registers from setjmp's caller into jmp_buf.  However, we also copy
+   the save area located at the stack pointer of setjmp's caller.
+   This save area will typically remain intact until the longjmp call.
+   The one exception is when there is an intervening alloca in
+   setjmp's caller.  This is certainly an unusual situation and is
+   likely to cause problems in any case (the storage allocated on the
+   stack cannot be safely accessed following the longjmp).  As bad as
+   it is, on most systems this situation would not necessarily lead to
+   a catastrophic failure.  If we did not preserve the extra save area
+   on Xtensa, however, it would.  When setjmp's caller returns after a
+   longjmp, there will be a window underflow; an invalid return
+   address or stack pointer in the save area will almost certainly
+   lead to a crash.  Keeping a copy of the extra save area in the
+   jmp_buf avoids this with only a small additional cost.  If setjmp
+   and longjmp are ever time-critical, this could be removed.
+*/
+
+  .text
+  .align  4
+  .literal_position
+  .global setjmp
+  .type setjmp, @function
+setjmp:
+
+# if XCHAL_HAVE_XEA3
+/*
+  a2 points to the jmp_buf structure of 68 bytes length:
+  8 * 4 to save the regester save area of setjmp that contains the callers resgisters
+  8 * 4 to save the caller's register save area which is pottentally
+  clobbered by an alloca() in the caller
+*/
+
+  entry sp, 32
+
+  /* Flush all registers.  */
+  ssai  0
+  spillw
+
+  addi  a7, a1, -32   # find the destination save area
+  s32i  a0, a2, 64
+
+/* Copy the callee register save area to jmp_buf */
+  l32i  a3, a7, 0
+  l32i  a4, a7, 4
+  s32i  a3, a2, 0
+  s32i  a4, a2, 4
+  l32i  a3, a7, 8
+  l32i  a4, a7, 12
+  s32i  a3, a2, 8
+  s32i  a4, a2, 12
+  l32i  a3, a7, 16
+  l32i  a4, a7, 20
+  s32i  a3, a2, 16
+  s32i  a4, a2, 20
+  l32i  a3, a7, 24
+  l32i  a4, a7, 28
+  s32i  a3, a2, 24
+  s32i  a4, a2, 28
+
+/* keep copy of callee register save area to protect against an
+   alloca() (after the setjmp) clobbering the registers needed to return from
+   the caller of setjmp */
+
+  l32i  a3, a1, 0
+  l32i  a4, a1, 4
+  s32i  a3, a2, 32
+  s32i  a4, a2, 36
+  l32i  a3, a1, 8
+  l32i  a4, a1, 12
+  s32i  a3, a2, 40
+  s32i  a4, a2, 44
+  l32i  a3, a1, 16
+  l32i  a4, a1, 20
+  s32i  a3, a2, 48
+  s32i  a4, a2, 52
+  l32i  a3, a1, 24
+  l32i  a4, a1, 28
+  s32i  a3, a2, 56
+  s32i  a4, a2, 60
+# else
+  entry sp, 16
+
+  /* Flush registers.  */
+  mov a4, a2      # save a2 (jmp_buf)
+  movi  a2, 0
+  syscall
+  mov a2, a4      # restore a2
+
+  /* Copy the register save area at (sp - 16).  */
+  addi  a5, a1, -16
+  l32i  a3, a5, 0
+  l32i  a4, a5, 4
+  s32i  a3, a2, 0
+  s32i  a4, a2, 4
+  l32i  a3, a5, 8
+  l32i  a4, a5, 12
+  s32i  a3, a2, 8
+  s32i  a4, a2, 12
+
+  /* Copy 0-8 words from the register overflow area.  */
+  extui a3, a0, 30, 2
+  blti  a3, 2, .Lendsj
+  l32i  a7, a1, 4
+  slli  a4, a3, 4
+  sub a5, a7, a4
+  addi  a6, a2, 16
+  addi  a7, a7, -16   # a7 = end of register overflow area
+.Lsjloop:
+  l32i  a3, a5, 0
+  l32i  a4, a5, 4
+  s32i  a3, a6, 0
+  s32i  a4, a6, 4
+  l32i  a3, a5, 8
+  l32i  a4, a5, 12
+  s32i  a3, a6, 8
+  s32i  a4, a6, 12
+  addi  a5, a5, 16
+  addi  a6, a6, 16
+  blt a5, a7, .Lsjloop
+.Lendsj:
+
+  /* Copy the register save area at sp.  */
+  l32i  a3, a1, 0
+  l32i  a4, a1, 4
+  s32i  a3, a2, 48
+  s32i  a4, a2, 52
+  l32i  a3, a1, 8
+  l32i  a4, a1, 12
+  s32i  a3, a2, 56
+  s32i  a4, a2, 60
+
+  /* Save the return address, including the window size bits.  */
+  s32i  a0, a2, 64
+# endif
+
+  movi  a2, 0
+  retw
+  .size setjmp, . - setjmp
+
+/* void longjmp (jmp_buf env, int val) */
+
+  .align  4
+  .literal_position
+  .global longjmp
+  .type longjmp, @function
+longjmp:
+  /*  a2 == &env, a3 == val  */
+#if XCHAL_HAVE_XEA3
+  entry sp, 32
+  ssai  0
+  tossw
+
+  l32i a0, a2, 64
+
+  addi  a7, a1, -32   # find the destination save area
+  l32i  a4, a2, 0
+  l32i  a5, a2, 4
+  s32i  a4, a7, 0
+  s32i  a5, a7, 4
+  l32i  a4, a2, 8
+  l32i  a5, a2, 12
+  s32i  a4, a7, 8
+  s32i  a5, a7, 12
+  l32i  a4, a2, 16
+  l32i  a5, a2, 20
+  s32i  a4, a7, 16
+  s32i  a5, a7, 20
+  l32i  a4, a2, 24
+  l32i  a5, a2, 28
+  s32i  a4, a7, 24
+  s32i  a5, a7, 28
+
+  /* The 8 words saved from the register save area at the target's
+     sp are copied back to the target procedure's save area.  The
+     only point of this is to prevent a catastrophic failure in
+     case the contents were moved by an alloca after calling
+     setjmp.  This is a bit paranoid but it doesn't cost much.
+   */
+
+  l32i  a7, a2, 4 /* get the stack pointer as it was at the call to setjmp() ...
+                before any changed due to alloca() */
+      addi  a7, a7, -32
+  l32i  a4, a2, 32 /* copy the register values from the jmp_buf to the
+                possibly clobbered register save area */
+  l32i  a5, a2, 36
+  s32i  a4, a7, 0
+  s32i  a5, a7, 4
+  l32i  a4, a2, 40
+  l32i  a5, a2, 44
+  s32i  a4, a7, 8
+  s32i  a5, a7, 12
+  l32i  a4, a2, 48
+  l32i  a5, a2, 52
+  s32i  a4, a7, 16
+  s32i  a5, a7, 20
+  l32i  a4, a2, 56
+  l32i  a5, a2, 60
+  s32i  a4, a7, 24
+  s32i  a5, a7, 28
+
+#else
+  entry sp, 16
+
+# if XCHAL_MAYHAVE_ERRATUM_XEA1KWIN
+  /* Using this register triggers early any overflow that a kernel-mode
+     level-one interrupt might otherwise cause.  */
+#  define AR_WB a15
+# else
+  /* Using this register is more efficient; it triggers less overflows.  */
+#  define AR_WB a5
+# endif
+  /* Invalidate all but the current window;
+     set WindowStart to (1 << WindowBase).  */
+  rsr AR_WB, WINDOWBASE
+  movi  a4, 1
+  ssl AR_WB
+  sll a4, a4
+  wsr a4, WINDOWSTART
+  rsync
+
+  /* Return to the return address of the setjmp, using the
+     window size bits from the setjmp call so that the caller
+     will be able to find the return value that we put in a2.  */
+
+  l32i  a0, a2, 64
+
+  /* Copy the first 4 saved registers from jmp_buf into the save area
+     at the current sp so that the values will be restored to registers
+     when longjmp returns.  */
+
+  addi  a7, a1, -16
+  l32i  a4, a2, 0
+  l32i  a5, a2, 4
+  s32i  a4, a7, 0
+  s32i  a5, a7, 4
+  l32i  a4, a2, 8
+  l32i  a5, a2, 12
+  s32i  a4, a7, 8
+  s32i  a5, a7, 12
+
+  /* Copy the remaining 0-8 saved registers.  */
+  extui a7, a0, 30, 2
+  blti  a7, 2, .Lendlj
+  l32i  a8, a2, 52
+  slli  a4, a7, 4
+  sub a6, a8, a4
+  addi  a5, a2, 16
+  addi  a8, a8, -16   # a8 = end of register overflow area
+.Lljloop:
+  l32i  a7, a5, 0
+  l32i  a4, a5, 4
+  s32i  a7, a6, 0
+  s32i  a4, a6, 4
+  l32i  a7, a5, 8
+  l32i  a4, a5, 12
+  s32i  a7, a6, 8
+  s32i  a4, a6, 12
+  addi  a5, a5, 16
+  addi  a6, a6, 16
+  blt a6, a8, .Lljloop
+.Lendlj:
+
+  /* The 4 words saved from the register save area at the target's
+     sp are copied back to the target procedure's save area.  The
+     only point of this is to prevent a catastrophic failure in
+     case the contents were moved by an alloca after calling
+     setjmp.  This is a bit paranoid but it doesn't cost much.  */
+
+  l32i  a7, a2, 4   # load the target stack pointer
+  addi  a7, a7, -16   # find the destination save area
+  l32i  a4, a2, 48
+  l32i  a5, a2, 52
+  s32i  a4, a7, 0
+  s32i  a5, a7, 4
+  l32i  a4, a2, 56
+  l32i  a5, a2, 60
+  s32i  a4, a7, 8
+  s32i  a5, a7, 12
+#endif
+
+  /* Return val ? val : 1.  */
+  movi  a2, 1
+  movnez  a2, a3, a3
+
+  retw
+  .size longjmp, . - longjmp
+
+#else
+
+  /*
+   Call0 ABI:
+   Much like other ABIs, this version just saves the necessary registers
+   to the stack and restores them later.  Much less needs to be done.
+  */
+
+  .text
+  .align  4
+  .literal_position
+  .global setjmp
+  .type setjmp, @function
+setjmp:
+  s32i  a0, a2, 0
+  s32i  a1, a2, 4
+  s32i  a12, a2, 8
+  s32i  a13, a2, 12
+  s32i  a14, a2, 16
+  s32i  a15, a2, 20
+  movi  a2, 0
+  ret
+  .size setjmp, . - setjmp
+
+  .align  4
+  .literal_position
+  .global longjmp
+  .type longjmp, @function
+longjmp:
+  l32i  a0, a2, 0
+  l32i  a12, a2, 8
+  l32i  a13, a2, 12
+  l32i  a14, a2, 16
+  l32i  a15, a2, 20
+  l32i  a1, a2, 4
+  /* Return val ? val : 1.  */
+  movi  a2, 1
+  movnez  a2, a3, a3
+
+  ret
+  .size longjmp, .-longjmp
+
+#endif /* CALL0 ABI */

[incubator-nuttx] 02/03: libc:machine:xtensa:add xtensa libc implement

Posted by xi...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git

commit cfcff5f570319192af6f64d4e7407c991488cff8
Author: zhuyanlin <zh...@xiaomi.com>
AuthorDate: Thu Oct 28 11:56:18 2021 +0800

    libc:machine:xtensa:add xtensa libc implement
    
    N/A
    
    Signed-off-by: zhuyanlin <zh...@xiaomi.com>
---
 libs/libc/machine/xtensa/Kconfig        |  43 ++
 libs/libc/machine/xtensa/Make.defs      |  31 +-
 libs/libc/machine/xtensa/arch_memcpy.S  | 281 ++++++++++++
 libs/libc/machine/xtensa/arch_memmove.S | 480 ++++++++++++++++++++
 libs/libc/machine/xtensa/arch_memset.S  | 179 ++++++++
 libs/libc/machine/xtensa/arch_strcmp.S  | 767 ++++++++++++++++++++++++++++++++
 libs/libc/machine/xtensa/arch_strcpy.S  | 243 ++++++++++
 libs/libc/machine/xtensa/arch_strlen.S  | 123 +++++
 libs/libc/machine/xtensa/arch_strncpy.S | 265 +++++++++++
 libs/libc/machine/xtensa/xtensa_asm.h   |  62 +++
 10 files changed, 2472 insertions(+), 2 deletions(-)

diff --git a/libs/libc/machine/xtensa/Kconfig b/libs/libc/machine/xtensa/Kconfig
index f72f3c0..232fb73 100644
--- a/libs/libc/machine/xtensa/Kconfig
+++ b/libs/libc/machine/xtensa/Kconfig
@@ -2,3 +2,46 @@
 # For a description of the syntax of this configuration file,
 # see the file kconfig-language.txt in the NuttX tools repository.
 #
+
+config XTENSA_MEMCPY
+        bool "Enable optimized memcpy() for XTENSA"
+        select LIBC_ARCH_MEMCPY
+        ---help---
+                Enable optimized XTENSA specific memcpy() library function
+
+config XTENSA_MEMMOVE
+        bool "Enable optimized memmove() for XTENSA"
+        select LIBC_ARCH_MEMMOVE
+        ---help---
+                Enable optimized XTENSA specific memmove() library function
+
+config XTENSA_MEMSET
+        bool "Enable optimized memset() for XTENSA"
+        select LIBC_ARCH_MEMSET
+        ---help---
+                Enable optimized XTENSA specific memset() library function
+
+config XTENSA_STRCMP
+        bool "Enable optimized strcmp() for XTENSA"
+        select LIBC_ARCH_STRCMP
+        ---help---
+                Enable optimized XTENSA specific strcmp() library function
+
+config XTENSA_STRCPY
+        bool "Enable optimized strcpy() for XTENSA"
+        select LIBC_ARCH_STRCPY
+        ---help---
+                Enable optimized XTENSA specific strcpy() library function
+
+config XTENSA_STRLEN
+        bool "Enable optimized strlen() for XTENSA"
+        select LIBC_ARCH_STRLEN
+        ---help---
+                Enable optimized XTENSA specific strlen() library function
+
+config XTENSA_STRNCPY
+        bool "Enable optimized strncpy() for XTENSA"
+        select LIBC_ARCH_STRNCPY
+        ---help---
+                Enable optimized XTENSA specific strncpy() library function
+
diff --git a/libs/libc/machine/xtensa/Make.defs b/libs/libc/machine/xtensa/Make.defs
index 8f33a82..379c7da 100644
--- a/libs/libc/machine/xtensa/Make.defs
+++ b/libs/libc/machine/xtensa/Make.defs
@@ -19,10 +19,37 @@
 ############################################################################
 
 ifeq ($(CONFIG_LIBC_ARCH_ELF),y)
-
 CSRCS += arch_elf.c
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMCPY),y)
+ASRCS += arch_memcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMMOVE),y)
+ASRCS += arch_memmove.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMSET),y)
+ASRCS += arch_memset.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCPY),y)
+ASRCS += arch_strcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRLEN),y)
+ASRCS += arch_strlen.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRNCPY),y)
+ASRCS += arch_strncpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCMP),y)
+ASRCS += arch_strcmp.S
+endif
 
 DEPPATH += --dep-path machine/xtensa
 VPATH += :machine/xtensa
 
-endif
diff --git a/libs/libc/machine/xtensa/arch_memcpy.S b/libs/libc/machine/xtensa/arch_memcpy.S
new file mode 100644
index 0000000..47de6dd
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memcpy.S
@@ -0,0 +1,281 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT  0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .literal_position
+
+  .local  .Ldst1mod2
+  .local  .Ldst2mod4
+  .local  .Lbytecopy
+
+  .align  4
+  .global memcpy
+  .type memcpy, @function
+memcpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src, a4 = len */
+
+  mov a5, a2    # copy dst so that a2 is return value
+  bbsi.l  a2, 0, .Ldst1mod2
+  bbsi.l  a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+  /* Get number of loop iterations with 16B per iteration.  */
+  srli  a7, a4, 4
+
+  /* Check if source is aligned.  */
+  slli  a8, a3, 30
+  bnez  a8, .Lsrcunaligned
+
+  /* Destination and source are word-aligned, use word copy.  */
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, 2f
+#else
+  beqz  a7, 2f
+  slli  a8, a7, 4
+  add a8, a8, a3  # a8 = end of last 16B source chunk
+#endif
+1:  l32i  a6, a3, 0
+  l32i  a7, a3, 4
+  s32i  a6, a5, 0
+  l32i  a6, a3, 8
+
+  s32i  a7, a5, 4
+  l32i  a7, a3, 12
+  s32i  a6, a5, 8
+  addi  a3, a3, 16
+  s32i  a7, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a8, 1b
+#endif
+
+  /* Copy any leftover pieces smaller than 16B.  */
+2:  bbci.l  a4, 3, 3f
+
+  /* Copy 8 bytes.  */
+  l32i  a6, a3, 0
+  l32i  a7, a3, 4
+  addi  a3, a3, 8
+  s32i  a6, a5, 0
+  s32i  a7, a5, 4
+  addi  a5, a5, 8
+
+3:  bbsi.l  a4, 2, 4f
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  # .align 4
+  /* Copy 4 bytes.  */
+4:  l32i  a6, a3, 0
+  addi  a3, a3, 4
+  s32i  a6, a5, 0
+  addi  a5, a5, 4
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 2 bytes.  */
+5:  l16ui a6, a3, 0
+  addi  a3, a3, 2
+  s16i  a6, a5, 0
+  addi  a5, a5, 2
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 1 byte.  */
+6:  l8ui  a6, a3, 0
+  s8i a6, a5, 0
+
+.Ldone:
+  RET(16)
+
+
+/* Destination is aligned; source is unaligned.  */
+
+  # .align 4
+.Lsrcunaligned:
+  /* Avoid loading anything for zero-length copies.  */
+  beqz  a4, .Ldone
+
+  /* Copy 16 bytes per iteration for word-aligned dst and
+     unaligned src.  */
+  ssa8  a3    # set shift amount from byte offset
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  srli    a11, a8, 30     # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, 2f
+#else
+  beqz  a7, 2f
+  slli  a10, a7, 4
+  add a10, a10, a3  # a10 = end of last 16B source chunk
+#endif
+1:  l32i  a7, a3, 4
+  l32i  a8, a3, 8
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  l32i  a9, a3, 12
+  src_b a7, a7, a8
+  s32i  a7, a5, 4
+  l32i  a6, a3, 16
+  src_b a8, a8, a9
+  s32i  a8, a5, 8
+  addi  a3, a3, 16
+  src_b a9, a9, a6
+  s32i  a9, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a10, 1b
+#endif
+
+2:  bbci.l  a4, 3, 3f
+
+  /* Copy 8 bytes.  */
+  l32i  a7, a3, 4
+  l32i  a8, a3, 8
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  addi  a3, a3, 8
+  src_b a7, a7, a8
+  s32i  a7, a5, 4
+  addi  a5, a5, 8
+  mov a6, a8
+
+3:  bbci.l  a4, 2, 4f
+
+  /* Copy 4 bytes.  */
+  l32i  a7, a3, 4
+  addi  a3, a3, 4
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  addi  a5, a5, 4
+  mov a6, a7
+4:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 2 bytes.  */
+5:  l8ui  a6, a3, 0
+  l8ui  a7, a3, 1
+  addi  a3, a3, 2
+  s8i a6, a5, 0
+  s8i a7, a5, 1
+  addi  a5, a5, 2
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 1 byte.  */
+6:  l8ui  a6, a3, 0
+  s8i a6, a5, 0
+  RET(16)
+
+
+  # .align XCHAL_INST_FETCH_WIDTH
+__memcpy_aux:
+
+  /* Skip bytes to get proper alignment for three-byte loop */
+# .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, 2f
+#else
+  beqz  a4, 2f
+  add a7, a3, a4  # a7 = end address for source
+#endif
+1:  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a7, 1b
+#endif
+2:  RET(16)
+
+
+/* Destination is unaligned.  */
+
+  # .align 4
+.Ldst1mod2: # dst is only byte aligned
+
+  /* Do short copies byte-by-byte.  */
+  bltui a4, 7, .Lbytecopy
+
+  /* Copy 1 byte.  */
+  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  addi  a4, a4, -1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+
+  /* Return to main algorithm if dst is now aligned.  */
+  bbci.l  a5, 1, .Ldstaligned
+
+.Ldst2mod4: # dst has 16-bit alignment
+
+  /* Do short copies byte-by-byte.  */
+  bltui a4, 6, .Lbytecopy
+
+  /* Copy 2 bytes.  */
+  l8ui  a6, a3, 0
+  l8ui  a7, a3, 1
+  addi  a3, a3, 2
+  addi  a4, a4, -2
+  s8i a6, a5, 0
+  s8i a7, a5, 1
+  addi  a5, a5, 2
+
+  /* dst is now aligned; return to main algorithm.  */
+  j .Ldstaligned
+
+  .end schedule
+
+  .size memcpy, . - memcpy
diff --git a/libs/libc/machine/xtensa/arch_memmove.S b/libs/libc/machine/xtensa/arch_memmove.S
new file mode 100644
index 0000000..7ce56c4
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memmove.S
@@ -0,0 +1,480 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT  0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+  .text
+  .begin schedule
+  .global memmove
+
+/*
+ * Byte by byte copy
+ */
+  .align  4
+  .byte 0   # 1 mod 4 alignment for LOOPNEZ
+        # (0 mod 4 alignment for LBEG)
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, .Lbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a4, .Lbytecopydone
+  add a7, a3, a4  # a7 = end address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lnextbyte:
+  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbytecopydone:
+  RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+  .align  4
+.Ldst1mod2: # dst is only byte aligned
+  _bltui  a4, 7, .Lbytecopy # do short copies byte by byte
+
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  addi  a3, a3,  1
+  addi  a4, a4, -1
+  s8i a6, a5,  0
+  addi  a5, a5,  1
+  _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
+          # return to main algorithm
+.Ldst2mod4: # dst 16-bit aligned
+  # copy 2 bytes
+  _bltui  a4, 6, .Lbytecopy # do short copies byte by byte
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a3, a3,  2
+  addi  a4, a4, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a5, a5,  2
+  j .Ldstaligned  # dst is now aligned, return to main algorithm
+
+.Lcommon:
+  bbsi.l  a2, 0, .Ldst1mod2 # if dst is 1 mod 2
+  bbsi.l  a2, 1, .Ldst2mod4 # if dst is 2 mod 4
+.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
+  srli  a7, a4, 4 # number of loop iterations with 16B
+        # per iteration
+  movi  a8, 3   # if source is not aligned,
+  bany  a3, a8, .Lsrcunaligned  # then use shifting copy
+  /*
+   * Destination and source are word-aligned, use word copy.
+   */
+  # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .Loop1done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .Loop1done
+  slli  a8, a7, 4
+  add a8, a8, a3  # a8 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1:
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  s32i  a6, a5,  0
+  l32i  a6, a3,  8
+  s32i  a7, a5,  4
+  l32i  a7, a3, 12
+  s32i  a6, a5,  8
+  addi  a3, a3, 16
+  s32i  a7, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1done:
+  bbci.l  a4, 3, .L2
+  # copy 8 bytes
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  addi  a3, a3,  8
+  s32i  a6, a5,  0
+  s32i  a7, a5,  4
+  addi  a5, a5,  8
+.L2:
+  bbsi.l  a4, 2, .L3
+  bbsi.l  a4, 1, .L4
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L3:
+  # copy 4 bytes
+  l32i  a6, a3,  0
+  addi  a3, a3,  4
+  s32i  a6, a5,  0
+  addi  a5, a5,  4
+  bbsi.l  a4, 1, .L4
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L4:
+  # copy 2 bytes
+  l16ui a6, a3,  0
+  addi  a3, a3,  2
+  s16i  a6, a5,  0
+  addi  a5, a5,  2
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L5:
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+  .align  4
+.Lsrcunaligned:
+  _beqz a4, .Ldone  # avoid loading anything for zero-length copies
+  # copy 16 bytes per iteration for word-aligned dst and unaligned src
+  ssa8  a3    # set shift amount from byte offset
+
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  and a11, a3, a8 # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .Loop2done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .Loop2done
+  slli  a10, a7, 4
+  add a10, a10, a3  # a10 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2:
+  l32i  a7, a3,  4
+  l32i  a8, a3,  8
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  l32i  a9, a3, 12
+  src_b a7, a7, a8
+  s32i  a7, a5,  4
+  l32i  a6, a3, 16
+  src_b a8, a8, a9
+  s32i  a8, a5,  8
+  addi  a3, a3, 16
+  src_b a9, a9, a6
+  s32i  a9, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2done:
+  bbci.l  a4, 3, .L12
+  # copy 8 bytes
+  l32i  a7, a3,  4
+  l32i  a8, a3,  8
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  addi  a3, a3,  8
+  src_b a7, a7, a8
+  s32i  a7, a5,  4
+  addi  a5, a5,  8
+  mov a6, a8
+.L12:
+  bbci.l  a4, 2, .L13
+  # copy 4 bytes
+  l32i  a7, a3,  4
+  addi  a3, a3,  4
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  addi  a5, a5,  4
+  mov a6, a7
+.L13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, .L14
+  bbsi.l  a4, 0, .L15
+.Ldone: RET(16)
+.L14:
+  # copy 2 bytes
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a3, a3,  2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a5, a5,  2
+  bbsi.l  a4, 0, .L15
+  RET(16)
+.L15:
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Byte by byte copy
+ */
+  .align  4
+  .byte 0   # 1 mod 4 alignment for LOOPNEZ
+        # (0 mod 4 alignment for LBEG)
+.Lbackbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, .Lbackbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a4, .Lbackbytecopydone
+  sub a7, a3, a4  # a7 = start address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbacknextbyte:
+  addi  a3, a3, -1
+  l8ui  a6, a3, 0
+  addi  a5, a5, -1
+  s8i a6, a5, 0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a7, .Lbacknextbyte # continue loop if
+               # $a3:src != $a7:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbackbytecopydone:
+  RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+  .align  4
+.Lbackdst1mod2: # dst is only byte aligned
+  _bltui  a4, 7, .Lbackbytecopy # do short copies byte by byte
+
+  # copy 1 byte
+  addi  a3, a3, -1
+  l8ui  a6, a3,  0
+  addi  a5, a5, -1
+  s8i a6, a5,  0
+  addi  a4, a4, -1
+  _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
+          # return to main algorithm
+.Lbackdst2mod4: # dst 16-bit aligned
+  # copy 2 bytes
+  _bltui  a4, 6, .Lbackbytecopy # do short copies byte by byte
+  addi  a3, a3, -2
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a5, a5, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a4, a4, -2
+  j .Lbackdstaligned  # dst is now aligned,
+          # return to main algorithm
+
+  .align  4
+memmove:
+
+  ENTRY(16)
+  # a2/ dst, a3/ src, a4/ len
+  mov a5, a2    # copy dst so that a2 is return value
+.Lmovecommon:
+  sub a6, a5, a3
+  bgeu  a6, a4, .Lcommon
+
+  add a5, a5, a4
+  add a3, a3, a4
+
+  bbsi.l  a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
+  bbsi.l  a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
+.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
+  srli  a7, a4, 4 # number of loop iterations with 16B
+        # per iteration
+  movi  a8, 3   # if source is not aligned,
+  bany  a3, a8, .Lbacksrcunaligned  # then use shifting copy
+  /*
+   * Destination and source are word-aligned, use word copy.
+   */
+  # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .backLoop1done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .backLoop1done
+  slli  a8, a7, 4
+  sub a8, a3, a8  # a8 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1:
+  addi  a3, a3, -16
+  l32i  a7, a3, 12
+  l32i  a6, a3,  8
+  addi  a5, a5, -16
+  s32i  a7, a5, 12
+  l32i  a7, a3,  4
+  s32i  a6, a5,  8
+  l32i  a6, a3,  0
+  s32i  a7, a5,  4
+  s32i  a6, a5,  0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1done:
+  bbci.l  a4, 3, .Lback2
+  # copy 8 bytes
+  addi  a3, a3, -8
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  addi  a5, a5, -8
+  s32i  a6, a5,  0
+  s32i  a7, a5,  4
+.Lback2:
+  bbsi.l  a4, 2, .Lback3
+  bbsi.l  a4, 1, .Lback4
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback3:
+  # copy 4 bytes
+  addi  a3, a3, -4
+  l32i  a6, a3,  0
+  addi  a5, a5, -4
+  s32i  a6, a5,  0
+  bbsi.l  a4, 1, .Lback4
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback4:
+  # copy 2 bytes
+  addi  a3, a3, -2
+  l16ui a6, a3,  0
+  addi  a5, a5, -2
+  s16i  a6, a5,  0
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback5:
+  # copy 1 byte
+  addi  a3, a3, -1
+  l8ui  a6, a3,  0
+  addi  a5, a5, -1
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+  .align  4
+.Lbacksrcunaligned:
+  _beqz a4, .Lbackdone  # avoid loading anything for zero-length copies
+  # copy 16 bytes per iteration for word-aligned dst and unaligned src
+  ssa8  a3    # set shift amount from byte offset
+#define SIM_CHECKS_ALIGNMENT  1 /* set to 1 when running on ISS with
+           * the lint or ferret client, or 0
+           * to save a few cycles */
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  and a11, a3, a8 # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .backLoop2done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .backLoop2done
+  slli  a10, a7, 4
+  sub a10, a3, a10  # a10 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2:
+  addi  a3, a3, -16
+  l32i  a7, a3, 12
+  l32i  a8, a3,  8
+  addi  a5, a5, -16
+  src_b a6, a7, a6
+  s32i  a6, a5, 12
+  l32i  a9, a3,  4
+  src_b a7, a8, a7
+  s32i  a7, a5,  8
+  l32i  a6, a3,  0
+  src_b a8, a9, a8
+  s32i  a8, a5,  4
+  src_b a9, a6, a9
+  s32i  a9, a5,  0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2done:
+  bbci.l  a4, 3, .Lback12
+  # copy 8 bytes
+  addi  a3, a3, -8
+  l32i  a7, a3,  4
+  l32i  a8, a3,  0
+  addi  a5, a5, -8
+  src_b a6, a7, a6
+  s32i  a6, a5,  4
+  src_b a7, a8, a7
+  s32i  a7, a5,  0
+  mov a6, a8
+.Lback12:
+  bbci.l  a4, 2, .Lback13
+  # copy 4 bytes
+  addi  a3, a3, -4
+  l32i  a7, a3,  0
+  addi  a5, a5, -4
+  src_b a6, a7, a6
+  s32i  a6, a5,  0
+  mov a6, a7
+.Lback13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, .Lback14
+  bbsi.l  a4, 0, .Lback15
+.Lbackdone:
+  RET(16)
+.Lback14:
+  # copy 2 bytes
+  addi  a3, a3, -2
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a5, a5, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  bbsi.l  a4, 0, .Lback15
+  RET(16)
+.Lback15:
+  # copy 1 byte
+  addi  a3, a3, -1
+  addi  a5, a5, -1
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+  .end schedule
+  .size memmove, . - memmove
diff --git a/libs/libc/machine/xtensa/arch_memset.S b/libs/libc/machine/xtensa/arch_memset.S
new file mode 100644
index 0000000..488172f
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memset.S
@@ -0,0 +1,179 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+/* void *memset (void *dst, int c, size_t length)
+
+   The algorithm is as follows:
+
+   Create a word with c in all byte positions.
+
+   If the destination is aligned, set 16B chunks with a loop, and then
+   finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
+
+   If the destination is unaligned, align it by conditionally
+   setting 1B and/or 2B and then go to aligned case.
+
+   This code tries to use fall-through branches for the common
+   case of an aligned destination (except for the branches to
+   the alignment labels).  */
+
+
+/* Byte-by-byte set.  */
+
+	.section .text
+	.begin schedule
+	.literal_position
+
+	.local	.Lbyteset
+	.local	.Ldst1mod2
+	.local	.Ldst2mod4
+
+	.align	4
+	.global	memset
+	.type	memset, @function
+memset:
+  ENTRY(16)
+	/* a2 = dst, a3 = c, a4 = length */
+
+	/* Duplicate character into all bytes of word.  */
+	extui	a3, a3, 0, 8
+	slli	a7, a3, 8
+	or	a3, a3, a7
+	slli	a7, a3, 16
+	or	a3, a3, a7
+
+	mov	a5, a2		// copy dst so that a2 is return value
+
+	/* Check if dst is unaligned.  */
+	bbsi.l	a2, 0, .Ldst1mod2
+	bbsi.l	a2, 1, .Ldst2mod4
+	j	.Ldstaligned
+
+.Ldst1mod2: // dst is only byte aligned
+
+	/* Do short sizes byte-by-byte.  */
+	bltui	a4, 8, .Lbyteset
+
+	/* Set 1 byte.  */
+	s8i	a3, a5, 0
+	addi	a5, a5, 1
+	addi	a4, a4, -1
+
+	/* Now retest if dst is aligned.  */
+	bbci.l	a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+	/* Do short sizes byte-by-byte.  */
+	bltui	a4, 8, .Lbyteset
+
+	/* Set 2 bytes.  */
+	s16i	a3, a5, 0
+	addi	a5, a5, 2
+	addi	a4, a4, -2
+
+	/* dst is now aligned; fall through to main algorithm */
+
+.Ldstaligned:
+
+	/* Get number of loop iterations with 16B per iteration.  */
+	srli	a7, a4, 4
+
+	/* Destination is word-aligned.  */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a7, 2f
+#else
+	beqz	a7, 2f
+	slli	a6, a7, 4
+	add	a6, a6, a5	// a6 = end of last 16B chunk
+#endif
+	/* Set 16 bytes per iteration.  */
+1:	s32i	a3, a5, 0
+	s32i	a3, a5, 4
+	s32i	a3, a5, 8
+	s32i	a3, a5, 12
+	addi	a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+	bltu	a5, a6, 1b
+#endif
+
+	/* Set any leftover pieces smaller than 16B.  */
+2:	bbci.l	a4, 3, 3f
+
+	/* Set 8 bytes.  */
+	s32i	a3, a5, 0
+	s32i	a3, a5, 4
+	addi	a5, a5, 8
+
+3:	bbci.l	a4, 2, 4f
+
+	/* Set 4 bytes.  */
+	s32i	a3, a5, 0
+	addi	a5, a5, 4
+
+4:	bbci.l	a4, 1, 5f
+
+	/* Set 2 bytes.  */
+	s16i	a3, a5, 0
+	addi	a5, a5, 2
+
+5:	bbci.l	a4, 0, 6f
+
+	/* Set 1 byte.  */
+	s8i	a3, a5, 0
+6:	RET(16)
+
+
+	// .align	XCHAL_INST_FETCH_WIDTH
+__memset_aux:
+
+	/* Skip bytes to get proper alignment for three-byte loop */
+// .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbyteset:
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, 2f
+#else
+	beqz	a4, 2f
+	add	a6, a5, a4	// a6 = ending address
+#endif
+1:	s8i	a3, a5, 0
+	addi	a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+	bltu	a5, a6, 1b
+#endif
+2:	RET(16)
+
+	.end schedule
+
+	.size	memset, . - memset
diff --git a/libs/libc/machine/xtensa/arch_strcmp.S b/libs/libc/machine/xtensa/arch_strcmp.S
new file mode 100644
index 0000000..aab50be
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strcmp.S
@@ -0,0 +1,767 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcmp.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#define MASK4 0x40404040
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin  schedule
+  .align  4
+  .literal_position
+
+  .global strcmp
+  .type strcmp,@function
+  .align  4
+
+strcmp:
+
+#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_FLIX3
+/*  Fast version for FLIX3 Little Endian */
+
+
+  ENTRY(16)
+  /* a2 = s1, a3 = s2 */
+
+  l8ui  a8, a2, 0 # byte 0 from s1
+  l8ui  a9, a3, 0 # byte 0 from s2
+  movi  a10, 3    # mask
+  movi  a5, 0xfffffffc
+  or  a11, a2, a3
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+  addi    a3, a3, -8
+  addi    a2, a2, -8
+  and a5, a5, a2
+  bne.w18 a8, a9, .Lretdiff
+  l32i  a8, a5, 8 # get word from aligned variant of s1
+
+  bany.w18  a11, a10, .Lnot_aligned
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+.Laligned:
+  /* Loop forever */
+1:
+  loop  a0, .Laligned_done
+
+  /* First unrolled loop body.  */
+  l32i  a9, a3, 8 # get word from s2
+  addi  a3, a3, 8 # advance s2 pointer
+  slli  a5, a8, 1
+  or  a10, a8, a5
+  {l32i a11, a2, 12 # get word from s1+4
+  bne.w18 a8, a9, .Lwne2}
+  l32i  a9, a3, 4 # get word from s2+4
+  bnall.w18 a10, a7, .Lprobeq
+
+  /* Second unrolled loop body.  */
+  slli  a5, a11, 1
+  or  a10, a11, a5
+  addi  a2, a2, 8 # advance s1 pointer
+        mov a8, a11
+  bne.w18 a11, a9, .Lwne2
+  l32i  a8, a2, 8 # get word from s1
+  bnall.w18 a10, a7, .Lprobeq2
+
+.Laligned_done:
+  l32i  a8, a2, 8 # get word from s1
+  j       1b
+
+.Lnot_aligned:
+  xor a11, a2, a3 # compare low two bits of s1 and s2
+  bany  a11, a10, .Lunaligned # if they have different alignment
+
+  /* s1/s2 are not word-aligned.  */
+  movi  a5, 0xfffffffc
+  addi  a2, a2, 1 # advance s1
+  beqz  a9, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 1 from s1
+  l8ui  a9, a3, 8 # byte 1 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 2 from s1
+  l8ui  a9, a3, 8 # byte 2 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  l32i  a8, a2, 8 # get word from s1
+  j .Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+.Lunaligned:
+  movi.n  a8, 0   # set up for the maximum loop count
+  loop  a8, .Lretdiff # loop forever (almost anyway)
+  l8ui  a8, a2, 8
+  l8ui  a9, a3, 8
+  addi  a2, a2, 1
+  bne a8, a9, .Lretdiff
+  addi  a3, a3, 1
+  beqz  a8, .Lretdiff
+.Lretdiff:
+  sub a2, a8, a9
+  RET(16)
+
+
+.Lprobeq2:
+  /* Adjust pointers to account for the loop unrolling.  */
+        mov a8, a11
+  addi  a2, a2, -4
+  addi  a3, a3, 4
+
+  /* align (0 mod 4) */
+.Lprobeq:
+  /* Words are probably equal, but check for sure.
+     If not, loop over the rest of string using normal algorithm.  */
+
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  movi  a5, MASK1 # mask for byte 1
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  movi  a7, MASK3 # mask for byte 3
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  /* align (1 mod 4) */
+  addi.n  a2, a2, 12  # advance s1 pointer
+  addi.n  a3, a3, 4 # advance s2 pointer
+  /* align (1 mod 4) or (2 mod 4) */
+1:
+  loop  a0, .Lend # loop forever (a4 is bigger than max iters)
+
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  addi  a3, a3, 4 # advance s2 pointer
+.Lend:
+  j 1b
+
+  /* Words are equal; some byte is zero.  */
+.Leq: movi  a2, 0   # return equal
+  RET(16)
+
+.Lwne2: /* Words are not equal.  On big-endian processors, if none of the
+     bytes are zero, the return value can be determined by a simple
+     comparison.  */
+.Lwne:  /* Words are not equal.  */
+  xor a2, a8, a9  # get word with nonzero in byte that differs
+  extui a10, a8, 0, 8
+  extui a11, a9, 0, 8
+  movi  a5, MASK1 # mask for byte 1
+  bany.w18  a2, a4, .Ldiff0 # if byte 0 differs
+
+  bnone.w18 a8, a4, .Leq  # if byte 0 is zero
+  movi  a6, MASK2 # mask for byte 2
+  bany.w18  a2, a5, .Ldiff1 # if byte 1 differs
+  extui a10, a8, 24, 8
+  bnone.w18 a8, a5, .Leq  # if byte 1 is zero
+  extui a11, a9, 24, 8
+  bany.w18  a2, a6, .Ldiff2 # if byte 2 differs
+  sub a2, a10, a11
+  bnone.w18 a8, a6, .Leq  # if byte 2 is zero
+  /* Little-endian is a little more difficult because can't subtract
+     whole words.  */
+.Ldiff3:
+  /* Bytes 0-2 are equal; byte 3 is different.
+     For little-endian need to have a sign bit for the difference.  */
+  RET(16)
+.Ldiff0:
+  /* Byte 0 is different.  */
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff1:
+  /* Byte 0 is equal; byte 1 is different.  */
+  extui a10, a8, 8, 8
+  extui a11, a9, 8, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff2:
+  /* Bytes 0-1 are equal; byte 2 is different.  */
+  extui a10, a8, 16, 8
+  extui a11, a9, 16, 8
+  sub a2, a10, a11
+  RET(16)
+
+#else
+#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_PDX4
+/*  Fast version for FLIX3 Little Endian */
+
+
+  ENTRY(16)
+  /* a2 = s1, a3 = s2 */
+
+  l8ui  a8, a2, 0 # byte 0 from s1
+  l8ui  a9, a3, 0 # byte 0 from s2
+  movi  a10, 3    # mask
+  movi  a5, 0xfffffffc
+  or  a11, a2, a3
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+  addi    a3, a3, -8
+  addi    a2, a2, -8
+  and a5, a5, a2
+  bne.w15 a8, a9, .Lretdiff
+  l32i  a8, a5, 8 # get word from aligned variant of s1
+
+  bany.w15  a11, a10, .Lnot_aligned
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+.Laligned:
+  /* Loop forever */
+1:
+  loop  a0, .Laligned_done
+
+  /* First unrolled loop body.  */
+  l32i  a9, a3, 8 # get word from s2
+  addi  a3, a3, 8 # advance s2 pointer
+  slli  a5, a8, 1
+  or  a10, a8, a5
+  {
+  bne.w15 a8, a9, .Lwne2
+  l32i  a11, a2, 12 # get word from s1+4
+  nop
+  nop
+  }
+  l32i  a9, a3, 4 # get word from s2+4
+  bnall.w15 a10, a7, .Lprobeq
+
+  /* Second unrolled loop body.  */
+  slli  a5, a11, 1
+  or  a10, a11, a5
+  addi  a2, a2, 8 # advance s1 pointer
+        mov a8, a11
+  bne.w15 a11, a9, .Lwne2
+  l32i  a8, a2, 8 # get word from s1
+  bnall.w15 a10, a7, .Lprobeq2
+
+.Laligned_done:
+  l32i  a8, a2, 8 # get word from s1
+  j       1b
+
+.Lnot_aligned:
+  xor a11, a2, a3 # compare low two bits of s1 and s2
+  bany  a11, a10, .Lunaligned # if they have different alignment
+
+  /* s1/s2 are not word-aligned.  */
+  movi  a5, 0xfffffffc
+  addi  a2, a2, 1 # advance s1
+  beqz  a9, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 1 from s1
+  l8ui  a9, a3, 8 # byte 1 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 2 from s1
+  l8ui  a9, a3, 8 # byte 2 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  l32i  a8, a2, 8 # get word from s1
+  j .Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+.Lunaligned:
+  movi.n  a8, 0   # set up for the maximum loop count
+  loop  a8, .Lretdiff # loop forever (almost anyway)
+  l8ui  a8, a2, 8
+  l8ui  a9, a3, 8
+  addi  a2, a2, 1
+  bne a8, a9, .Lretdiff
+  addi  a3, a3, 1
+  beqz  a8, .Lretdiff
+.Lretdiff:
+  sub a2, a8, a9
+  RET(16)
+
+
+.Lprobeq2:
+  /* Adjust pointers to account for the loop unrolling.  */
+        mov a8, a11
+  addi  a2, a2, -4
+  addi  a3, a3, 4
+
+  /* align (0 mod 4) */
+.Lprobeq:
+  /* Words are probably equal, but check for sure.
+     If not, loop over the rest of string using normal algorithm.  */
+
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  movi  a5, MASK1 # mask for byte 1
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  movi  a7, MASK3 # mask for byte 3
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  /* align (1 mod 4) */
+  addi.n  a2, a2, 12  # advance s1 pointer
+  addi.n  a3, a3, 4 # advance s2 pointer
+  /* align (1 mod 4) or (2 mod 4) */
+1:
+  loop  a0, .Lend # loop forever (a4 is bigger than max iters)
+
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  addi  a3, a3, 4 # advance s2 pointer
+.Lend:
+  j 1b
+
+  /* Words are equal; some byte is zero.  */
+.Leq: movi  a2, 0   # return equal
+  RET(16)
+
+.Lwne2: /* Words are not equal.  On big-endian processors, if none of the
+     bytes are zero, the return value can be determined by a simple
+     comparison.  */
+.Lwne:  /* Words are not equal.  */
+  xor a2, a8, a9  # get word with nonzero in byte that differs
+  extui a10, a8, 0, 8
+  extui a11, a9, 0, 8
+  movi  a5, MASK1 # mask for byte 1
+  bany.w15  a2, a4, .Ldiff0 # if byte 0 differs
+
+  bnone.w15 a8, a4, .Leq  # if byte 0 is zero
+  movi  a6, MASK2 # mask for byte 2
+  bany.w15  a2, a5, .Ldiff1 # if byte 1 differs
+  extui a10, a8, 24, 8
+  bnone.w15 a8, a5, .Leq  # if byte 1 is zero
+  extui a11, a9, 24, 8
+  bany.w15  a2, a6, .Ldiff2 # if byte 2 differs
+  sub a2, a10, a11
+  bnone.w15 a8, a6, .Leq  # if byte 2 is zero
+  /* Little-endian is a little more difficult because can't subtract
+     whole words.  */
+.Ldiff3:
+  /* Bytes 0-2 are equal; byte 3 is different.
+     For little-endian need to have a sign bit for the difference.  */
+  RET(16)
+.Ldiff0:
+  /* Byte 0 is different.  */
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff1:
+  /* Byte 0 is equal; byte 1 is different.  */
+  extui a10, a8, 8, 8
+  extui a11, a9, 8, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff2:
+  /* Bytes 0-1 are equal; byte 2 is different.  */
+  extui a10, a8, 16, 8
+  extui a11, a9, 16, 8
+  sub a2, a10, a11
+  RET(16)
+
+
+#else /* Not FLIX3 */
+  ENTRY(16)
+  /* a2 = s1, a3 = s2 */
+
+  l8ui  a8, a2, 0 # byte 0 from s1
+  l8ui  a9, a3, 0 # byte 0 from s2
+  movi  a10, 3    # mask
+  bne a8, a9, .Lretdiff
+
+  or  a11, a2, a3
+  bnone a11, a10, .Laligned
+
+  xor a11, a2, a3 # compare low two bits of s1 and s2
+  bany  a11, a10, .Lunaligned # if they have different alignment
+
+  /* s1/s2 are not word-aligned.  */
+  addi  a2, a2, 1 # advance s1
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 0 # byte 1 from s1
+  l8ui  a9, a3, 0 # byte 1 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 0 # byte 2 from s1
+  l8ui  a9, a3, 0 # byte 2 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  j .Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Lunaligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lretdiff # loop forever (almost anyway)
+#endif
+.Lnextbyte:
+  l8ui  a8, a2, 0
+  l8ui  a9, a3, 0
+  addi  a2, a2, 1
+  bne a8, a9, .Lretdiff
+  addi  a3, a3, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, .Lretdiff
+#else
+  bnez  a8, .Lnextbyte
+#endif
+.Lretdiff:
+  sub a2, a8, a9
+  RET(16)
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_CONST16
+  /* (2 mod 4) alignment for loop instruction */
+  .byte 0
+#endif
+.Laligned:
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+
+  /* Loop forever */
+1:
+  loop  a0, .Laligned_done
+
+  /* First unrolled loop body.  */
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  slli  a5, a8, 1
+  bne a8, a9, .Lwne2
+  or  a9, a8, a5
+  bnall a9, a7, .Lprobeq
+
+  /* Second unrolled loop body.  */
+  l32i  a8, a2, 4 # get word from s1+4
+  l32i  a9, a3, 4 # get word from s2+4
+  slli  a5, a8, 1
+  bne a8, a9, .Lwne2
+  or  a9, a8, a5
+  bnall a9, a7, .Lprobeq2
+
+  addi  a2, a2, 8 # advance s1 pointer
+  addi  a3, a3, 8 # advance s2 pointer
+.Laligned_done:
+  j       1b
+
+.Lprobeq2:
+  /* Adjust pointers to account for the loop unrolling.  */
+  addi  a2, a2, 4
+  addi  a3, a3, 4
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+.Laligned:
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+  j .Lfirstword
+.Lnextword:
+  addi  a2, a2, 4 # advance s1 pointer
+  addi  a3, a3, 4 # advance s2 pointer
+.Lfirstword:
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  slli  a5, a8, 1
+  bne a8, a9, .Lwne2
+  or  a9, a8, a5
+  ball  a9, a7, .Lnextword
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  /* align (0 mod 4) */
+.Lprobeq:
+  /* Words are probably equal, but check for sure.
+     If not, loop over the rest of string using normal algorithm.  */
+
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  movi  a5, MASK1 # mask for byte 1
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  movi  a7, MASK3 # mask for byte 3
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  /* align (1 mod 4) */
+#if XCHAL_HAVE_DENSITY
+  addi.n  a2, a2, 4 # advance s1 pointer
+  addi.n  a3, a3, 4 # advance s2 pointer
+  /* align (1 mod 4) or (2 mod 4) */
+#else
+  addi  a2, a2, 4 # advance s1 pointer
+  addi  a3, a3, 4 # advance s2 pointer
+  or  a1, a1, a1  # nop
+#if XCHAL_HAVE_CONST16
+  or  a1, a1, a1  # nop
+#endif
+  /* align (2 mod 4) */
+#endif /* XCHAL_HAVE_DENSITY */
+#if XCHAL_HAVE_LOOPS
+1:
+  loop  a0, .Leq  # loop forever (a4 is bigger than max iters)
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  addi  a3, a3, 4 # advance s2 pointer
+  j 1b
+#else /* !XCHAL_HAVE_LOOPS */
+
+  j .Lfirstword2
+.Lnextword2:
+  addi  a3, a3, 4 # advance s2 pointer
+.Lfirstword2:
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bany  a8, a7, .Lnextword2 # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  /* Words are equal; some byte is zero.  */
+.Leq: movi  a2, 0   # return equal
+  RET(16)
+
+.Lwne2: /* Words are not equal.  On big-endian processors, if none of the
+     bytes are zero, the return value can be determined by a simple
+     comparison.  */
+#if XCHAL_HAVE_BE
+  or  a10, a8, a5
+  bnall a10, a7, .Lsomezero
+  bgeu  a8, a9, .Lposreturn
+  movi  a2, -1
+  RET(16)
+.Lposreturn:
+  movi  a2, 1
+  RET(16)
+.Lsomezero: # There is probably some zero byte.
+#endif /* XCHAL_HAVE_BE */
+.Lwne:  /* Words are not equal.  */
+  xor a2, a8, a9  # get word with nonzero in byte that differs
+  bany  a2, a4, .Ldiff0 # if byte 0 differs
+  movi  a5, MASK1 # mask for byte 1
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bany  a2, a5, .Ldiff1 # if byte 1 differs
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bany  a2, a6, .Ldiff2 # if byte 2 differs
+  bnone a8, a6, .Leq  # if byte 2 is zero
+#if XCHAL_HAVE_BE
+.Ldiff3:
+.Ldiff2:
+.Ldiff1:
+  /* Byte 0 is equal (at least) and there is a difference before a zero
+     byte.  Just subtract words to get the return value.
+     The high order equal bytes cancel, leaving room for the sign.  */
+  sub a2, a8, a9
+  RET(16)
+
+.Ldiff0:
+  /* Need to make room for the sign, so can't subtract whole words.  */
+  extui a10, a8, 24, 8
+  extui a11, a9, 24, 8
+  sub a2, a10, a11
+  RET(16)
+
+#else /* !XCHAL_HAVE_BE */
+  /* Little-endian is a little more difficult because can't subtract
+     whole words.  */
+.Ldiff3:
+  /* Bytes 0-2 are equal; byte 3 is different.
+     For little-endian need to have a sign bit for the difference.  */
+  extui a10, a8, 24, 8
+  extui a11, a9, 24, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff0:
+  /* Byte 0 is different.  */
+  extui a10, a8, 0, 8
+  extui a11, a9, 0, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff1:
+  /* Byte 0 is equal; byte 1 is different.  */
+  extui a10, a8, 8, 8
+  extui a11, a9, 8, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff2:
+  /* Bytes 0-1 are equal; byte 2 is different.  */
+  extui a10, a8, 16, 8
+  extui a11, a9, 16, 8
+  sub a2, a10, a11
+  RET(16)
+
+#endif /* !XCHAL_HAVE_BE */
+#endif /* FLIX3 */
+#endif /* FLIX3 */
+
+  .end  schedule
+  .size strcmp, . - strcmp
+
diff --git a/libs/libc/machine/xtensa/arch_strcpy.S b/libs/libc/machine/xtensa/arch_strcpy.S
new file mode 100644
index 0000000..b062d87
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strcpy.S
@@ -0,0 +1,243 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .align  4
+  .literal_position
+  .global strcpy
+  .type strcpy, @function
+strcpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src */
+
+  mov a10, a2   # leave dst in return value register
+  movi  a4, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a3, 0, .Lsrc1mod2
+  bbsi.l  a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+  /* Check if the destination is aligned.  */
+  movi  a8, 3
+  bnone a10, a8, .Laligned
+
+  j .Ldstunaligned
+
+.Lsrc1mod2: # src address is odd
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a3, a3, 1 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  beqz  a8, 1f    # if byte 0 is zero
+  addi  a10, a10, 1 # advance dst pointer
+  bbci.l  a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+  l8ui  a8, a3, 0 # get byte 0
+  /* 1-cycle interlock */
+  s8i a8, a10, 0  # store byte 0
+  beqz  a8, 1f    # if byte 0 is zero
+  l8ui  a8, a3, 1 # get byte 0
+  addi  a3, a3, 2 # advance src pointer
+  s8i a8, a10, 1  # store byte 0
+  addi  a10, a10, 2 # advance dst pointer
+  bnez  a8, .Lsrcaligned
+1:  RET(16)
+
+
+/* dst is word-aligned; src is word-aligned.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+.Laligned:
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lz3  # loop forever (almost anyway)
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  bnone a8, a7, .Lz3  # if byte 3 is zero
+  addi  a10, a10, 4 # advance dst pointer
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  addi  a10, a10, 4 # advance dst pointer
+.Laligned:
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  bany  a8, a7, 1b  # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+.Lz3: /* Byte 3 is zero.  */
+  RET(16)
+
+.Lz0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  RET(16)
+
+.Lz1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  RET(16)
+
+.Lz2: /* Byte 2 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  RET(16)
+
+#if 1
+/* For now just use byte copy loop for the unaligned destination case.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 2f    # loop forever (almost anyway)
+#endif
+1:  l8ui  a8, a3, 0
+  addi  a3, a3, 1
+  s8i a8, a10, 0
+  addi  a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, 2f
+#else
+  bnez  a8, 1b
+#endif
+2:  RET(16)
+
+#else /* 0 */
+
+/* This code is not functional yet.  */
+
+.Ldstunaligned:
+  l32i  a9, a2, 0 # load word from dst
+#if XCHAL_HAVE_BE
+  ssa8b a9    # rotate by dst alignment so that
+  src a9, a9, a9  # shift in loop will put back in place
+  ssa8l a9    # shift left by byte*8
+#else
+  ssa8l a9    # rotate by dst alignment so that
+  src a9, a9, a9  # shift in loop will put back in place
+  ssa8b a9    # shift left by 32-byte*8
+#endif
+
+/* dst is word-aligned; src is unaligned.  */
+
+.Ldstunalignedloop:
+  l32i  a8, a3, 0 # get word from src
+  /* 1-cycle interlock */
+  bnone a8, a4, .Lu0  # if byte 0 is zero
+  bnone a8, a5, .Lu1  # if byte 1 is zero
+  bnone a8, a6, .Lu2  # if byte 2 is zero
+  src a9, a8, a9  # combine last word and this word
+  s32i  a9, a10, 0  # store word to dst
+  bnone a8, a7, .Lu3  # if byte 3 is nonzero, iterate
+  l32i  a9, a3, 4 # get word from src
+  addi  a3, a3, 8 # advance src pointer
+  bnone a9, a4, .Lu4  # if byte 0 is zero
+  bnone a9, a5, .Lu5  # if byte 1 is zero
+  bnone a9, a6, .Lu6  # if byte 2 is zero
+  src a8, a9, a8  # combine last word and this word
+  s32i  a8, a10, 4  # store word to dst
+  addi  a10, a10, 8 # advance dst pointer
+  bany  a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate
+
+  /* Byte 7 is zero.  */
+.Lu7: RET(16)
+
+.Lu0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  RET(16)
+
+.Lu1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+  extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  RET(16)
+
+.Lu2: /* Byte 2 is zero.  */
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  RET(16)
+
+#endif /* 0 */
+  .end schedule
+
+  .size strcpy, . - strcpy
diff --git a/libs/libc/machine/xtensa/arch_strlen.S b/libs/libc/machine/xtensa/arch_strlen.S
new file mode 100644
index 0000000..686268e
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strlen.S
@@ -0,0 +1,123 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strlen.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .align  4
+  .literal_position
+  .global strlen
+  .type strlen, @function
+strlen:
+  ENTRY(16)
+  /* a2 = s */
+
+  addi  a3, a2, -4  # because we overincrement at the end
+  movi  a4, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a2, 0, .L1mod2
+  bbsi.l  a2, 1, .L2mod4
+  j .Laligned
+
+.L1mod2: # address is odd
+  l8ui  a8, a3, 4 # get byte 0
+  addi  a3, a3, 1 # advance string pointer
+  beqz  a8, .Lz3  # if byte 0 is zero
+  bbci.l  a3, 1, .Laligned # if string pointer is now word-aligned
+
+.L2mod4: # address is 2 mod 4
+  addi  a3, a3, 2 # advance ptr for aligned access
+  l32i  a8, a3, 0 # get word with first two bytes of string
+  bnone a8, a6, .Lz2  # if byte 2 (of word, not string) is zero
+  bany  a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero
+
+  /* Byte 3 is zero.  */
+  addi  a3, a3, 3 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+
+/* String is word-aligned.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lz3  # loop forever (almost anyway)
+#endif
+1:  l32i  a8, a3, 4 # get next word of string
+  addi  a3, a3, 4 # advance string pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+#if XCHAL_HAVE_LOOPS
+  bnone a8, a7, .Lz3  # if byte 3 is zero
+#else
+  bany  a8, a7, 1b  # repeat if byte 3 is non-zero
+#endif
+
+.Lz3: /* Byte 3 is zero.  */
+  addi  a3, a3, 3 # point to zero byte
+  /* Fall through....  */
+
+.Lz0: /* Byte 0 is zero.  */
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+.Lz1: /* Byte 1 is zero.  */
+  addi  a3, a3, 1 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+.Lz2: /* Byte 2 is zero.  */
+  addi  a3, a3, 2 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+  .end schedule
+
+  .size strlen, . - strlen
diff --git a/libs/libc/machine/xtensa/arch_strncpy.S b/libs/libc/machine/xtensa/arch_strncpy.S
new file mode 100644
index 0000000..297f00c
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strncpy.S
@@ -0,0 +1,265 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strncpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+.begin schedule
+  .align  4
+  .literal_position
+__strncpy_aux:
+
+.Lsrc1mod2: # src address is odd
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a3, a3, 1 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  beqz  a8, .Lfill  # if byte 0 is zero
+  bbci.l  a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a4, a4, -1  # decrement n
+  s8i a8, a10, 0  # store byte 0
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  beqz  a8, .Lfill  # if byte 0 is zero
+  l8ui  a8, a3, 1 # get byte 0
+  addi  a3, a3, 2 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  bnez  a8, .Lsrcaligned
+  j .Lfill
+
+.Lret:
+  RET(16)
+
+  .align  4
+  .global strncpy
+  .type strncpy, @function
+strncpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src */
+
+  mov a10, a2   # leave dst in return value register
+  beqz    a4, .Lret       # if n is zero
+
+  movi  a11, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a3, 0, .Lsrc1mod2
+  bbsi.l  a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+  /* Check if the destination is aligned.  */
+  movi  a8, 3
+  bnone a10, a8, .Laligned
+
+  j .Ldstunaligned
+
+
+/* Fill the dst with zeros -- n is at least 1.  */
+
+.Lfill:
+  movi  a9, 0
+  bbsi.l  a10, 0, .Lfill1mod2
+  bbsi.l  a10, 1, .Lfill2mod4
+.Lfillaligned:
+  blti  a4, 4, .Lfillcleanup
+
+  /* Loop filling complete words with zero.  */
+#if XCHAL_HAVE_LOOPS
+
+  srai  a8, a4, 2
+  loop  a8, 1f
+  s32i  a9, a10, 0
+  addi  a10, a10, 4
+
+1:  slli  a8, a8, 2
+  sub a4, a4, a8
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  s32i  a9, a10, 0
+  addi  a10, a10, 4
+  addi  a4, a4, -4
+  bgei    a4, 4, 1b
+
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  beqz  a4, 2f
+
+.Lfillcleanup:
+  /* Fill leftover (1 to 3) bytes with zero.  */
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  addi  a10, a10, 1
+  bnez    a4, .Lfillcleanup
+
+2:  RET(16)
+
+.Lfill1mod2: # dst address is odd
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  addi    a10, a10, 1 # advance dst pointer
+  bbci.l  a10, 1, .Lfillaligned # if dst is now word-aligned
+
+.Lfill2mod4: # dst address is 2 mod 4
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  s8i a9, a10, 1  # store byte 1
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  addi    a10, a10, 2 # advance dst pointer
+  j .Lfillaligned
+
+
+/* dst is word-aligned; src is word-aligned; n is at least 1.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 1f    # loop forever (almost anyway)
+  blti  a4, 5, .Ldstunaligned # n is near limit; do one at a time
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a11, .Lz0 # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  addi  a4, a4, -4  # decrement n
+  addi  a10, a10, 4 # advance dst pointer
+  bnone a8, a7, .Lfill  # if byte 3 is zero
+1:
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  blti  a4, 5, .Ldstunaligned # n is near limit; do one at a time
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a11, .Lz0 # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  addi  a4, a4, -4  # decrement n
+  addi  a10, a10, 4 # advance dst pointer
+  bany  a8, a7, 1b  # no zeroes
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  j .Lfill
+
+.Lz0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  addi  a4, a4, -1  # decrement n
+  addi  a10, a10, 1 # advance dst pointer
+  j .Lfill
+
+.Lz1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  addi  a4, a4, -2  # decrement n
+  addi  a10, a10, 2 # advance dst pointer
+  j .Lfill
+
+.Lz2: /* Byte 2 is zero.  */
+#if XCHAL_HAVE_BE
+  extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  addi  a4, a4, -3  # decrement n
+  addi  a10, a10, 3 # advance dst pointer
+  j .Lfill
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 2f    # loop forever (almost anyway)
+#endif
+1:  l8ui  a8, a3, 0
+  addi  a3, a3, 1
+  s8i a8, a10, 0
+  addi  a4, a4, -1
+  beqz  a4, 3f
+  addi  a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, 2f
+#else
+  bnez  a8, 1b
+#endif
+2:  j .Lfill
+
+3:  RET(16)
+.end schedule
+
+  .size strncpy, . - strncpy
diff --git a/libs/libc/machine/xtensa/xtensa_asm.h b/libs/libc/machine/xtensa/xtensa_asm.h
new file mode 100644
index 0000000..9913763
--- /dev/null
+++ b/libs/libc/machine/xtensa/xtensa_asm.h
@@ -0,0 +1,62 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/xtensa_asm.h
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include <arch/chip/core-isa.h>
+
+/****************************************************************************
+ * Assembly Language Macros
+ ****************************************************************************/
+
+  .macro  src_b r, w0, w1
+#if XCHAL_HAVE_BE
+  src \r, \w0, \w1
+#else
+  src \r, \w1, \w0
+#endif
+  .endm
+
+  .macro  ssa8  r
+#if XCHAL_HAVE_BE
+  ssa8b \r
+#else
+  ssa8l \r
+#endif
+  .endm
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#if XCHAL_HAVE_BE
+#  define MASK0 0xff000000
+#  define MASK1 0x00ff0000
+#  define MASK2 0x0000ff00
+#  define MASK3 0x000000ff
+#else
+#  define MASK0 0x000000ff
+#  define MASK1 0x0000ff00
+#  define MASK2 0x00ff0000
+#  define MASK3 0xff000000
+#endif
+