You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nuttx.apache.org by xi...@apache.org on 2021/11/06 12:39:39 UTC

[incubator-nuttx] 02/03: libc:machine:xtensa:add xtensa libc implement

This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nuttx.git

commit cfcff5f570319192af6f64d4e7407c991488cff8
Author: zhuyanlin <zh...@xiaomi.com>
AuthorDate: Thu Oct 28 11:56:18 2021 +0800

    libc:machine:xtensa:add xtensa libc implement
    
    N/A
    
    Signed-off-by: zhuyanlin <zh...@xiaomi.com>
---
 libs/libc/machine/xtensa/Kconfig        |  43 ++
 libs/libc/machine/xtensa/Make.defs      |  31 +-
 libs/libc/machine/xtensa/arch_memcpy.S  | 281 ++++++++++++
 libs/libc/machine/xtensa/arch_memmove.S | 480 ++++++++++++++++++++
 libs/libc/machine/xtensa/arch_memset.S  | 179 ++++++++
 libs/libc/machine/xtensa/arch_strcmp.S  | 767 ++++++++++++++++++++++++++++++++
 libs/libc/machine/xtensa/arch_strcpy.S  | 243 ++++++++++
 libs/libc/machine/xtensa/arch_strlen.S  | 123 +++++
 libs/libc/machine/xtensa/arch_strncpy.S | 265 +++++++++++
 libs/libc/machine/xtensa/xtensa_asm.h   |  62 +++
 10 files changed, 2472 insertions(+), 2 deletions(-)

diff --git a/libs/libc/machine/xtensa/Kconfig b/libs/libc/machine/xtensa/Kconfig
index f72f3c0..232fb73 100644
--- a/libs/libc/machine/xtensa/Kconfig
+++ b/libs/libc/machine/xtensa/Kconfig
@@ -2,3 +2,46 @@
 # For a description of the syntax of this configuration file,
 # see the file kconfig-language.txt in the NuttX tools repository.
 #
+
+config XTENSA_MEMCPY
+        bool "Enable optimized memcpy() for XTENSA"
+        select LIBC_ARCH_MEMCPY
+        ---help---
+                Enable optimized XTENSA specific memcpy() library function
+
+config XTENSA_MEMMOVE
+        bool "Enable optimized memmove() for XTENSA"
+        select LIBC_ARCH_MEMMOVE
+        ---help---
+                Enable optimized XTENSA specific memmove() library function
+
+config XTENSA_MEMSET
+        bool "Enable optimized memset() for XTENSA"
+        select LIBC_ARCH_MEMSET
+        ---help---
+                Enable optimized XTENSA specific memset() library function
+
+config XTENSA_STRCMP
+        bool "Enable optimized strcmp() for XTENSA"
+        select LIBC_ARCH_STRCMP
+        ---help---
+                Enable optimized XTENSA specific strcmp() library function
+
+config XTENSA_STRCPY
+        bool "Enable optimized strcpy() for XTENSA"
+        select LIBC_ARCH_STRCPY
+        ---help---
+                Enable optimized XTENSA specific strcpy() library function
+
+config XTENSA_STRLEN
+        bool "Enable optimized strlen() for XTENSA"
+        select LIBC_ARCH_STRLEN
+        ---help---
+                Enable optimized XTENSA specific strlen() library function
+
+config XTENSA_STRNCPY
+        bool "Enable optimized strncpy() for XTENSA"
+        select LIBC_ARCH_STRNCPY
+        ---help---
+                Enable optimized XTENSA specific strncpy() library function
+
diff --git a/libs/libc/machine/xtensa/Make.defs b/libs/libc/machine/xtensa/Make.defs
index 8f33a82..379c7da 100644
--- a/libs/libc/machine/xtensa/Make.defs
+++ b/libs/libc/machine/xtensa/Make.defs
@@ -19,10 +19,37 @@
 ############################################################################
 
 ifeq ($(CONFIG_LIBC_ARCH_ELF),y)
-
 CSRCS += arch_elf.c
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMCPY),y)
+ASRCS += arch_memcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMMOVE),y)
+ASRCS += arch_memmove.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMSET),y)
+ASRCS += arch_memset.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCPY),y)
+ASRCS += arch_strcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRLEN),y)
+ASRCS += arch_strlen.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRNCPY),y)
+ASRCS += arch_strncpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCMP),y)
+ASRCS += arch_strcmp.S
+endif
 
 DEPPATH += --dep-path machine/xtensa
 VPATH += :machine/xtensa
 
-endif
diff --git a/libs/libc/machine/xtensa/arch_memcpy.S b/libs/libc/machine/xtensa/arch_memcpy.S
new file mode 100644
index 0000000..47de6dd
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memcpy.S
@@ -0,0 +1,281 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT  0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .literal_position
+
+  .local  .Ldst1mod2
+  .local  .Ldst2mod4
+  .local  .Lbytecopy
+
+  .align  4
+  .global memcpy
+  .type memcpy, @function
+memcpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src, a4 = len */
+
+  mov a5, a2    # copy dst so that a2 is return value
+  bbsi.l  a2, 0, .Ldst1mod2
+  bbsi.l  a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+  /* Get number of loop iterations with 16B per iteration.  */
+  srli  a7, a4, 4
+
+  /* Check if source is aligned.  */
+  slli  a8, a3, 30
+  bnez  a8, .Lsrcunaligned
+
+  /* Destination and source are word-aligned, use word copy.  */
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, 2f
+#else
+  beqz  a7, 2f
+  slli  a8, a7, 4
+  add a8, a8, a3  # a8 = end of last 16B source chunk
+#endif
+1:  l32i  a6, a3, 0
+  l32i  a7, a3, 4
+  s32i  a6, a5, 0
+  l32i  a6, a3, 8
+
+  s32i  a7, a5, 4
+  l32i  a7, a3, 12
+  s32i  a6, a5, 8
+  addi  a3, a3, 16
+  s32i  a7, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a8, 1b
+#endif
+
+  /* Copy any leftover pieces smaller than 16B.  */
+2:  bbci.l  a4, 3, 3f
+
+  /* Copy 8 bytes.  */
+  l32i  a6, a3, 0
+  l32i  a7, a3, 4
+  addi  a3, a3, 8
+  s32i  a6, a5, 0
+  s32i  a7, a5, 4
+  addi  a5, a5, 8
+
+3:  bbsi.l  a4, 2, 4f
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  # .align 4
+  /* Copy 4 bytes.  */
+4:  l32i  a6, a3, 0
+  addi  a3, a3, 4
+  s32i  a6, a5, 0
+  addi  a5, a5, 4
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 2 bytes.  */
+5:  l16ui a6, a3, 0
+  addi  a3, a3, 2
+  s16i  a6, a5, 0
+  addi  a5, a5, 2
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 1 byte.  */
+6:  l8ui  a6, a3, 0
+  s8i a6, a5, 0
+
+.Ldone:
+  RET(16)
+
+
+/* Destination is aligned; source is unaligned.  */
+
+  # .align 4
+.Lsrcunaligned:
+  /* Avoid loading anything for zero-length copies.  */
+  beqz  a4, .Ldone
+
+  /* Copy 16 bytes per iteration for word-aligned dst and
+     unaligned src.  */
+  ssa8  a3    # set shift amount from byte offset
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  srli    a11, a8, 30     # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, 2f
+#else
+  beqz  a7, 2f
+  slli  a10, a7, 4
+  add a10, a10, a3  # a10 = end of last 16B source chunk
+#endif
+1:  l32i  a7, a3, 4
+  l32i  a8, a3, 8
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  l32i  a9, a3, 12
+  src_b a7, a7, a8
+  s32i  a7, a5, 4
+  l32i  a6, a3, 16
+  src_b a8, a8, a9
+  s32i  a8, a5, 8
+  addi  a3, a3, 16
+  src_b a9, a9, a6
+  s32i  a9, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a10, 1b
+#endif
+
+2:  bbci.l  a4, 3, 3f
+
+  /* Copy 8 bytes.  */
+  l32i  a7, a3, 4
+  l32i  a8, a3, 8
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  addi  a3, a3, 8
+  src_b a7, a7, a8
+  s32i  a7, a5, 4
+  addi  a5, a5, 8
+  mov a6, a8
+
+3:  bbci.l  a4, 2, 4f
+
+  /* Copy 4 bytes.  */
+  l32i  a7, a3, 4
+  addi  a3, a3, 4
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  addi  a5, a5, 4
+  mov a6, a7
+4:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 2 bytes.  */
+5:  l8ui  a6, a3, 0
+  l8ui  a7, a3, 1
+  addi  a3, a3, 2
+  s8i a6, a5, 0
+  s8i a7, a5, 1
+  addi  a5, a5, 2
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 1 byte.  */
+6:  l8ui  a6, a3, 0
+  s8i a6, a5, 0
+  RET(16)
+
+
+  # .align XCHAL_INST_FETCH_WIDTH
+__memcpy_aux:
+
+  /* Skip bytes to get proper alignment for three-byte loop */
+# .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, 2f
+#else
+  beqz  a4, 2f
+  add a7, a3, a4  # a7 = end address for source
+#endif
+1:  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a7, 1b
+#endif
+2:  RET(16)
+
+
+/* Destination is unaligned.  */
+
+  # .align 4
+.Ldst1mod2: # dst is only byte aligned
+
+  /* Do short copies byte-by-byte.  */
+  bltui a4, 7, .Lbytecopy
+
+  /* Copy 1 byte.  */
+  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  addi  a4, a4, -1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+
+  /* Return to main algorithm if dst is now aligned.  */
+  bbci.l  a5, 1, .Ldstaligned
+
+.Ldst2mod4: # dst has 16-bit alignment
+
+  /* Do short copies byte-by-byte.  */
+  bltui a4, 6, .Lbytecopy
+
+  /* Copy 2 bytes.  */
+  l8ui  a6, a3, 0
+  l8ui  a7, a3, 1
+  addi  a3, a3, 2
+  addi  a4, a4, -2
+  s8i a6, a5, 0
+  s8i a7, a5, 1
+  addi  a5, a5, 2
+
+  /* dst is now aligned; return to main algorithm.  */
+  j .Ldstaligned
+
+  .end schedule
+
+  .size memcpy, . - memcpy
diff --git a/libs/libc/machine/xtensa/arch_memmove.S b/libs/libc/machine/xtensa/arch_memmove.S
new file mode 100644
index 0000000..7ce56c4
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memmove.S
@@ -0,0 +1,480 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT  0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+  .text
+  .begin schedule
+  .global memmove
+
+/*
+ * Byte by byte copy
+ */
+  .align  4
+  .byte 0   # 1 mod 4 alignment for LOOPNEZ
+        # (0 mod 4 alignment for LBEG)
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, .Lbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a4, .Lbytecopydone
+  add a7, a3, a4  # a7 = end address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lnextbyte:
+  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbytecopydone:
+  RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+  .align  4
+.Ldst1mod2: # dst is only byte aligned
+  _bltui  a4, 7, .Lbytecopy # do short copies byte by byte
+
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  addi  a3, a3,  1
+  addi  a4, a4, -1
+  s8i a6, a5,  0
+  addi  a5, a5,  1
+  _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
+          # return to main algorithm
+.Ldst2mod4: # dst 16-bit aligned
+  # copy 2 bytes
+  _bltui  a4, 6, .Lbytecopy # do short copies byte by byte
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a3, a3,  2
+  addi  a4, a4, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a5, a5,  2
+  j .Ldstaligned  # dst is now aligned, return to main algorithm
+
+.Lcommon:
+  bbsi.l  a2, 0, .Ldst1mod2 # if dst is 1 mod 2
+  bbsi.l  a2, 1, .Ldst2mod4 # if dst is 2 mod 4
+.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
+  srli  a7, a4, 4 # number of loop iterations with 16B
+        # per iteration
+  movi  a8, 3   # if source is not aligned,
+  bany  a3, a8, .Lsrcunaligned  # then use shifting copy
+  /*
+   * Destination and source are word-aligned, use word copy.
+   */
+  # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .Loop1done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .Loop1done
+  slli  a8, a7, 4
+  add a8, a8, a3  # a8 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1:
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  s32i  a6, a5,  0
+  l32i  a6, a3,  8
+  s32i  a7, a5,  4
+  l32i  a7, a3, 12
+  s32i  a6, a5,  8
+  addi  a3, a3, 16
+  s32i  a7, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1done:
+  bbci.l  a4, 3, .L2
+  # copy 8 bytes
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  addi  a3, a3,  8
+  s32i  a6, a5,  0
+  s32i  a7, a5,  4
+  addi  a5, a5,  8
+.L2:
+  bbsi.l  a4, 2, .L3
+  bbsi.l  a4, 1, .L4
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L3:
+  # copy 4 bytes
+  l32i  a6, a3,  0
+  addi  a3, a3,  4
+  s32i  a6, a5,  0
+  addi  a5, a5,  4
+  bbsi.l  a4, 1, .L4
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L4:
+  # copy 2 bytes
+  l16ui a6, a3,  0
+  addi  a3, a3,  2
+  s16i  a6, a5,  0
+  addi  a5, a5,  2
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L5:
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+  .align  4
+.Lsrcunaligned:
+  _beqz a4, .Ldone  # avoid loading anything for zero-length copies
+  # copy 16 bytes per iteration for word-aligned dst and unaligned src
+  ssa8  a3    # set shift amount from byte offset
+
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  and a11, a3, a8 # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .Loop2done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .Loop2done
+  slli  a10, a7, 4
+  add a10, a10, a3  # a10 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2:
+  l32i  a7, a3,  4
+  l32i  a8, a3,  8
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  l32i  a9, a3, 12
+  src_b a7, a7, a8
+  s32i  a7, a5,  4
+  l32i  a6, a3, 16
+  src_b a8, a8, a9
+  s32i  a8, a5,  8
+  addi  a3, a3, 16
+  src_b a9, a9, a6
+  s32i  a9, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2done:
+  bbci.l  a4, 3, .L12
+  # copy 8 bytes
+  l32i  a7, a3,  4
+  l32i  a8, a3,  8
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  addi  a3, a3,  8
+  src_b a7, a7, a8
+  s32i  a7, a5,  4
+  addi  a5, a5,  8
+  mov a6, a8
+.L12:
+  bbci.l  a4, 2, .L13
+  # copy 4 bytes
+  l32i  a7, a3,  4
+  addi  a3, a3,  4
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  addi  a5, a5,  4
+  mov a6, a7
+.L13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, .L14
+  bbsi.l  a4, 0, .L15
+.Ldone: RET(16)
+.L14:
+  # copy 2 bytes
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a3, a3,  2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a5, a5,  2
+  bbsi.l  a4, 0, .L15
+  RET(16)
+.L15:
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Byte by byte copy
+ */
+  .align  4
+  .byte 0   # 1 mod 4 alignment for LOOPNEZ
+        # (0 mod 4 alignment for LBEG)
+.Lbackbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, .Lbackbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a4, .Lbackbytecopydone
+  sub a7, a3, a4  # a7 = start address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbacknextbyte:
+  addi  a3, a3, -1
+  l8ui  a6, a3, 0
+  addi  a5, a5, -1
+  s8i a6, a5, 0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a7, .Lbacknextbyte # continue loop if
+               # $a3:src != $a7:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbackbytecopydone:
+  RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+  .align  4
+.Lbackdst1mod2: # dst is only byte aligned
+  _bltui  a4, 7, .Lbackbytecopy # do short copies byte by byte
+
+  # copy 1 byte
+  addi  a3, a3, -1
+  l8ui  a6, a3,  0
+  addi  a5, a5, -1
+  s8i a6, a5,  0
+  addi  a4, a4, -1
+  _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
+          # return to main algorithm
+.Lbackdst2mod4: # dst 16-bit aligned
+  # copy 2 bytes
+  _bltui  a4, 6, .Lbackbytecopy # do short copies byte by byte
+  addi  a3, a3, -2
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a5, a5, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a4, a4, -2
+  j .Lbackdstaligned  # dst is now aligned,
+          # return to main algorithm
+
+  .align  4
+memmove:
+
+  ENTRY(16)
+  # a2/ dst, a3/ src, a4/ len
+  mov a5, a2    # copy dst so that a2 is return value
+.Lmovecommon:
+  sub a6, a5, a3
+  bgeu  a6, a4, .Lcommon
+
+  add a5, a5, a4
+  add a3, a3, a4
+
+  bbsi.l  a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
+  bbsi.l  a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
+.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
+  srli  a7, a4, 4 # number of loop iterations with 16B
+        # per iteration
+  movi  a8, 3   # if source is not aligned,
+  bany  a3, a8, .Lbacksrcunaligned  # then use shifting copy
+  /*
+   * Destination and source are word-aligned, use word copy.
+   */
+  # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .backLoop1done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .backLoop1done
+  slli  a8, a7, 4
+  sub a8, a3, a8  # a8 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1:
+  addi  a3, a3, -16
+  l32i  a7, a3, 12
+  l32i  a6, a3,  8
+  addi  a5, a5, -16
+  s32i  a7, a5, 12
+  l32i  a7, a3,  4
+  s32i  a6, a5,  8
+  l32i  a6, a3,  0
+  s32i  a7, a5,  4
+  s32i  a6, a5,  0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1done:
+  bbci.l  a4, 3, .Lback2
+  # copy 8 bytes
+  addi  a3, a3, -8
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  addi  a5, a5, -8
+  s32i  a6, a5,  0
+  s32i  a7, a5,  4
+.Lback2:
+  bbsi.l  a4, 2, .Lback3
+  bbsi.l  a4, 1, .Lback4
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback3:
+  # copy 4 bytes
+  addi  a3, a3, -4
+  l32i  a6, a3,  0
+  addi  a5, a5, -4
+  s32i  a6, a5,  0
+  bbsi.l  a4, 1, .Lback4
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback4:
+  # copy 2 bytes
+  addi  a3, a3, -2
+  l16ui a6, a3,  0
+  addi  a5, a5, -2
+  s16i  a6, a5,  0
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback5:
+  # copy 1 byte
+  addi  a3, a3, -1
+  l8ui  a6, a3,  0
+  addi  a5, a5, -1
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+  .align  4
+.Lbacksrcunaligned:
+  _beqz a4, .Lbackdone  # avoid loading anything for zero-length copies
+  # copy 16 bytes per iteration for word-aligned dst and unaligned src
+  ssa8  a3    # set shift amount from byte offset
+#define SIM_CHECKS_ALIGNMENT  1 /* set to 1 when running on ISS with
+           * the lint or ferret client, or 0
+           * to save a few cycles */
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  and a11, a3, a8 # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .backLoop2done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .backLoop2done
+  slli  a10, a7, 4
+  sub a10, a3, a10  # a10 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2:
+  addi  a3, a3, -16
+  l32i  a7, a3, 12
+  l32i  a8, a3,  8
+  addi  a5, a5, -16
+  src_b a6, a7, a6
+  s32i  a6, a5, 12
+  l32i  a9, a3,  4
+  src_b a7, a8, a7
+  s32i  a7, a5,  8
+  l32i  a6, a3,  0
+  src_b a8, a9, a8
+  s32i  a8, a5,  4
+  src_b a9, a6, a9
+  s32i  a9, a5,  0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2done:
+  bbci.l  a4, 3, .Lback12
+  # copy 8 bytes
+  addi  a3, a3, -8
+  l32i  a7, a3,  4
+  l32i  a8, a3,  0
+  addi  a5, a5, -8
+  src_b a6, a7, a6
+  s32i  a6, a5,  4
+  src_b a7, a8, a7
+  s32i  a7, a5,  0
+  mov a6, a8
+.Lback12:
+  bbci.l  a4, 2, .Lback13
+  # copy 4 bytes
+  addi  a3, a3, -4
+  l32i  a7, a3,  0
+  addi  a5, a5, -4
+  src_b a6, a7, a6
+  s32i  a6, a5,  0
+  mov a6, a7
+.Lback13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, .Lback14
+  bbsi.l  a4, 0, .Lback15
+.Lbackdone:
+  RET(16)
+.Lback14:
+  # copy 2 bytes
+  addi  a3, a3, -2
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a5, a5, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  bbsi.l  a4, 0, .Lback15
+  RET(16)
+.Lback15:
+  # copy 1 byte
+  addi  a3, a3, -1
+  addi  a5, a5, -1
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+  .end schedule
+  .size memmove, . - memmove
diff --git a/libs/libc/machine/xtensa/arch_memset.S b/libs/libc/machine/xtensa/arch_memset.S
new file mode 100644
index 0000000..488172f
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_memset.S
@@ -0,0 +1,179 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+/* void *memset (void *dst, int c, size_t length)
+
+   The algorithm is as follows:
+
+   Create a word with c in all byte positions.
+
+   If the destination is aligned, set 16B chunks with a loop, and then
+   finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
+
+   If the destination is unaligned, align it by conditionally
+   setting 1B and/or 2B and then go to aligned case.
+
+   This code tries to use fall-through branches for the common
+   case of an aligned destination (except for the branches to
+   the alignment labels).  */
+
+
+/* Byte-by-byte set.  */
+
+	.section .text
+	.begin schedule
+	.literal_position
+
+	.local	.Lbyteset
+	.local	.Ldst1mod2
+	.local	.Ldst2mod4
+
+	.align	4
+	.global	memset
+	.type	memset, @function
+memset:
+  ENTRY(16)
+	/* a2 = dst, a3 = c, a4 = length */
+
+	/* Duplicate character into all bytes of word.  */
+	extui	a3, a3, 0, 8
+	slli	a7, a3, 8
+	or	a3, a3, a7
+	slli	a7, a3, 16
+	or	a3, a3, a7
+
+	mov	a5, a2		// copy dst so that a2 is return value
+
+	/* Check if dst is unaligned.  */
+	bbsi.l	a2, 0, .Ldst1mod2
+	bbsi.l	a2, 1, .Ldst2mod4
+	j	.Ldstaligned
+
+.Ldst1mod2: // dst is only byte aligned
+
+	/* Do short sizes byte-by-byte.  */
+	bltui	a4, 8, .Lbyteset
+
+	/* Set 1 byte.  */
+	s8i	a3, a5, 0
+	addi	a5, a5, 1
+	addi	a4, a4, -1
+
+	/* Now retest if dst is aligned.  */
+	bbci.l	a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+	/* Do short sizes byte-by-byte.  */
+	bltui	a4, 8, .Lbyteset
+
+	/* Set 2 bytes.  */
+	s16i	a3, a5, 0
+	addi	a5, a5, 2
+	addi	a4, a4, -2
+
+	/* dst is now aligned; fall through to main algorithm */
+
+.Ldstaligned:
+
+	/* Get number of loop iterations with 16B per iteration.  */
+	srli	a7, a4, 4
+
+	/* Destination is word-aligned.  */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a7, 2f
+#else
+	beqz	a7, 2f
+	slli	a6, a7, 4
+	add	a6, a6, a5	// a6 = end of last 16B chunk
+#endif
+	/* Set 16 bytes per iteration.  */
+1:	s32i	a3, a5, 0
+	s32i	a3, a5, 4
+	s32i	a3, a5, 8
+	s32i	a3, a5, 12
+	addi	a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+	bltu	a5, a6, 1b
+#endif
+
+	/* Set any leftover pieces smaller than 16B.  */
+2:	bbci.l	a4, 3, 3f
+
+	/* Set 8 bytes.  */
+	s32i	a3, a5, 0
+	s32i	a3, a5, 4
+	addi	a5, a5, 8
+
+3:	bbci.l	a4, 2, 4f
+
+	/* Set 4 bytes.  */
+	s32i	a3, a5, 0
+	addi	a5, a5, 4
+
+4:	bbci.l	a4, 1, 5f
+
+	/* Set 2 bytes.  */
+	s16i	a3, a5, 0
+	addi	a5, a5, 2
+
+5:	bbci.l	a4, 0, 6f
+
+	/* Set 1 byte.  */
+	s8i	a3, a5, 0
+6:	RET(16)
+
+
+	// .align	XCHAL_INST_FETCH_WIDTH
+__memset_aux:
+
+	/* Skip bytes to get proper alignment for three-byte loop */
+// .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbyteset:
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, 2f
+#else
+	beqz	a4, 2f
+	add	a6, a5, a4	// a6 = ending address
+#endif
+1:	s8i	a3, a5, 0
+	addi	a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+	bltu	a5, a6, 1b
+#endif
+2:	RET(16)
+
+	.end schedule
+
+	.size	memset, . - memset
diff --git a/libs/libc/machine/xtensa/arch_strcmp.S b/libs/libc/machine/xtensa/arch_strcmp.S
new file mode 100644
index 0000000..aab50be
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strcmp.S
@@ -0,0 +1,767 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcmp.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#define MASK4 0x40404040
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin  schedule
+  .align  4
+  .literal_position
+
+  .global strcmp
+  .type strcmp,@function
+  .align  4
+
+strcmp:
+
+#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_FLIX3
+/*  Fast version for FLIX3 Little Endian */
+
+
+  ENTRY(16)
+  /* a2 = s1, a3 = s2 */
+
+  l8ui  a8, a2, 0 # byte 0 from s1
+  l8ui  a9, a3, 0 # byte 0 from s2
+  movi  a10, 3    # mask
+  movi  a5, 0xfffffffc
+  or  a11, a2, a3
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+  addi    a3, a3, -8
+  addi    a2, a2, -8
+  and a5, a5, a2
+  bne.w18 a8, a9, .Lretdiff
+  l32i  a8, a5, 8 # get word from aligned variant of s1
+
+  bany.w18  a11, a10, .Lnot_aligned
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+.Laligned:
+  /* Loop forever */
+1:
+  loop  a0, .Laligned_done
+
+  /* First unrolled loop body.  */
+  l32i  a9, a3, 8 # get word from s2
+  addi  a3, a3, 8 # advance s2 pointer
+  slli  a5, a8, 1
+  or  a10, a8, a5
+  {l32i a11, a2, 12 # get word from s1+4
+  bne.w18 a8, a9, .Lwne2}
+  l32i  a9, a3, 4 # get word from s2+4
+  bnall.w18 a10, a7, .Lprobeq
+
+  /* Second unrolled loop body.  */
+  slli  a5, a11, 1
+  or  a10, a11, a5
+  addi  a2, a2, 8 # advance s1 pointer
+        mov a8, a11
+  bne.w18 a11, a9, .Lwne2
+  l32i  a8, a2, 8 # get word from s1
+  bnall.w18 a10, a7, .Lprobeq2
+
+.Laligned_done:
+  l32i  a8, a2, 8 # get word from s1
+  j       1b
+
+.Lnot_aligned:
+  xor a11, a2, a3 # compare low two bits of s1 and s2
+  bany  a11, a10, .Lunaligned # if they have different alignment
+
+  /* s1/s2 are not word-aligned.  */
+  movi  a5, 0xfffffffc
+  addi  a2, a2, 1 # advance s1
+  beqz  a9, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 1 from s1
+  l8ui  a9, a3, 8 # byte 1 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 2 from s1
+  l8ui  a9, a3, 8 # byte 2 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  l32i  a8, a2, 8 # get word from s1
+  j .Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+.Lunaligned:
+  movi.n  a8, 0   # set up for the maximum loop count
+  loop  a8, .Lretdiff # loop forever (almost anyway)
+  l8ui  a8, a2, 8
+  l8ui  a9, a3, 8
+  addi  a2, a2, 1
+  bne a8, a9, .Lretdiff
+  addi  a3, a3, 1
+  beqz  a8, .Lretdiff
+.Lretdiff:
+  sub a2, a8, a9
+  RET(16)
+
+
+.Lprobeq2:
+  /* Adjust pointers to account for the loop unrolling.  */
+        mov a8, a11
+  addi  a2, a2, -4
+  addi  a3, a3, 4
+
+  /* align (0 mod 4) */
+.Lprobeq:
+  /* Words are probably equal, but check for sure.
+     If not, loop over the rest of string using normal algorithm.  */
+
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  movi  a5, MASK1 # mask for byte 1
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  movi  a7, MASK3 # mask for byte 3
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  /* align (1 mod 4) */
+  addi.n  a2, a2, 12  # advance s1 pointer
+  addi.n  a3, a3, 4 # advance s2 pointer
+  /* align (1 mod 4) or (2 mod 4) */
+1:
+  loop  a0, .Lend # loop forever (a4 is bigger than max iters)
+
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  addi  a3, a3, 4 # advance s2 pointer
+.Lend:
+  j 1b
+
+  /* Words are equal; some byte is zero.  */
+.Leq: movi  a2, 0   # return equal
+  RET(16)
+
+.Lwne2: /* Words are not equal.  On big-endian processors, if none of the
+     bytes are zero, the return value can be determined by a simple
+     comparison.  */
+.Lwne:  /* Words are not equal.  */
+  xor a2, a8, a9  # get word with nonzero in byte that differs
+  extui a10, a8, 0, 8
+  extui a11, a9, 0, 8
+  movi  a5, MASK1 # mask for byte 1
+  bany.w18  a2, a4, .Ldiff0 # if byte 0 differs
+
+  bnone.w18 a8, a4, .Leq  # if byte 0 is zero
+  movi  a6, MASK2 # mask for byte 2
+  bany.w18  a2, a5, .Ldiff1 # if byte 1 differs
+  extui a10, a8, 24, 8
+  bnone.w18 a8, a5, .Leq  # if byte 1 is zero
+  extui a11, a9, 24, 8
+  bany.w18  a2, a6, .Ldiff2 # if byte 2 differs
+  sub a2, a10, a11
+  bnone.w18 a8, a6, .Leq  # if byte 2 is zero
+  /* Little-endian is a little more difficult because can't subtract
+     whole words.  */
+.Ldiff3:
+  /* Bytes 0-2 are equal; byte 3 is different.
+     For little-endian need to have a sign bit for the difference.  */
+  RET(16)
+.Ldiff0:
+  /* Byte 0 is different.  */
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff1:
+  /* Byte 0 is equal; byte 1 is different.  */
+  extui a10, a8, 8, 8
+  extui a11, a9, 8, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff2:
+  /* Bytes 0-1 are equal; byte 2 is different.  */
+  extui a10, a8, 16, 8
+  extui a11, a9, 16, 8
+  sub a2, a10, a11
+  RET(16)
+
+#else
+#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_PDX4
+/*  Fast version for FLIX3 Little Endian */
+
+
+  ENTRY(16)
+  /* a2 = s1, a3 = s2 */
+
+  l8ui  a8, a2, 0 # byte 0 from s1
+  l8ui  a9, a3, 0 # byte 0 from s2
+  movi  a10, 3    # mask
+  movi  a5, 0xfffffffc
+  or  a11, a2, a3
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+  addi    a3, a3, -8
+  addi    a2, a2, -8
+  and a5, a5, a2
+  bne.w15 a8, a9, .Lretdiff
+  l32i  a8, a5, 8 # get word from aligned variant of s1
+
+  bany.w15  a11, a10, .Lnot_aligned
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+.Laligned:
+  /* Loop forever */
+1:
+  loop  a0, .Laligned_done
+
+  /* First unrolled loop body.  */
+  l32i  a9, a3, 8 # get word from s2
+  addi  a3, a3, 8 # advance s2 pointer
+  slli  a5, a8, 1
+  or  a10, a8, a5
+  {
+  bne.w15 a8, a9, .Lwne2
+  l32i  a11, a2, 12 # get word from s1+4
+  nop
+  nop
+  }
+  l32i  a9, a3, 4 # get word from s2+4
+  bnall.w15 a10, a7, .Lprobeq
+
+  /* Second unrolled loop body.  */
+  slli  a5, a11, 1
+  or  a10, a11, a5
+  addi  a2, a2, 8 # advance s1 pointer
+        mov a8, a11
+  bne.w15 a11, a9, .Lwne2
+  l32i  a8, a2, 8 # get word from s1
+  bnall.w15 a10, a7, .Lprobeq2
+
+.Laligned_done:
+  l32i  a8, a2, 8 # get word from s1
+  j       1b
+
+.Lnot_aligned:
+  xor a11, a2, a3 # compare low two bits of s1 and s2
+  bany  a11, a10, .Lunaligned # if they have different alignment
+
+  /* s1/s2 are not word-aligned.  */
+  movi  a5, 0xfffffffc
+  addi  a2, a2, 1 # advance s1
+  beqz  a9, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 1 from s1
+  l8ui  a9, a3, 8 # byte 1 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  and     a6, a2, a5
+  l32i  a8, a6, 8 # get word from s1
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 8 # byte 2 from s1
+  l8ui  a9, a3, 8 # byte 2 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  l32i  a8, a2, 8 # get word from s1
+  j .Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+.Lunaligned:
+  movi.n  a8, 0   # set up for the maximum loop count
+  loop  a8, .Lretdiff # loop forever (almost anyway)
+  l8ui  a8, a2, 8
+  l8ui  a9, a3, 8
+  addi  a2, a2, 1
+  bne a8, a9, .Lretdiff
+  addi  a3, a3, 1
+  beqz  a8, .Lretdiff
+.Lretdiff:
+  sub a2, a8, a9
+  RET(16)
+
+
+.Lprobeq2:
+  /* Adjust pointers to account for the loop unrolling.  */
+        mov a8, a11
+  addi  a2, a2, -4
+  addi  a3, a3, 4
+
+  /* align (0 mod 4) */
+.Lprobeq:
+  /* Words are probably equal, but check for sure.
+     If not, loop over the rest of string using normal algorithm.  */
+
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  movi  a5, MASK1 # mask for byte 1
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  movi  a7, MASK3 # mask for byte 3
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  /* align (1 mod 4) */
+  addi.n  a2, a2, 12  # advance s1 pointer
+  addi.n  a3, a3, 4 # advance s2 pointer
+  /* align (1 mod 4) or (2 mod 4) */
+1:
+  loop  a0, .Lend # loop forever (a4 is bigger than max iters)
+
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  addi  a3, a3, 4 # advance s2 pointer
+.Lend:
+  j 1b
+
+  /* Words are equal; some byte is zero.  */
+.Leq: movi  a2, 0   # return equal
+  RET(16)
+
+.Lwne2: /* Words are not equal.  On big-endian processors, if none of the
+     bytes are zero, the return value can be determined by a simple
+     comparison.  */
+.Lwne:  /* Words are not equal.  */
+  xor a2, a8, a9  # get word with nonzero in byte that differs
+  extui a10, a8, 0, 8
+  extui a11, a9, 0, 8
+  movi  a5, MASK1 # mask for byte 1
+  bany.w15  a2, a4, .Ldiff0 # if byte 0 differs
+
+  bnone.w15 a8, a4, .Leq  # if byte 0 is zero
+  movi  a6, MASK2 # mask for byte 2
+  bany.w15  a2, a5, .Ldiff1 # if byte 1 differs
+  extui a10, a8, 24, 8
+  bnone.w15 a8, a5, .Leq  # if byte 1 is zero
+  extui a11, a9, 24, 8
+  bany.w15  a2, a6, .Ldiff2 # if byte 2 differs
+  sub a2, a10, a11
+  bnone.w15 a8, a6, .Leq  # if byte 2 is zero
+  /* Little-endian is a little more difficult because can't subtract
+     whole words.  */
+.Ldiff3:
+  /* Bytes 0-2 are equal; byte 3 is different.
+     For little-endian need to have a sign bit for the difference.  */
+  RET(16)
+.Ldiff0:
+  /* Byte 0 is different.  */
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff1:
+  /* Byte 0 is equal; byte 1 is different.  */
+  extui a10, a8, 8, 8
+  extui a11, a9, 8, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff2:
+  /* Bytes 0-1 are equal; byte 2 is different.  */
+  extui a10, a8, 16, 8
+  extui a11, a9, 16, 8
+  sub a2, a10, a11
+  RET(16)
+
+
+#else /* Not FLIX3 */
+  ENTRY(16)
+  /* a2 = s1, a3 = s2 */
+
+  l8ui  a8, a2, 0 # byte 0 from s1
+  l8ui  a9, a3, 0 # byte 0 from s2
+  movi  a10, 3    # mask
+  bne a8, a9, .Lretdiff
+
+  or  a11, a2, a3
+  bnone a11, a10, .Laligned
+
+  xor a11, a2, a3 # compare low two bits of s1 and s2
+  bany  a11, a10, .Lunaligned # if they have different alignment
+
+  /* s1/s2 are not word-aligned.  */
+  addi  a2, a2, 1 # advance s1
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 0 # byte 1 from s1
+  l8ui  a9, a3, 0 # byte 1 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  bnone a2, a10, .Laligned # if s1/s2 now aligned
+  l8ui  a8, a2, 0 # byte 2 from s1
+  l8ui  a9, a3, 0 # byte 2 from s2
+  addi  a2, a2, 1 # advance s1
+  bne a8, a9, .Lretdiff # if different, return difference
+  beqz  a8, .Leq  # bytes equal, if zero, strings are equal
+  addi  a3, a3, 1 # advance s2
+  j .Laligned
+
+/* s1 and s2 have different alignment.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.
+
+   Note: It is important for this unaligned case to come before the
+   code for aligned strings, because otherwise some of the branches
+   above cannot reach and have to be transformed to branches around
+   jumps.  The unaligned code is smaller and the branches can reach
+   over it.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Lunaligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lretdiff # loop forever (almost anyway)
+#endif
+.Lnextbyte:
+  l8ui  a8, a2, 0
+  l8ui  a9, a3, 0
+  addi  a2, a2, 1
+  bne a8, a9, .Lretdiff
+  addi  a3, a3, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, .Lretdiff
+#else
+  bnez  a8, .Lnextbyte
+#endif
+.Lretdiff:
+  sub a2, a8, a9
+  RET(16)
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+   If the zero-overhead loop option is available, use an (almost)
+   infinite zero-overhead loop with conditional exits so we only pay
+   for taken branches when exiting the loop.  */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+   32 and 127.
+
+   Rather than check all bytes for zero:
+   Take one word (4 bytes).  Call it w1.
+   Shift w1 left by one into w1'.
+   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
+   Check that all 4 bit 6's (one for each byte) are one:
+   If they are, we are definitely not done.
+   If they are not, we are probably done, but need to check for zero.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_CONST16
+  /* (2 mod 4) alignment for loop instruction */
+  .byte 0
+#endif
+.Laligned:
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+
+  /* Loop forever */
+1:
+  loop  a0, .Laligned_done
+
+  /* First unrolled loop body.  */
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  slli  a5, a8, 1
+  bne a8, a9, .Lwne2
+  or  a9, a8, a5
+  bnall a9, a7, .Lprobeq
+
+  /* Second unrolled loop body.  */
+  l32i  a8, a2, 4 # get word from s1+4
+  l32i  a9, a3, 4 # get word from s2+4
+  slli  a5, a8, 1
+  bne a8, a9, .Lwne2
+  or  a9, a8, a5
+  bnall a9, a7, .Lprobeq2
+
+  addi  a2, a2, 8 # advance s1 pointer
+  addi  a3, a3, 8 # advance s2 pointer
+.Laligned_done:
+  j       1b
+
+.Lprobeq2:
+  /* Adjust pointers to account for the loop unrolling.  */
+  addi  a2, a2, 4
+  addi  a3, a3, 4
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+.Laligned:
+  movi  a4, MASK0 # mask for byte 0
+  movi  a7, MASK4
+  j .Lfirstword
+.Lnextword:
+  addi  a2, a2, 4 # advance s1 pointer
+  addi  a3, a3, 4 # advance s2 pointer
+.Lfirstword:
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  slli  a5, a8, 1
+  bne a8, a9, .Lwne2
+  or  a9, a8, a5
+  ball  a9, a7, .Lnextword
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  /* align (0 mod 4) */
+.Lprobeq:
+  /* Words are probably equal, but check for sure.
+     If not, loop over the rest of string using normal algorithm.  */
+
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  movi  a5, MASK1 # mask for byte 1
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  movi  a7, MASK3 # mask for byte 3
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  /* align (1 mod 4) */
+#if XCHAL_HAVE_DENSITY
+  addi.n  a2, a2, 4 # advance s1 pointer
+  addi.n  a3, a3, 4 # advance s2 pointer
+  /* align (1 mod 4) or (2 mod 4) */
+#else
+  addi  a2, a2, 4 # advance s1 pointer
+  addi  a3, a3, 4 # advance s2 pointer
+  or  a1, a1, a1  # nop
+#if XCHAL_HAVE_CONST16
+  or  a1, a1, a1  # nop
+#endif
+  /* align (2 mod 4) */
+#endif /* XCHAL_HAVE_DENSITY */
+#if XCHAL_HAVE_LOOPS
+1:
+  loop  a0, .Leq  # loop forever (a4 is bigger than max iters)
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bnone a8, a7, .Leq  # if byte 3 is zero
+  addi  a3, a3, 4 # advance s2 pointer
+  j 1b
+#else /* !XCHAL_HAVE_LOOPS */
+
+  j .Lfirstword2
+.Lnextword2:
+  addi  a3, a3, 4 # advance s2 pointer
+.Lfirstword2:
+  l32i  a8, a2, 0 # get word from s1
+  l32i  a9, a3, 0 # get word from s2
+  addi  a2, a2, 4 # advance s1 pointer
+  bne a8, a9, .Lwne
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bnone a8, a6, .Leq  # if byte 2 is zero
+  bany  a8, a7, .Lnextword2 # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  /* Words are equal; some byte is zero.  */
+.Leq: movi  a2, 0   # return equal
+  RET(16)
+
+.Lwne2: /* Words are not equal.  On big-endian processors, if none of the
+     bytes are zero, the return value can be determined by a simple
+     comparison.  */
+#if XCHAL_HAVE_BE
+  or  a10, a8, a5
+  bnall a10, a7, .Lsomezero
+  bgeu  a8, a9, .Lposreturn
+  movi  a2, -1
+  RET(16)
+.Lposreturn:
+  movi  a2, 1
+  RET(16)
+.Lsomezero: # There is probably some zero byte.
+#endif /* XCHAL_HAVE_BE */
+.Lwne:  /* Words are not equal.  */
+  xor a2, a8, a9  # get word with nonzero in byte that differs
+  bany  a2, a4, .Ldiff0 # if byte 0 differs
+  movi  a5, MASK1 # mask for byte 1
+  bnone a8, a4, .Leq  # if byte 0 is zero
+  bany  a2, a5, .Ldiff1 # if byte 1 differs
+  movi  a6, MASK2 # mask for byte 2
+  bnone a8, a5, .Leq  # if byte 1 is zero
+  bany  a2, a6, .Ldiff2 # if byte 2 differs
+  bnone a8, a6, .Leq  # if byte 2 is zero
+#if XCHAL_HAVE_BE
+.Ldiff3:
+.Ldiff2:
+.Ldiff1:
+  /* Byte 0 is equal (at least) and there is a difference before a zero
+     byte.  Just subtract words to get the return value.
+     The high order equal bytes cancel, leaving room for the sign.  */
+  sub a2, a8, a9
+  RET(16)
+
+.Ldiff0:
+  /* Need to make room for the sign, so can't subtract whole words.  */
+  extui a10, a8, 24, 8
+  extui a11, a9, 24, 8
+  sub a2, a10, a11
+  RET(16)
+
+#else /* !XCHAL_HAVE_BE */
+  /* Little-endian is a little more difficult because can't subtract
+     whole words.  */
+.Ldiff3:
+  /* Bytes 0-2 are equal; byte 3 is different.
+     For little-endian need to have a sign bit for the difference.  */
+  extui a10, a8, 24, 8
+  extui a11, a9, 24, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff0:
+  /* Byte 0 is different.  */
+  extui a10, a8, 0, 8
+  extui a11, a9, 0, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff1:
+  /* Byte 0 is equal; byte 1 is different.  */
+  extui a10, a8, 8, 8
+  extui a11, a9, 8, 8
+  sub a2, a10, a11
+  RET(16)
+
+.Ldiff2:
+  /* Bytes 0-1 are equal; byte 2 is different.  */
+  extui a10, a8, 16, 8
+  extui a11, a9, 16, 8
+  sub a2, a10, a11
+  RET(16)
+
+#endif /* !XCHAL_HAVE_BE */
+#endif /* FLIX3 */
+#endif /* FLIX3 */
+
+  .end  schedule
+  .size strcmp, . - strcmp
+
diff --git a/libs/libc/machine/xtensa/arch_strcpy.S b/libs/libc/machine/xtensa/arch_strcpy.S
new file mode 100644
index 0000000..b062d87
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strcpy.S
@@ -0,0 +1,243 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .align  4
+  .literal_position
+  .global strcpy
+  .type strcpy, @function
+strcpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src */
+
+  mov a10, a2   # leave dst in return value register
+  movi  a4, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a3, 0, .Lsrc1mod2
+  bbsi.l  a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+  /* Check if the destination is aligned.  */
+  movi  a8, 3
+  bnone a10, a8, .Laligned
+
+  j .Ldstunaligned
+
+.Lsrc1mod2: # src address is odd
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a3, a3, 1 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  beqz  a8, 1f    # if byte 0 is zero
+  addi  a10, a10, 1 # advance dst pointer
+  bbci.l  a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+  l8ui  a8, a3, 0 # get byte 0
+  /* 1-cycle interlock */
+  s8i a8, a10, 0  # store byte 0
+  beqz  a8, 1f    # if byte 0 is zero
+  l8ui  a8, a3, 1 # get byte 0
+  addi  a3, a3, 2 # advance src pointer
+  s8i a8, a10, 1  # store byte 0
+  addi  a10, a10, 2 # advance dst pointer
+  bnez  a8, .Lsrcaligned
+1:  RET(16)
+
+
+/* dst is word-aligned; src is word-aligned.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+.Laligned:
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lz3  # loop forever (almost anyway)
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  bnone a8, a7, .Lz3  # if byte 3 is zero
+  addi  a10, a10, 4 # advance dst pointer
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  addi  a10, a10, 4 # advance dst pointer
+.Laligned:
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  bany  a8, a7, 1b  # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+.Lz3: /* Byte 3 is zero.  */
+  RET(16)
+
+.Lz0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  RET(16)
+
+.Lz1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  RET(16)
+
+.Lz2: /* Byte 2 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  RET(16)
+
+#if 1
+/* For now just use byte copy loop for the unaligned destination case.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 2f    # loop forever (almost anyway)
+#endif
+1:  l8ui  a8, a3, 0
+  addi  a3, a3, 1
+  s8i a8, a10, 0
+  addi  a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, 2f
+#else
+  bnez  a8, 1b
+#endif
+2:  RET(16)
+
+#else /* 0 */
+
+/* This code is not functional yet.  */
+
+.Ldstunaligned:
+  l32i  a9, a2, 0 # load word from dst
+#if XCHAL_HAVE_BE
+  ssa8b a9    # rotate by dst alignment so that
+  src a9, a9, a9  # shift in loop will put back in place
+  ssa8l a9    # shift left by byte*8
+#else
+  ssa8l a9    # rotate by dst alignment so that
+  src a9, a9, a9  # shift in loop will put back in place
+  ssa8b a9    # shift left by 32-byte*8
+#endif
+
+/* dst is word-aligned; src is unaligned.  */
+
+.Ldstunalignedloop:
+  l32i  a8, a3, 0 # get word from src
+  /* 1-cycle interlock */
+  bnone a8, a4, .Lu0  # if byte 0 is zero
+  bnone a8, a5, .Lu1  # if byte 1 is zero
+  bnone a8, a6, .Lu2  # if byte 2 is zero
+  src a9, a8, a9  # combine last word and this word
+  s32i  a9, a10, 0  # store word to dst
+  bnone a8, a7, .Lu3  # if byte 3 is nonzero, iterate
+  l32i  a9, a3, 4 # get word from src
+  addi  a3, a3, 8 # advance src pointer
+  bnone a9, a4, .Lu4  # if byte 0 is zero
+  bnone a9, a5, .Lu5  # if byte 1 is zero
+  bnone a9, a6, .Lu6  # if byte 2 is zero
+  src a8, a9, a8  # combine last word and this word
+  s32i  a8, a10, 4  # store word to dst
+  addi  a10, a10, 8 # advance dst pointer
+  bany  a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate
+
+  /* Byte 7 is zero.  */
+.Lu7: RET(16)
+
+.Lu0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  RET(16)
+
+.Lu1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+  extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  RET(16)
+
+.Lu2: /* Byte 2 is zero.  */
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  RET(16)
+
+#endif /* 0 */
+  .end schedule
+
+  .size strcpy, . - strcpy
diff --git a/libs/libc/machine/xtensa/arch_strlen.S b/libs/libc/machine/xtensa/arch_strlen.S
new file mode 100644
index 0000000..686268e
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strlen.S
@@ -0,0 +1,123 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strlen.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .align  4
+  .literal_position
+  .global strlen
+  .type strlen, @function
+strlen:
+  ENTRY(16)
+  /* a2 = s */
+
+  addi  a3, a2, -4  # because we overincrement at the end
+  movi  a4, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a2, 0, .L1mod2
+  bbsi.l  a2, 1, .L2mod4
+  j .Laligned
+
+.L1mod2: # address is odd
+  l8ui  a8, a3, 4 # get byte 0
+  addi  a3, a3, 1 # advance string pointer
+  beqz  a8, .Lz3  # if byte 0 is zero
+  bbci.l  a3, 1, .Laligned # if string pointer is now word-aligned
+
+.L2mod4: # address is 2 mod 4
+  addi  a3, a3, 2 # advance ptr for aligned access
+  l32i  a8, a3, 0 # get word with first two bytes of string
+  bnone a8, a6, .Lz2  # if byte 2 (of word, not string) is zero
+  bany  a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero
+
+  /* Byte 3 is zero.  */
+  addi  a3, a3, 3 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+
+/* String is word-aligned.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lz3  # loop forever (almost anyway)
+#endif
+1:  l32i  a8, a3, 4 # get next word of string
+  addi  a3, a3, 4 # advance string pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+#if XCHAL_HAVE_LOOPS
+  bnone a8, a7, .Lz3  # if byte 3 is zero
+#else
+  bany  a8, a7, 1b  # repeat if byte 3 is non-zero
+#endif
+
+.Lz3: /* Byte 3 is zero.  */
+  addi  a3, a3, 3 # point to zero byte
+  /* Fall through....  */
+
+.Lz0: /* Byte 0 is zero.  */
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+.Lz1: /* Byte 1 is zero.  */
+  addi  a3, a3, 1 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+.Lz2: /* Byte 2 is zero.  */
+  addi  a3, a3, 2 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+  .end schedule
+
+  .size strlen, . - strlen
diff --git a/libs/libc/machine/xtensa/arch_strncpy.S b/libs/libc/machine/xtensa/arch_strncpy.S
new file mode 100644
index 0000000..297f00c
--- /dev/null
+++ b/libs/libc/machine/xtensa/arch_strncpy.S
@@ -0,0 +1,265 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strncpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+.begin schedule
+  .align  4
+  .literal_position
+__strncpy_aux:
+
+.Lsrc1mod2: # src address is odd
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a3, a3, 1 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  beqz  a8, .Lfill  # if byte 0 is zero
+  bbci.l  a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a4, a4, -1  # decrement n
+  s8i a8, a10, 0  # store byte 0
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  beqz  a8, .Lfill  # if byte 0 is zero
+  l8ui  a8, a3, 1 # get byte 0
+  addi  a3, a3, 2 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  bnez  a8, .Lsrcaligned
+  j .Lfill
+
+.Lret:
+  RET(16)
+
+  .align  4
+  .global strncpy
+  .type strncpy, @function
+strncpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src */
+
+  mov a10, a2   # leave dst in return value register
+  beqz    a4, .Lret       # if n is zero
+
+  movi  a11, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a3, 0, .Lsrc1mod2
+  bbsi.l  a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+  /* Check if the destination is aligned.  */
+  movi  a8, 3
+  bnone a10, a8, .Laligned
+
+  j .Ldstunaligned
+
+
+/* Fill the dst with zeros -- n is at least 1.  */
+
+.Lfill:
+  movi  a9, 0
+  bbsi.l  a10, 0, .Lfill1mod2
+  bbsi.l  a10, 1, .Lfill2mod4
+.Lfillaligned:
+  blti  a4, 4, .Lfillcleanup
+
+  /* Loop filling complete words with zero.  */
+#if XCHAL_HAVE_LOOPS
+
+  srai  a8, a4, 2
+  loop  a8, 1f
+  s32i  a9, a10, 0
+  addi  a10, a10, 4
+
+1:  slli  a8, a8, 2
+  sub a4, a4, a8
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  s32i  a9, a10, 0
+  addi  a10, a10, 4
+  addi  a4, a4, -4
+  bgei    a4, 4, 1b
+
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  beqz  a4, 2f
+
+.Lfillcleanup:
+  /* Fill leftover (1 to 3) bytes with zero.  */
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  addi  a10, a10, 1
+  bnez    a4, .Lfillcleanup
+
+2:  RET(16)
+
+.Lfill1mod2: # dst address is odd
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  addi    a10, a10, 1 # advance dst pointer
+  bbci.l  a10, 1, .Lfillaligned # if dst is now word-aligned
+
+.Lfill2mod4: # dst address is 2 mod 4
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  s8i a9, a10, 1  # store byte 1
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  addi    a10, a10, 2 # advance dst pointer
+  j .Lfillaligned
+
+
+/* dst is word-aligned; src is word-aligned; n is at least 1.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 1f    # loop forever (almost anyway)
+  blti  a4, 5, .Ldstunaligned # n is near limit; do one at a time
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a11, .Lz0 # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  addi  a4, a4, -4  # decrement n
+  addi  a10, a10, 4 # advance dst pointer
+  bnone a8, a7, .Lfill  # if byte 3 is zero
+1:
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  blti  a4, 5, .Ldstunaligned # n is near limit; do one at a time
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a11, .Lz0 # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  addi  a4, a4, -4  # decrement n
+  addi  a10, a10, 4 # advance dst pointer
+  bany  a8, a7, 1b  # no zeroes
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  j .Lfill
+
+.Lz0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  addi  a4, a4, -1  # decrement n
+  addi  a10, a10, 1 # advance dst pointer
+  j .Lfill
+
+.Lz1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  addi  a4, a4, -2  # decrement n
+  addi  a10, a10, 2 # advance dst pointer
+  j .Lfill
+
+.Lz2: /* Byte 2 is zero.  */
+#if XCHAL_HAVE_BE
+  extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  addi  a4, a4, -3  # decrement n
+  addi  a10, a10, 3 # advance dst pointer
+  j .Lfill
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 2f    # loop forever (almost anyway)
+#endif
+1:  l8ui  a8, a3, 0
+  addi  a3, a3, 1
+  s8i a8, a10, 0
+  addi  a4, a4, -1
+  beqz  a4, 3f
+  addi  a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, 2f
+#else
+  bnez  a8, 1b
+#endif
+2:  j .Lfill
+
+3:  RET(16)
+.end schedule
+
+  .size strncpy, . - strncpy
diff --git a/libs/libc/machine/xtensa/xtensa_asm.h b/libs/libc/machine/xtensa/xtensa_asm.h
new file mode 100644
index 0000000..9913763
--- /dev/null
+++ b/libs/libc/machine/xtensa/xtensa_asm.h
@@ -0,0 +1,62 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/xtensa_asm.h
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include <arch/chip/core-isa.h>
+
+/****************************************************************************
+ * Assembly Language Macros
+ ****************************************************************************/
+
+  .macro  src_b r, w0, w1
+#if XCHAL_HAVE_BE
+  src \r, \w0, \w1
+#else
+  src \r, \w1, \w0
+#endif
+  .endm
+
+  .macro  ssa8  r
+#if XCHAL_HAVE_BE
+  ssa8b \r
+#else
+  ssa8l \r
+#endif
+  .endm
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#if XCHAL_HAVE_BE
+#  define MASK0 0xff000000
+#  define MASK1 0x00ff0000
+#  define MASK2 0x0000ff00
+#  define MASK3 0x000000ff
+#else
+#  define MASK0 0x000000ff
+#  define MASK1 0x0000ff00
+#  define MASK2 0x00ff0000
+#  define MASK3 0xff000000
+#endif
+