From 2164faec8a22d2934ae775587c38fb15e5852e51 Mon Sep 17 00:00:00 2001
From: openharmony_ci <120357966@qq.com>
Date: Sat, 9 Dec 2023 09:54:06 +0000
Subject: [PATCH] =?UTF-8?q?=E5=9B=9E=E9=80=80=20'Pull=20Request=20!22=20:?=
 =?UTF-8?q?=20=E5=B0=86optimized-routine=E4=BB=8E21.02=E5=8D=87=E7=BA=A7?=
 =?UTF-8?q?=E8=87=B323.01=E7=89=88=E6=9C=AC'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 LICENSE                                  | 230 +-------
 Makefile                                 |   7 +-
 OAT.xml                                  |  39 +-
 README                                   |  14 +-
 config.mk.dist                           |  25 +-
 math/Dir.mk                              |  17 +-
 math/README.contributors                 |  78 ---
 math/aarch64/v_cos.c                     |  87 ---
 math/aarch64/v_cosf.c                    |  82 ---
 math/aarch64/v_exp.c                     | 125 -----
 math/aarch64/v_exp2f.c                   | 113 ----
 math/aarch64/v_exp_data.c                | 146 ------
 math/aarch64/v_expf.c                    | 122 -----
 math/aarch64/v_log.c                     | 100 ----
 math/aarch64/v_log_data.c                | 156 ------
 math/aarch64/v_logf.c                    |  74 ---
 math/aarch64/v_math.h                    | 135 -----
 math/aarch64/v_powf.c                    | 148 ------
 math/aarch64/v_sin.c                     |  97 ----
 math/aarch64/v_sinf.c                    |  82 ---
 math/cosf.c                              |   6 +-
 math/erf.c                               |   2 +-
 math/erf_data.c                          |   2 +-
 math/erff.c                              |   2 +-
 math/erff_data.c                         |   2 +-
 math/exp.c                               |   2 +-
 math/exp10.c                             | 129 -----
 math/exp2.c                              |   2 +-
 math/exp2f.c                             |   2 +-
 math/exp2f_data.c                        |   2 +-
 math/exp_data.c                          |  23 +-
 math/expf.c                              |   2 +-
 math/include/mathlib.h                   |  69 ++-
 math/log.c                               |   2 +-
 math/log2.c                              |   2 +-
 math/log2_data.c                         |   2 +-
 math/log2f.c                             |   2 +-
 math/log2f_data.c                        |   2 +-
 math/log_data.c                          |   2 +-
 math/logf.c                              |   6 +-
 math/logf_data.c                         |   2 +-
 math/math_config.h                       |  34 +-
 math/math_err.c                          |   2 +-
 math/math_errf.c                         |   2 +-
 math/pow.c                               |   2 +-
 math/pow_log_data.c                      |   2 +-
 math/powf.c                              |   2 +-
 math/powf_log2_data.c                    |   2 +-
 math/s_cos.c                             |   6 +
 math/s_cosf.c                            |   6 +
 math/s_exp.c                             |   6 +
 math/s_exp2f.c                           |   6 +
 math/s_exp2f_1u.c                        |   6 +
 math/s_expf.c                            |   6 +
 math/s_expf_1u.c                         |   6 +
 math/s_log.c                             |   6 +
 math/s_logf.c                            |   6 +
 math/s_pow.c                             |   6 +
 math/s_powf.c                            |   6 +
 math/s_sin.c                             |   6 +
 math/s_sinf.c                            |   6 +
 math/sincosf.c                           |   6 +-
 math/sincosf.h                           |   6 +-
 math/sincosf_data.c                      |   2 +-
 math/sinf.c                              |   6 +-
 math/test/mathbench.c                    | 369 ++++++++-----
 math/test/mathbench_funcs.h              |  62 ---
 math/test/mathbench_wrappers.h           |  66 ---
 math/test/mathtest.c                     |  16 +-
 math/test/rtest/dotest.c                 |   2 +-
 math/test/rtest/intern.h                 |   2 +-
 math/test/rtest/main.c                   |   2 +-
 math/test/rtest/random.c                 |   2 +-
 math/test/rtest/random.h                 |   2 +-
 math/test/rtest/semi.c                   |   2 +-
 math/test/rtest/semi.h                   |   2 +-
 math/test/rtest/types.h                  |   2 +-
 math/test/rtest/wrappers.c               |   2 +-
 math/test/rtest/wrappers.h               |   2 +-
 math/test/runulp.sh                      | 127 +++--
 math/test/testcases/directed/cosf.tst    |   2 +-
 math/test/testcases/directed/erf.tst     |   2 +-
 math/test/testcases/directed/erff.tst    |   2 +-
 math/test/testcases/directed/exp.tst     |   2 +-
 math/test/testcases/directed/exp10.tst   |  15 -
 math/test/testcases/directed/exp2.tst    |   2 +-
 math/test/testcases/directed/exp2f.tst   |   2 +-
 math/test/testcases/directed/expf.tst    |   2 +-
 math/test/testcases/directed/log.tst     |   2 +-
 math/test/testcases/directed/log2.tst    |   2 +-
 math/test/testcases/directed/log2f.tst   |   2 +-
 math/test/testcases/directed/logf.tst    |   2 +-
 math/test/testcases/directed/pow.tst     |   2 +-
 math/test/testcases/directed/powf.tst    |   2 +-
 math/test/testcases/directed/sincosf.tst |   2 +-
 math/test/testcases/directed/sinf.tst    |   2 +-
 math/test/testcases/random/double.tst    |   2 +-
 math/test/testcases/random/float.tst     |   2 +-
 math/test/ulp.c                          | 245 +++++----
 math/test/ulp.h                          |  31 +-
 math/test/ulp_funcs.h                    |  40 --
 math/test/ulp_wrappers.h                 |  37 --
 math/tgamma128.c                         | 351 -------------
 math/tgamma128.h                         | 141 -----
 math/tools/cos.sollya                    |   2 +-
 math/tools/exp.sollya                    |   2 +-
 math/tools/exp2.sollya                   |   2 +-
 math/tools/log.sollya                    |   2 +-
 math/tools/log2.sollya                   |   2 +-
 math/tools/log2_abs.sollya               |   2 +-
 math/tools/log_abs.sollya                |   2 +-
 math/tools/plot.py                       |   2 +-
 math/tools/remez.jl                      |   2 +-
 math/tools/sin.sollya                    |   2 +-
 math/tools/tgamma128_gen.jl              | 212 --------
 math/tools/v_exp.sollya                  |   2 +-
 math/tools/v_log.sollya                  |   2 +-
 math/tools/v_sin.sollya                  |   2 +-
 math/v_cos.c                             |  87 +++
 math/v_cosf.c                            |  76 +++
 math/v_exp.c                             |  94 ++++
 math/v_exp.h                             |  14 +
 math/v_exp2f.c                           |  78 +++
 math/{aarch64 => }/v_exp2f_1u.c          |  61 ++-
 math/v_exp_data.c                        | 403 ++++++++++++++
 math/v_expf.c                            |  83 +++
 math/{aarch64 => }/v_expf_1u.c           |  69 +--
 math/v_log.c                             | 104 ++++
 math/v_log.h                             |  18 +
 math/v_log_data.c                        | 158 ++++++
 math/v_logf.c                            |  73 +++
 math/v_math.h                            | 641 +++++++++++++++++++++++
 math/{aarch64 => }/v_pow.c               |  21 +-
 math/v_powf.c                            | 235 +++++++++
 math/v_sin.c                             |  86 +++
 math/v_sinf.c                            |  75 +++
 math/vn_cos.c                            |  12 +
 math/vn_cosf.c                           |  12 +
 math/vn_exp.c                            |  12 +
 math/vn_exp2f.c                          |  12 +
 math/vn_exp2f_1u.c                       |  11 +
 math/vn_expf.c                           |  12 +
 math/vn_expf_1u.c                        |  11 +
 math/vn_log.c                            |  12 +
 math/vn_logf.c                           |  12 +
 math/vn_pow.c                            |  12 +
 math/vn_powf.c                           |  12 +
 math/vn_sin.c                            |  12 +
 math/vn_sinf.c                           |  12 +
 networking/Dir.mk                        |   2 +-
 networking/aarch64/chksum_simd.c         |   2 +-
 networking/arm/chksum_simd.c             |   2 +-
 networking/chksum.c                      |   2 +-
 networking/chksum_common.h               |   2 +-
 networking/include/networking.h          |   2 +-
 networking/test/chksum.c                 |   2 +-
 string/Dir.mk                            |   2 +-
 string/README.contributors               |  30 --
 string/aarch64/__mtag_tag_region.S       |   6 +-
 string/aarch64/__mtag_tag_zero_region.S  |   6 +-
 string/aarch64/asmdefs.h                 | 106 ----
 string/aarch64/check-arch.S              |   6 +-
 string/aarch64/memchr-mte.S              |  58 +-
 string/aarch64/memchr-sve.S              |   6 +-
 string/aarch64/memchr.S                  |   6 +-
 string/aarch64/memcmp-sve.S              |   6 +-
 string/aarch64/memcmp.S                  | 239 ++++-----
 string/aarch64/memcpy-advsimd.S          |   6 +-
 string/aarch64/memcpy-mops.S             |  21 -
 string/aarch64/memcpy-sve.S              | 177 -------
 string/aarch64/memcpy.S                  |   6 +-
 string/aarch64/memmove-mops.S            |  21 -
 string/aarch64/memrchr.S                 |  51 +-
 string/aarch64/memset-mops.S             |  20 -
 string/aarch64/memset.S                  |   6 +-
 string/aarch64/stpcpy-mte.S              |  10 +
 string/aarch64/stpcpy-sve.S              |   2 +-
 string/aarch64/stpcpy.S                  |   2 +-
 string/aarch64/strchr-mte.S              |  58 +-
 string/aarch64/strchr-sve.S              |   6 +-
 string/aarch64/strchr.S                  |   6 +-
 string/aarch64/strchrnul-mte.S           |  47 +-
 string/aarch64/strchrnul-sve.S           |   2 +-
 string/aarch64/strchrnul.S               |   6 +-
 string/aarch64/strcmp-mte.S              | 189 +++++++
 string/aarch64/strcmp-sve.S              |   6 +-
 string/aarch64/strcmp.S                  | 238 ++++-----
 string/aarch64/strcpy-mte.S              | 161 ++++++
 string/aarch64/strcpy-sve.S              |   6 +-
 string/aarch64/strcpy.S                  | 395 +++++++++-----
 string/aarch64/strlen-mte.S              |  41 +-
 string/aarch64/strlen-sve.S              |   6 +-
 string/aarch64/strlen.S                  |  21 +-
 string/aarch64/strncmp-mte.S             | 307 +++++++++++
 string/aarch64/strncmp-sve.S             |   6 +-
 string/aarch64/strncmp.S                 | 238 ++++-----
 string/aarch64/strnlen-sve.S             |   6 +-
 string/aarch64/strnlen.S                 |  60 ++-
 string/aarch64/strrchr-mte.S             |  58 +-
 string/aarch64/strrchr-sve.S             |   6 +-
 string/aarch64/strrchr.S                 |   6 +-
 string/bench/memcpy.c                    | 170 ++----
 string/bench/memset.c                    | 243 ---------
 string/bench/strlen.c                    |  16 +-
 string/include/benchlib.h                |   2 +-
 string/include/stringlib.h               |  15 +-
 string/test/__mtag_tag_region.c          |   2 +-
 string/test/__mtag_tag_zero_region.c     |   2 +-
 string/test/memchr.c                     |   2 +-
 string/test/memcmp.c                     |   2 +-
 string/test/memcpy.c                     |  10 +-
 string/test/memmove.c                    |  10 +-
 string/test/memrchr.c                    |   2 +-
 string/test/memset.c                     |   5 +-
 string/test/mte.h                        |   2 +-
 string/test/stpcpy.c                     |   7 +-
 string/test/strchr.c                     |   2 +-
 string/test/strchrnul.c                  |   2 +-
 string/test/strcmp.c                     |   7 +-
 string/test/strcpy.c                     |   7 +-
 string/test/stringtest.h                 |   2 +-
 string/test/strlen.c                     |   5 +-
 string/test/strncmp.c                    |   7 +-
 string/test/strnlen.c                    |   2 +-
 string/test/strrchr.c                    |   2 +-
 string/x86_64/check-arch.S               |   2 +-
 226 files changed, 4783 insertions(+), 5035 deletions(-)
 delete mode 100644 math/README.contributors
 delete mode 100644 math/aarch64/v_cos.c
 delete mode 100644 math/aarch64/v_cosf.c
 delete mode 100644 math/aarch64/v_exp.c
 delete mode 100644 math/aarch64/v_exp2f.c
 delete mode 100644 math/aarch64/v_exp_data.c
 delete mode 100644 math/aarch64/v_expf.c
 delete mode 100644 math/aarch64/v_log.c
 delete mode 100644 math/aarch64/v_log_data.c
 delete mode 100644 math/aarch64/v_logf.c
 delete mode 100644 math/aarch64/v_math.h
 delete mode 100644 math/aarch64/v_powf.c
 delete mode 100644 math/aarch64/v_sin.c
 delete mode 100644 math/aarch64/v_sinf.c
 delete mode 100644 math/exp10.c
 create mode 100644 math/s_cos.c
 create mode 100644 math/s_cosf.c
 create mode 100644 math/s_exp.c
 create mode 100644 math/s_exp2f.c
 create mode 100644 math/s_exp2f_1u.c
 create mode 100644 math/s_expf.c
 create mode 100644 math/s_expf_1u.c
 create mode 100644 math/s_log.c
 create mode 100644 math/s_logf.c
 create mode 100644 math/s_pow.c
 create mode 100644 math/s_powf.c
 create mode 100644 math/s_sin.c
 create mode 100644 math/s_sinf.c
 delete mode 100644 math/test/mathbench_funcs.h
 delete mode 100644 math/test/mathbench_wrappers.h
 delete mode 100644 math/test/testcases/directed/exp10.tst
 delete mode 100644 math/test/ulp_funcs.h
 delete mode 100644 math/test/ulp_wrappers.h
 delete mode 100644 math/tgamma128.c
 delete mode 100644 math/tgamma128.h
 delete mode 100644 math/tools/tgamma128_gen.jl
 create mode 100644 math/v_cos.c
 create mode 100644 math/v_cosf.c
 create mode 100644 math/v_exp.c
 create mode 100644 math/v_exp.h
 create mode 100644 math/v_exp2f.c
 rename math/{aarch64 => }/v_exp2f_1u.c (43%)
 create mode 100644 math/v_exp_data.c
 create mode 100644 math/v_expf.c
 rename math/{aarch64 => }/v_expf_1u.c (39%)
 create mode 100644 math/v_log.c
 create mode 100644 math/v_log.h
 create mode 100644 math/v_log_data.c
 create mode 100644 math/v_logf.c
 create mode 100644 math/v_math.h
 rename math/{aarch64 => }/v_pow.c (35%)
 create mode 100644 math/v_powf.c
 create mode 100644 math/v_sin.c
 create mode 100644 math/v_sinf.c
 create mode 100644 math/vn_cos.c
 create mode 100644 math/vn_cosf.c
 create mode 100644 math/vn_exp.c
 create mode 100644 math/vn_exp2f.c
 create mode 100644 math/vn_exp2f_1u.c
 create mode 100644 math/vn_expf.c
 create mode 100644 math/vn_expf_1u.c
 create mode 100644 math/vn_log.c
 create mode 100644 math/vn_logf.c
 create mode 100644 math/vn_pow.c
 create mode 100644 math/vn_powf.c
 create mode 100644 math/vn_sin.c
 create mode 100644 math/vn_sinf.c
 delete mode 100644 string/README.contributors
 delete mode 100644 string/aarch64/asmdefs.h
 delete mode 100644 string/aarch64/memcpy-mops.S
 delete mode 100644 string/aarch64/memcpy-sve.S
 delete mode 100644 string/aarch64/memmove-mops.S
 delete mode 100644 string/aarch64/memset-mops.S
 create mode 100644 string/aarch64/stpcpy-mte.S
 create mode 100644 string/aarch64/strcmp-mte.S
 create mode 100644 string/aarch64/strcpy-mte.S
 create mode 100644 string/aarch64/strncmp-mte.S
 delete mode 100644 string/bench/memset.c

diff --git a/LICENSE b/LICENSE
index 20a4b77..2543b82 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,11 +1,6 @@
-MIT OR Apache-2.0 WITH LLVM-exception
-=====================================
-
-
 MIT License
------------
 
-Copyright (c) 1999-2022, Arm Limited.
+Copyright (c) 1999-2019, Arm Limited.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,226 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-
-
-Apache-2.0 WITH LLVM-exception
-------------------------------
-
-                                Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-    1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-    2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-    3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-    4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-    5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-    6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-    7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-    8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-    9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-    END OF TERMS AND CONDITIONS
-
-    APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-    Copyright [yyyy] [name of copyright owner]
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-
---- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
diff --git a/Makefile b/Makefile
index c487896..169f89e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2018-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
 
 srcdir = .
 prefix = /usr
@@ -11,7 +11,6 @@ includedir = $(prefix)/include
 
 # Configure these in config.mk, do not make changes in this file.
 SUBS = math string networking
-PLSUBS = math
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
@@ -21,7 +20,6 @@ CPPFLAGS =
 CFLAGS = -std=c99 -O2
 CFLAGS_SHARED = -fPIC
 CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
-CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
 LDFLAGS =
 LDLIBS =
 AR = $(CROSS_COMPILE)ar
@@ -53,7 +51,6 @@ $(DIRS):
 	mkdir -p $@
 
 $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
-$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
 
 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
diff --git a/OAT.xml b/OAT.xml
index ab48a78..71acb93 100644
--- a/OAT.xml
+++ b/OAT.xml
@@ -19,7 +19,7 @@
     policylist:
     1. policy: If the OAT-Default.xml policies do not meet your requirements, please add policies here.
     2. policyitem: The fields type, name, path, desc is required, and the fields rule, group, filefilter is optional,the default value is:
-    <policyitem type="" name="" path="" desc="" rule="may" filefilter="defaultPolicyFilter"/>
+    <policyitem type="" name="" path="" desc="" rule="may" group="defaultGroup" filefilter="defaultPolicyFilter"/>
     3. policyitem type:
         "compatibility" is used to check license compatibility in the specified path;
         "license" is used to check source license header in the specified path;
@@ -49,43 +49,10 @@ All configurations in this file will be merged to OAT-Default.xml, if you have a
 
 <configuration>
     <oatconfig>
-        <licensefile></licensefile>
-        <policylist>
-            <policy>
-                <policyitem type="license" name="MIT" path=".*"  desc="兼容license"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="math/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="networking/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/aarch64/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/include/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/bench/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/test/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/x86_64/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/Dir.mk" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>  
-                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="Makefile" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>  
-
-            
-            </policy>
-        </policylist>
         <filefilterlist>
-            <filefilter name="defaultPolicyFilter" desc="Filters for compatibility, license header policies">
-                 <filteritem type="filepath" name="math/README.contributors" desc="官方自带文件"/>
-                 <filteritem type="filepath" name="LICENSE" desc="官方自带文件"/>
-                 <filteritem type="filepath" name="string/README.contributors" desc="官方自带文件"/>
-                 <filteritem type="filepath" name="README.OpenSource" desc="官方自带文件"/>
-                 <filteritem type="filepath" name="README" desc="官方自带文件"/>
-                 <filteritem type="filepath" name="optimized-routines.gni" desc="不涉及license"/>
-                 <filteritem type="filepath" name="bundle.json" desc="不涉及license"/>
-                 <filteritem type="filepath" name="config.mk.dist" desc="不涉及license"/>
-
-
-
-            </filefilter>
-
             <filefilter name="binaryFileTypePolicyFilter" desc="Filters for binary file policies">
-		        <filteritem type="filename" name="*.pdf" desc="官方自带文件"/>
+		<filteritem type="filename" name="*.pdf" desc="官方自带文件"/>
             </filefilter>
-
-
-		</filefilterlist>
+        </filefilterlist>
     </oatconfig>
 </configuration>
diff --git a/README b/README
index a2143a2..9e1a34f 100644
--- a/README
+++ b/README
@@ -2,17 +2,14 @@ Arm Optimized Routines
 ----------------------
 
 This repository contains implementations of library functions
-provided by Arm. The outbound license is available under a dual
-license, at the user’s election, as reflected in the LICENSE file.
-Contributions to this project are accepted, but Contributors have
-to sign an Assignment Agreement, please follow the instructions in
+provided by Arm under MIT License (See LICENSE). Contributions
+to this project are accepted, but Contributors have to sign an
+Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
-to projects that require copyright assignment is possible. Further
-contribution requirements are documented in README.contributors of
-the appropriate subdirectory.
+to projects that require copyright assignment is possible.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v23.01.
+release is v21.02.
 
 Source code layout:
 
@@ -27,7 +24,6 @@ networking/test/ - networking test and benchmark related sources.
 string/         - string routines subproject sources.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
-pl/...          - separately maintained performance library code.
 
 The steps to build the target libraries and run the tests:
 
diff --git a/config.mk.dist b/config.mk.dist
index c4a6dba..177e1ac 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,14 +1,11 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2018-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
 
 # Subprojects to build
 SUBS = math string networking
 
-# Subsubprojects to build if subproject pl is built
-PLSUBS = math
-
 # Target architecture: aarch64, arm or x86_64
 ARCH = aarch64
 
@@ -59,22 +56,8 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Use with clang.
 #math-cflags += -ffp-contract=fast
 
-# Disable/enable SVE vector math code and tests
-WANT_SVE_MATH = 0
-ifeq ($(WANT_SVE_MATH), 1)
-  math-cflags += -march=armv8.2-a+sve
-endif
-math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
-
-# If defined to 1, set errno in math functions according to ISO C.  Many math
-# libraries do not set errno, so this is 0 by default.  It may need to be
-# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
-WANT_ERRNO = 0
-math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
-
-# If set to 1, set fenv in vector math routines.
-WANT_SIMD_EXCEPT = 0
-math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
+# Disable vector math code
+#math-cflags += -DWANT_VMATH=0
 
 # Disable fenv checks
 #math-ulpflags = -q -f
diff --git a/math/Dir.mk b/math/Dir.mk
index d6385d2..3b841ab 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,14 +1,12 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019-2022, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2019, Arm Limited.
+# SPDX-License-Identifier: MIT
 
 S := $(srcdir)/math
 B := build/math
 
 math-lib-srcs := $(wildcard $(S)/*.[cS])
-math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
-
 math-test-srcs := \
 	$(S)/test/mathtest.c \
 	$(S)/test/mathbench.c \
@@ -17,7 +15,6 @@ math-test-srcs := \
 math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
 
 math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
-math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
 
 math-libs := \
 	build/lib/libmathlib.so \
@@ -45,11 +42,10 @@ math-files := \
 	$(math-tools) \
 	$(math-host-tools) \
 	$(math-includes) \
-	$(math-test-includes) \
 
-all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes)
 
-$(math-objs): $(math-includes) $(math-test-includes)
+$(math-objs): $(math-includes)
 $(math-objs): CFLAGS_ALL += $(math-cflags)
 $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
@@ -87,9 +83,6 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
-build/include/test/%.h: $(S)/test/%.h
-	cp $< $@
-
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
@@ -103,7 +96,7 @@ check-math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
 
 check-math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
+	ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
 
 check-math: check-math-test check-math-rtest check-math-ulp
 
diff --git a/math/README.contributors b/math/README.contributors
deleted file mode 100644
index 33e7ba3..0000000
--- a/math/README.contributors
+++ /dev/null
@@ -1,78 +0,0 @@
-STYLE REQUIREMENTS
-==================
-
-1. Most code in this sub-directory is expected to be upstreamed into glibc so
-   the GNU Coding Standard and glibc specific conventions should be followed
-   to ease upstreaming.
-
-2. ABI and symbols: the code should be written so it is suitable for inclusion
-   into a libc with minimal changes. This e.g. means that internal symbols
-   should be hidden and in the implementation reserved namespace according to
-   ISO C and POSIX rules. If possible the built shared libraries and static
-   library archives should be usable to override libc symbols at link time (or
-   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
-   (other than symbol versioning), this cannot be done reliably for static
-   linking so this is a best effort requirement.
-
-3. API: include headers should be suitable for benchmarking and testing code
-   and should not conflict with libc headers.
-
-
-CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
-==============================================
-
-1. Math functions have quality and performance requirements.
-
-2. Quality:
-   - Worst-case ULP error should be small in the entire input domain (for most
-     common double precision scalar functions the target is < 0.66 ULP error,
-     and < 1 ULP for single precision, even performance optimized function
-     variant should not have > 5 ULP error if the goal is to be a drop in
-     replacement for a standard math function), this should be tested
-     statistically (or on all inputs if possible in reasonable amount of time).
-     The ulp tool is for this and runulp.sh should be updated for new functions.
-
-   - All standard rounding modes need to be supported but in non-default rounding
-     modes the quality requirement can be relaxed. (Non-nearest rounded
-     computation can be slow and inaccurate but has to be correct for conformance
-     reasons.)
-
-   - Special cases and error handling need to follow ISO C Annex F requirements,
-     POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
-     https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
-     this should be tested by direct tests (glibc test system may be used for it).
-
-   - Error handling code should be decoupled from the approximation code as much
-     as possible. (There are helper functions, these take care of errno as well
-     as exception raising.)
-
-   - Vector math code does not need to work in non-nearest rounding mode and error
-     handling side effects need not happen (fenv exceptions and errno), but the
-     result should be correct (within quality requirements, which are lower for
-     vector code than for scalar code).
-
-   - Error bounds of the approximation should be clearly documented.
-
-   - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
-     systems. (Routines and features can be disabled on specific targets, but
-     the build must complete). On aarch64, both little- and big-endian targets
-     are supported as well as valid combinations of architecture extensions.
-     The configurations that should be tested depend on the contribution.
-
-3. Performance:
-   - Common math code should be benchmarked on modern aarch64 microarchitectures
-     over typical inputs.
-
-   - Performance improvements should be documented (relative numbers can be
-     published; it is enough to use the mathbench microbenchmark tool which should
-     be updated for new functions).
-
-   - Attention should be paid to the compilation flags: for aarch64 fma
-     contraction should be on and math errno turned off so some builtins can be
-     inlined.
-
-   - The code should be reasonably performant on x86_64 too, e.g. some rounding
-     instructions and fma may not be available on x86_64, such builtins turn into
-     libc calls with slow code. Such slowdown is not acceptable, a faster fallback
-     should be present: glibc and bionic use the same code on all targets. (This
-     does not apply to vector math code).
diff --git a/math/aarch64/v_cos.c b/math/aarch64/v_cos.c
deleted file mode 100644
index 9a73575..0000000
--- a/math/aarch64/v_cos.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Double-precision vector cos function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  float64x2_t poly[7];
-  float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
-} data = {
-  /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
-  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
-	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
-	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
-	    V2 (-0x1.9e9540300a1p-41) },
-  .inv_pi = V2 (0x1.45f306dc9c883p-2),
-  .half_pi = V2 (0x1.921fb54442d18p+0),
-  .pi_1 = V2 (0x1.921fb54442d18p+1),
-  .pi_2 = V2 (0x1.1a62633145c06p-53),
-  .pi_3 = V2 (0x1.c1cd129024e09p-106),
-  .shift = V2 (0x1.8p52),
-  .range_val = V2 (0x1p23)
-};
-
-#define C(i) d->poly[i]
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
-{
-  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-  return v_call_f64 (cos, x, y, cmp);
-}
-
-float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
-  uint64x2_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
-  r = vabsq_f64 (x);
-  cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
-		   vreinterpretq_u64_f64 (d->range_val));
-  if (unlikely (v_any_u64 (cmp)))
-    /* If fenv exceptions are to be triggered correctly, set any special lanes
-       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
-       special-case handler later.  */
-    r = vbslq_f64 (cmp, v_f64 (1.0), r);
-#else
-  cmp = vcageq_f64 (x, d->range_val);
-  r = x;
-#endif
-
-  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
-  n = vsubq_f64 (n, d->shift);
-  n = vsubq_f64 (n, v_f64 (0.5));
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f64 (r, d->pi_1, n);
-  r = vfmsq_f64 (r, d->pi_2, n);
-  r = vfmsq_f64 (r, d->pi_3, n);
-
-  /* sin(r) poly approx.  */
-  r2 = vmulq_f64 (r, r);
-  r3 = vmulq_f64 (r2, r);
-  r4 = vmulq_f64 (r2, r2);
-
-  t1 = vfmaq_f64 (C (4), C (5), r2);
-  t2 = vfmaq_f64 (C (2), C (3), r2);
-  t3 = vfmaq_f64 (C (0), C (1), r2);
-
-  y = vfmaq_f64 (t1, C (6), r4);
-  y = vfmaq_f64 (t2, y, r4);
-  y = vfmaq_f64 (t3, y, r4);
-  y = vfmaq_f64 (r, y, r3);
-
-  if (unlikely (v_any_u64 (cmp)))
-    return special_case (x, y, odd, cmp);
-  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-}
diff --git a/math/aarch64/v_cosf.c b/math/aarch64/v_cosf.c
deleted file mode 100644
index b9890b2..0000000
--- a/math/aarch64/v_cosf.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Single-precision vector cos function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  float32x4_t poly[4];
-  float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
-} data = {
-  /* 1.886 ulp error.  */
-  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
-	    V4 (0x1.5b2e76p-19f) },
-
-  .pi_1 = V4 (0x1.921fb6p+1f),
-  .pi_2 = V4 (-0x1.777a5cp-24f),
-  .pi_3 = V4 (-0x1.ee59dap-49f),
-
-  .inv_pi = V4 (0x1.45f306p-2f),
-  .shift = V4 (0x1.8p+23f),
-  .half_pi = V4 (0x1.921fb6p0f),
-  .range_val = V4 (0x1p20f)
-};
-
-#define C(i) d->poly[i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
-{
-  /* Fall back to scalar code.  */
-  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-  return v_call_f32 (cosf, x, y, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, r3, y;
-  uint32x4_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
-  r = vabsq_f32 (x);
-  cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
-		   vreinterpretq_u32_f32 (d->range_val));
-  if (unlikely (v_any_u32 (cmp)))
-    /* If fenv exceptions are to be triggered correctly, set any special lanes
-       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
-       special-case handler later.  */
-    r = vbslq_f32 (cmp, v_f32 (1.0f), r);
-#else
-  cmp = vcageq_f32 (x, d->range_val);
-  r = x;
-#endif
-
-  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
-  n = vsubq_f32 (n, d->shift);
-  n = vsubq_f32 (n, v_f32 (0.5f));
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f32 (r, d->pi_1, n);
-  r = vfmsq_f32 (r, d->pi_2, n);
-  r = vfmsq_f32 (r, d->pi_3, n);
-
-  /* y = sin(r).  */
-  r2 = vmulq_f32 (r, r);
-  r3 = vmulq_f32 (r2, r);
-  y = vfmaq_f32 (C (2), C (3), r2);
-  y = vfmaq_f32 (C (1), y, r2);
-  y = vfmaq_f32 (C (0), y, r2);
-  y = vfmaq_f32 (r, y, r3);
-
-  if (unlikely (v_any_u32 (cmp)))
-    return special_case (x, y, odd, cmp);
-  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-}
diff --git a/math/aarch64/v_exp.c b/math/aarch64/v_exp.c
deleted file mode 100644
index bc5609f..0000000
--- a/math/aarch64/v_exp.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Double-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-#define N (1 << V_EXP_TABLE_BITS)
-#define IndexMask (N - 1)
-
-const static volatile struct
-{
-  float64x2_t poly[3];
-  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
-#if !WANT_SIMD_EXCEPT
-  float64x2_t special_bound, scale_thresh;
-#endif
-} data = {
-  /* maxerr: 1.88 +0.5 ulp
-     rel error: 1.4337*2^-53
-     abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
-  .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
-	    V2 (0x1.55555da646206p-5) },
-#if !WANT_SIMD_EXCEPT
-  .scale_thresh = V2 (163840.0), /* 1280.0 * N.  */
-  .special_bound = V2 (704.0),
-#endif
-  .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2.  */
-  .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N.  */
-  .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
-  .shift = V2 (0x1.8p+52)
-};
-
-#define C(i) data.poly[i]
-#define Tab __v_exp_data
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
-# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9).  */
-# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound.  */
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine to special lanes.  */
-  return v_call_f64 (exp, x, y, cmp);
-}
-
-#else
-
-# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
-/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
-# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
-# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254.  */
-
-static inline float64x2_t VPCS_ATTR
-special_case (float64x2_t s, float64x2_t y, float64x2_t n)
-{
-  /* 2^(n/N) may overflow, break it up into s1*s2.  */
-  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
-  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
-  float64x2_t s2 = vreinterpretq_f64_u64 (
-      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
-  uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
-  float64x2_t r1 = vmulq_f64 (s1, s1);
-  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
-  return vbslq_f64 (cmp, r1, r0);
-}
-
-#endif
-
-float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
-{
-  float64x2_t n, r, r2, s, y, z;
-  uint64x2_t cmp, u, e;
-
-#if WANT_SIMD_EXCEPT
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special_case to fix special lanes later. This is only necessary if fenv
-     exceptions are to be triggered correctly.  */
-  float64x2_t xm = x;
-  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
-  if (unlikely (v_any_u64 (cmp)))
-    x = vbslq_f64 (cmp, v_f64 (1), x);
-#else
-  cmp = vcagtq_f64 (x, data.special_bound);
-#endif
-
-  /* n = round(x/(ln2/N)).  */
-  z = vfmaq_f64 (data.shift, x, data.inv_ln2);
-  u = vreinterpretq_u64_f64 (z);
-  n = vsubq_f64 (z, data.shift);
-
-  /* r = x - n*ln2/N.  */
-  r = x;
-  r = vfmsq_f64 (r, data.ln2_hi, n);
-  r = vfmsq_f64 (r, data.ln2_lo, n);
-
-  e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
-
-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4.  */
-  r2 = vmulq_f64 (r, r);
-  y = vfmaq_f64 (C (0), C (1), r);
-  y = vfmaq_f64 (y, C (2), r2);
-  y = vfmaq_f64 (r, y, r2);
-
-  /* s = 2^(n/N).  */
-  u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
-  s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
-
-  if (unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
-#else
-    return special_case (s, y, n);
-#endif
-
-  return vfmaq_f64 (s, y, s);
-}
diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/v_exp2f.c
deleted file mode 100644
index e402205..0000000
--- a/math/aarch64/v_exp2f.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  float32x4_t poly[5];
-  uint32x4_t exponent_bias;
-#if !WANT_SIMD_EXCEPT
-  float32x4_t special_bound, scale_thresh;
-#endif
-} data = {
-  /* maxerr: 1.962 ulp.  */
-  .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
-	    V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
-  .exponent_bias = V4 (0x3f800000),
-#if !WANT_SIMD_EXCEPT
-  .special_bound = V4 (126.0f),
-  .scale_thresh = V4 (192.0f),
-#endif
-};
-
-#define C(i) d->poly[i]
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
-# define BigBound v_u32 (0x42800000)	  /* asuint (0x1p6).  */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine for special lanes.  */
-  return v_call_f32 (exp2f, x, y, cmp);
-}
-
-#else
-
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
-	      float32x4_t scale, const struct data *d)
-{
-  /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
-  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
-  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
-  float32x4_t r2 = vmulq_f32 (s1, s1);
-  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
-  /* Similar to r1 but avoids double rounding in the subnormal range.  */
-  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
-  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
-  return vbslq_f32 (cmp2, r2, r);
-}
-
-#endif
-
-float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, scale, p, q, poly;
-  uint32x4_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
-  /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
-  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special_case to fix special lanes later. This is only necessary if fenv
-     exceptions are to be triggered correctly.  */
-  if (unlikely (v_any_u32 (cmp)))
-    x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
-    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-       x = n + r, with r in [-1/2, 1/2].  */
-  n = vrndaq_f32 (x);
-  r = vsubq_f32 (x, n);
-  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-
-#if !WANT_SIMD_EXCEPT
-  cmp = vcagtq_f32 (n, d->special_bound);
-#endif
-
-  r2 = vmulq_f32 (r, r);
-  p = vfmaq_f32 (C (1), C (0), r);
-  q = vfmaq_f32 (C (3), C (2), r);
-  q = vfmaq_f32 (q, p, r2);
-  p = vmulq_f32 (C (4), r);
-  poly = vfmaq_f32 (p, q, r2);
-
-  if (unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
-    return special_case (poly, n, e, cmp, scale, d);
-#endif
-
-  return vfmaq_f32 (scale, poly, scale);
-}
diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c
deleted file mode 100644
index 45f0848..0000000
--- a/math/aarch64/v_exp_data.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Lookup table for double-precision e^x vector function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-# define N (1 << V_EXP_TABLE_BITS)
-
-/* 2^(j/N), j=0..N.  */
-const uint64_t __v_exp_data[] = {
-# if N == 128
-  0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
-  0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
-  0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
-  0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
-  0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
-  0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
-  0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
-  0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
-  0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
-  0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
-  0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
-  0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
-  0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
-  0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
-  0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
-  0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
-  0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
-  0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
-  0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
-  0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
-  0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
-  0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
-  0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
-  0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
-  0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
-  0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
-  0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
-  0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
-  0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
-  0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
-  0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
-  0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
-  0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
-  0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
-  0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
-  0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
-  0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
-  0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
-  0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
-  0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
-  0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
-  0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
-  0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
-# elif N == 256
-  0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
-  0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
-  0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
-  0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
-  0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
-  0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
-  0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
-  0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
-  0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
-  0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
-  0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
-  0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
-  0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
-  0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
-  0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
-  0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
-  0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
-  0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
-  0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
-  0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
-  0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
-  0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
-  0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
-  0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
-  0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
-  0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
-  0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
-  0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
-  0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
-  0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
-  0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
-  0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
-  0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
-  0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
-  0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
-  0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
-  0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
-  0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
-  0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
-  0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
-  0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
-  0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
-  0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
-  0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
-  0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
-  0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
-  0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
-  0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
-  0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
-  0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
-  0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
-  0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
-  0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
-  0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
-  0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
-  0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
-  0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
-  0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
-  0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
-  0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
-  0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
-  0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
-  0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
-  0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
-  0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
-  0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
-  0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
-  0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
-  0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
-  0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
-  0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
-  0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
-  0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
-  0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
-  0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
-  0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
-  0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
-  0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
-  0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
-  0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
-  0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
-  0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
-  0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
-  0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
-  0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
-  0x3feff9d96b2a23d9,
-# endif
-};
diff --git a/math/aarch64/v_expf.c b/math/aarch64/v_expf.c
deleted file mode 100644
index 34e8b60..0000000
--- a/math/aarch64/v_expf.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  float32x4_t poly[5];
-  float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
-  uint32x4_t exponent_bias;
-#if !WANT_SIMD_EXCEPT
-  float32x4_t special_bound, scale_thresh;
-#endif
-} data = {
-  /* maxerr: 1.45358 +0.5 ulp.  */
-  .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
-	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
-  .shift = V4 (0x1.8p23f),
-  .inv_ln2 = V4 (0x1.715476p+0f),
-  .ln2_hi = V4 (0x1.62e4p-1f),
-  .ln2_lo = V4 (0x1.7f7d1cp-20f),
-  .exponent_bias = V4 (0x3f800000),
-#if !WANT_SIMD_EXCEPT
-  .special_bound = V4 (126.0f),
-  .scale_thresh = V4 (192.0f),
-#endif
-};
-
-#define C(i) d->poly[i]
-
-#if WANT_SIMD_EXCEPT
-
-# define TinyBound v_u32 (0x20000000)	/* asuint (0x1p-63).  */
-# define BigBound v_u32 (0x42800000)	/* asuint (0x1p6).  */
-# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
-{
-  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
-     routine to special lanes.  */
-  return v_call_f32 (expf, x, y, cmp);
-}
-
-#else
-
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
-	      float32x4_t scale, const struct data *d)
-{
-  /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
-  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
-  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
-  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
-  float32x4_t r2 = vmulq_f32 (s1, s1);
-  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
-  /* Similar to r1 but avoids double rounding in the subnormal range.  */
-  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
-  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
-  return vbslq_f32 (cmp2, r2, r);
-}
-
-#endif
-
-float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, scale, p, q, poly, z;
-  uint32x4_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
-  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
-  cmp = vcgeq_u32 (
-      vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
-		 TinyBound),
-      SpecialBound);
-  float32x4_t xm = x;
-  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
-     special case handler to fix special lanes later. This is only necessary if
-     fenv exceptions are to be triggered correctly.  */
-  if (unlikely (v_any_u32 (cmp)))
-    x = vbslq_f32 (cmp, v_f32 (1), x);
-#endif
-
-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
-  z = vfmaq_f32 (d->shift, x, d->inv_ln2);
-  n = vsubq_f32 (z, d->shift);
-  r = vfmsq_f32 (x, n, d->ln2_hi);
-  r = vfmsq_f32 (r, n, d->ln2_lo);
-  e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
-  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
-
-#if !WANT_SIMD_EXCEPT
-  cmp = vcagtq_f32 (n, d->special_bound);
-#endif
-
-  r2 = vmulq_f32 (r, r);
-  p = vfmaq_f32 (C (1), C (0), r);
-  q = vfmaq_f32 (C (3), C (2), r);
-  q = vfmaq_f32 (q, p, r2);
-  p = vmulq_f32 (C (4), r);
-  poly = vfmaq_f32 (p, q, r2);
-
-  if (unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
-    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
-#else
-    return special_case (poly, n, e, cmp, scale, d);
-#endif
-
-  return vfmaq_f32 (scale, poly, scale);
-}
diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c
deleted file mode 100644
index 1d1c1fa..0000000
--- a/math/aarch64/v_log.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Double-precision vector log(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  uint64x2_t min_norm;
-  uint32x4_t special_bound;
-  float64x2_t poly[5];
-  float64x2_t ln2;
-  uint64x2_t sign_exp_mask;
-} data = {
-  /* Worst-case error: 1.17 + 0.5 ulp.
-     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
-  .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
-	    V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
-	    V2 (-0x1.554e550bd501ep-3) },
-  .ln2 = V2 (0x1.62e42fefa39efp-1),
-  .min_norm = V2 (0x0010000000000000),
-  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm.  */
-  .sign_exp_mask = V2 (0xfff0000000000000)
-};
-
-#define A(i) d->poly[i]
-#define N (1 << V_LOG_TABLE_BITS)
-#define IndexMask (N - 1)
-#define Off v_u64 (0x3fe6900900000000)
-
-struct entry
-{
-  float64x2_t invc;
-  float64x2_t logc;
-};
-
-static inline struct entry
-lookup (uint64x2_t i)
-{
-  /* Since N is a power of 2, n % N = n & (N - 1).  */
-  struct entry e;
-  uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
-  uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
-  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
-  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
-  e.invc = vuzp1q_f64 (e0, e1);
-  e.logc = vuzp2q_f64 (e0, e1);
-  return e;
-}
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
-	      uint32x2_t cmp)
-{
-  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
-}
-
-float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float64x2_t z, r, r2, p, y, kd, hi;
-  uint64x2_t ix, iz, tmp;
-  uint32x2_t cmp;
-  int64x2_t k;
-  struct entry e;
-
-  ix = vreinterpretq_u64_f64 (x);
-  cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
-		  vget_low_u32 (d->special_bound));
-
-  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
-     The range is split into N subintervals.
-     The ith subinterval contains z and c is near its center.  */
-  tmp = vsubq_u64 (ix, Off);
-  k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
-  iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
-  z = vreinterpretq_f64_u64 (iz);
-  e = lookup (tmp);
-
-  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
-  r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
-  kd = vcvtq_f64_s64 (k);
-
-  /* hi = r + log(c) + k*Ln2.  */
-  hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
-  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
-  r2 = vmulq_f64 (r, r);
-  y = vfmaq_f64 (A (2), A (3), r);
-  p = vfmaq_f64 (A (0), A (1), r);
-  y = vfmaq_f64 (y, A (4), r2);
-  y = vfmaq_f64 (p, y, r2);
-
-  if (unlikely (v_any_u32h (cmp)))
-    return special_case (x, y, hi, r2, cmp);
-  return vfmaq_f64 (hi, y, r2);
-}
diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c
deleted file mode 100644
index 82351bb..0000000
--- a/math/aarch64/v_log_data.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Lookup table for double-precision log(x) vector function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-#define N (1 << V_LOG_TABLE_BITS)
-
-const struct v_log_data __v_log_data = {
-  /* Algorithm:
-
-	x = 2^k z
-	log(x) = k ln2 + log(c) + poly(z/c - 1)
-
-  where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
-  N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
-
-	table[i].invc = 1/c
-	table[i].logc = (double)log(c)
-
-  where c is near the center of the subinterval and is chosen by trying several
-  floating point invc candidates around 1/center and selecting one for which
-  the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
-  that contains 1 and the previous one got tweaked to avoid cancellation.  */
-  .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
-	     { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
-	     { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
-	     { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
-	     { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
-	     { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
-	     { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
-	     { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
-	     { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
-	     { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
-	     { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
-	     { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
-	     { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
-	     { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
-	     { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
-	     { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
-	     { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
-	     { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
-	     { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
-	     { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
-	     { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
-	     { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
-	     { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
-	     { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
-	     { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
-	     { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
-	     { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
-	     { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
-	     { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
-	     { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
-	     { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
-	     { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
-	     { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
-	     { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
-	     { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
-	     { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
-	     { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
-	     { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
-	     { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
-	     { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
-	     { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
-	     { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
-	     { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
-	     { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
-	     { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
-	     { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
-	     { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
-	     { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
-	     { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
-	     { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
-	     { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
-	     { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
-	     { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
-	     { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
-	     { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
-	     { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
-	     { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
-	     { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
-	     { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
-	     { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
-	     { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
-	     { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
-	     { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
-	     { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
-	     { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
-	     { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
-	     { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
-	     { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
-	     { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
-	     { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
-	     { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
-	     { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
-	     { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
-	     { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
-	     { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
-	     { 1.0, 0.0 },
-	     { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
-	     { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
-	     { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
-	     { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
-	     { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
-	     { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
-	     { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
-	     { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
-	     { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
-	     { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
-	     { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
-	     { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
-	     { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
-	     { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
-	     { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
-	     { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
-	     { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
-	     { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
-	     { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
-	     { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
-	     { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
-	     { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
-	     { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
-	     { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
-	     { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
-	     { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
-	     { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
-	     { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
-	     { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
-	     { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
-	     { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
-	     { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
-	     { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
-	     { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
-	     { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
-	     { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
-	     { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
-	     { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
-	     { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
-	     { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
-	     { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
-	     { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
-	     { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
-	     { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
-	     { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
-	     { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
-	     { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
-	     { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
-	     { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
-	     { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
-	     { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
-	     { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
-};
diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c
deleted file mode 100644
index 66ebbbc..0000000
--- a/math/aarch64/v_logf.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Single-precision vector log function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  uint32x4_t min_norm;
-  uint16x8_t special_bound;
-  float32x4_t poly[7];
-  float32x4_t ln2, tiny_bound;
-  uint32x4_t off, mantissa_mask;
-} data = {
-  /* 3.34 ulp error.  */
-  .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
-	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
-	    V4 (-0x1.ffffc8p-2f) },
-  .ln2 = V4 (0x1.62e43p-1f),
-  .tiny_bound = V4 (0x1p-126),
-  .min_norm = V4 (0x00800000),
-  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
-  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
-  .mantissa_mask = V4 (0x007fffff)
-};
-
-#define P(i) d->poly[7 - i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
-	      uint16x4_t cmp)
-{
-  /* Fall back to scalar code.  */
-  return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, p, q, r, r2, y;
-  uint32x4_t u;
-  uint16x4_t cmp;
-
-  u = vreinterpretq_u32_f32 (x);
-  cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
-		  vget_low_u16 (d->special_bound));
-
-  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
-  u = vsubq_u32 (u, d->off);
-  n = vcvtq_f32_s32 (
-      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
-  u = vandq_u32 (u, d->mantissa_mask);
-  u = vaddq_u32 (u, d->off);
-  r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
-
-  /* y = log(1+r) + n*ln2.  */
-  r2 = vmulq_f32 (r, r);
-  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
-  p = vfmaq_f32 (P (5), P (6), r);
-  q = vfmaq_f32 (P (3), P (4), r);
-  y = vfmaq_f32 (P (1), P (2), r);
-  p = vfmaq_f32 (p, P (7), r2);
-  q = vfmaq_f32 (q, p, r2);
-  y = vfmaq_f32 (y, q, r2);
-  p = vfmaq_f32 (r, d->ln2, n);
-
-  if (unlikely (v_any_u16h (cmp)))
-    return special_case (x, y, r2, p, cmp);
-  return vfmaq_f32 (p, y, r2);
-}
diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h
deleted file mode 100644
index 1dc9916..0000000
--- a/math/aarch64/v_math.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Vector math abstractions.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _V_MATH_H
-#define _V_MATH_H
-
-#if !__aarch64__
-# error "Cannot build without AArch64"
-#endif
-
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-
-#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
-#define V_NAME_D1(fun) _ZGVnN2v_##fun
-#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
-#define V_NAME_D2(fun) _ZGVnN2vv_##fun
-
-#include <stdint.h>
-#include "../math_config.h"
-#include <arm_neon.h>
-
-/* Shorthand helpers for declaring constants.  */
-#  define V2(X) { X, X }
-#  define V4(X) { X, X, X, X }
-#  define V8(X) { X, X, X, X, X, X, X, X }
-
-static inline int
-v_any_u16h (uint16x4_t x)
-{
-  return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
-}
-
-static inline int
-v_lanes32 (void)
-{
-  return 4;
-}
-
-static inline float32x4_t
-v_f32 (float x)
-{
-  return (float32x4_t) V4 (x);
-}
-static inline uint32x4_t
-v_u32 (uint32_t x)
-{
-  return (uint32x4_t) V4 (x);
-}
-/* true if any elements of a v_cond result is non-zero.  */
-static inline int
-v_any_u32 (uint32x4_t x)
-{
-  /* assume elements in x are either 0 or -1u.  */
-  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
-}
-static inline int
-v_any_u32h (uint32x2_t x)
-{
-  return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
-}
-static inline float32x4_t
-v_lookup_f32 (const float *tab, uint32x4_t idx)
-{
-  return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline uint32x4_t
-v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
-{
-  return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline float32x4_t
-v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
-{
-  return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
-		       p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
-}
-static inline float32x4_t
-v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
-	     float32x4_t y, uint32x4_t p)
-{
-  return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
-		       p[1] ? f (x1[1], x2[1]) : y[1],
-		       p[2] ? f (x1[2], x2[2]) : y[2],
-		       p[3] ? f (x1[3], x2[3]) : y[3]};
-}
-
-static inline int
-v_lanes64 (void)
-{
-  return 2;
-}
-static inline float64x2_t
-v_f64 (double x)
-{
-  return (float64x2_t) V2 (x);
-}
-static inline uint64x2_t
-v_u64 (uint64_t x)
-{
-  return (uint64x2_t) V2 (x);
-}
-/* true if any elements of a v_cond result is non-zero.  */
-static inline int
-v_any_u64 (uint64x2_t x)
-{
-  /* assume elements in x are either 0 or -1u.  */
-  return vpaddd_u64 (x) != 0;
-}
-static inline float64x2_t
-v_lookup_f64 (const double *tab, uint64x2_t idx)
-{
-  return (float64x2_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline uint64x2_t
-v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
-{
-  return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline float64x2_t
-v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
-{
-  double p1 = p[1];
-  double x1 = x[1];
-  if (likely (p[0]))
-    y[0] = f (x[0]);
-  if (likely (p1))
-    y[1] = f (x1);
-  return y;
-}
-
-#endif
diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c
deleted file mode 100644
index 3a4163a..0000000
--- a/math/aarch64/v_powf.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Single-precision vector powf function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Thresh v_u32 (0x7f000000) /* Max - Min.  */
-#define MantissaMask v_u32 (0x007fffff)
-
-#define A data.log2_poly
-#define C data.exp2f_poly
-
-/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
-#define Off v_u32 (0x3f35d000)
-
-#define V_POWF_LOG2_TABLE_BITS 5
-#define V_EXP2F_TABLE_BITS 5
-#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
-#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
-
-static const struct
-{
-  struct
-  {
-    double invc, logc;
-  } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
-  double log2_poly[4];
-  uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
-  double exp2f_poly[3];
-} data = {
-  .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
-	       {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
-	       {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
-	       {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
-	       {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
-	       {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
-	       {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
-	       {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
-	       {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
-	       {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
-	       {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
-	       {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
-	       {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
-	       {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
-	       {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
-	       {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
-	       {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
-	       {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
-	       {0x1p+0, 0x0p+0 * Scale},
-	       {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
-	       {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
-	       {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
-	       {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
-	       {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
-	       {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
-	       {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
-	       {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
-	       {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
-	       {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
-	       {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
-	       {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
-	       {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
-  .log2_poly = { /* rel err: 1.5 * 2^-30.  */
-		-0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
-		-0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
-  .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
-		0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
-		0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
-		0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
-		0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
-		0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
-		0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
-		0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
-		0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
-		0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
-		0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
-  .exp2f_poly = { /* rel err: 1.69 * 2^-34.  */
-		 0x1.c6af84b912394p-5 / Scale / Scale / Scale,
-		 0x1.ebfce50fac4f3p-3 / Scale / Scale,
-		 0x1.62e42ff0c52d6p-1 / Scale}};
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
-{
-  return v_call2_f32 (powf, x, y, ret, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
-{
-  uint32x4_t u = vreinterpretq_u32_f32 (x);
-  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
-  uint32x4_t tmp = vsubq_u32 (u, Off);
-  uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
-			    Log2IdxMask);
-  uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
-  uint32x4_t iz = vsubq_u32 (u, top);
-  int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
-			     23 - V_EXP2F_TABLE_BITS); /* arithmetic shift.  */
-
-  float32x4_t ret;
-  for (int lane = 0; lane < 4; lane++)
-    {
-      /* Use double precision for each lane.  */
-      double invc = data.log2_tab[i[lane]].invc;
-      double logc = data.log2_tab[i[lane]].logc;
-      double z = (double) asfloat (iz[lane]);
-
-      /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
-      double r = __builtin_fma (z, invc, -1.0);
-      double y0 = logc + (double) k[lane];
-
-      /* Polynomial to approximate log1p(r)/ln2.  */
-      double logx = A[0];
-      logx = r * logx + A[1];
-      logx = r * logx + A[2];
-      logx = r * logx + A[3];
-      logx = r * logx + y0;
-      double ylogx = y[lane] * logx;
-      cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
-			  >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
-		      ? 1
-		      : cmp[lane];
-
-      /* N*x = k + r with r in [-1/2, 1/2].  */
-      double kd = round (ylogx);
-      uint64_t ki = lround (ylogx);
-      r = ylogx - kd;
-
-      /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
-      uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
-      t += ki << (52 - V_EXP2F_TABLE_BITS);
-      double s = asdouble (t);
-      double p = C[0];
-      p = __builtin_fma (p, r, C[1]);
-      p = __builtin_fma (p, r, C[2]);
-      p = __builtin_fma (p, s * r, s);
-
-      ret[lane] = p;
-    }
-  if (unlikely (v_any_u32 (cmp)))
-    return special_case (x, y, ret, cmp);
-  return ret;
-}
diff --git a/math/aarch64/v_sin.c b/math/aarch64/v_sin.c
deleted file mode 100644
index 04129c3..0000000
--- a/math/aarch64/v_sin.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Double-precision vector sin function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  float64x2_t poly[7];
-  float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
-} data = {
-  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
-	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
-	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
-	    V2 (-0x1.9e9540300a1p-41) },
-
-  .range_val = V2 (0x1p23),
-  .inv_pi = V2 (0x1.45f306dc9c883p-2),
-  .pi_1 = V2 (0x1.921fb54442d18p+1),
-  .pi_2 = V2 (0x1.1a62633145c06p-53),
-  .pi_3 = V2 (0x1.c1cd129024e09p-106),
-  .shift = V2 (0x1.8p52),
-};
-
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255).  */
-# define Thresh v_u64 (0x1160000000000000)    /* RangeVal - TinyBound.  */
-#endif
-
-#define C(i) d->poly[i]
-
-static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
-{
-  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-  return v_call_f64 (sin, x, y, cmp);
-}
-
-/* Vector (AdvSIMD) sin approximation.
-   Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
-   is 2.87 ULP:
-   _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
-				      want 0x1.fffffffa7dc05p-1
-   Maximum observed error in the entire non-special domain ([-2^23, 2^23])
-   is 3.22 ULP:
-   _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
-				       want 0x1.ffdcd125c84f8p-3.  */
-float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
-  uint64x2_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
-  /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
-     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
-     fenv). These lanes will be fixed by special-case handler later.  */
-  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
-  cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
-  r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
-#else
-  r = x;
-  cmp = vcageq_f64 (x, d->range_val);
-#endif
-
-  /* n = rint(|x|/pi).  */
-  n = vfmaq_f64 (d->shift, d->inv_pi, r);
-  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
-  n = vsubq_f64 (n, d->shift);
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = vfmsq_f64 (r, d->pi_1, n);
-  r = vfmsq_f64 (r, d->pi_2, n);
-  r = vfmsq_f64 (r, d->pi_3, n);
-
-  /* sin(r) poly approx.  */
-  r2 = vmulq_f64 (r, r);
-  r3 = vmulq_f64 (r2, r);
-  r4 = vmulq_f64 (r2, r2);
-
-  t1 = vfmaq_f64 (C (4), C (5), r2);
-  t2 = vfmaq_f64 (C (2), C (3), r2);
-  t3 = vfmaq_f64 (C (0), C (1), r2);
-
-  y = vfmaq_f64 (t1, C (6), r4);
-  y = vfmaq_f64 (t2, y, r4);
-  y = vfmaq_f64 (t3, y, r4);
-  y = vfmaq_f64 (r, y, r3);
-
-  if (unlikely (v_any_u64 (cmp)))
-    return special_case (x, y, odd, cmp);
-  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
-}
diff --git a/math/aarch64/v_sinf.c b/math/aarch64/v_sinf.c
deleted file mode 100644
index 3368798..0000000
--- a/math/aarch64/v_sinf.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Single-precision vector sin function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-
-static const struct data
-{
-  float32x4_t poly[4];
-  float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
-} data = {
-  /* 1.886 ulp error.  */
-  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
-	    V4 (0x1.5b2e76p-19f) },
-
-  .pi_1 = V4 (0x1.921fb6p+1f),
-  .pi_2 = V4 (-0x1.777a5cp-24f),
-  .pi_3 = V4 (-0x1.ee59dap-49f),
-
-  .inv_pi = V4 (0x1.45f306p-2f),
-  .shift = V4 (0x1.8p+23f),
-  .range_val = V4 (0x1p20f)
-};
-
-#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f).  */
-# define Thresh v_u32 (0x28800000)    /* RangeVal - TinyBound.  */
-#endif
-
-#define C(i) d->poly[i]
-
-static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
-{
-  /* Fall back to scalar code.  */
-  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-  return v_call_f32 (sinf, x, y, cmp);
-}
-
-float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-  float32x4_t n, r, r2, y;
-  uint32x4_t odd, cmp;
-
-#if WANT_SIMD_EXCEPT
-  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
-  cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
-  /* If fenv exceptions are to be triggered correctly, set any special lanes
-     to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
-     special-case handler later.  */
-  r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
-#else
-  r = x;
-  cmp = vcageq_f32 (x, d->range_val);
-#endif
-
-  /* n = rint(|x|/pi) */
-  n = vfmaq_f32 (d->shift, d->inv_pi, r);
-  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
-  n = vsubq_f32 (n, d->shift);
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
-  r = vfmsq_f32 (r, d->pi_1, n);
-  r = vfmsq_f32 (r, d->pi_2, n);
-  r = vfmsq_f32 (r, d->pi_3, n);
-
-  /* y = sin(r) */
-  r2 = vmulq_f32 (r, r);
-  y = vfmaq_f32 (C (2), C (3), r2);
-  y = vfmaq_f32 (C (1), y, r2);
-  y = vfmaq_f32 (C (0), y, r2);
-  y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
-
-  if (unlikely (v_any_u32 (cmp)))
-    return special_case (x, y, odd, cmp);
-  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
-}
diff --git a/math/cosf.c b/math/cosf.c
index 6293ce8..f29f194 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -22,7 +22,7 @@ cosf (float y)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4f))
+  if (abstop12 (y) < abstop12 (pio4))
     {
       double x2 = x * x;
 
diff --git a/math/erf.c b/math/erf.c
index 5f9f40d..12d7e51 100644
--- a/math/erf.c
+++ b/math/erf.c
@@ -2,7 +2,7 @@
  * Double-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/erf_data.c b/math/erf_data.c
index 10cf1fa..807875b 100644
--- a/math/erf_data.c
+++ b/math/erf_data.c
@@ -2,7 +2,7 @@
  * Shared data between erf and erfc.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/erff.c b/math/erff.c
index 9fa476d..a58e825 100644
--- a/math/erff.c
+++ b/math/erff.c
@@ -2,7 +2,7 @@
  * Single-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/math/erff_data.c b/math/erff_data.c
index f822788..fa6b1ef 100644
--- a/math/erff_data.c
+++ b/math/erff_data.c
@@ -2,7 +2,7 @@
  * Data for approximation of erff.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/exp.c b/math/exp.c
index 1de500c..7f5024c 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -2,7 +2,7 @@
  * Double-precision e^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <float.h>
diff --git a/math/exp10.c b/math/exp10.c
deleted file mode 100644
index 0fbec4c..0000000
--- a/math/exp10.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Double-precision 10^x function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#define N (1 << EXP_TABLE_BITS)
-#define IndexMask (N - 1)
-#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX).  */
-#define UFlowBound -0x1.5ep+8 /* -350.  */
-#define SmallTop 0x3c6 /* top12(0x1p-57).  */
-#define BigTop 0x407   /* top12(0x1p8).  */
-#define Thresh 0x41    /* BigTop - SmallTop.  */
-#define Shift __exp_data.shift
-#define C(i) __exp_data.exp10_poly[i]
-
-static double
-special_case (uint64_t sbits, double_t tmp, uint64_t ki)
-{
-  double_t scale, y;
-
-  if (ki - (1ull << 16) < 0x80000000)
-    {
-      /* The exponent of scale might have overflowed by 1.  */
-      sbits -= 1ull << 52;
-      scale = asdouble (sbits);
-      y = 2 * (scale + scale * tmp);
-      return check_oflow (eval_as_double (y));
-    }
-
-  /* n < 0, need special care in the subnormal range.  */
-  sbits += 1022ull << 52;
-  scale = asdouble (sbits);
-  y = scale + scale * tmp;
-
-  if (y < 1.0)
-    {
-      /* Round y to the right precision before scaling it into the subnormal
-	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
-	 E is the worst-case ulp error outside the subnormal range.  So this
-	 is only useful if the goal is better than 1 ulp worst-case error.  */
-      double_t lo = scale - y + scale * tmp;
-      double_t hi = 1.0 + y;
-      lo = 1.0 - hi + y + lo;
-      y = eval_as_double (hi + lo) - 1.0;
-      /* Avoid -0.0 with downward rounding.  */
-      if (WANT_ROUNDING && y == 0.0)
-	y = 0.0;
-      /* The underflow exception needs to be signaled explicitly.  */
-      force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
-    }
-  y = 0x1p-1022 * y;
-
-  return check_uflow (y);
-}
-
-/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP.  */
-double
-exp10 (double x)
-{
-  uint64_t ix = asuint64 (x);
-  uint32_t abstop = (ix >> 52) & 0x7ff;
-
-  if (unlikely (abstop - SmallTop >= Thresh))
-    {
-      if (abstop - SmallTop >= 0x80000000)
-	/* Avoid spurious underflow for tiny x.
-	   Note: 0 is common input.  */
-	return x + 1;
-      if (abstop == 0x7ff)
-	return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
-      if (x >= OFlowBound)
-	return __math_oflow (0);
-      if (x < UFlowBound)
-	return __math_uflow (0);
-
-      /* Large x is special-cased below.  */
-      abstop = 0;
-    }
-
-  /* Reduce x: z = x * N / log10(2), k = round(z).  */
-  double_t z = __exp_data.invlog10_2N * x;
-  double_t kd;
-  int64_t ki;
-#if TOINT_INTRINSICS
-  kd = roundtoint (z);
-  ki = converttoint (z);
-#else
-  kd = eval_as_double (z + Shift);
-  kd -= Shift;
-  ki = kd;
-#endif
-
-  /* r = x - k * log10(2), r in [-0.5, 0.5].  */
-  double_t r = x;
-  r = __exp_data.neglog10_2hiN * kd + r;
-  r = __exp_data.neglog10_2loN * kd + r;
-
-  /* exp10(x) = 2^(k/N) * 2^(r/N).
-     Approximate the two components separately.  */
-
-  /* s = 2^(k/N), using lookup table.  */
-  uint64_t e = ki << (52 - EXP_TABLE_BITS);
-  uint64_t i = (ki & IndexMask) * 2;
-  uint64_t u = __exp_data.tab[i + 1];
-  uint64_t sbits = u + e;
-
-  double_t tail = asdouble (__exp_data.tab[i]);
-
-  /* 2^(r/N) ~= 1 + r * Poly(r).  */
-  double_t r2 = r * r;
-  double_t p = C (0) + r * C (1);
-  double_t y = C (2) + r * C (3);
-  y = y + r2 * C (4);
-  y = p + r2 * y;
-  y = tail + y * r;
-
-  if (unlikely (abstop == 0))
-    return special_case (sbits, y, ki);
-
-  /* Assemble components:
-     y  = 2^(r/N) * 2^(k/N)
-       ~= (y + 1) * s.  */
-  double_t s = asdouble (sbits);
-  return eval_as_double (s * y + s);
-}
diff --git a/math/exp2.c b/math/exp2.c
index a1eee44..35ab39f 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -2,7 +2,7 @@
  * Double-precision 2^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <float.h>
diff --git a/math/exp2f.c b/math/exp2f.c
index 776c3dd..94b3253 100644
--- a/math/exp2f.c
+++ b/math/exp2f.c
@@ -2,7 +2,7 @@
  * Single-precision 2^x function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <math.h>
diff --git a/math/exp2f_data.c b/math/exp2f_data.c
index f0cb7fc..3fb0ad1 100644
--- a/math/exp2f_data.c
+++ b/math/exp2f_data.c
@@ -2,7 +2,7 @@
  * Shared data between expf, exp2f and powf.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/exp_data.c b/math/exp_data.c
index 9df4e0b..cba7683 100644
--- a/math/exp_data.c
+++ b/math/exp_data.c
@@ -2,7 +2,7 @@
  * Shared data between exp, exp2 and pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
@@ -12,7 +12,6 @@
 const struct exp_data __exp_data = {
 // N/ln2
 .invln2N = 0x1.71547652b82fep0 * N,
-.invlog10_2N = 0x1.a934f0979a371p1 * N,
 // -ln2/N
 #if N == 64
 .negln2hiN = -0x1.62e42fefa0000p-7,
@@ -27,8 +26,6 @@ const struct exp_data __exp_data = {
 .negln2hiN = -0x1.62e42fef80000p-10,
 .negln2loN = -0x1.1cf79abc9e3b4p-45,
 #endif
-.neglog10_2hiN = -0x1.3441350ap-2 / N,
-.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
 // Used for rounding when !TOINT_INTRINSICS
 #if EXP_USE_TOINT_NARROW
 .shift = 0x1800000000.8p0,
@@ -150,24 +147,6 @@ const struct exp_data __exp_data = {
 0x1.3b2ab786ee1dap-7,
 #endif
 },
-.exp10_poly = {
-#if EXP10_POLY_WIDE
-/* Range is wider if using shift-based reduction: coeffs generated
-   using Remez in [-log10(2)/128, log10(2)/128 ].  */
-0x1.26bb1bbb55515p1,
-0x1.53524c73cd32bp1,
-0x1.0470591e1a108p1,
-0x1.2bd77b12fe9a8p0,
-0x1.14289fef24b78p-1
-#else
-/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ].  */
-0x1.26bb1bbb55516p1,
-0x1.53524c73ce9fep1,
-0x1.0470591ce4b26p1,
-0x1.2bd76577fe684p0,
-0x1.1446eeccd0efbp-1
-#endif
-},
 // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
 // tab[2*k] = asuint64(T[k])
 // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/math/expf.c b/math/expf.c
index 08a20d5..9b2f0c3 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -2,7 +2,7 @@
  * Single-precision e^x function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <math.h>
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 64cbb9c..279d829 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,8 +1,8 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2015-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef _MATHLIB_H
@@ -18,33 +18,74 @@ float cosf (float);
 void sincosf (float, float*, float*);
 
 double exp (double);
-double exp10 (double);
 double exp2 (double);
 double log (double);
 double log2 (double);
 double pow (double, double);
 
+/* Scalar functions using the vector algorithm with identical result.  */
+float __s_sinf (float);
+float __s_cosf (float);
+float __s_expf (float);
+float __s_expf_1u (float);
+float __s_exp2f (float);
+float __s_exp2f_1u (float);
+float __s_logf (float);
+float __s_powf (float, float);
+double __s_sin (double);
+double __s_cos (double);
+double __s_exp (double);
+double __s_log (double);
+double __s_pow (double, double);
+
 #if __aarch64__
-# if __GNUC__ >= 5
+#if __GNUC__ >= 5
 typedef __Float32x4_t __f32x4_t;
 typedef __Float64x2_t __f64x2_t;
-# elif __clang_major__*100+__clang_minor__ >= 305
+#elif __clang_major__*100+__clang_minor__ >= 305
 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
 typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-# else
-#  error Unsupported compiler
-# endif
+#else
+#error Unsupported compiler
+#endif
+
+/* Vector functions following the base PCS.  */
+__f32x4_t __v_sinf (__f32x4_t);
+__f32x4_t __v_cosf (__f32x4_t);
+__f32x4_t __v_expf (__f32x4_t);
+__f32x4_t __v_expf_1u (__f32x4_t);
+__f32x4_t __v_exp2f (__f32x4_t);
+__f32x4_t __v_exp2f_1u (__f32x4_t);
+__f32x4_t __v_logf (__f32x4_t);
+__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
+__f64x2_t __v_sin (__f64x2_t);
+__f64x2_t __v_cos (__f64x2_t);
+__f64x2_t __v_exp (__f64x2_t);
+__f64x2_t __v_log (__f64x2_t);
+__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
 
-# if __GNUC__ >= 9 || __clang_major__ >= 8
-#  undef __vpcs
-#  define __vpcs __attribute__((__aarch64_vector_pcs__))
+#if __GNUC__ >= 9 || __clang_major__ >= 8
+#define __vpcs __attribute__((__aarch64_vector_pcs__))
+
+/* Vector functions following the vector PCS.  */
+__vpcs __f32x4_t __vn_sinf (__f32x4_t);
+__vpcs __f32x4_t __vn_cosf (__f32x4_t);
+__vpcs __f32x4_t __vn_expf (__f32x4_t);
+__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
+__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
+__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
+__vpcs __f32x4_t __vn_logf (__f32x4_t);
+__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t __vn_sin (__f64x2_t);
+__vpcs __f64x2_t __vn_cos (__f64x2_t);
+__vpcs __f64x2_t __vn_exp (__f64x2_t);
+__vpcs __f64x2_t __vn_log (__f64x2_t);
+__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
-__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
@@ -53,7 +94,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
-# endif
+#endif
 #endif
 
 #endif
diff --git a/math/log.c b/math/log.c
index 43dfc2a..d3b7bc6 100644
--- a/math/log.c
+++ b/math/log.c
@@ -2,7 +2,7 @@
  * Double-precision log(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <float.h>
diff --git a/math/log2.c b/math/log2.c
index 3f9c21b..55102b7 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -2,7 +2,7 @@
  * Double-precision log2(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <float.h>
diff --git a/math/log2_data.c b/math/log2_data.c
index 293bd7d..3fc9b47 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -2,7 +2,7 @@
  * Data for log2.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/log2f.c b/math/log2f.c
index 0a44fa2..acb629e 100644
--- a/math/log2f.c
+++ b/math/log2f.c
@@ -2,7 +2,7 @@
  * Single-precision log2 function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <math.h>
diff --git a/math/log2f_data.c b/math/log2f_data.c
index 4866ef7..f3546d7 100644
--- a/math/log2f_data.c
+++ b/math/log2f_data.c
@@ -2,7 +2,7 @@
  * Data definition for log2f.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/log_data.c b/math/log_data.c
index 3ecc1f4..96a098d 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -2,7 +2,7 @@
  * Data for log.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/logf.c b/math/logf.c
index 820f74c..cfbaee1 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2017-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <math.h>
@@ -57,7 +57,7 @@ logf (float x)
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
-  iz = ix - (tmp & 0xff800000);
+  iz = ix - (tmp & 0x1ff << 23);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
diff --git a/math/logf_data.c b/math/logf_data.c
index 0424768..e8973ce 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -2,7 +2,7 @@
  * Data definition for logf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/math_config.h b/math/math_config.h
index 394aaeb..e851043 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,8 +1,8 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2017-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef _MATH_CONFIG_H
@@ -92,17 +92,6 @@
 # define unlikely(x) (x)
 #endif
 
-/* Return ptr but hide its value from the compiler so accesses through it
-   cannot be optimized based on the contents.  */
-#define ptr_barrier(ptr)                                                      \
-  ({                                                                          \
-    __typeof (ptr) __ptr = (ptr);                                             \
-    __asm("" : "+r"(__ptr));                                                  \
-    __ptr;                                                                    \
-  })
-
-/* Symbol renames to avoid libc conflicts.  */
-
 #if HAVE_FAST_ROUND
 /* When set, the roundtoint and converttoint functions are provided with
    the semantics documented below.  */
@@ -392,22 +381,15 @@ extern const struct powf_log2_data
 #define EXP_USE_TOINT_NARROW 0
 #define EXP2_POLY_ORDER 5
 #define EXP2_POLY_WIDE 0
-/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
-   and !TOINT_INTRINSICS.  */
-#define EXP10_POLY_WIDE 0
 extern const struct exp_data
 {
   double invln2N;
-  double invlog10_2N;
   double shift;
   double negln2hiN;
   double negln2loN;
-  double neglog10_2hiN;
-  double neglog10_2loN;
   double poly[4]; /* Last four coefficients.  */
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
-  double exp10_poly[5];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data HIDDEN;
 
@@ -477,16 +459,4 @@ extern const struct erf_data
   double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
 } __erf_data HIDDEN;
 
-#define V_EXP_TABLE_BITS 7
-extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
-
-#define V_LOG_TABLE_BITS 7
-extern const struct v_log_data
-{
-  struct
-  {
-    double invc, logc;
-  } table[1 << V_LOG_TABLE_BITS];
-} __v_log_data HIDDEN;
-
 #endif
diff --git a/math/math_err.c b/math/math_err.c
index cfe0728..1bf9538 100644
--- a/math/math_err.c
+++ b/math/math_err.c
@@ -2,7 +2,7 @@
  * Double-precision math error handling.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/math_errf.c b/math/math_errf.c
index 4233918..d5350b8 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -2,7 +2,7 @@
  * Single-precision math error handling.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/pow.c b/math/pow.c
index af719fe..86842c6 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -2,7 +2,7 @@
  * Double-precision x^y function.
  *
  * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <float.h>
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 2a4c250..45569c5 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -2,7 +2,7 @@
  * Data for the log part of pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/powf.c b/math/powf.c
index 05c80bb..6ba45d3 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -2,7 +2,7 @@
  * Single-precision pow function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <math.h>
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index 243836a..97e0d98 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -2,7 +2,7 @@
  * Data definition for powf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "math_config.h"
diff --git a/math/s_cos.c b/math/s_cos.c
new file mode 100644
index 0000000..53a95b0
--- /dev/null
+++ b/math/s_cos.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_cos.c"
diff --git a/math/s_cosf.c b/math/s_cosf.c
new file mode 100644
index 0000000..914c02e
--- /dev/null
+++ b/math/s_cosf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_cosf.c"
diff --git a/math/s_exp.c b/math/s_exp.c
new file mode 100644
index 0000000..ac7246b
--- /dev/null
+++ b/math/s_exp.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp.c"
diff --git a/math/s_exp2f.c b/math/s_exp2f.c
new file mode 100644
index 0000000..df7dfd6
--- /dev/null
+++ b/math/s_exp2f.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp2f.c"
diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c
new file mode 100644
index 0000000..5e3852b
--- /dev/null
+++ b/math/s_exp2f_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_exp2f_1u.c"
diff --git a/math/s_expf.c b/math/s_expf.c
new file mode 100644
index 0000000..3492c46
--- /dev/null
+++ b/math/s_expf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
new file mode 100644
index 0000000..eb7bbcb
--- /dev/null
+++ b/math/s_expf_1u.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_expf_1u.c"
diff --git a/math/s_log.c b/math/s_log.c
new file mode 100644
index 0000000..23289cf
--- /dev/null
+++ b/math/s_log.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_log.c"
diff --git a/math/s_logf.c b/math/s_logf.c
new file mode 100644
index 0000000..9399350
--- /dev/null
+++ b/math/s_logf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_logf.c"
diff --git a/math/s_pow.c b/math/s_pow.c
new file mode 100644
index 0000000..2e34c9f
--- /dev/null
+++ b/math/s_pow.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_pow.c"
diff --git a/math/s_powf.c b/math/s_powf.c
new file mode 100644
index 0000000..6d91a4a
--- /dev/null
+++ b/math/s_powf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_powf.c"
diff --git a/math/s_sin.c b/math/s_sin.c
new file mode 100644
index 0000000..06982c2
--- /dev/null
+++ b/math/s_sin.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_sin.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
new file mode 100644
index 0000000..68ca908
--- /dev/null
+++ b/math/s_sinf.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_sinf.c"
diff --git a/math/sincosf.c b/math/sincosf.c
index 446f21d..9746f1c 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4f))
+  if (abstop12 (y) < abstop12 (pio4))
     {
       double x2 = x * x;
 
diff --git a/math/sincosf.h b/math/sincosf.h
index ec23ed7..1e80fc9 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -1,8 +1,8 @@
 /*
  * Header for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -12,7 +12,7 @@
 /* 2PI * 2^-64.  */
 static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
-static const float pio4f = 0x1.921FB6p-1f;
+static const double pio4 = 0x1.921FB54442D18p-1;
 
 /* The constants and polynomials for sine and cosine.  */
 typedef struct
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index 2252529..ab4ac47 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -2,7 +2,7 @@
  * Data definition for sinf, cosf and sincosf.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/math/sinf.c b/math/sinf.c
index 8dd8ae4..ddbc1da 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <math.h>
@@ -21,7 +21,7 @@ sinf (float y)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4f))
+  if (abstop12 (y) < abstop12 (pio4))
     {
       s = x * x;
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index b2711e5..0c17826 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,8 +1,8 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #undef _GNU_SOURCE
@@ -15,6 +15,11 @@
 #include <math.h>
 #include "mathlib.h"
 
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+# define WANT_VMATH 1
+#endif
+
 /* Number of measurements, best result is reported.  */
 #define MEASURE 60
 /* Array size.  */
@@ -29,9 +34,8 @@ static float Af[N];
 static long measurecount = MEASURE;
 static long itercount = ITER;
 
-#ifdef __vpcs
-#include <arm_neon.h>
-typedef float64x2_t v_double;
+#if __aarch64__ && WANT_VMATH
+typedef __f64x2_t v_double;
 
 #define v_double_len() 2
 
@@ -47,7 +51,7 @@ v_double_dup (double x)
   return (v_double){x, x};
 }
 
-typedef float32x4_t v_float;
+typedef __f32x4_t v_float;
 
 #define v_float_len() 4
 
@@ -72,91 +76,141 @@ typedef float v_float;
 #define v_float_len(x) 1
 #define v_float_load(x) (x)[0]
 #define v_float_dup(x) (x)
-
 #endif
 
-#if WANT_SVE_MATH
-#include <arm_sve.h>
-typedef svbool_t sv_bool;
-typedef svfloat64_t sv_double;
+static double
+dummy (double x)
+{
+  return x;
+}
 
-#define sv_double_len() svcntd()
+static float
+dummyf (float x)
+{
+  return x;
+}
 
-static inline sv_double
-sv_double_load (const double *p)
+#if WANT_VMATH
+#if __aarch64__
+static v_double
+__v_dummy (v_double x)
 {
-  svbool_t pg = svptrue_b64();
-  return svld1(pg, p);
+  return x;
 }
 
-static inline sv_double
-sv_double_dup (double x)
+static v_float
+__v_dummyf (v_float x)
 {
-  return svdup_n_f64(x);
+  return x;
 }
 
-typedef svfloat32_t sv_float;
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
+{
+  return x;
+}
 
-#define sv_float_len() svcntw()
+__vpcs static v_float
+__vn_dummyf (v_float x)
+{
+  return x;
+}
 
-static inline sv_float
-sv_float_load (const float *p)
+__vpcs static v_float
+xy__vn_powf (v_float x)
 {
-  svbool_t pg = svptrue_b32();
-  return svld1(pg, p);
+  return __vn_powf (x, x);
 }
 
-static inline sv_float
-sv_float_dup (float x)
+__vpcs static v_float
+xy_Z_powf (v_float x)
 {
-  return svdup_n_f32(x);
+  return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+  return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
 }
-#else
-/* dummy definitions to make things compile.  */
-#define sv_double_len(x) 1
-#define sv_float_len(x) 1
 #endif
 
-static double
-dummy (double x)
+static v_float
+xy__v_powf (v_float x)
 {
-  return x;
+  return __v_powf (x, x);
 }
 
-static float
-dummyf (float x)
+static v_double
+xy__v_pow (v_double x)
 {
-  return x;
+  return __v_pow (x, x);
 }
-#ifdef __vpcs
-__vpcs static v_double
-__vn_dummy (v_double x)
+#endif
+
+static float
+xy__s_powf (float x)
 {
-  return x;
+  return __s_powf (x, x);
 }
 
-__vpcs static v_float
-__vn_dummyf (v_float x)
+static double
+xy__s_pow (double x)
 {
-  return x;
+  return __s_pow (x, x);
 }
 #endif
-#if WANT_SVE_MATH
-static sv_double
-__sv_dummy (sv_double x, sv_bool pg)
+
+static double
+xypow (double x)
 {
-  return x;
+  return pow (x, x);
 }
 
-static sv_float
-__sv_dummyf (sv_float x, sv_bool pg)
+static float
+xypowf (float x)
 {
-  return x;
+  return powf (x, x);
 }
 
-#endif
+static double
+xpow (double x)
+{
+  return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+  return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+  return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+  return powf (2.34f, x);
+}
 
-#include "test/mathbench_wrappers.h"
+static float
+sincosf_wrap (float x)
+{
+  float s, c;
+  sincosf (x, &s, &c);
+  return s + c;
+}
 
 static const struct fun
 {
@@ -169,40 +223,127 @@ static const struct fun
   {
     double (*d) (double);
     float (*f) (float);
+    v_double (*vd) (v_double);
+    v_float (*vf) (v_float);
 #ifdef __vpcs
     __vpcs v_double (*vnd) (v_double);
     __vpcs v_float (*vnf) (v_float);
-#endif
-#if WANT_SVE_MATH
-    sv_double (*svd) (sv_double, sv_bool);
-    sv_float (*svf) (sv_float, sv_bool);
 #endif
   } fun;
 } funtab[] = {
 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
+#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
+#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
-#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
-#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
 D (dummy, 1.0, 2.0)
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
 F (dummyf, 1.0, 2.0)
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#if WANT_VMATH
+D (__s_sin, -3.1, 3.1)
+D (__s_cos, -3.1, 3.1)
+D (__s_exp, -9.9, 9.9)
+D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
+F (__s_expf, -9.9, 9.9)
+F (__s_expf_1u, -9.9, 9.9)
+F (__s_exp2f, -9.9, 9.9)
+F (__s_exp2f_1u, -9.9, 9.9)
+F (__s_logf, 0.01, 11.1)
+{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
+F (__s_sinf, -3.1, 3.1)
+F (__s_cosf, -3.1, 3.1)
+#if __aarch64__
+VD (__v_dummy, 1.0, 2.0)
+VD (__v_sin, -3.1, 3.1)
+VD (__v_cos, -3.1, 3.1)
+VD (__v_exp, -9.9, 9.9)
+VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
+VF (__v_dummyf, 1.0, 2.0)
+VF (__v_expf, -9.9, 9.9)
+VF (__v_expf_1u, -9.9, 9.9)
+VF (__v_exp2f, -9.9, 9.9)
+VF (__v_exp2f_1u, -9.9, 9.9)
+VF (__v_logf, 0.01, 11.1)
+{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
+VF (__v_sinf, -3.1, 3.1)
+VF (__v_cosf, -3.1, 3.1)
 #ifdef __vpcs
 VND (__vn_dummy, 1.0, 2.0)
+VND (__vn_exp, -9.9, 9.9)
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (__vn_log, 0.01, 11.1)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (__vn_sin, -3.1, 3.1)
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (__vn_cos, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
 VNF (__vn_dummyf, 1.0, 2.0)
+VNF (__vn_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (__vn_expf_1u, -9.9, 9.9)
+VNF (__vn_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (__vn_exp2f_1u, -9.9, 9.9)
+VNF (__vn_logf, 0.01, 11.1)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (__vn_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (__vn_cosf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
 #endif
-#if WANT_SVE_MATH
-SVD (__sv_dummy, 1.0, 2.0)
-SVF (__sv_dummyf, 1.0, 2.0)
 #endif
-#include "test/mathbench_funcs.h"
 {0},
 #undef F
 #undef D
+#undef VF
+#undef VD
 #undef VNF
 #undef VND
-#undef SVF
-#undef SVD
 };
 
 static void
@@ -301,75 +442,69 @@ runf_latency (float f (float))
     prev = f (Af[i] + prev * z);
 }
 
-#ifdef __vpcs
 static void
-run_vn_thruput (__vpcs v_double f (v_double))
+run_v_thruput (v_double f (v_double))
 {
   for (int i = 0; i < N; i += v_double_len ())
     f (v_double_load (A+i));
 }
 
 static void
-runf_vn_thruput (__vpcs v_float f (v_float))
+runf_v_thruput (v_float f (v_float))
 {
   for (int i = 0; i < N; i += v_float_len ())
     f (v_float_load (Af+i));
 }
 
 static void
-run_vn_latency (__vpcs v_double f (v_double))
+run_v_latency (v_double f (v_double))
 {
-  volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
-  uint64x2_t sel = vsel;
-  v_double prev = v_double_dup (0);
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
   for (int i = 0; i < N; i += v_double_len ())
-    prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
+    prev = f (v_double_load (A+i) + prev * z);
 }
 
 static void
-runf_vn_latency (__vpcs v_float f (v_float))
+runf_v_latency (v_float f (v_float))
 {
-  volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
-  uint32x4_t sel = vsel;
-  v_float prev = v_float_dup (0);
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
   for (int i = 0; i < N; i += v_float_len ())
-    prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
+    prev = f (v_float_load (Af+i) + prev * z);
 }
-#endif
 
-#if WANT_SVE_MATH
+#ifdef __vpcs
 static void
-run_sv_thruput (sv_double f (sv_double, sv_bool))
+run_vn_thruput (__vpcs v_double f (v_double))
 {
-  for (int i = 0; i < N; i += sv_double_len ())
-    f (sv_double_load (A+i), svptrue_b64 ());
+  for (int i = 0; i < N; i += v_double_len ())
+    f (v_double_load (A+i));
 }
 
 static void
-runf_sv_thruput (sv_float f (sv_float, sv_bool))
+runf_vn_thruput (__vpcs v_float f (v_float))
 {
-  for (int i = 0; i < N; i += sv_float_len ())
-    f (sv_float_load (Af+i), svptrue_b32 ());
+  for (int i = 0; i < N; i += v_float_len ())
+    f (v_float_load (Af+i));
 }
 
 static void
-run_sv_latency (sv_double f (sv_double, sv_bool))
+run_vn_latency (__vpcs v_double f (v_double))
 {
-  volatile sv_bool vsel = svptrue_b64 ();
-  sv_bool sel = vsel;
-  sv_double prev = sv_double_dup (0);
-  for (int i = 0; i < N; i += sv_double_len ())
-    prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
+  v_double z = v_double_dup (zero);
+  v_double prev = z;
+  for (int i = 0; i < N; i += v_double_len ())
+    prev = f (v_double_load (A+i) + prev * z);
 }
 
 static void
-runf_sv_latency (sv_float f (sv_float, sv_bool))
+runf_vn_latency (__vpcs v_float f (v_float))
 {
-  volatile sv_bool vsel = svptrue_b32 ();
-  sv_bool sel = vsel;
-  sv_float prev = sv_float_dup (0);
-  for (int i = 0; i < N; i += sv_float_len ())
-    prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
+  v_float z = v_float_dup (zero);
+  v_float prev = z;
+  for (int i = 0; i < N; i += v_float_len ())
+    prev = f (v_float_load (Af+i) + prev * z);
 }
 #endif
 
@@ -404,10 +539,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   const char *s = type == 't' ? "rthruput" : "latency";
   int vlen = 1;
 
-  if (f->vec == 'n')
-    vlen = f->prec == 'd' ? v_double_len() : v_float_len();
-  else if (f->vec == 's')
-    vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
+  if (f->vec && f->prec == 'd')
+    vlen = v_double_len();
+  else if (f->vec && f->prec == 'f')
+    vlen = v_float_len();
 
   if (f->prec == 'd' && type == 't' && f->vec == 0)
     TIMEIT (run_thruput, f->fun.d);
@@ -417,6 +552,14 @@ bench1 (const struct fun *f, int type, double lo, double hi)
     TIMEIT (runf_thruput, f->fun.f);
   else if (f->prec == 'f' && type == 'l' && f->vec == 0)
     TIMEIT (runf_latency, f->fun.f);
+  else if (f->prec == 'd' && type == 't' && f->vec == 'v')
+    TIMEIT (run_v_thruput, f->fun.vd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
+    TIMEIT (run_v_latency, f->fun.vd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 'v')
+    TIMEIT (runf_v_thruput, f->fun.vf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
+    TIMEIT (runf_v_latency, f->fun.vf);
 #ifdef __vpcs
   else if (f->prec == 'd' && type == 't' && f->vec == 'n')
     TIMEIT (run_vn_thruput, f->fun.vnd);
@@ -427,32 +570,20 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
     TIMEIT (runf_vn_latency, f->fun.vnf);
 #endif
-#if WANT_SVE_MATH
-  else if (f->prec == 'd' && type == 't' && f->vec == 's')
-    TIMEIT (run_sv_thruput, f->fun.svd);
-  else if (f->prec == 'd' && type == 'l' && f->vec == 's')
-    TIMEIT (run_sv_latency, f->fun.svd);
-  else if (f->prec == 'f' && type == 't' && f->vec == 's')
-    TIMEIT (runf_sv_thruput, f->fun.svf);
-  else if (f->prec == 'f' && type == 'l' && f->vec == 's')
-    TIMEIT (runf_sv_latency, f->fun.svf);
-#endif
 
   if (type == 't')
     {
       ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
-      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
-	      f->name, s,
+      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	      (unsigned long long) dt, lo, hi, vlen);
+	      (unsigned long long) dt, lo, hi);
     }
   else if (type == 'l')
     {
       ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
-      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
-	      f->name, s,
+      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	      (unsigned long long) dt, lo, hi, vlen);
+	      (unsigned long long) dt, lo, hi);
     }
   fflush (stdout);
 }
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
deleted file mode 100644
index 84c4e68..0000000
--- a/math/test/mathbench_funcs.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Function entries for mathbench.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-/* clang-format off */
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp10, -9.9, 9.9)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
-#ifdef __vpcs
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
-#endif
-  /* clang-format on */
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
deleted file mode 100644
index 062b9db..0000000
--- a/math/test/mathbench_wrappers.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Function wrappers for mathbench.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifdef __vpcs
-
-__vpcs static v_float
-xy_Z_powf (v_float x)
-{
-  return _ZGVnN4vv_powf (x, x);
-}
-
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
-  return _ZGVnN2vv_pow (x, x);
-}
-
-#endif
-
-static double
-xypow (double x)
-{
-  return pow (x, x);
-}
-
-static float
-xypowf (float x)
-{
-  return powf (x, x);
-}
-
-static double
-xpow (double x)
-{
-  return pow (x, 23.4);
-}
-
-static float
-xpowf (float x)
-{
-  return powf (x, 23.4f);
-}
-
-static double
-ypow (double x)
-{
-  return pow (2.34, x);
-}
-
-static float
-ypowf (float x)
-{
-  return powf (2.34f, x);
-}
-
-static float
-sincosf_wrap (float x)
-{
-  float s, c;
-  sincosf (x, &s, &c);
-  return s + c;
-}
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index cedccfd..3108967 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,8 +1,8 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 1998-2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <assert.h>
@@ -196,11 +196,9 @@ int is_complex_rettype(int rettype) {
 #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
 #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
 
-#ifndef PL
 /* sincosf wrappers for easier testing.  */
 static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
 static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
-#endif
 
 test_func tfuncs[] = {
     /* trigonometric */
@@ -220,10 +218,9 @@ test_func tfuncs[] = {
     TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
     TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
-#ifndef PL
     TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-#endif
+
     /* hyperbolic */
     TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
     TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
@@ -254,7 +251,6 @@ test_func tfuncs[] = {
     TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
     TFUNC(at_s,rt_s, expm1f, ULPUNIT),
-    TFUNC(at_d,rt_d, exp10, ULPUNIT),
 
     /* power */
     TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@@ -1022,7 +1018,6 @@ int runtest(testdetail t) {
     DO_DOP(d_arg1,op1r);
     DO_DOP(d_arg2,op2r);
     s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
-    s_res.i = 0;
 
     /*
      * Detect NaNs, infinities and denormals on input, and set a
@@ -1157,25 +1152,22 @@ int runtest(testdetail t) {
             tresultr[0] = t.resultr[0];
             tresultr[1] = t.resultr[1];
             resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
-            resulti[0] = resulti[1] = 0;
             wres = 2;
             break;
         case rt_i:
             tresultr[0] = t.resultr[0];
             resultr[0] = intres;
-            resulti[0] = 0;
             wres = 1;
             break;
         case rt_s:
         case rt_s2:
             tresultr[0] = t.resultr[0];
             resultr[0] = s_res.i;
-            resulti[0] = 0;
             wres = 1;
             break;
         default:
             puts("unhandled rettype in runtest");
-            abort ();
+            wres = 0;
         }
         if(t.resultc != rc_none) {
             int err = 0;
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 5b3e9b4..6be79e1 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -2,7 +2,7 @@
  * dotest.c - actually generate mathlib test cases
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index 3ebd7dd..12a9c74 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -2,7 +2,7 @@
  * intern.h
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef mathtest_intern_h
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index 3d533c9..0d8ead8 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -2,7 +2,7 @@
  * main.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index 1de3258..5612396 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -2,7 +2,7 @@
  * random.c - random number generator for producing mathlib test cases
  *
  * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "types.h"
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index 0b477d7..b4b22df 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -2,7 +2,7 @@
  * random.h - header for random.c
  *
  * Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "types.h"
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index 70a7844..c9f0daf 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -2,7 +2,7 @@
  * semi.c: test implementations of mathlib seminumerical functions
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index 7a1444e..17dc415 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -2,7 +2,7 @@
  * semi.h: header for semi.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef test_semi_h
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index e15b4e0..53cd557 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -2,7 +2,7 @@
  * types.h
  *
  * Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef mathtest_types_h
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index 4410171..de45ac5 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -2,7 +2,7 @@
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 0a8a587..7b09c85 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -2,7 +2,7 @@
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 typedef struct {
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index e2e03e3..0190d9a 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,8 +2,8 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019-2023, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# Copyright (c) 2019-2020, Arm Limited.
+# SPDX-License-Identifier: MIT
 
 #set -x
 set -eu
@@ -72,16 +72,6 @@ t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
 t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
 
-L=0.02
-t exp10   0                   0x1p-47             5000
-t exp10  -0                  -0x1p-47             5000
-t exp10   0x1p-47             1                   50000
-t exp10  -0x1p-47            -1                   50000
-t exp10   1                   0x1.34413509f79ffp8 50000
-t exp10  -1                  -0x1.434e6420f4374p8 50000
-t exp10  0x1.34413509f79ffp8  inf                 5000
-t exp10 -0x1.434e6420f4374p8 -inf                 5000
-
 L=1.0
 Ldir=0.9
 t erf  0 0xffff000000000000 10000
@@ -153,10 +143,15 @@ Ldir=0.5
 done
 
 # vector functions
-
 Ldir=0.5
 r='n'
-flags="${ULPFLAGS:--q}"
+flags="${ULPFLAGS:--q} -f"
+runs=
+check __s_exp 1 && runs=1
+runv=
+check __v_exp 1 && runv=1
+runvn=
+check __vn_exp 1 && runvn=1
 
 range_exp='
   0 0xffff000000000000 10000
@@ -182,10 +177,9 @@ range_pow='
 '
 
 range_sin='
-  0       0x1p23     500000
- -0      -0x1p23     500000
-  0x1p23  inf        10000
- -0x1p23 -inf        10000
+  0 0xffff000000000000 10000
+  0x1p-4     0x1p4     400000
+ -0x1p-23    0x1p23    400000
 '
 range_cos="$range_sin"
 
@@ -205,10 +199,9 @@ range_logf='
 '
 
 range_sinf='
-  0        0x1p20   500000
- -0       -0x1p20   500000
-  0x1p20   inf      10000
- -0x1p20  -inf      10000
+ 0    0xffff0000    10000
+ 0x1p-4    0x1p4    300000
+-0x1p-9   -0x1p9    300000
 '
 range_cosf="$range_sinf"
 
@@ -236,8 +229,9 @@ L_sinf=1.4
 L_cosf=1.4
 L_powf=2.1
 
-while read G F D
+while read G F R
 do
+	[ "$R" = 1 ] || continue
 	case "$G" in \#*) continue ;; esac
 	eval range="\${range_$G}"
 	eval L="\${L_$G}"
@@ -245,35 +239,74 @@ do
 	do
 		[ -n "$X" ] || continue
 		case "$X" in \#*) continue ;; esac
-		disable_fenv=""
-		if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
-			# If library was built with SIMD exceptions
-			# disabled, disable fenv checking in ulp
-			# tool. Otherwise, fenv checking may still be
-			# disabled by adding -f to the end of the run
-			# line.
-			disable_fenv="-f"
-		fi
-		t $D $disable_fenv $F $X
+		t $F $X
 	done << EOF
 $range
-
 EOF
 done << EOF
 # group symbol run
-exp       _ZGVnN2v_exp
-log       _ZGVnN2v_log
-pow       _ZGVnN2vv_pow      -f
-sin       _ZGVnN2v_sin       -z
-cos       _ZGVnN2v_cos
-expf      _ZGVnN4v_expf
-expf_1u   _ZGVnN4v_expf_1u   -f
-exp2f     _ZGVnN4v_exp2f
-exp2f_1u  _ZGVnN4v_exp2f_1u  -f
-logf      _ZGVnN4v_logf
-sinf      _ZGVnN4v_sinf      -z
-cosf      _ZGVnN4v_cosf
-powf      _ZGVnN4vv_powf     -f
+exp  __s_exp       $runs
+exp  __v_exp       $runv
+exp  __vn_exp      $runvn
+exp  _ZGVnN2v_exp  $runvn
+
+log  __s_log       $runs
+log  __v_log       $runv
+log  __vn_log      $runvn
+log  _ZGVnN2v_log  $runvn
+
+pow __s_pow       $runs
+pow __v_pow       $runv
+pow __vn_pow      $runvn
+pow _ZGVnN2vv_pow $runvn
+
+sin __s_sin       $runs
+sin __v_sin       $runv
+sin __vn_sin      $runvn
+sin _ZGVnN2v_sin  $runvn
+
+cos __s_cos       $runs
+cos __v_cos       $runv
+cos __vn_cos      $runvn
+cos _ZGVnN2v_cos  $runvn
+
+expf __s_expf      $runs
+expf __v_expf      $runv
+expf __vn_expf     $runvn
+expf _ZGVnN4v_expf $runvn
+
+expf_1u __s_expf_1u   $runs
+expf_1u __v_expf_1u   $runv
+expf_1u __vn_expf_1u  $runvn
+
+exp2f __s_exp2f      $runs
+exp2f __v_exp2f      $runv
+exp2f __vn_exp2f     $runvn
+exp2f _ZGVnN4v_exp2f $runvn
+
+exp2f_1u __s_exp2f_1u  $runs
+exp2f_1u __v_exp2f_1u  $runv
+exp2f_1u __vn_exp2f_1u $runvn
+
+logf __s_logf      $runs
+logf __v_logf      $runv
+logf __vn_logf     $runvn
+logf _ZGVnN4v_logf $runvn
+
+sinf __s_sinf      $runs
+sinf __v_sinf      $runv
+sinf __vn_sinf     $runvn
+sinf _ZGVnN4v_sinf $runvn
+
+cosf __s_cosf      $runs
+cosf __v_cosf      $runv
+cosf __vn_cosf     $runvn
+cosf _ZGVnN4v_cosf $runvn
+
+powf __s_powf       $runs
+powf __v_powf       $runv
+powf __vn_powf      $runvn
+powf _ZGVnN4vv_powf $runvn
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 7ea0d45..7916044 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,7 +1,7 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
 func=cosf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
index 12384ce..7fa4d18 100644
--- a/math/test/testcases/directed/erf.tst
+++ b/math/test/testcases/directed/erf.tst
@@ -1,7 +1,7 @@
 ; erf.tst - Directed test cases for erf
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
index 28f8fa3..d05b7b1 100644
--- a/math/test/testcases/directed/erff.tst
+++ b/math/test/testcases/directed/erff.tst
@@ -1,7 +1,7 @@
 ; erff.tst
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
 func=erff op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index 0bb2ef4..85d556c 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp10.tst b/math/test/testcases/directed/exp10.tst
deleted file mode 100644
index 2cf4273..0000000
--- a/math/test/testcases/directed/exp10.tst
+++ /dev/null
@@ -1,15 +0,0 @@
-; Directed test cases for exp10
-;
-; Copyright (c) 2023, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
-func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
-func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
-func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
-func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
-func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
-func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
-func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
-func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index 7069f90..fa56c9f 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 6ca2eea..38cfc3f 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,7 +1,7 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
 func=exp2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index 89ae8fe..ff0f671 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,7 +1,7 @@
 ; expf.tst - Directed test cases for expf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
 func=expf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index 686ea83..a0aa398 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index 361bdde..ff1286c 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 5fce051..5832c4f 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,7 +1,7 @@
 ; log2f.tst - Directed test cases for log2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
 func=log2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index a6d1b9d..6e68a36 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,7 +1,7 @@
 ; logf.tst - Directed test cases for logf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
 func=logf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index 879d128..1966581 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for pow
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index 46d5224..3fa8b11 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,7 +1,7 @@
 ; powf.tst - Directed test cases for powf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index cddb346..4b33d22 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for SP sincos
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 
 func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index 041b13d..ded80b1 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,7 +1,7 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+; SPDX-License-Identifier: MIT
 
 
 func=sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index 8e885d6..c24ff80 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,7 +1,7 @@
 !! double.tst - Random test case specification for DP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+!! SPDX-License-Identifier: MIT
 
 test exp 10000
 test exp2 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index ea4a5a0..d02a227 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,7 +1,7 @@
 !! single.tst - Random test case specification for SP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+!! SPDX-License-Identifier: MIT
 
 test sinf 10000
 test cosf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 5ff2997..51479b8 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,11 +1,10 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#define _GNU_SOURCE
 #include <ctype.h>
 #include <fenv.h>
 #include <float.h>
@@ -24,6 +23,11 @@
 # include <mpfr.h>
 #endif
 
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+# define WANT_VMATH 1
+#endif
+
 static inline uint64_t
 asuint64 (double f)
 {
@@ -208,61 +212,73 @@ struct conf
   unsigned long long n;
   double softlim;
   double errlim;
-  int ignore_zero_sign;
 };
 
+/* Wrappers for sincos.  */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
 /* A bit of a hack: call vector functions twice with the same
    input in lane 0 but a different value in other lanes: once
    with an in-range value and then with a special case value.  */
 static int secondcall;
 
 /* Wrappers for vector functions.  */
-#ifdef __vpcs
+#if __aarch64__ && WANT_VMATH
 typedef __f32x4_t v_float;
 typedef __f64x2_t v_double;
-/* First element of fv and dv may be changed by -c argument.  */
-static float fv[2] = {1.0f, -INFINITY};
-static double dv[2] = {1.0, -INFINITY};
+static const float fv[2] = {1.0f, -INFINITY};
+static const double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-#if WANT_SVE_MATH
-#include <arm_sve.h>
-typedef __SVFloat32_t sv_float;
-typedef __SVFloat64_t sv_double;
-
-static inline sv_float svargf(float x)  {
-	int n = svcntw();
-	float base[n];
-	for (int i=0; i<n; i++)
-		base[i] = (float)x;
-	base[n-1] = (float) fv[secondcall];
-	return svld1(svptrue_b32(), base);
-}
-static inline sv_double svargd(double x) {
-	int n = svcntd();
-	double base[n];
-	for (int i=0; i<n; i++)
-		base[i] = x;
-	base[n-1] = dv[secondcall];
-	return svld1(svptrue_b64(), base);
-}
-static inline float svretf(sv_float vec)  {
-	int n = svcntw();
-	float res[n];
-	svst1(svptrue_b32(), res, vec);
-	return res[0];
-}
-static inline double svretd(sv_double vec) {
-	int n = svcntd();
-	double res[n];
-	svst1(svptrue_b64(), res, vec);
-	return res[0];
-}
+
+static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
+static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
+static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
+static float v_expf(float x) { return __v_expf(argf(x))[0]; }
+static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
+static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
+static float v_logf(float x) { return __v_logf(argf(x))[0]; }
+static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
+static double v_sin(double x) { return __v_sin(argd(x))[0]; }
+static double v_cos(double x) { return __v_cos(argd(x))[0]; }
+static double v_exp(double x) { return __v_exp(argd(x))[0]; }
+static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
+#ifdef __vpcs
+static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
+static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
+static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
+static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
+static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
+static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
+static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
+static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
+static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
+static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
+static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
+static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
+static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
+static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
+static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
+static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
 #endif
 #endif
 
-#include "test/ulp_wrappers.h"
-
 struct fun
 {
   const char *name;
@@ -306,44 +322,83 @@ static const struct fun fun[] = {
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
-/* Neon routines.  */
-#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define ZVNF1(x) VNF1 (x) ZVF1 (x)
-#define ZVNF2(x) VNF2 (x) ZVF2 (x)
-#define ZVND1(x) VND1 (x) ZVD1 (x)
-#define ZVND2(x) VND2 (x) ZVD2 (x)
-/* SVE routines.  */
-#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
-
-#include "test/ulp_funcs.h"
-
+ F1 (sin)
+ F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ F1 (erf)
+ D1 (exp)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ D1 (erf)
+#if WANT_VMATH
+ F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
+ F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
+ F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
+ F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
+ F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
+ F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
+ F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+ F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+ F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
+ F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
+#if __aarch64__
+ F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#ifdef __vpcs
+ F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#endif
+#endif
+#endif
 #undef F
 #undef F1
 #undef F2
 #undef D1
 #undef D2
-#undef SVF1
-#undef SVF2
-#undef SVD1
-#undef SVD2
  {0}};
 
 /* Boilerplate for generic calls.  */
@@ -584,18 +639,12 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
 static void
 usage (void)
 {
-  puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func "
+  puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
 	"lo [hi [x lo2 hi2] [count]]");
   puts ("Compares func against a higher precision implementation in [lo; hi].");
   puts ("-q: quiet.");
   puts ("-m: use mpfr even if faster method is available.");
-  puts ("-f: disable fenv exceptions testing.");
-#ifdef ___vpcs
-  puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
-	"    This should be different from tested input in other lanes, and non-special \n"
-	"    (i.e. should not trigger fenv exceptions). Default is 1.");
-#endif
-  puts ("-z: ignore sign of 0.");
+  puts ("-f: disable fenv testing (rounding modes and exceptions).");
   puts ("Supported func:");
   for (const struct fun *f = fun; f->name; f++)
     printf ("\t%s\n", f->name);
@@ -719,7 +768,6 @@ main (int argc, char *argv[])
   conf.fenv = 1;
   conf.softlim = 0;
   conf.errlim = INFINITY;
-  conf.ignore_zero_sign = 0;
   for (;;)
     {
       argc--;
@@ -759,22 +807,11 @@ main (int argc, char *argv[])
 	    {
 	      argc--;
 	      argv++;
-	      if (argc < 1 || argv[0][1] != '\0')
+	      if (argc < 1)
 		usage ();
 	      conf.rc = argv[0][0];
 	    }
 	  break;
-	case 'z':
-	  conf.ignore_zero_sign = 1;
-	  break;
-#ifdef __vpcs
-	case 'c':
-	  argc--;
-	  argv++;
-	  fv[0] = strtof(argv[0], 0);
-	  dv[0] = strtod(argv[0], 0);
-	  break;
-#endif
 	default:
 	  usage ();
 	}
@@ -800,19 +837,7 @@ main (int argc, char *argv[])
     if (strcmp (argv[0], f->name) == 0)
       break;
   if (!f->name)
-    {
-#ifndef __vpcs
-      /* Ignore vector math functions if vector math is not supported.  */
-      if (strncmp (argv[0], "_ZGVnN", 6) == 0)
-	exit (0);
-#endif
-#if !WANT_SVE_MATH
-      if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
-	exit (0);
-#endif
-      printf ("math function %s not supported\n", argv[0]);
-      exit (1);
-    }
+    usage ();
   if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
     conf.mpfr = 1; /* Use mpfr if long double has no extra precision.  */
   if (!USE_MPFR && conf.mpfr)
diff --git a/math/test/ulp.h b/math/test/ulp.h
index b0bc59a..a0c3016 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -1,8 +1,8 @@
 /*
  * Generic functions for ULP error estimation.
  *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* For each different math function type,
@@ -37,8 +37,7 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
 /* Difference between exact result and closest real number that
    gets rounded to got, i.e. error before rounding, for a correctly
    rounded result the difference is 0.  */
-static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
-			   int ignore_zero_sign)
+static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
 {
   RT(float) want = p->y;
   RT(float) d;
@@ -46,18 +45,10 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
 
   if (RT(asuint) (got) == RT(asuint) (want))
     return 0.0;
-  if (isnan (got) && isnan (want))
-    /* Ignore sign of NaN.  */
-    return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
   if (signbit (got) != signbit (want))
-    {
-      /* Fall through to ULP calculation if ignoring sign of zero and at
-	 exactly one of want and got is non-zero.  */
-      if (ignore_zero_sign && want == got)
-	return 0.0;
-      if (!ignore_zero_sign || (want != 0 && got != 0))
-	return INFINITY;
-    }
+    /* May have false positives with NaN.  */
+    //return isnan(got) && isnan(want) ? 0 : INFINITY;
+    return INFINITY;
   if (!isfinite (want) || !isfinite (got))
     {
       if (isnan (got) != isnan (want))
@@ -123,12 +114,8 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
 static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
 				    int r, RT(float) * y, int *ex)
 {
-  if (r != FE_TONEAREST)
-    fesetround (r);
   *y = T(call) (f, a);
   *ex = 0;
-  if (r != FE_TONEAREST)
-    fesetround (FE_TONEAREST);
 }
 
 static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
@@ -168,12 +155,8 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
 					int r, struct RT(ret) * p,
 					RT(float) ygot, int exgot)
 {
-  if (r != FE_TONEAREST)
-    fesetround (r);
   RT(double) yl = T(call_long) (f, a);
   p->y = (RT(float)) yl;
-  if (r != FE_TONEAREST)
-    fesetround (FE_TONEAREST);
   if (RT(isok_nofenv) (ygot, p->y))
     return 1;
   p->ulpexp = RT(ulpscale) (p->y);
@@ -305,7 +288,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       if (!ok)
 	{
 	  int print = 0;
-	  double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
+	  double err = RT(ulperr) (ygot, &want, r);
 	  double abserr = fabs (err);
 	  // TODO: count errors below accuracy limit.
 	  if (abserr > 0)
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
deleted file mode 100644
index 84f7927..0000000
--- a/math/test/ulp_funcs.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Function entries for ulp.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-/* clang-format off */
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp10)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#ifdef __vpcs
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#endif
-/* clang-format on */
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
deleted file mode 100644
index 60dc3d6..0000000
--- a/math/test/ulp_wrappers.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Function wrappers for ulp.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* clang-format off */
-
-/* Wrappers for sincos.  */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
-/* Wrappers for vector functions.  */
-#ifdef __vpcs
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
-#endif
-
-/* clang-format on */
diff --git a/math/tgamma128.c b/math/tgamma128.c
deleted file mode 100644
index dda0da7..0000000
--- a/math/tgamma128.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * Implementation of the true gamma function (as opposed to lgamma)
- * for 128-bit long double.
- *
- * Copyright (c) 2006,2009,2023 Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/*
- * This module implements the float128 gamma function under the name
- * tgamma128. It's expected to be suitable for integration into system
- * maths libraries under the standard name tgammal, if long double is
- * 128-bit. Such a library will probably want to check the error
- * handling and optimize the initial process of extracting the
- * exponent, which is done here by simple and portable (but
- * potentially slower) methods.
- */
-
-#include <float.h>
-#include <math.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-#include "tgamma128.h"
-
-#define lenof(x) (sizeof(x)/sizeof(*(x)))
-
-/*
- * Helper routine to evaluate a polynomial via Horner's rule
- */
-static long double poly(const long double *coeffs, size_t n, long double x)
-{
-    long double result = coeffs[--n];
-
-    while (n > 0)
-        result = (result * x) + coeffs[--n];
-
-    return result;
-}
-
-/*
- * Compute sin(pi*x) / pi, for use in the reflection formula that
- * relates gamma(-x) and gamma(x).
- */
-static long double sin_pi_x_over_pi(long double x)
-{
-    int quo;
-    long double fracpart = remquol(x, 0.5L, &quo);
-
-    long double sign = 1.0L;
-    if (quo & 2)
-        sign = -sign;
-    quo &= 1;
-
-    if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
-        /* For numbers this size, sin(pi*x) is so close to pi*x that
-         * sin(pi*x)/pi is indistinguishable from x in float128 */
-        return sign * fracpart;
-    }
-
-    if (quo == 0) {
-        return sign * sinl(pi*fracpart) / pi;
-    } else {
-        return sign * cosl(pi*fracpart) / pi;
-    }
-}
-
-/* Return tgamma(x) on the assumption that x >= 8. */
-static long double tgamma_large(long double x,
-                                bool negative, long double negadjust)
-{
-    /*
-     * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
-     * where K is a correction factor computed as a polynomial in 1/x.
-     *
-     * (Vaguely inspired by the form of the Lanczos approximation, but
-     * I tried the Lanczos approximation itself and it suffers badly
-     * from big cancellation leading to loss of significance.)
-     */
-    long double t = 1/x;
-    long double p = poly(coeffs_large, lenof(coeffs_large), t);
-
-    /*
-     * To avoid overflow in cases where x^(x-0.5) does overflow
-     * but gamma(x) does not, we split x^(x-0.5) in half and
-     * multiply back up _after_ multiplying the shrinking factor
-     * of exp(-(x-0.5)).
-     *
-     * Note that computing x-0.5 and (x-0.5)/2 is exact for the
-     * relevant range of x, so the only sources of error are pow
-     * and exp themselves, plus the multiplications.
-     */
-    long double powhalf = powl(x, (x-0.5L)/2.0L);
-    long double expret = expl(-(x-0.5L));
-
-    if (!negative) {
-        return (expret * powhalf) * powhalf * p;
-    } else {
-        /*
-         * Apply the reflection formula as commented below, but
-         * carefully: negadjust has magnitude less than 1, so it can
-         * turn a case where gamma(+x) would overflow into a case
-         * where gamma(-x) doesn't underflow. Not only that, but the
-         * FP format has greater range in the tiny domain due to
-         * denormals. For both reasons, it's not good enough to
-         * compute the positive result and then adjust it.
-         */
-        long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
-        return ret / powhalf;
-    }
-}
-
-/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
-static long double tgamma_tiny(long double x,
-                               bool negative, long double negadjust)
-{
-    /*
-     * For x near zero, we use a polynomial approximation to
-     * g = 1/(x*gamma(x)), and then return 1/(g*x).
-     */
-    long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
-    if (!negative)
-        return 1.0L / (g*x);
-    else
-        return g / negadjust;
-}
-
-/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
-static long double tgamma_ultratiny(long double x, bool negative,
-                                    long double negadjust)
-{
-    /* On this interval, gamma can't even be distinguished from 1/x,
-     * so we skip the polynomial evaluation in tgamma_tiny, partly to
-     * save time and partly to avoid the tiny intermediate values
-     * setting the underflow exception flag. */
-    if (!negative)
-        return 1.0L / x;
-    else
-        return 1.0L / negadjust;
-}
-
-/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
-static long double tgamma_central(long double x)
-{
-    /*
-     * In this central interval, our strategy is to finding the
-     * difference between x and the point where gamma has a minimum,
-     * and approximate based on that.
-     */
-
-    /* The difference between the input x and the minimum x. The first
-     * subtraction is expected to be exact, since x and min_hi have
-     * the same exponent (unless x=2, in which case it will still be
-     * exact). */
-    long double t = (x - min_x_hi) - min_x_lo;
-
-    /*
-     * Now use two different polynomials for the intervals [1,m] and
-     * [m,2].
-     */
-    long double p;
-    if (t < 0)
-        p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
-    else
-        p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
-
-    return (min_y_lo + p * (t*t)) + min_y_hi;
-}
-
-long double tgamma128(long double x)
-{
-    /*
-     * Start by extracting the number's sign and exponent, and ruling
-     * out cases of non-normalized numbers.
-     *
-     * For an implementation integrated into a system libm, it would
-     * almost certainly be quicker to do this by direct bitwise access
-     * to the input float128 value, using whatever is the local idiom
-     * for knowing its endianness.
-     *
-     * Integration into a system libc may also need to worry about
-     * setting errno, if that's the locally preferred way to report
-     * math.h errors.
-     */
-    int sign = signbit(x);
-    int exponent;
-    switch (fpclassify(x)) {
-      case FP_NAN:
-        return x+x; /* propagate QNaN, make SNaN throw an exception */
-      case FP_ZERO:
-        return 1/x; /* divide by zero on purpose to indicate a pole */
-      case FP_INFINITE:
-        if (sign) {
-            return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
-                         * IEEE invalid operation exception to indicate that */
-        }
-        return x;     /* but gamma(+inf) is just +inf with no error */
-      case FP_SUBNORMAL:
-        exponent = -16384;
-        break;
-      default:
-        frexpl(x, &exponent);
-        exponent--;
-        break;
-    }
-
-    bool negative = false;
-    long double negadjust = 0.0L;
-
-    if (sign) {
-        /*
-         * Euler's reflection formula is
-         *
-         *    gamma(1-x) gamma(x) = pi/sin(pi*x)
-         *
-         *                        pi
-         * => gamma(x) = --------------------
-         *               gamma(1-x) sin(pi*x)
-         *
-         * But computing 1-x is going to lose a lot of accuracy when x
-         * is very small, so instead we transform using the recurrence
-         * gamma(t+1)=t gamma(t). Setting t=-x, this gives us
-         * gamma(1-x) = -x gamma(-x), so we now have
-         *
-         *                         pi
-         *    gamma(x) = ----------------------
-         *               -x gamma(-x) sin(pi*x)
-         *
-         * which relates gamma(x) to gamma(-x), which is much nicer,
-         * since x can be turned into -x without rounding.
-         */
-        negadjust = sin_pi_x_over_pi(x);
-        negative = true;
-        x = -x;
-
-        /*
-         * Now the ultimate answer we want is
-         *
-         *    1 / (gamma(x) * x * negadjust)
-         *
-         * where x is the positive value we've just turned it into.
-         *
-         * For some of the cases below, we'll compute gamma(x)
-         * normally and then compute this adjusted value afterwards.
-         * But for others, we can implement the reciprocal operation
-         * in this formula by _avoiding_ an inversion that the
-         * sub-case was going to do anyway.
-         */
-
-        if (negadjust == 0) {
-            /*
-             * Special case for negative integers. Applying the
-             * reflection formula would cause division by zero, but
-             * standards would prefer we treat this error case as an
-             * invalid operation and return NaN instead. (Possibly
-             * because otherwise you'd have to decide which sign of
-             * infinity to return, and unlike the x=0 case, there's no
-             * sign of zero available to disambiguate.)
-             */
-            return negadjust / negadjust;
-        }
-    }
-
-    /*
-     * Split the positive domain into various cases. For cases where
-     * we do the negative-number adjustment the usual way, we'll leave
-     * the answer in 'g' and drop out of the if statement.
-     */
-    long double g;
-
-    if (exponent >= 11) {
-        /*
-         * gamma of any positive value this large overflows, and gamma
-         * of any negative value underflows.
-         */
-        if (!negative) {
-            long double huge = 0x1p+12288L;
-            return huge * huge; /* provoke an overflow */
-        } else {
-            long double tiny = 0x1p-12288L;
-            return tiny * tiny * negadjust; /* underflow, of the right sign */
-        }
-    } else if (exponent >= 3) {
-        /* Negative-number adjustment happens inside here */
-        return tgamma_large(x, negative, negadjust);
-    } else if (exponent < -113) {
-        /* Negative-number adjustment happens inside here */
-        return tgamma_ultratiny(x, negative, negadjust);
-    } else if (exponent < -5) {
-        /* Negative-number adjustment happens inside here */
-        return tgamma_tiny(x, negative, negadjust);
-    } else if (exponent == 0) {
-        g = tgamma_central(x);
-    } else if (exponent < 0) {
-        /*
-         * For x in [1/32,1) we range-reduce upwards to the interval
-         * [1,2), using the inverse of the normal recurrence formula:
-         * gamma(x) = gamma(x+1)/x.
-         */
-        g = tgamma_central(1+x) / x;
-    } else {
-        /*
-         * For x in [2,8) we range-reduce downwards to the interval
-         * [1,2) by repeated application of the recurrence formula.
-         *
-         * Actually multiplying (x-1) by (x-2) by (x-3) and so on
-         * would introduce multiple ULPs of rounding error. We can get
-         * better accuracy by writing x = (k+1/2) + t, where k is an
-         * integer and |t|<1/2, and expanding out the obvious factor
-         * (x-1)(x-2)...(x-k+1) as a polynomial in t.
-         */
-        long double mult;
-        int i = x;
-        if (i == 2) { /* x in [2,3) */
-            mult = (x-1);
-        } else {
-            long double t = x - (i + 0.5L);
-            switch (i) {
-                /* E.g. for x=3.5+t, we want
-                 * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
-              case 3:
-                mult = 3.75L+t*(4.0L+t);
-                break;
-              case 4:
-                mult = 13.125L+t*(17.75L+t*(7.5L+t));
-                break;
-              case 5:
-                mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
-                break;
-              case 6:
-                mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
-                    117.5L+t*(17.5L+t))));
-                break;
-              case 7:
-                mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
-                    1140.0L+t*(231.25L+t*(24.0L+t)))));
-                break;
-            }
-        }
-
-        g = tgamma_central(x - (i-1)) * mult;
-    }
-
-    if (!negative) {
-        /* Positive domain: return g unmodified */
-        return g;
-    } else {
-        /* Negative domain: apply the reflection formula as commented above */
-        return 1.0L / (g * x * negadjust);
-    }
-}
diff --git a/math/tgamma128.h b/math/tgamma128.h
deleted file mode 100644
index ced10c3..0000000
--- a/math/tgamma128.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Polynomial coefficients and other constants for tgamma128.c.
- *
- * Copyright (c) 2006,2009,2023 Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* The largest positive value for which 128-bit tgamma does not overflow. */
-static const long double max_x =  0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
-
-/* Coefficients of the polynomial used in the tgamma_large() subroutine */
-static const long double coeffs_large[] = {
-     0x1.8535745aa79569579b9eec0f3bbcp+0L,
-     0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
-     0x1.59f6a05094f69686c3380f4e2783p-8L,
-    -0x1.0b291dee952a82764a4859b081a6p-8L,
-    -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
-     0x1.387a8b5f38dd77e7f139b1021e86p-10L,
-     0x1.bca46637f65b13750c728cc29e40p-14L,
-    -0x1.d80401c00aef998c9e303151a51cp-11L,
-    -0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
-     0x1.4e950204437dcaf2be77f73a6f45p-10L,
-     0x1.cb711a2d65f188bf60110934d6bep-14L,
-    -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
-    -0x1.0305ab9760cddb0d833e73766836p-12L,
-     0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
-     0x1.bb4144740ad9290123fdcea684aap-11L,
-    -0x1.72ab4e88272a229bfafd192450f0p-5L,
-     0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
-     0x1.e222791c6743ce3e3cae220fb236p-3L,
-     0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
-    -0x1.9d204fa235a42cd901b123d2ad47p+1L,
-     0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
-     0x1.37f900a11dbd892abd7dde533e2dp+5L,
-    -0x1.2da49f4188dd89cb958369ef2401p+7L,
-     0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
-    -0x1.61433cebe649098c9611c4c7774ap+7L,
-};
-
-/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
-static const long double coeffs_tiny[] = {
-     0x1.0000000000000000000000000000p+0L,
-     0x1.2788cfc6fb618f49a37c7f0201fep-1L,
-    -0x1.4fcf4026afa2dceb8490ade22796p-1L,
-    -0x1.5815e8fa27047c8f42b5d9217244p-5L,
-     0x1.5512320b43fbe5dfa771333518f7p-3L,
-    -0x1.59af103c340927bffdd44f954bfcp-5L,
-    -0x1.3b4af28483e210479657e5543366p-7L,
-     0x1.d919c527f6070bfce9b29c2ace9cp-8L,
-    -0x1.317112ce35337def3556a18aa178p-10L,
-    -0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
-     0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
-    -0x1.51cf9f090b5dc398ba86305e3634p-16L,
-    -0x1.4e80f64c04a339740de06ca9fa4ap-20L,
-     0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
-};
-
-/* The location within the interval [1,2] where gamma has a minimum.
- * Specified as the sum of two 128-bit values, for extra precision. */
-static const long double min_x_hi =  0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
-static const long double min_x_lo =  0x1.ac54d7d218de21303a7c60f08840p-118L;
-
-/* The actual minimum value that gamma takes at that location.
- * Again specified as the sum of two 128-bit values. */
-static const long double min_y_hi =  0x1.c56dc82a74aee8d8851566d40f32p-1L;
-static const long double min_y_lo =  0x1.8ed98685742c353ce55e5794686fp-114L;
-
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [1,min_x] */
-static const long double coeffs_central_neg[] = {
-     0x1.b6c53f7377b83839c8a292e43b69p-2L,
-     0x1.0bae9f40c7d09ed76e732045850ap-3L,
-     0x1.4981175e14d04c3530e51d01c5fep-3L,
-     0x1.79f77aaf032c948af3a9edbd2061p-4L,
-     0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
-     0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
-     0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
-     0x1.6df1de1e178ef72ca7bd63d40870p-6L,
-     0x1.f63f502bde27e81c0f5e13479b43p-7L,
-     0x1.57fd67d901f40ea011353ad89a0ap-7L,
-     0x1.d7151376eed187eb753e2273cafcp-8L,
-     0x1.427162b5c6ff1d904c71ef53e37cp-8L,
-     0x1.b954b8c3a56cf93e49ef6538928ap-9L,
-     0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
-     0x1.9d35250d9b9378d9b59df734537ap-10L,
-     0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
-     0x1.7e0db39bb99cdb52b028d9359380p-11L,
-     0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
-     0x1.27521cf5fd24dcdf43524e6add11p-13L,
-     0x1.06461d62243bf9a826b42349672fp-10L,
-    -0x1.2b852abead28209b4e0c756dc46ep-9L,
-     0x1.be673c11a72c826115ec6d286c14p-8L,
-    -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
-     0x1.fa362bd2dc68f41abef2d8600acdp-6L,
-    -0x1.a21585b2f52f8b23855de8e452edp-5L,
-     0x1.1f234431ed032052fc92e64e0493p-4L,
-    -0x1.40d332476ca0199c60cdae3f9132p-4L,
-     0x1.1d45dc665d86012eba2eea199cefp-4L,
-    -0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
-     0x1.7e7e2fbc6d49ad484300d6add324p-6L,
-    -0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
-     0x1.30a2a73944f8c84998314d69c23fp-10L,
-};
-
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [min_x,2] */
-static const long double coeffs_central_pos[] = {
-     0x1.b6c53f7377b83839c8a292e22aa2p-2L,
-    -0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
-     0x1.4981175e14d04c3530ee5e1ecebcp-3L,
-    -0x1.79f77aaf032c948ac983d77f3e07p-4L,
-     0x1.1e97bd10821095ab7dc94936cc11p-4L,
-    -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
-     0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
-    -0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
-     0x1.f63f502be57d11aed2cfe90843ffp-7L,
-    -0x1.57fd67d852f230015b9f64770273p-7L,
-     0x1.d715138adc07e5fce81077070357p-8L,
-    -0x1.4271618e9fda8992a667adb15f4fp-8L,
-     0x1.b954d15d9eb772e80fdd760672d7p-9L,
-    -0x1.2dfe391241d3cb79c8c15182843dp-9L,
-     0x1.9d44396fcd48451c3ba924cee814p-10L,
-    -0x1.1ac195fb99739e341589e39803e6p-10L,
-     0x1.82e46127b68f002770826e25f146p-11L,
-    -0x1.089dacd90d9f41493119ac178359p-11L,
-     0x1.6993c007b20394a057d21f3d37f8p-12L,
-    -0x1.ec43a709f4446560c099dec8e31bp-13L,
-     0x1.4ba36322f4074e9add9450f003cap-13L,
-    -0x1.b3f83a977965ca1b7937bf5b34cap-14L,
-     0x1.10af346abc09cb25a6d9fe810b6ep-14L,
-    -0x1.38d8ea1188f242f50203edc395bdp-15L,
-     0x1.39add987a948ec56f62b721a4475p-16L,
-    -0x1.02a4e141f286c8a967e2df9bc9adp-17L,
-     0x1.433b50af22425f546e87113062d7p-19L,
-    -0x1.0c7b73cb0013f00aafc103e8e382p-21L,
-     0x1.b852de313ec38da2297f6deaa6b4p-25L,
-};
-
-/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
- */
-static const long double pi =  0x1.921fb54442d18469898cc51701b8p+1L;
diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya
index 6690adf..bd72d6b 100644
--- a/math/tools/cos.sollya
+++ b/math/tools/cos.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating cos(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 8;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya
index 0668bdb..b7a462c 100644
--- a/math/tools/exp.sollya
+++ b/math/tools/exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 5; // poly degree
 N = 128; // table entries
diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya
index bd0a42d..e760769 100644
--- a/math/tools/exp2.sollya
+++ b/math/tools/exp2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating 2^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 // exp2f parameters
 deg = 3; // poly degree
diff --git a/math/tools/log.sollya b/math/tools/log.sollya
index 5288f55..6df4db4 100644
--- a/math/tools/log.sollya
+++ b/math/tools/log.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 12; // poly degree
 // |log(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya
index 85811be..4a364c0 100644
--- a/math/tools/log2.sollya
+++ b/math/tools/log2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 11; // poly degree
 // |log2(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya
index d018ba0..82c4dac 100644
--- a/math/tools/log2_abs.sollya
+++ b/math/tools/log2_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 7; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya
index 5f9bfe4..a2ac190 100644
--- a/math/tools/log_abs.sollya
+++ b/math/tools/log_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 6; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/plot.py b/math/tools/plot.py
index a0fa023..6c8b89f 100755
--- a/math/tools/plot.py
+++ b/math/tools/plot.py
@@ -3,7 +3,7 @@
 # ULP error plot tool.
 #
 # Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index 1deab67..2ff436f 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -4,7 +4,7 @@
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
 # Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
 
 import Base.\
 
diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya
index a193000..a6e8511 100644
--- a/math/tools/sin.sollya
+++ b/math/tools/sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 7;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/tgamma128_gen.jl b/math/tools/tgamma128_gen.jl
deleted file mode 100644
index da76e8b..0000000
--- a/math/tools/tgamma128_gen.jl
+++ /dev/null
@@ -1,212 +0,0 @@
-# -*- julia -*-
-#
-# Generate tgamma128.h, containing polynomials and constants used by
-# tgamma128.c.
-#
-# Copyright (c) 2006,2009,2023 Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-
-# This Julia program depends on the 'Remez' and 'SpecialFunctions'
-# library packages. To install them, run this at the interactive Julia
-# prompt:
-#
-#   import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
-#
-# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
-
-import Printf
-import Remez
-import SpecialFunctions
-
-# Round a BigFloat to 128-bit long double and format it as a C99 hex
-# float literal.
-function quadhex(x)
-    sign = " "
-    if x < 0
-        sign = "-"
-        x = -x
-    end
-
-    exponent = BigInt(floor(log2(x)))
-    exponent = max(exponent, -16382)
-    @assert(exponent <= 16383) # else overflow
-
-    x /= BigFloat(2)^exponent
-    @assert(1 <= x < 2)
-    x *= BigFloat(2)^112
-    mantissa = BigInt(round(x))
-
-    mantstr = string(mantissa, base=16, pad=29)
-    return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
-                           exponent)
-end
-
-# Round a BigFloat to 128-bit long double and return it still as a
-# BigFloat.
-function quadval(x, round=0)
-    sign = +1
-    if x.sign < 0
-        sign = -1
-        x = -x
-    end
-
-    exponent = BigInt(floor(log2(x)))
-    exponent = max(exponent, -16382)
-    @assert(exponent <= 16383) # else overflow
-
-    x /= BigFloat(2)^exponent
-    @assert(1 <= x < 2)
-    x *= BigFloat(2)^112
-    if round < 0
-        mantissa = floor(x)
-    elseif round > 0
-        mantissa = ceil(x)
-    else
-        mantissa = round(x)
-    end
-
-    return sign * mantissa * BigFloat(2)^(exponent - 112)
-end
-
-# Output an array of BigFloats as a C array declaration.
-function dumparray(a, name)
-    println("static const long double ", name, "[] = {")
-    for x in N
-        println("    ", quadhex(x), ",")
-    end
-    println("};")
-end
-
-print("/*
- * Polynomial coefficients and other constants for tgamma128.c.
- *
- * Copyright (c) 2006,2009,2023 Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-")
-
-Base.MPFR.setprecision(512)
-
-e = exp(BigFloat(1))
-
-print("
-/* The largest positive value for which 128-bit tgamma does not overflow. */
-")
-lo = BigFloat("1000")
-hi = BigFloat("2000")
-while true
-    global lo
-    global hi
-    global max_x
-
-    mid = (lo + hi) / 2
-    if mid == lo || mid == hi
-        max_x = mid
-        break
-    end
-    if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
-        lo = mid
-    else
-        hi = mid
-    end
-end
-max_x = quadval(max_x, -1)
-println("static const long double max_x = ", quadhex(max_x), ";")
-
-print("
-/* Coefficients of the polynomial used in the tgamma_large() subroutine */
-")
-N, D, E, X = Remez.ratfn_minimax(
-    x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
-                exp(SpecialFunctions.logabsgamma(1/x)[1] +
-                    (1/x-0.5)*(1+log(x))),
-    (0, 1/BigFloat(8)),
-    24, 0,
-    (x, y) -> 1/y
-)
-dumparray(N, "coeffs_large")
-
-print("
-/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
-")
-N, D, E, X = Remez.ratfn_minimax(
-    x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
-    (0, 1/BigFloat(32)),
-    13, 0,
-)
-dumparray(N, "coeffs_tiny")
-
-print("
-/* The location within the interval [1,2] where gamma has a minimum.
- * Specified as the sum of two 128-bit values, for extra precision. */
-")
-lo = BigFloat("1.4")
-hi = BigFloat("1.5")
-while true
-    global lo
-    global hi
-    global min_x
-
-    mid = (lo + hi) / 2
-    if mid == lo || mid == hi
-        min_x = mid
-        break
-    end
-    if SpecialFunctions.digamma(mid) < 0
-        lo = mid
-    else
-        hi = mid
-    end
-end
-min_x_hi = quadval(min_x, -1)
-println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
-println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
-
-print("
-/* The actual minimum value that gamma takes at that location.
- * Again specified as the sum of two 128-bit values. */
-")
-min_y = SpecialFunctions.gamma(min_x)
-min_y_hi = quadval(min_y, -1)
-println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
-println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
-
-function taylor_bodge(x)
-    # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
-    # Used in the Remez calls below for x values very near the origin, to avoid
-    # significance loss problems when trying to compute it directly via that
-    # formula (even in MPFR's extra precision).
-    return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
-end
-
-print("
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [1,min_x] */
-")
-N, D, E, X = Remez.ratfn_minimax(
-    x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
-        (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
-    (0, min_x - 1),
-    31, 0,
-    (x, y) -> x^2,
-)
-dumparray(N, "coeffs_central_neg")
-
-print("
-/* Coefficients of the polynomial used in the tgamma_central() subroutine
- * for computing gamma on the interval [min_x,2] */
-")
-N, D, E, X = Remez.ratfn_minimax(
-    x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
-        (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
-    (0, 2 - min_x),
-    28, 0,
-    (x, y) -> x^2,
-)
-dumparray(N, "coeffs_central_pos")
-
-print("
-/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
- */
-")
-println("static const long double pi = ", quadhex(BigFloat(pi)), ";")
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
index 5fa7de7..c0abb63 100644
--- a/math/tools/v_exp.sollya
+++ b/math/tools/v_exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 4; // poly degree
 N = 128; // table entries
diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya
index d982524..cc3d2c4 100644
--- a/math/tools/v_log.sollya
+++ b/math/tools/v_log.sollya
@@ -1,7 +1,7 @@
 // polynomial used for __v_log(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 6; // poly degree
 a = -0x1.fc1p-9;
diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya
index 63b9d65..65cc995 100644
--- a/math/tools/v_sin.sollya
+++ b/math/tools/v_sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+// SPDX-License-Identifier: MIT
 
 deg = 15;  // polynomial degree
 a = -pi/2; // interval
diff --git a/math/v_cos.c b/math/v_cos.c
new file mode 100644
index 0000000..20ba6bd
--- /dev/null
+++ b/math/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const double Poly[] = {
+/* worst-case error is 3.5 ulp.
+   abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].  */
+-0x1.9f4a9c8b21dc9p-41,
+ 0x1.60e88a10163f2p-33,
+-0x1.ae6361b7254e7p-26,
+ 0x1.71de382e8d62bp-19,
+-0x1.a01a019aeb4ffp-13,
+ 0x1.111111110b25ep-7,
+-0x1.55555555554c3p-3,
+};
+
+#define C7 v_f64 (Poly[0])
+#define C6 v_f64 (Poly[1])
+#define C5 v_f64 (Poly[2])
+#define C4 v_f64 (Poly[3])
+#define C3 v_f64 (Poly[4])
+#define C2 v_f64 (Poly[5])
+#define C1 v_f64 (Poly[6])
+
+#define InvPi v_f64 (0x1.45f306dc9c883p-2)
+#define HalfPi v_f64 (0x1.921fb54442d18p+0)
+#define Pi1 v_f64 (0x1.921fb54442d18p+1)
+#define Pi2 v_f64 (0x1.1a62633145c06p-53)
+#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
+#define Shift v_f64 (0x1.8p52)
+#define RangeVal v_f64 (0x1p23)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (cos, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(cos) (v_f64_t x)
+{
+  v_f64_t n, r, r2, y;
+  v_u64_t odd, cmp;
+
+  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = v_fma_f64 (InvPi, r + HalfPi, Shift);
+  odd = v_as_u64_f64 (n) << 63;
+  n -= Shift;
+  n -= v_f64 (0.5);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = v_fma_f64 (-Pi1, n, r);
+  r = v_fma_f64 (-Pi2, n, r);
+  r = v_fma_f64 (-Pi3, n, r);
+
+  /* sin(r) poly approx.  */
+  r2 = r * r;
+  y = v_fma_f64 (C7, r2, C6);
+  y = v_fma_f64 (y, r2, C5);
+  y = v_fma_f64 (y, r2, C4);
+  y = v_fma_f64 (y, r2, C3);
+  y = v_fma_f64 (y, r2, C2);
+  y = v_fma_f64 (y, r2, C1);
+  y = v_fma_f64 (y * r2, r, r);
+
+  /* sign.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_cosf.c b/math/v_cosf.c
new file mode 100644
index 0000000..150294b
--- /dev/null
+++ b/math/v_cosf.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* 1.886 ulp error */
+  0x1.5b2e76p-19f,
+  -0x1.9f42eap-13f,
+  0x1.110df4p-7f,
+  -0x1.555548p-3f,
+};
+#define Pi1 v_f32 (0x1.921fb6p+1f)
+#define Pi2 v_f32 (-0x1.777a5cp-24f)
+#define Pi3 v_f32 (-0x1.ee59dap-49f)
+#define A3 v_f32 (Poly[3])
+#define A5 v_f32 (Poly[2])
+#define A7 v_f32 (Poly[1])
+#define A9 v_f32 (Poly[0])
+#define RangeVal v_f32 (0x1p20f)
+#define InvPi v_f32 (0x1.45f306p-2f)
+#define Shift v_f32 (0x1.8p+23f)
+#define AbsMask v_u32 (0x7fffffff)
+#define HalfPi v_f32 (0x1.921fb6p0f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (cosf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(cosf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, y;
+  v_u32_t odd, cmp;
+
+  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5 */
+  n = v_fma_f32 (InvPi, r + HalfPi, Shift);
+  odd = v_as_u32_f32 (n) << 31;
+  n -= Shift;
+  n -= v_f32 (0.5f);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  r = v_fma_f32 (-Pi1, n, r);
+  r = v_fma_f32 (-Pi2, n, r);
+  r = v_fma_f32 (-Pi3, n, r);
+
+  /* y = sin(r) */
+  r2 = r * r;
+  y = v_fma_f32 (A9, r2, A7);
+  y = v_fma_f32 (y, r2, A5);
+  y = v_fma_f32 (y, r2, A3);
+  y = v_fma_f32 (y * r2, r, r);
+
+  /* sign fix */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_exp.c b/math/v_exp.c
new file mode 100644
index 0000000..e459d53
--- /dev/null
+++ b/math/v_exp.c
@@ -0,0 +1,94 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+#include "v_exp.h"
+
+#if V_EXP_TABLE_BITS == 7
+/* maxerr: 1.88 +0.5 ulp
+   rel error: 1.4337*2^-53
+   abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+#define C1 v_f64 (0x1.ffffffffffd43p-2)
+#define C2 v_f64 (0x1.55555c75adbb2p-3)
+#define C3 v_f64 (0x1.55555da646206p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2.  */
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N.  */
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
+#elif V_EXP_TABLE_BITS == 8
+/* maxerr: 0.54 +0.5 ulp
+   rel error: 1.4318*2^-58
+   abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ].  */
+#define C1 v_f64 (0x1.fffffffffffd4p-2)
+#define C2 v_f64 (0x1.5555571d6b68cp-3)
+#define C3 v_f64 (0x1.5555576a59599p-5)
+#define InvLn2 v_f64 (0x1.71547652b82fep8)
+#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
+#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
+#endif
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define Tab __v_exp_data
+#define IndexMask v_u64 (N - 1)
+#define Shift v_f64 (0x1.8p+52)
+#define Thres v_f64 (704.0)
+
+VPCS_ATTR
+static v_f64_t
+specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
+{
+  v_f64_t absn = v_abs_f64 (n);
+
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
+  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
+  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
+  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
+  v_f64_t r1 = s1 * s1;
+  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
+  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(exp) (v_f64_t x)
+{
+  v_f64_t n, r, r2, s, y, z;
+  v_u64_t cmp, u, e, i;
+
+  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
+
+  /* n = round(x/(ln2/N)).  */
+  z = v_fma_f64 (x, InvLn2, Shift);
+  u = v_as_u64_f64 (z);
+  n = z - Shift;
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = v_fma_f64 (-Ln2hi, n, r);
+  r = v_fma_f64 (-Ln2lo, n, r);
+
+  e = u << (52 - V_EXP_TABLE_BITS);
+  i = u & IndexMask;
+
+  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  r2 = r * r;
+  y = v_fma_f64 (C2, r, C1);
+  y = v_fma_f64 (C3, r2, y);
+  y = v_fma_f64 (y, r2, r);
+
+  /* s = 2^(n/N).  */
+  u = v_lookup_u64 (Tab, i);
+  s = v_as_f64_u64 (u + e);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (s, y, n);
+  return v_fma_f64 (y, s, s);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_exp.h b/math/v_exp.h
new file mode 100644
index 0000000..305da19
--- /dev/null
+++ b/math/v_exp.h
@@ -0,0 +1,14 @@
+/*
+ * Declarations for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_math.h"
+#if WANT_VMATH
+
+#define V_EXP_TABLE_BITS 7
+
+extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+#endif
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
new file mode 100644
index 0000000..e3ea5af
--- /dev/null
+++ b/math/v_exp2f.c
@@ -0,0 +1,78 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* maxerr: 1.962 ulp.  */
+  0x1.59977ap-10f,
+  0x1.3ce9e4p-7f,
+  0x1.c6bd32p-5f,
+  0x1.ebf9bcp-3f,
+  0x1.62e422p-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(exp2f) (v_f32_t x)
+{
+  v_f32_t n, r, r2, scale, p, q, poly, absn;
+  v_u32_t cmp, e;
+
+  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+#if 0
+  v_f32_t z;
+  z = x + Shift;
+  n = z - Shift;
+  r = x - n;
+  e = v_as_u32_f32 (z) << 23;
+#else
+  n = v_round_f32 (x);
+  r = x - n;
+  e = v_as_u32_s32 (v_round_s32 (x)) << 23;
+#endif
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  r2 = r * r;
+  p = v_fma_f32 (C0, r, C1);
+  q = v_fma_f32 (C2, r, C3);
+  q = v_fma_f32 (p, r2, q);
+  p = C4 * r;
+  poly = v_fma_f32 (q, r2, p);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn, cmp, scale);
+  return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/aarch64/v_exp2f_1u.c b/math/v_exp2f_1u.c
similarity index 43%
rename from math/aarch64/v_exp2f_1u.c
rename to math/v_exp2f_1u.c
index ba6b02f..1caa14d 100644
--- a/math/aarch64/v_exp2f_1u.c
+++ b/math/v_exp2f_1u.c
@@ -1,12 +1,13 @@
 /*
  * Single-precision vector 2^x function.
  *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include "mathlib.h"
 #include "v_math.h"
+#if V_SUPPORTED
 
 static const float Poly[] = {
   /*  maxerr: 0.878 ulp.  */
@@ -24,49 +25,51 @@ static const float Poly[] = {
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
-static float32x4_t VPCS_ATTR NOINLINE
-specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
-  float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
-  float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
-  uint32x4_t cmp = absn > v_f32 (192.0f);
-  float32x4_t r1 = s1 * s1;
-  float32x4_t r0 = poly * s1 * s2;
-  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
-				| (~cmp & vreinterpretq_u32_f32 (r0)));
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
+  v_f32_t r1 = s1 * s1;
+  v_f32_t r0 = poly * s1 * s2;
+  return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
 }
 
-float32x4_t VPCS_ATTR
-_ZGVnN4v_exp2f_1u (float32x4_t x)
+VPCS_ATTR
+v_f32_t
+V_NAME(exp2f_1u) (v_f32_t x)
 {
-  float32x4_t n, r, scale, poly, absn;
-  uint32x4_t cmp, e;
+  v_f32_t n, r, scale, poly, absn;
+  v_u32_t cmp, e;
 
   /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
      x = n + r, with r in [-1/2, 1/2].  */
 #if 0
-  float32x4_t z;
+  v_f32_t z;
   z = x + Shift;
   n = z - Shift;
   r = x - n;
-  e = vreinterpretq_u32_f32 (z) << 23;
+  e = v_as_u32_f32 (z) << 23;
 #else
-  n = vrndaq_f32 (x);
+  n = v_round_f32 (x);
   r = x - n;
-  e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+  e = v_as_u32_s32 (v_round_s32 (x)) << 23;
 #endif
-  scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
-  absn = vabsq_f32 (n);
-  cmp = absn > v_f32 (126.0f);
-  poly = vfmaq_f32 (C1, C0, r);
-  poly = vfmaq_f32 (C2, poly, r);
-  poly = vfmaq_f32 (C3, poly, r);
-  poly = vfmaq_f32 (C4, poly, r);
-  poly = vfmaq_f32 (C5, poly, r);
-  poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  poly = v_fma_f32 (C0, r, C1);
+  poly = v_fma_f32 (poly, r, C2);
+  poly = v_fma_f32 (poly, r, C3);
+  poly = v_fma_f32 (poly, r, C4);
+  poly = v_fma_f32 (poly, r, C5);
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn);
   return scale * poly;
 }
+#endif
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
new file mode 100644
index 0000000..3653554
--- /dev/null
+++ b/math/v_exp_data.c
@@ -0,0 +1,403 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_exp.h"
+#if WANT_VMATH
+
+#define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N.  */
+const u64_t __v_exp_data[] = {
+#if N == 128
+0x3ff0000000000000,
+0x3feff63da9fb3335,
+0x3fefec9a3e778061,
+0x3fefe315e86e7f85,
+0x3fefd9b0d3158574,
+0x3fefd06b29ddf6de,
+0x3fefc74518759bc8,
+0x3fefbe3ecac6f383,
+0x3fefb5586cf9890f,
+0x3fefac922b7247f7,
+0x3fefa3ec32d3d1a2,
+0x3fef9b66affed31b,
+0x3fef9301d0125b51,
+0x3fef8abdc06c31cc,
+0x3fef829aaea92de0,
+0x3fef7a98c8a58e51,
+0x3fef72b83c7d517b,
+0x3fef6af9388c8dea,
+0x3fef635beb6fcb75,
+0x3fef5be084045cd4,
+0x3fef54873168b9aa,
+0x3fef4d5022fcd91d,
+0x3fef463b88628cd6,
+0x3fef3f49917ddc96,
+0x3fef387a6e756238,
+0x3fef31ce4fb2a63f,
+0x3fef2b4565e27cdd,
+0x3fef24dfe1f56381,
+0x3fef1e9df51fdee1,
+0x3fef187fd0dad990,
+0x3fef1285a6e4030b,
+0x3fef0cafa93e2f56,
+0x3fef06fe0a31b715,
+0x3fef0170fc4cd831,
+0x3feefc08b26416ff,
+0x3feef6c55f929ff1,
+0x3feef1a7373aa9cb,
+0x3feeecae6d05d866,
+0x3feee7db34e59ff7,
+0x3feee32dc313a8e5,
+0x3feedea64c123422,
+0x3feeda4504ac801c,
+0x3feed60a21f72e2a,
+0x3feed1f5d950a897,
+0x3feece086061892d,
+0x3feeca41ed1d0057,
+0x3feec6a2b5c13cd0,
+0x3feec32af0d7d3de,
+0x3feebfdad5362a27,
+0x3feebcb299fddd0d,
+0x3feeb9b2769d2ca7,
+0x3feeb6daa2cf6642,
+0x3feeb42b569d4f82,
+0x3feeb1a4ca5d920f,
+0x3feeaf4736b527da,
+0x3feead12d497c7fd,
+0x3feeab07dd485429,
+0x3feea9268a5946b7,
+0x3feea76f15ad2148,
+0x3feea5e1b976dc09,
+0x3feea47eb03a5585,
+0x3feea34634ccc320,
+0x3feea23882552225,
+0x3feea155d44ca973,
+0x3feea09e667f3bcd,
+0x3feea012750bdabf,
+0x3fee9fb23c651a2f,
+0x3fee9f7df9519484,
+0x3fee9f75e8ec5f74,
+0x3fee9f9a48a58174,
+0x3fee9feb564267c9,
+0x3feea0694fde5d3f,
+0x3feea11473eb0187,
+0x3feea1ed0130c132,
+0x3feea2f336cf4e62,
+0x3feea427543e1a12,
+0x3feea589994cce13,
+0x3feea71a4623c7ad,
+0x3feea8d99b4492ed,
+0x3feeaac7d98a6699,
+0x3feeace5422aa0db,
+0x3feeaf3216b5448c,
+0x3feeb1ae99157736,
+0x3feeb45b0b91ffc6,
+0x3feeb737b0cdc5e5,
+0x3feeba44cbc8520f,
+0x3feebd829fde4e50,
+0x3feec0f170ca07ba,
+0x3feec49182a3f090,
+0x3feec86319e32323,
+0x3feecc667b5de565,
+0x3feed09bec4a2d33,
+0x3feed503b23e255d,
+0x3feed99e1330b358,
+0x3feede6b5579fdbf,
+0x3feee36bbfd3f37a,
+0x3feee89f995ad3ad,
+0x3feeee07298db666,
+0x3feef3a2b84f15fb,
+0x3feef9728de5593a,
+0x3feeff76f2fb5e47,
+0x3fef05b030a1064a,
+0x3fef0c1e904bc1d2,
+0x3fef12c25bd71e09,
+0x3fef199bdd85529c,
+0x3fef20ab5fffd07a,
+0x3fef27f12e57d14b,
+0x3fef2f6d9406e7b5,
+0x3fef3720dcef9069,
+0x3fef3f0b555dc3fa,
+0x3fef472d4a07897c,
+0x3fef4f87080d89f2,
+0x3fef5818dcfba487,
+0x3fef60e316c98398,
+0x3fef69e603db3285,
+0x3fef7321f301b460,
+0x3fef7c97337b9b5f,
+0x3fef864614f5a129,
+0x3fef902ee78b3ff6,
+0x3fef9a51fbc74c83,
+0x3fefa4afa2a490da,
+0x3fefaf482d8e67f1,
+0x3fefba1bee615a27,
+0x3fefc52b376bba97,
+0x3fefd0765b6e4540,
+0x3fefdbfdad9cbe14,
+0x3fefe7c1819e90d8,
+0x3feff3c22b8f71f1,
+#elif N == 256
+0x3ff0000000000000,
+0x3feffb1afa5abcbf,
+0x3feff63da9fb3335,
+0x3feff168143b0281,
+0x3fefec9a3e778061,
+0x3fefe7d42e11bbcc,
+0x3fefe315e86e7f85,
+0x3fefde5f72f654b1,
+0x3fefd9b0d3158574,
+0x3fefd50a0e3c1f89,
+0x3fefd06b29ddf6de,
+0x3fefcbd42b72a836,
+0x3fefc74518759bc8,
+0x3fefc2bdf66607e0,
+0x3fefbe3ecac6f383,
+0x3fefb9c79b1f3919,
+0x3fefb5586cf9890f,
+0x3fefb0f145e46c85,
+0x3fefac922b7247f7,
+0x3fefa83b23395dec,
+0x3fefa3ec32d3d1a2,
+0x3fef9fa55fdfa9c5,
+0x3fef9b66affed31b,
+0x3fef973028d7233e,
+0x3fef9301d0125b51,
+0x3fef8edbab5e2ab6,
+0x3fef8abdc06c31cc,
+0x3fef86a814f204ab,
+0x3fef829aaea92de0,
+0x3fef7e95934f312e,
+0x3fef7a98c8a58e51,
+0x3fef76a45471c3c2,
+0x3fef72b83c7d517b,
+0x3fef6ed48695bbc0,
+0x3fef6af9388c8dea,
+0x3fef672658375d2f,
+0x3fef635beb6fcb75,
+0x3fef5f99f8138a1c,
+0x3fef5be084045cd4,
+0x3fef582f95281c6b,
+0x3fef54873168b9aa,
+0x3fef50e75eb44027,
+0x3fef4d5022fcd91d,
+0x3fef49c18438ce4d,
+0x3fef463b88628cd6,
+0x3fef42be3578a819,
+0x3fef3f49917ddc96,
+0x3fef3bdda27912d1,
+0x3fef387a6e756238,
+0x3fef351ffb82140a,
+0x3fef31ce4fb2a63f,
+0x3fef2e85711ece75,
+0x3fef2b4565e27cdd,
+0x3fef280e341ddf29,
+0x3fef24dfe1f56381,
+0x3fef21ba7591bb70,
+0x3fef1e9df51fdee1,
+0x3fef1b8a66d10f13,
+0x3fef187fd0dad990,
+0x3fef157e39771b2f,
+0x3fef1285a6e4030b,
+0x3fef0f961f641589,
+0x3fef0cafa93e2f56,
+0x3fef09d24abd886b,
+0x3fef06fe0a31b715,
+0x3fef0432edeeb2fd,
+0x3fef0170fc4cd831,
+0x3feefeb83ba8ea32,
+0x3feefc08b26416ff,
+0x3feef96266e3fa2d,
+0x3feef6c55f929ff1,
+0x3feef431a2de883b,
+0x3feef1a7373aa9cb,
+0x3feeef26231e754a,
+0x3feeecae6d05d866,
+0x3feeea401b7140ef,
+0x3feee7db34e59ff7,
+0x3feee57fbfec6cf4,
+0x3feee32dc313a8e5,
+0x3feee0e544ede173,
+0x3feedea64c123422,
+0x3feedc70df1c5175,
+0x3feeda4504ac801c,
+0x3feed822c367a024,
+0x3feed60a21f72e2a,
+0x3feed3fb2709468a,
+0x3feed1f5d950a897,
+0x3feecffa3f84b9d4,
+0x3feece086061892d,
+0x3feecc2042a7d232,
+0x3feeca41ed1d0057,
+0x3feec86d668b3237,
+0x3feec6a2b5c13cd0,
+0x3feec4e1e192aed2,
+0x3feec32af0d7d3de,
+0x3feec17dea6db7d7,
+0x3feebfdad5362a27,
+0x3feebe41b817c114,
+0x3feebcb299fddd0d,
+0x3feebb2d81d8abff,
+0x3feeb9b2769d2ca7,
+0x3feeb8417f4531ee,
+0x3feeb6daa2cf6642,
+0x3feeb57de83f4eef,
+0x3feeb42b569d4f82,
+0x3feeb2e2f4f6ad27,
+0x3feeb1a4ca5d920f,
+0x3feeb070dde910d2,
+0x3feeaf4736b527da,
+0x3feeae27dbe2c4cf,
+0x3feead12d497c7fd,
+0x3feeac0827ff07cc,
+0x3feeab07dd485429,
+0x3feeaa11fba87a03,
+0x3feea9268a5946b7,
+0x3feea84590998b93,
+0x3feea76f15ad2148,
+0x3feea6a320dceb71,
+0x3feea5e1b976dc09,
+0x3feea52ae6cdf6f4,
+0x3feea47eb03a5585,
+0x3feea3dd1d1929fd,
+0x3feea34634ccc320,
+0x3feea2b9febc8fb7,
+0x3feea23882552225,
+0x3feea1c1c70833f6,
+0x3feea155d44ca973,
+0x3feea0f4b19e9538,
+0x3feea09e667f3bcd,
+0x3feea052fa75173e,
+0x3feea012750bdabf,
+0x3fee9fdcddd47645,
+0x3fee9fb23c651a2f,
+0x3fee9f9298593ae5,
+0x3fee9f7df9519484,
+0x3fee9f7466f42e87,
+0x3fee9f75e8ec5f74,
+0x3fee9f8286ead08a,
+0x3fee9f9a48a58174,
+0x3fee9fbd35d7cbfd,
+0x3fee9feb564267c9,
+0x3feea024b1ab6e09,
+0x3feea0694fde5d3f,
+0x3feea0b938ac1cf6,
+0x3feea11473eb0187,
+0x3feea17b0976cfdb,
+0x3feea1ed0130c132,
+0x3feea26a62ff86f0,
+0x3feea2f336cf4e62,
+0x3feea3878491c491,
+0x3feea427543e1a12,
+0x3feea4d2add106d9,
+0x3feea589994cce13,
+0x3feea64c1eb941f7,
+0x3feea71a4623c7ad,
+0x3feea7f4179f5b21,
+0x3feea8d99b4492ed,
+0x3feea9cad931a436,
+0x3feeaac7d98a6699,
+0x3feeabd0a478580f,
+0x3feeace5422aa0db,
+0x3feeae05bad61778,
+0x3feeaf3216b5448c,
+0x3feeb06a5e0866d9,
+0x3feeb1ae99157736,
+0x3feeb2fed0282c8a,
+0x3feeb45b0b91ffc6,
+0x3feeb5c353aa2fe2,
+0x3feeb737b0cdc5e5,
+0x3feeb8b82b5f98e5,
+0x3feeba44cbc8520f,
+0x3feebbdd9a7670b3,
+0x3feebd829fde4e50,
+0x3feebf33e47a22a2,
+0x3feec0f170ca07ba,
+0x3feec2bb4d53fe0d,
+0x3feec49182a3f090,
+0x3feec674194bb8d5,
+0x3feec86319e32323,
+0x3feeca5e8d07f29e,
+0x3feecc667b5de565,
+0x3feece7aed8eb8bb,
+0x3feed09bec4a2d33,
+0x3feed2c980460ad8,
+0x3feed503b23e255d,
+0x3feed74a8af46052,
+0x3feed99e1330b358,
+0x3feedbfe53c12e59,
+0x3feede6b5579fdbf,
+0x3feee0e521356eba,
+0x3feee36bbfd3f37a,
+0x3feee5ff3a3c2774,
+0x3feee89f995ad3ad,
+0x3feeeb4ce622f2ff,
+0x3feeee07298db666,
+0x3feef0ce6c9a8952,
+0x3feef3a2b84f15fb,
+0x3feef68415b749b1,
+0x3feef9728de5593a,
+0x3feefc6e29f1c52a,
+0x3feeff76f2fb5e47,
+0x3fef028cf22749e4,
+0x3fef05b030a1064a,
+0x3fef08e0b79a6f1f,
+0x3fef0c1e904bc1d2,
+0x3fef0f69c3f3a207,
+0x3fef12c25bd71e09,
+0x3fef16286141b33d,
+0x3fef199bdd85529c,
+0x3fef1d1cd9fa652c,
+0x3fef20ab5fffd07a,
+0x3fef244778fafb22,
+0x3fef27f12e57d14b,
+0x3fef2ba88988c933,
+0x3fef2f6d9406e7b5,
+0x3fef33405751c4db,
+0x3fef3720dcef9069,
+0x3fef3b0f2e6d1675,
+0x3fef3f0b555dc3fa,
+0x3fef43155b5bab74,
+0x3fef472d4a07897c,
+0x3fef4b532b08c968,
+0x3fef4f87080d89f2,
+0x3fef53c8eacaa1d6,
+0x3fef5818dcfba487,
+0x3fef5c76e862e6d3,
+0x3fef60e316c98398,
+0x3fef655d71ff6075,
+0x3fef69e603db3285,
+0x3fef6e7cd63a8315,
+0x3fef7321f301b460,
+0x3fef77d5641c0658,
+0x3fef7c97337b9b5f,
+0x3fef81676b197d17,
+0x3fef864614f5a129,
+0x3fef8b333b16ee12,
+0x3fef902ee78b3ff6,
+0x3fef953924676d76,
+0x3fef9a51fbc74c83,
+0x3fef9f7977cdb740,
+0x3fefa4afa2a490da,
+0x3fefa9f4867cca6e,
+0x3fefaf482d8e67f1,
+0x3fefb4aaa2188510,
+0x3fefba1bee615a27,
+0x3fefbf9c1cb6412a,
+0x3fefc52b376bba97,
+0x3fefcac948dd7274,
+0x3fefd0765b6e4540,
+0x3fefd632798844f8,
+0x3fefdbfdad9cbe14,
+0x3fefe1d802243c89,
+0x3fefe7c1819e90d8,
+0x3fefedba3692d514,
+0x3feff3c22b8f71f1,
+0x3feff9d96b2a23d9,
+#endif
+};
+#endif
diff --git a/math/v_expf.c b/math/v_expf.c
new file mode 100644
index 0000000..d403e00
--- /dev/null
+++ b/math/v_expf.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  0x1.0e4020p-7f,
+  0x1.573e2ep-5f,
+  0x1.555e66p-3f,
+  0x1.fffdb6p-2f,
+  0x1.ffffecp-1f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
+  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
+  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
+  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(expf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
+  v_u32_t cmp, e;
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+#if 1
+  z = v_fma_f32 (x, InvLn2, Shift);
+  n = z - Shift;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
+#else
+  z = x * InvLn2;
+  n = v_round_f32 (z);
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
+#endif
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  r2 = r * r;
+  p = v_fma_f32 (C0, r, C1);
+  q = v_fma_f32 (C2, r, C3);
+  q = v_fma_f32 (p, r2, q);
+  p = C4 * r;
+  poly = v_fma_f32 (q, r2, p);
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (poly, n, e, absn, cmp, scale);
+  return v_fma_f32 (poly, scale, scale);
+}
+VPCS_ALIAS
+#endif
diff --git a/math/aarch64/v_expf_1u.c b/math/v_expf_1u.c
similarity index 39%
rename from math/aarch64/v_expf_1u.c
rename to math/v_expf_1u.c
index 43d03fa..023bd24 100644
--- a/math/aarch64/v_expf_1u.c
+++ b/math/v_expf_1u.c
@@ -1,12 +1,13 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include "mathlib.h"
 #include "v_math.h"
+#if V_SUPPORTED
 
 static const float Poly[] = {
   /*  maxerr: 0.36565 +0.5 ulp.  */
@@ -27,51 +28,53 @@ static const float Poly[] = {
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
-static float32x4_t VPCS_ATTR NOINLINE
-specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
-  uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
-  float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
-  float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
-  uint32x4_t cmp = absn > v_f32 (192.0f);
-  float32x4_t r1 = s1 * s1;
-  float32x4_t r0 = poly * s1 * s2;
-  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
-				| (~cmp & vreinterpretq_u32_f32 (r0)));
+  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
+  v_f32_t s2 = v_as_f32_u32 (e - b);
+  v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
+  v_f32_t r1 = s1 * s1;
+  v_f32_t r0 = poly * s1 * s2;
+  return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
 }
 
-float32x4_t VPCS_ATTR
-_ZGVnN4v_expf_1u (float32x4_t x)
+VPCS_ATTR
+v_f32_t
+V_NAME(expf_1u) (v_f32_t x)
 {
-  float32x4_t n, r, scale, poly, absn, z;
-  uint32x4_t cmp, e;
+  v_f32_t n, r, scale, poly, absn, z;
+  v_u32_t cmp, e;
 
   /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
-  z = vfmaq_f32 (Shift, x, InvLn2);
+  z = v_fma_f32 (x, InvLn2, Shift);
   n = z - Shift;
-  r = vfmaq_f32 (x, n, -Ln2hi);
-  r = vfmaq_f32 (r, n, -Ln2lo);
-  e = vreinterpretq_u32_f32 (z) << 23;
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_f32 (z) << 23;
 #else
   z = x * InvLn2;
-  n = vrndaq_f32 (z);
-  r = vfmaq_f32 (x, n, -Ln2hi);
-  r = vfmaq_f32 (r, n, -Ln2lo);
-  e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
+  n = v_round_f32 (z);
+  r = v_fma_f32 (n, -Ln2hi, x);
+  r = v_fma_f32 (n, -Ln2lo, r);
+  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
 #endif
-  scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
-  absn = vabsq_f32 (n);
-  cmp = absn > v_f32 (126.0f);
-  poly = vfmaq_f32 (C1, C0, r);
-  poly = vfmaq_f32 (C2, poly, r);
-  poly = vfmaq_f32 (C3, poly, r);
-  poly = vfmaq_f32 (C4, poly, r);
-  poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
-  poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
+  absn = v_abs_f32 (n);
+  cmp = v_cond_u32 (absn > v_f32 (126.0f));
+  poly = v_fma_f32 (C0, r, C1);
+  poly = v_fma_f32 (poly, r, C2);
+  poly = v_fma_f32 (poly, r, C3);
+  poly = v_fma_f32 (poly, r, C4);
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn);
   return scale * poly;
 }
+#endif
diff --git a/math/v_log.c b/math/v_log.c
new file mode 100644
index 0000000..d84c740
--- /dev/null
+++ b/math/v_log.c
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "v_log.h"
+#if V_SUPPORTED
+
+/* Worst-case error: 1.17 + 0.5 ulp.  */
+
+static const f64_t Poly[] = {
+  /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  -0x1.ffffffffffff7p-2,
+   0x1.55555555170d4p-2,
+  -0x1.0000000399c27p-2,
+   0x1.999b2e90e94cap-3,
+  -0x1.554e550bd501ep-3,
+};
+
+#define A0 v_f64 (Poly[0])
+#define A1 v_f64 (Poly[1])
+#define A2 v_f64 (Poly[2])
+#define A3 v_f64 (Poly[3])
+#define A4 v_f64 (Poly[4])
+#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
+#define N (1 << V_LOG_TABLE_BITS)
+#define OFF v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  v_f64_t invc;
+  v_f64_t logc;
+};
+
+static inline struct entry
+lookup (v_u64_t i)
+{
+  struct entry e;
+#ifdef SCALAR
+  e.invc = __v_log_data[i].invc;
+  e.logc = __v_log_data[i].logc;
+#else
+  e.invc[0] = __v_log_data[i[0]].invc;
+  e.logc[0] = __v_log_data[i[0]].logc;
+  e.invc[1] = __v_log_data[i[1]].invc;
+  e.logc[1] = __v_log_data[i[1]].logc;
+#endif
+  return e;
+}
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (log, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(log) (v_f64_t x)
+{
+  v_f64_t z, r, r2, p, y, kd, hi;
+  v_u64_t ix, iz, tmp, top, i, cmp;
+  v_s64_t k;
+  struct entry e;
+
+  ix = v_as_u64_f64 (x);
+  top = ix >> 48;
+  cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+
+  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = ix - OFF;
+  i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
+  k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
+  iz = ix - (tmp & v_u64 (0xfffULL << 52));
+  z = v_as_f64_u64 (iz);
+  e = lookup (i);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
+  kd = v_to_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  hi = v_fma_f64 (kd, Ln2, e.logc + r);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = r * r;
+  y = v_fma_f64 (A3, r, A2);
+  p = v_fma_f64 (A1, r, A0);
+  y = v_fma_f64 (A4, r2, y);
+  y = v_fma_f64 (y, r2, p);
+  y = v_fma_f64 (y, r2, hi);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_log.h b/math/v_log.h
new file mode 100644
index 0000000..bcc2fa6
--- /dev/null
+++ b/math/v_log.h
@@ -0,0 +1,18 @@
+/*
+ * Declarations for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_math.h"
+#if WANT_VMATH
+
+#define V_LOG_TABLE_BITS 7
+
+extern const struct v_log_data
+{
+  f64_t invc;
+  f64_t logc;
+} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
+#endif
diff --git a/math/v_log_data.c b/math/v_log_data.c
new file mode 100644
index 0000000..97ee5b0
--- /dev/null
+++ b/math/v_log_data.c
@@ -0,0 +1,158 @@
+/*
+ * Lookup table for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "v_log.h"
+#if WANT_VMATH
+
+#define N (1 << V_LOG_TABLE_BITS)
+
+/* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
+and log(c) and 1/c for the ith subinterval comes from a lookup table:
+
+	tab[i].invc = 1/c
+	tab[i].logc = (double)log(c)
+
+where c is near the center of the subinterval and is chosen by trying several
+floating point invc candidates around 1/center and selecting one for which
+the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+that contains 1 and the previous one got tweaked to avoid cancellation.  */
+const struct v_log_data __v_log_data[N] = {
+{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
+{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
+{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
+{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
+{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
+{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
+{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
+{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
+{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
+{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
+{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
+{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
+{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
+{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
+{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
+{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
+{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
+{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
+{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
+{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
+{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
+{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
+{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
+{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
+{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
+{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
+{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
+{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
+{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
+{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
+{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
+{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
+{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
+{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
+{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
+{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
+{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
+{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
+{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
+{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
+{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
+{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
+{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
+{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
+{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
+{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
+{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
+{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
+{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
+{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
+{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
+{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
+{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
+{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
+{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
+{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
+{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
+{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
+{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
+{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
+{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
+{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
+{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
+{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
+{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
+{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
+{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
+{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
+{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
+{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
+{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
+{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
+{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
+{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
+{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
+{1.0, 0.0},
+{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
+{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
+{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
+{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
+{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
+{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
+{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
+{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
+{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
+{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
+{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
+{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
+{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
+{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
+{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
+{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
+{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
+{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
+{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
+{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
+{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
+{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
+{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
+{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
+{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
+{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
+{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
+{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
+{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
+{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
+{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
+{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
+{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
+{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
+{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
+{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
+{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
+{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
+{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
+{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
+{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
+{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
+{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
+{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
+{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
+{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
+{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
+{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
+{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
+{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
+{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
+{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
+};
+#endif
diff --git a/math/v_logf.c b/math/v_logf.c
new file mode 100644
index 0000000..7373192
--- /dev/null
+++ b/math/v_logf.c
@@ -0,0 +1,73 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* 3.34 ulp error */
+  -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
+  -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
+};
+#define P7 v_f32 (Poly[0])
+#define P6 v_f32 (Poly[1])
+#define P5 v_f32 (Poly[2])
+#define P4 v_f32 (Poly[3])
+#define P3 v_f32 (Poly[4])
+#define P2 v_f32 (Poly[5])
+#define P1 v_f32 (Poly[6])
+
+#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Mask v_u32 (0x007fffff)
+#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (logf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(logf) (v_f32_t x)
+{
+  v_f32_t n, p, q, r, r2, y;
+  v_u32_t u, cmp;
+
+  u = v_as_u32_f32 (x);
+  cmp = v_cond_u32 (u - Min >= Max - Min);
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
+  u -= Off;
+  n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
+  u &= Mask;
+  u += Off;
+  r = v_as_f32_u32 (u) - v_f32 (1.0f);
+
+  /* y = log(1+r) + n*ln2.  */
+  r2 = r * r;
+  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+  p = v_fma_f32 (P6, r, P5);
+  q = v_fma_f32 (P4, r, P3);
+  y = v_fma_f32 (P2, r, P1);
+  p = v_fma_f32 (P7, r2, p);
+  q = v_fma_f32 (p, r2, q);
+  y = v_fma_f32 (q, r2, y);
+  p = v_fma_f32 (Ln2, n, r);
+  y = v_fma_f32 (y, r2, p);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_math.h b/math/v_math.h
new file mode 100644
index 0000000..f2cc467
--- /dev/null
+++ b/math/v_math.h
@@ -0,0 +1,641 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#ifndef WANT_VMATH
+/* Enable the build of vector math code.  */
+# define WANT_VMATH 1
+#endif
+#if WANT_VMATH
+
+/* The goal of this header is to allow vector and scalar
+   build of the same algorithm, the provided intrinsic
+   wrappers are also vector length agnostic so they can
+   be implemented for SVE too (or other simd architectures)
+   and then the code should work on those targets too.  */
+
+#if SCALAR
+#define V_NAME(x) __s_##x
+#elif VPCS && __aarch64__
+#define V_NAME(x) __vn_##x
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+#else
+#define V_NAME(x) __v_##x
+#endif
+
+#ifndef VPCS_ATTR
+#define VPCS_ATTR
+#endif
+#ifndef VPCS_ALIAS
+#define VPCS_ALIAS
+#endif
+
+#include <stdint.h>
+#include "math_config.h"
+
+typedef float f32_t;
+typedef uint32_t u32_t;
+typedef int32_t s32_t;
+typedef double f64_t;
+typedef uint64_t u64_t;
+typedef int64_t s64_t;
+
+/* reinterpret as type1 from type2.  */
+static inline u32_t
+as_u32_f32 (f32_t x)
+{
+  union { f32_t f; u32_t u; } r = {x};
+  return r.u;
+}
+static inline f32_t
+as_f32_u32 (u32_t x)
+{
+  union { u32_t u; f32_t f; } r = {x};
+  return r.f;
+}
+static inline s32_t
+as_s32_u32 (u32_t x)
+{
+  union { u32_t u; s32_t i; } r = {x};
+  return r.i;
+}
+static inline u32_t
+as_u32_s32 (s32_t x)
+{
+  union { s32_t i; u32_t u; } r = {x};
+  return r.u;
+}
+static inline u64_t
+as_u64_f64 (f64_t x)
+{
+  union { f64_t f; u64_t u; } r = {x};
+  return r.u;
+}
+static inline f64_t
+as_f64_u64 (u64_t x)
+{
+  union { u64_t u; f64_t f; } r = {x};
+  return r.f;
+}
+static inline s64_t
+as_s64_u64 (u64_t x)
+{
+  union { u64_t u; s64_t i; } r = {x};
+  return r.i;
+}
+static inline u64_t
+as_u64_s64 (s64_t x)
+{
+  union { s64_t i; u64_t u; } r = {x};
+  return r.u;
+}
+
+#if SCALAR
+#define V_SUPPORTED 1
+typedef f32_t v_f32_t;
+typedef u32_t v_u32_t;
+typedef s32_t v_s32_t;
+typedef f64_t v_f64_t;
+typedef u64_t v_u64_t;
+typedef s64_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 1;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return x;
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return x;
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return x;
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x;
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x;
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x;
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  *x = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  *x = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return __builtin_fabsf (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return __builtin_roundf (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return tab[idx];
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return f (x);
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return f (x1, x2);
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 1;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return x;
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return x;
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return x;
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  *x = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  return x != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x ? -1 : 0;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return __builtin_fabs (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return __builtin_fma (x, y, z);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return __builtin_round (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return __builtin_lround (x); /* relies on -fno-math-errno.  */
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return x;
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return tab[idx];
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return f (x);
+}
+
+#elif __aarch64__
+#define V_SUPPORTED 1
+#include <arm_neon.h>
+typedef float32x4_t v_f32_t;
+typedef uint32x4_t v_u32_t;
+typedef int32x4_t v_s32_t;
+typedef float64x2_t v_f64_t;
+typedef uint64x2_t v_u64_t;
+typedef int64x2_t v_s64_t;
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline v_f32_t
+v_f32 (f32_t x)
+{
+  return (v_f32_t){x, x, x, x};
+}
+static inline v_u32_t
+v_u32 (u32_t x)
+{
+  return (v_u32_t){x, x, x, x};
+}
+static inline v_s32_t
+v_s32 (s32_t x)
+{
+  return (v_s32_t){x, x, x, x};
+}
+
+static inline f32_t
+v_get_f32 (v_f32_t x, int i)
+{
+  return x[i];
+}
+static inline u32_t
+v_get_u32 (v_u32_t x, int i)
+{
+  return x[i];
+}
+static inline s32_t
+v_get_s32 (v_s32_t x, int i)
+{
+  return x[i];
+}
+
+static inline void
+v_set_f32 (v_f32_t *x, int i, f32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_u32 (v_u32_t *x, int i, u32_t v)
+{
+  (*x)[i] = v;
+}
+static inline void
+v_set_s32 (v_s32_t *x, int i, s32_t v)
+{
+  (*x)[i] = v;
+}
+
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (v_u32_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u32_t
+v_cond_u32 (v_u32_t x)
+{
+  return x;
+}
+static inline v_f32_t
+v_abs_f32 (v_f32_t x)
+{
+  return vabsq_f32 (x);
+}
+static inline v_f32_t
+v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
+{
+  return vfmaq_f32 (z, x, y);
+}
+static inline v_f32_t
+v_round_f32 (v_f32_t x)
+{
+  return vrndaq_f32 (x);
+}
+static inline v_s32_t
+v_round_s32 (v_f32_t x)
+{
+  return vcvtaq_s32_f32 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f32_t
+v_to_f32_s32 (v_s32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+static inline v_f32_t
+v_to_f32_u32 (v_u32_t x)
+{
+  return (v_f32_t){x[0], x[1], x[2], x[3]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u32_t
+v_as_u32_f32 (v_f32_t x)
+{
+  union { v_f32_t f; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_as_f32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_f32_t f; } r = {x};
+  return r.f;
+}
+static inline v_s32_t
+v_as_s32_u32 (v_u32_t x)
+{
+  union { v_u32_t u; v_s32_t i; } r = {x};
+  return r.i;
+}
+static inline v_u32_t
+v_as_u32_s32 (v_s32_t x)
+{
+  union { v_s32_t i; v_u32_t u; } r = {x};
+  return r.u;
+}
+static inline v_f32_t
+v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+{
+  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_u32_t
+v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+{
+  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline v_f32_t
+v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+{
+  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline v_f32_t
+v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
+	     v_u32_t p)
+{
+  return (
+    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
+	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
+static inline v_f64_t
+v_f64 (f64_t x)
+{
+  return (v_f64_t){x, x};
+}
+static inline v_u64_t
+v_u64 (u64_t x)
+{
+  return (v_u64_t){x, x};
+}
+static inline v_s64_t
+v_s64 (s64_t x)
+{
+  return (v_s64_t){x, x};
+}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  (*x)[i] = v;
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (v_u64_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+/* to wrap the result of relational operators.  */
+static inline v_u64_t
+v_cond_u64 (v_u64_t x)
+{
+  return x;
+}
+static inline v_f64_t
+v_abs_f64 (v_f64_t x)
+{
+  return vabsq_f64 (x);
+}
+static inline v_f64_t
+v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
+{
+  return vfmaq_f64 (z, x, y);
+}
+static inline v_f64_t
+v_round_f64 (v_f64_t x)
+{
+  return vrndaq_f64 (x);
+}
+static inline v_s64_t
+v_round_s64 (v_f64_t x)
+{
+  return vcvtaq_s64_f64 (x);
+}
+/* convert to type1 from type2.  */
+static inline v_f64_t
+v_to_f64_s64 (v_s64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+static inline v_f64_t
+v_to_f64_u64 (v_u64_t x)
+{
+  return (v_f64_t){x[0], x[1]};
+}
+/* reinterpret as type1 from type2.  */
+static inline v_u64_t
+v_as_u64_f64 (v_f64_t x)
+{
+  union { v_f64_t f; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_as_f64_u64 (v_u64_t x)
+{
+  union { v_u64_t u; v_f64_t f; } r = {x};
+  return r.f;
+}
+static inline v_s64_t
+v_as_s64_u64 (v_u64_t x)
+{
+  union {  v_u64_t u; v_s64_t i; } r = {x};
+  return r.i;
+}
+static inline v_u64_t
+v_as_u64_s64 (v_s64_t x)
+{
+  union { v_s64_t i; v_u64_t u; } r = {x};
+  return r.u;
+}
+static inline v_f64_t
+v_lookup_f64 (const f64_t *tab, v_u64_t idx)
+{
+  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_u64_t
+v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+{
+  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline v_f64_t
+v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+{
+  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+}
+#endif
+
+#endif
+#endif
diff --git a/math/aarch64/v_pow.c b/math/v_pow.c
similarity index 35%
rename from math/aarch64/v_pow.c
rename to math/v_pow.c
index 734f166..a209d57 100644
--- a/math/aarch64/v_pow.c
+++ b/math/v_pow.c
@@ -1,22 +1,27 @@
 /*
  * Double-precision vector pow function.
  *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include "mathlib.h"
 #include "v_math.h"
+#if V_SUPPORTED
 
-float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+VPCS_ATTR
+v_f64_t
+V_NAME(pow) (v_f64_t x, v_f64_t y)
 {
-  float64x2_t z;
+  v_f64_t z;
   for (int lane = 0; lane < v_lanes64 (); lane++)
     {
-      double sx = x[lane];
-      double sy = y[lane];
-      double sz = pow (sx, sy);
-      z[lane] = sz;
+      f64_t sx = v_get_f64 (x, lane);
+      f64_t sy = v_get_f64 (y, lane);
+      f64_t sz = pow (sx, sy);
+      v_set_f64 (&z, lane, sz);
     }
   return z;
 }
+VPCS_ALIAS
+#endif
diff --git a/math/v_powf.c b/math/v_powf.c
new file mode 100644
index 0000000..fb80fa6
--- /dev/null
+++ b/math/v_powf.c
@@ -0,0 +1,235 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define SBITS 5
+#define Tlog v__powf_log2_data.tab
+#define Texp v__exp2f_data.tab
+#define A v__powf_log2_data.poly
+#define C v__exp2f_data.poly
+#define LOGDEG 4
+
+#if LOGDEG == 5
+/* 1.01 ulp */
+#define OFF v_u32 (0x3f330000)
+#define TBITS 4
+#elif LOGDEG == 4
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
+#define OFF v_u32 (0x3f35d000)
+#define TBITS 5
+#endif
+
+#define V_EXP2F_TABLE_BITS SBITS
+#define V_EXP2F_POLY_ORDER 3
+struct v_exp2f_data
+{
+  uint64_t tab[1 << V_EXP2F_TABLE_BITS];
+  double poly[V_EXP2F_POLY_ORDER];
+};
+
+#define V_POWF_LOG2_TABLE_BITS TBITS
+#define V_POWF_LOG2_POLY_ORDER LOGDEG
+#define SCALE ((double) (1 << SBITS))
+struct v_powf_log2_data
+{
+  struct
+  {
+    double invc, logc;
+  } tab[1 << V_POWF_LOG2_TABLE_BITS];
+  double poly[V_POWF_LOG2_POLY_ORDER];
+};
+
+static const struct v_powf_log2_data v__powf_log2_data = {
+#if LOGDEG == 5
+  .tab = {
+{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
+{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
+{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
+{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
+{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
+{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
+{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
+{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
+{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
+{ 0x1p+0, 0x0p+0 * SCALE },
+{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
+{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
+{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
+{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
+{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
+{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
+  },
+/* rel err: 1.46 * 2^-32 */
+  .poly = {
+0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
+0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
+0x1.71547652ab82bp0 * SCALE,
+  }
+#elif LOGDEG == 4
+  .tab = {
+{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
+{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
+{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
+{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
+{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
+{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
+{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
+{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
+{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
+{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
+{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
+{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
+{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
+{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
+{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
+{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
+{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
+{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
+{0x1p+0, 0x0p+0 * SCALE},
+{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
+{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
+{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
+{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
+{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
+{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
+{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
+{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
+{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
+{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
+{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
+{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
+{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
+  },
+/* rel err: 1.5 * 2^-30 */
+  .poly = {
+ -0x1.6ff5daa3b3d7cp-2 * SCALE,
+ 0x1.ec81d03c01aebp-2 * SCALE,
+ -0x1.71547bb43f101p-1 * SCALE,
+ 0x1.7154764a815cbp0 * SCALE,
+  }
+#endif
+};
+
+static const struct v_exp2f_data v__exp2f_data = {
+  .tab = {
+0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
+0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
+0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
+0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
+0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
+0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+  },
+/* rel err: 1.69 * 2^-34 */
+  .poly = {
+0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
+  },
+};
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
+{
+  return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(powf) (v_f32_t x, v_f32_t y)
+{
+  v_u32_t u, tmp, cmp, i, top, iz;
+  v_s32_t k;
+  v_f32_t ret;
+
+  u = v_as_u32_f32 (x);
+  cmp = v_cond_u32 (u - Min >= Max - Min);
+  tmp = u - OFF;
+  i = (tmp >> (23 - TBITS)) % (1 << TBITS);
+  top = tmp & 0xff800000;
+  iz = u - top;
+  k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
+
+  for (int lane = 0; lane < v_lanes32 (); lane++)
+    {
+      uint32_t si, siz;
+      int32_t sk;
+      float sy;
+
+      /* Use double precision for each lane.  */
+      double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
+      uint64_t ki, t;
+
+      si = v_get_u32 (i, lane);
+      siz = v_get_u32 (iz, lane);
+      sk = v_get_s32 (k, lane);
+      sy = v_get_f32 (y, lane);
+
+      invc = Tlog[si].invc;
+      logc = Tlog[si].logc;
+      z = (double) as_f32_u32 (siz);
+
+      /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
+      r = __builtin_fma (z, invc, -1.0);
+      y0 = logc + (double) sk;
+
+      /* Polynomial to approximate log1p(r)/ln2.  */
+#if LOGDEG == 5
+      logx = A[0];
+      logx = r * logx + A[1];
+      logx = r * logx + A[2];
+      logx = r * logx + A[3];
+      logx = r * logx + A[4];
+      logx = r * logx + y0;
+#elif LOGDEG == 4
+      logx = A[0];
+      logx = r * logx + A[1];
+      logx = r * logx + A[2];
+      logx = r * logx + A[3];
+      logx = r * logx + y0;
+#endif
+      ylogx = sy * logx;
+      v_set_u32 (&cmp, lane,
+		 (as_u64_f64 (ylogx) >> 47 & 0xffff)
+		     >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
+		   ? 1
+		   : v_get_u32 (cmp, lane));
+
+      /* N*x = k + r with r in [-1/2, 1/2] */
+#if TOINT_INTRINSICS
+      kd = roundtoint (ylogx); /* k */
+      ki = converttoint (ylogx);
+#else
+# define SHIFT 0x1.8p52
+      kd = eval_as_double (ylogx + SHIFT);
+      ki = asuint64 (kd);
+      kd -= SHIFT;
+#endif
+      r = ylogx - kd;
+
+      /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
+      t = Texp[ki % (1 << SBITS)];
+      t += ki << (52 - SBITS);
+      s = as_f64_u64 (t);
+      p = C[0];
+      p = __builtin_fma (p, r, C[1]);
+      p = __builtin_fma (p, r, C[2]);
+      p = __builtin_fma (p, s * r, s);
+
+      v_set_f32 (&ret, lane, p);
+    }
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, ret, cmp);
+  return ret;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_sin.c b/math/v_sin.c
new file mode 100644
index 0000000..2b9ed05
--- /dev/null
+++ b/math/v_sin.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const double Poly[] = {
+/* worst-case error is 3.5 ulp.
+   abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].  */
+-0x1.9f4a9c8b21dc9p-41,
+ 0x1.60e88a10163f2p-33,
+-0x1.ae6361b7254e7p-26,
+ 0x1.71de382e8d62bp-19,
+-0x1.a01a019aeb4ffp-13,
+ 0x1.111111110b25ep-7,
+-0x1.55555555554c3p-3,
+};
+
+#define C7 v_f64 (Poly[0])
+#define C6 v_f64 (Poly[1])
+#define C5 v_f64 (Poly[2])
+#define C4 v_f64 (Poly[3])
+#define C3 v_f64 (Poly[4])
+#define C2 v_f64 (Poly[5])
+#define C1 v_f64 (Poly[6])
+
+#define InvPi v_f64 (0x1.45f306dc9c883p-2)
+#define Pi1 v_f64 (0x1.921fb54442d18p+1)
+#define Pi2 v_f64 (0x1.1a62633145c06p-53)
+#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
+#define Shift v_f64 (0x1.8p52)
+#define RangeVal v_f64 (0x1p23)
+#define AbsMask v_u64 (0x7fffffffffffffff)
+
+VPCS_ATTR
+__attribute__ ((noinline)) static v_f64_t
+specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+{
+  return v_call_f64 (sin, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f64_t
+V_NAME(sin) (v_f64_t x)
+{
+  v_f64_t n, r, r2, y;
+  v_u64_t sign, odd, cmp;
+
+  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
+  sign = v_as_u64_f64 (x) & ~AbsMask;
+  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
+
+  /* n = rint(|x|/pi).  */
+  n = v_fma_f64 (InvPi, r, Shift);
+  odd = v_as_u64_f64 (n) << 63;
+  n -= Shift;
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = v_fma_f64 (-Pi1, n, r);
+  r = v_fma_f64 (-Pi2, n, r);
+  r = v_fma_f64 (-Pi3, n, r);
+
+  /* sin(r) poly approx.  */
+  r2 = r * r;
+  y = v_fma_f64 (C7, r2, C6);
+  y = v_fma_f64 (y, r2, C5);
+  y = v_fma_f64 (y, r2, C4);
+  y = v_fma_f64 (y, r2, C3);
+  y = v_fma_f64 (y, r2, C2);
+  y = v_fma_f64 (y, r2, C1);
+  y = v_fma_f64 (y * r2, r, r);
+
+  /* sign.  */
+  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/v_sinf.c b/math/v_sinf.c
new file mode 100644
index 0000000..e66bfce
--- /dev/null
+++ b/math/v_sinf.c
@@ -0,0 +1,75 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+static const float Poly[] = {
+  /* 1.886 ulp error */
+  0x1.5b2e76p-19f,
+  -0x1.9f42eap-13f,
+  0x1.110df4p-7f,
+  -0x1.555548p-3f,
+};
+#define Pi1 v_f32 (0x1.921fb6p+1f)
+#define Pi2 v_f32 (-0x1.777a5cp-24f)
+#define Pi3 v_f32 (-0x1.ee59dap-49f)
+#define A3 v_f32 (Poly[3])
+#define A5 v_f32 (Poly[2])
+#define A7 v_f32 (Poly[1])
+#define A9 v_f32 (Poly[0])
+#define RangeVal v_f32 (0x1p20f)
+#define InvPi v_f32 (0x1.45f306p-2f)
+#define Shift v_f32 (0x1.8p+23f)
+#define AbsMask v_u32 (0x7fffffff)
+
+VPCS_ATTR
+static v_f32_t
+specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (sinf, x, y, cmp);
+}
+
+VPCS_ATTR
+v_f32_t
+V_NAME(sinf) (v_f32_t x)
+{
+  v_f32_t n, r, r2, y;
+  v_u32_t sign, odd, cmp;
+
+  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
+  sign = v_as_u32_f32 (x) & ~AbsMask;
+  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
+
+  /* n = rint(|x|/pi) */
+  n = v_fma_f32 (InvPi, r, Shift);
+  odd = v_as_u32_f32 (n) << 31;
+  n -= Shift;
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  r = v_fma_f32 (-Pi1, n, r);
+  r = v_fma_f32 (-Pi2, n, r);
+  r = v_fma_f32 (-Pi3, n, r);
+
+  /* y = sin(r) */
+  r2 = r * r;
+  y = v_fma_f32 (A9, r2, A7);
+  y = v_fma_f32 (y, r2, A5);
+  y = v_fma_f32 (y, r2, A3);
+  y = v_fma_f32 (y * r2, r, r);
+
+  /* sign fix */
+  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return specialcase (x, y, cmp);
+  return y;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/vn_cos.c b/math/vn_cos.c
new file mode 100644
index 0000000..b57a549
--- /dev/null
+++ b/math/vn_cos.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cos.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
+#include "v_cos.c"
+#endif
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
new file mode 100644
index 0000000..6321d46
--- /dev/null
+++ b/math/vn_cosf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_cosf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
+#include "v_cosf.c"
+#endif
diff --git a/math/vn_exp.c b/math/vn_exp.c
new file mode 100644
index 0000000..06e269d
--- /dev/null
+++ b/math/vn_exp.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
+#include "v_exp.c"
+#endif
diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c
new file mode 100644
index 0000000..db9707e
--- /dev/null
+++ b/math/vn_exp2f.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp2f.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
+#include "v_exp2f.c"
+#endif
diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c
new file mode 100644
index 0000000..17bd0ab
--- /dev/null
+++ b/math/vn_exp2f_1u.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_exp2f_1u.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_exp2f_1u.c"
+#endif
diff --git a/math/vn_expf.c b/math/vn_expf.c
new file mode 100644
index 0000000..0652907
--- /dev/null
+++ b/math/vn_expf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
+#include "v_expf.c"
+#endif
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
new file mode 100644
index 0000000..3be7768
--- /dev/null
+++ b/math/vn_expf_1u.c
@@ -0,0 +1,11 @@
+/*
+ * AdvSIMD vector PCS variant of __v_expf_1u.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#include "v_expf_1u.c"
+#endif
diff --git a/math/vn_log.c b/math/vn_log.c
new file mode 100644
index 0000000..b58fe8f
--- /dev/null
+++ b/math/vn_log.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_log.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
+#include "v_log.c"
+#endif
diff --git a/math/vn_logf.c b/math/vn_logf.c
new file mode 100644
index 0000000..cc5b8ae
--- /dev/null
+++ b/math/vn_logf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_logf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
+#include "v_logf.c"
+#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
new file mode 100644
index 0000000..2609501
--- /dev/null
+++ b/math/vn_pow.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_pow.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
+#include "v_pow.c"
+#endif
diff --git a/math/vn_powf.c b/math/vn_powf.c
new file mode 100644
index 0000000..095d07e
--- /dev/null
+++ b/math/vn_powf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_powf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
+#include "v_powf.c"
+#endif
diff --git a/math/vn_sin.c b/math/vn_sin.c
new file mode 100644
index 0000000..905c796
--- /dev/null
+++ b/math/vn_sin.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sin.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
+#include "v_sin.c"
+#endif
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
new file mode 100644
index 0000000..1214e1a
--- /dev/null
+++ b/math/vn_sinf.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_sinf.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
+#include "v_sinf.c"
+#endif
diff --git a/networking/Dir.mk b/networking/Dir.mk
index 2589e0a..b496103 100644
--- a/networking/Dir.mk
+++ b/networking/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
 
 S := $(srcdir)/networking
 B := build/networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
index 90c00eb..6d5be58 100644
--- a/networking/aarch64/chksum_simd.c
+++ b/networking/aarch64/chksum_simd.c
@@ -2,7 +2,7 @@
  * AArch64-specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "networking.h"
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
index ae08fe5..7f69adf 100644
--- a/networking/arm/chksum_simd.c
+++ b/networking/arm/chksum_simd.c
@@ -2,7 +2,7 @@
  * Armv7-A specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "networking.h"
diff --git a/networking/chksum.c b/networking/chksum.c
index 329482f..95ce5ba 100644
--- a/networking/chksum.c
+++ b/networking/chksum.c
@@ -3,7 +3,7 @@
  * This sum is often used as a simple checksum in networking.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include "networking.h"
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
index 16f0f6c..958c8cc 100644
--- a/networking/chksum_common.h
+++ b/networking/chksum_common.h
@@ -2,7 +2,7 @@
  * Common code for checksum implementations
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef CHKSUM_COMMON_H
diff --git a/networking/include/networking.h b/networking/include/networking.h
index 297dd4b..a88feff 100644
--- a/networking/include/networking.h
+++ b/networking/include/networking.h
@@ -2,7 +2,7 @@
  * Public API.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 unsigned short __chksum (const void *, unsigned int);
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 239b5b8..41b9812 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -2,7 +2,7 @@
  * Ones' complement checksum test & benchmark
  *
  * Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #define _GNU_SOURCE
diff --git a/string/Dir.mk b/string/Dir.mk
index 40ff5ac..cf3453f 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+# SPDX-License-Identifier: MIT
 
 S := $(srcdir)/string
 B := build/string
diff --git a/string/README.contributors b/string/README.contributors
deleted file mode 100644
index 0b4a51b..0000000
--- a/string/README.contributors
+++ /dev/null
@@ -1,30 +0,0 @@
-STYLE REQUIREMENTS
-==================
-
-1. Most code in this sub-directory is expected to be upstreamed into glibc so
-   the GNU Coding Standard and glibc specific conventions should be followed
-   to ease upstreaming.
-
-2. ABI and symbols: the code should be written so it is suitable for inclusion
-   into a libc with minimal changes. This e.g. means that internal symbols
-   should be hidden and in the implementation reserved namespace according to
-   ISO C and POSIX rules. If possible the built shared libraries and static
-   library archives should be usable to override libc symbols at link time (or
-   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
-   (other than symbol versioning), this cannot be done reliably for static
-   linking so this is a best effort requirement.
-
-3. API: include headers should be suitable for benchmarking and testing code
-   and should not conflict with libc headers.
-
-
-CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
-================================================
-1. Code:
-   - The assumptions of the code must be clearly documented.
-
-   - Assembly style should be consistent across different implementations.
-
-
-2. Performance:
-   - Benchmarking is needed on several microarchitectures.
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 207e229..84339f7 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_region - tag memory
  *
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index 44b8e01..f58364c 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
- * Copyright (c) 2021-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
deleted file mode 100644
index 131b95e..0000000
--- a/string/aarch64/asmdefs.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Macros for asm code.  AArch64 version.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _ASMDEFS_H
-#define _ASMDEFS_H
-
-/* Branch Target Identitication support.  */
-#define BTI_C		hint	34
-#define BTI_J		hint	36
-/* Return address signing support (pac-ret).  */
-#define PACIASP		hint	25; .cfi_window_save
-#define AUTIASP		hint	29; .cfi_window_save
-
-/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
-#define FEATURE_1_AND 0xc0000000
-#define FEATURE_1_BTI 1
-#define FEATURE_1_PAC 2
-
-/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
-#ifdef __ILP32__
-#define GNU_PROPERTY(type, value)	\
-  .section .note.gnu.property, "a";	\
-  .p2align 2;				\
-  .word 4;				\
-  .word 12;				\
-  .word 5;				\
-  .asciz "GNU";				\
-  .word type;				\
-  .word 4;				\
-  .word value;				\
-  .text
-#else
-#define GNU_PROPERTY(type, value)	\
-  .section .note.gnu.property, "a";	\
-  .p2align 3;				\
-  .word 4;				\
-  .word 16;				\
-  .word 5;				\
-  .asciz "GNU";				\
-  .word type;				\
-  .word 4;				\
-  .word value;				\
-  .word 0;				\
-  .text
-#endif
-
-/* If set then the GNU Property Note section will be added to
-   mark objects to support BTI and PAC-RET.  */
-#ifndef WANT_GNU_PROPERTY
-#define WANT_GNU_PROPERTY 1
-#endif
-
-#if WANT_GNU_PROPERTY
-/* Add property note with supported features to all asm files.  */
-GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
-#endif
-
-#define ENTRY_ALIGN(name, alignment)	\
-  .global name;		\
-  .type name,%function;	\
-  .align alignment;		\
-  name:			\
-  .cfi_startproc;	\
-  BTI_C;
-
-#define ENTRY(name)	ENTRY_ALIGN(name, 6)
-
-#define ENTRY_ALIAS(name)	\
-  .global name;		\
-  .type name,%function;	\
-  name:
-
-#define END(name)	\
-  .cfi_endproc;		\
-  .size name, .-name;
-
-#define L(l) .L ## l
-
-#ifdef __ILP32__
-  /* Sanitize padding bits of pointer arguments as per aapcs64 */
-#define PTR_ARG(n)  mov w##n, w##n
-#else
-#define PTR_ARG(n)
-#endif
-
-#ifdef __ILP32__
-  /* Sanitize padding bits of size arguments as per aapcs64 */
-#define SIZE_ARG(n)  mov w##n, w##n
-#else
-#define SIZE_ARG(n)
-#endif
-
-/* Compiler supports SVE instructions  */
-#ifndef HAVE_SVE
-# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
-#   define HAVE_SVE 1
-# else
-#   define HAVE_SVE 0
-# endif
-#endif
-
-#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 131b7fa..5a54242 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #if !__aarch64__
@@ -10,4 +10,4 @@
 #endif
 
 /* Include for GNU property notes.  */
-#include "asmdefs.h"
+#include "../asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 948c3cb..c2e967d 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,21 +23,25 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
+#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vend		v3
-#define dend		d3
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting leading zeros identifies
-   exactly which byte matched.  */
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
 	PTR_ARG (0)
@@ -46,53 +50,55 @@ ENTRY (__memchr_aarch64_mte)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	rbit	synd, synd
 	clz	synd, synd
-	cmp	cntin, synd, lsr 2
 	add	result, srcin, synd, lsr 2
+	cmp	cntin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
-	.p2align 3
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 17
+	add	tmp, tmp, 16
 	subs	cntrem, cntin, tmp
-	b.lo	L(nomatch)
+	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	tbz	cntrem, 4, L(loop32_2)
-	sub	src, src, 16
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
 	.p2align 4
 L(loop32):
-	ldr	qdata, [src, 32]!
+	ldr	qdata, [src, 16]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, 16]
-	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	ldr	qdata, [src, 16]!
 	subs	cntrem, cntrem, 32
-	b.lo	L(end_2)
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.ls	L(end)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-L(end_2):
-	add	src, src, 16
 L(end):
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
-	sub	cntrem, src, srcin
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
-	sub	cntrem, cntin, cntrem
+	add	tmp, srcin, cntin
+	sub	cntrem, tmp, src
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index b851cf3..c22e659 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index fe6cfe2..353f0d1 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index d52ce45..78c5eca 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 35135e7..3b10266 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,84 +1,103 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * ARMv8-a, AArch64, unaligned accesses.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
-#define src1	x0
-#define src2	x1
-#define limit	x2
-#define result	w0
-
-#define data1	x3
-#define data1w	w3
-#define data2	x4
-#define data2w	w4
-#define data3	x5
-#define data3w	w5
-#define data4	x6
-#define data4w	w6
-#define tmp	x6
-#define src1end	x7
-#define src2end	x8
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
 
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 ENTRY (__memcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
+	subs	limit, limit, 8
+	b.lo	L(less8)
 
-	cmp	limit, 16
-	b.lo	L(less16)
-	ldp	data1, data3, [src1]
-	ldp	data2, data4, [src2]
-	ccmp	data1, data2, 0, ne
-	ccmp	data3, data4, 0, eq
-	b.ne	L(return2)
-
-	add	src1end, src1, limit
-	add	src2end, src2, limit
-	cmp	limit, 32
-	b.ls	L(last_bytes)
-	cmp	limit, 160
-	b.hs	L(loop_align)
-	sub	limit, limit, 32
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
 
-	.p2align 4
-L(loop32):
-	ldp	data1, data3, [src1, 16]
-	ldp	data2, data4, [src2, 16]
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
 	cmp	data1, data2
-	ccmp	data3, data4, 0, eq
-	b.ne	L(return2)
-	cmp	limit, 16
+	bne	L(return)
+
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
 	b.ls	L(last_bytes)
 
-	ldp	data1, data3, [src1, 32]
-	ldp	data2, data4, [src2, 32]
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
+
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
+
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
+
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
 	cmp	data1, data2
-	ccmp	data3, data4, 0, eq
-	b.ne	L(return2)
-	add	src1, src1, 32
-	add	src2, src2, 32
-L(last64):
-	subs	limit, limit, 32
-	b.hi	L(loop32)
+	bne	L(return)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	ldp	data1, data3, [src1end, -16]
-	ldp	data2, data4, [src2end, -16]
-L(return2):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
 	cmp	data1, data2
-	csel	data1, data1, data3, ne
-	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -86,105 +105,33 @@ L(return):
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	cmp	data1, data2
+	cmp     data1, data2
+L(ret_eq):
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
-L(less16):
-	add	src1end, src1, limit
-	add	src2end, src2, limit
-	tbz	limit, 3, L(less8)
-	ldr	data1, [src1]
-	ldr	data2, [src2]
-	ldr	data3, [src1end, -8]
-	ldr	data4, [src2end, -8]
-	b	L(return2)
-
-	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
 L(less8):
-	tbz	limit, 2, L(less4)
-	ldr	data1w, [src1]
-	ldr	data2w, [src2]
-	ldr	data3w, [src1end, -4]
-	ldr	data4w, [src2end, -4]
-	b	L(return2)
-
-L(less4):
-	tbz	limit, 1, L(less2)
-	ldrh	data1w, [src1]
-	ldrh	data2w, [src2]
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
 	b.ne	L(return)
-L(less2):
-	mov	result, 0
-	tbz	limit, 0, L(return_zero)
-	ldrb	data1w, [src1end, -1]
-	ldrb	data2w, [src2end, -1]
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
 	sub	result, data1w, data2w
-L(return_zero):
-	ret
-
-L(loop_align):
-	ldp	data1, data3, [src1, 16]
-	ldp	data2, data4, [src2, 16]
-	cmp	data1, data2
-	ccmp	data3, data4, 0, eq
-	b.ne	L(return2)
-
-	/* Align src2 and adjust src1, src2 and limit.  */
-	and	tmp, src2, 15
-	sub	tmp, tmp, 16
-	sub	src2, src2, tmp
-	add	limit, limit, tmp
-	sub	src1, src1, tmp
-	sub	limit, limit, 64 + 16
-
-	.p2align 4
-L(loop64):
-	ldr	q0, [src1, 16]
-	ldr	q1, [src2, 16]
-	subs	limit, limit, 64
-	ldr	q2, [src1, 32]
-	ldr	q3, [src2, 32]
-	eor	v0.16b, v0.16b, v1.16b
-	eor	v1.16b, v2.16b, v3.16b
-	ldr	q2, [src1, 48]
-	ldr	q3, [src2, 48]
-	umaxp	v0.16b, v0.16b, v1.16b
-	ldr	q4, [src1, 64]!
-	ldr	q5, [src2, 64]!
-	eor	v1.16b, v2.16b, v3.16b
-	eor	v2.16b, v4.16b, v5.16b
-	umaxp	v1.16b, v1.16b, v2.16b
-	umaxp	v0.16b, v0.16b, v1.16b
-	umaxp	v0.16b, v0.16b, v0.16b
-	fmov	tmp, d0
-	ccmp	tmp, 0, 0, hi
-	b.eq	L(loop64)
-
-	/* If equal, process last 1-64 bytes using scalar loop.  */
-	add	limit, limit, 64 + 16
-	cbz	tmp, L(last64)
-
-	/* Determine the 8-byte aligned offset of the first difference.  */
-#ifdef __AARCH64EB__
-	rev16	tmp, tmp
-#endif
-	rev	tmp, tmp
-	clz	tmp, tmp
-	bic	tmp, tmp, 7
-	sub	tmp, tmp, 48
-	ldr	data1, [src1, tmp]
-	ldr	data2, [src2, tmp]
-#ifndef __AARCH64EB__
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	mov	result, 1
-	cmp	data1, data2
-	cneg	result, result, lo
 	ret
 
 END (__memcmp_aarch64)
+
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index e6527d0..f97f2c3 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
deleted file mode 100644
index b45c314..0000000
--- a/string/aarch64/memcpy-mops.S
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * memcpy using MOPS extension.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "asmdefs.h"
-
-ENTRY (__memcpy_aarch64_mops)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
-	mov	x3, x0
-	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
-	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
-	.inst	0x19810443	/* cpyfe   [x3]!, [x1]!, x2!  */
-	ret
-
-END (__memcpy_aarch64_mops)
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
deleted file mode 100644
index e8a946d..0000000
--- a/string/aarch64/memcpy-sve.S
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * memcpy - copy memory area
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
- *
- */
-
-#include "asmdefs.h"
-
-#ifdef HAVE_SVE
-
-.arch armv8-a+sve
-
-#define dstin	x0
-#define src	x1
-#define count	x2
-#define dst	x3
-#define srcend	x4
-#define dstend	x5
-#define tmp1	x6
-#define vlen	x6
-
-#define A_q	q0
-#define B_q	q1
-#define C_q	q2
-#define D_q	q3
-#define E_q	q4
-#define F_q	q5
-#define G_q	q6
-#define H_q	q7
-
-/* This implementation handles overlaps and supports both memcpy and memmove
-   from a single entry point.  It uses unaligned accesses and branchless
-   sequences to keep the code small, simple and improve performance.
-   SVE vectors are used to speedup small copies.
-
-   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
-   copies of up to 128 bytes, and large copies.  The overhead of the overlap
-   check is negligible since it is only required for large copies.
-
-   Large copies use a software pipelined loop processing 64 bytes per iteration.
-   The source pointer is 16-byte aligned to minimize unaligned accesses.
-   The loop tail is handled by always copying 64 bytes from the end.
-*/
-
-ENTRY_ALIAS (__memmove_aarch64_sve)
-ENTRY (__memcpy_aarch64_sve)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
-	cmp	count, 128
-	b.hi	L(copy_long)
-	cntb	vlen
-	cmp	count, vlen, lsl 1
-	b.hi	L(copy32_128)
-
-	whilelo p0.b, xzr, count
-	whilelo p1.b, vlen, count
-	ld1b	z0.b, p0/z, [src, 0, mul vl]
-	ld1b	z1.b, p1/z, [src, 1, mul vl]
-	st1b	z0.b, p0, [dstin, 0, mul vl]
-	st1b	z1.b, p1, [dstin, 1, mul vl]
-	ret
-
-	/* Medium copies: 33..128 bytes.  */
-L(copy32_128):
-	add	srcend, src, count
-	add	dstend, dstin, count
-	ldp	A_q, B_q, [src]
-	ldp	C_q, D_q, [srcend, -32]
-	cmp	count, 64
-	b.hi	L(copy128)
-	stp	A_q, B_q, [dstin]
-	stp	C_q, D_q, [dstend, -32]
-	ret
-
-	/* Copy 65..128 bytes.  */
-L(copy128):
-	ldp	E_q, F_q, [src, 32]
-	cmp	count, 96
-	b.ls	L(copy96)
-	ldp	G_q, H_q, [srcend, -64]
-	stp	G_q, H_q, [dstend, -64]
-L(copy96):
-	stp	A_q, B_q, [dstin]
-	stp	E_q, F_q, [dstin, 32]
-	stp	C_q, D_q, [dstend, -32]
-	ret
-
-	/* Copy more than 128 bytes.  */
-L(copy_long):
-	add	srcend, src, count
-	add	dstend, dstin, count
-
-	/* Use backwards copy if there is an overlap.  */
-	sub	tmp1, dstin, src
-	cmp	tmp1, count
-	b.lo	L(copy_long_backwards)
-
-	/* Copy 16 bytes and then align src to 16-byte alignment.  */
-	ldr	D_q, [src]
-	and	tmp1, src, 15
-	bic	src, src, 15
-	sub	dst, dstin, tmp1
-	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_q, B_q, [src, 16]
-	str	D_q, [dstin]
-	ldp	C_q, D_q, [src, 48]
-	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	L(copy64_from_end)
-L(loop64):
-	stp	A_q, B_q, [dst, 16]
-	ldp	A_q, B_q, [src, 80]
-	stp	C_q, D_q, [dst, 48]
-	ldp	C_q, D_q, [src, 112]
-	add	src, src, 64
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	L(loop64)
-
-	/* Write the last iteration and copy 64 bytes from the end.  */
-L(copy64_from_end):
-	ldp	E_q, F_q, [srcend, -64]
-	stp	A_q, B_q, [dst, 16]
-	ldp	A_q, B_q, [srcend, -32]
-	stp	C_q, D_q, [dst, 48]
-	stp	E_q, F_q, [dstend, -64]
-	stp	A_q, B_q, [dstend, -32]
-	ret
-
-	/* Large backwards copy for overlapping copies.
-	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
-L(copy_long_backwards):
-	cbz	tmp1, L(return)
-	ldr	D_q, [srcend, -16]
-	and	tmp1, srcend, 15
-	bic	srcend, srcend, 15
-	sub	count, count, tmp1
-	ldp	A_q, B_q, [srcend, -32]
-	str	D_q, [dstend, -16]
-	ldp	C_q, D_q, [srcend, -64]
-	sub	dstend, dstend, tmp1
-	subs	count, count, 128
-	b.ls	L(copy64_from_start)
-
-L(loop64_backwards):
-	str	B_q, [dstend, -16]
-	str	A_q, [dstend, -32]
-	ldp	A_q, B_q, [srcend, -96]
-	str	D_q, [dstend, -48]
-	str	C_q, [dstend, -64]!
-	ldp	C_q, D_q, [srcend, -128]
-	sub	srcend, srcend, 64
-	subs	count, count, 64
-	b.hi	L(loop64_backwards)
-
-	/* Write the last iteration and copy 64 bytes from the start.  */
-L(copy64_from_start):
-	ldp	E_q, F_q, [src, 32]
-	stp	A_q, B_q, [dstend, -32]
-	ldp	A_q, B_q, [src]
-	stp	C_q, D_q, [dstend, -64]
-	stp	E_q, F_q, [dstin, 32]
-	stp	A_q, B_q, [dstin]
-L(return):
-	ret
-
-END (__memcpy_aarch64_sve)
-
-#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 2b1a592..8a967cd 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
deleted file mode 100644
index 6c73017..0000000
--- a/string/aarch64/memmove-mops.S
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * memmove using MOPS extension.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "asmdefs.h"
-
-ENTRY (__memmove_aarch64_mops)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-
-	mov	x3, x0
-	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
-	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
-	.inst	0x1d810443	/* cpye    [x3]!, [x1]!, x2!  */
-	ret
-
-END (__memmove_aarch64_mops)
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 6418bdf..7b4be84 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
 /*
  * memrchr - find last character in a memory zone.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,6 +23,7 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
+#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -30,16 +31,19 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vend		v3
-#define dend		d3
+#define vrepmask	v3
+#define vend		v4
+#define dend		d4
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting leading zeros identifies
-   exactly which byte matched.  */
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
 	PTR_ARG (0)
@@ -49,9 +53,12 @@ ENTRY (__memrchr_aarch64)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
+	mov	wtmp, 0xf00f
+	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -62,36 +69,34 @@ ENTRY (__memrchr_aarch64)
 	csel	result, result, xzr, hi
 	ret
 
-	nop
 L(start_loop):
-	subs	cntrem, src, srcin
+	sub	tmp, end, src
+	subs	cntrem, cntin, tmp
 	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	sub	cntrem, cntrem, 1
-	tbz	cntrem, 4, L(loop32_2)
-	add	src, src, 16
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
 
-	.p2align 5
+	.p2align 4
 L(loop32):
-	ldr	qdata, [src, -32]!
+	ldr	qdata, [src, -16]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, -16]
+	ldr	qdata, [src, -16]!
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.lo	L(end_2)
+	b.ls	L(end)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-L(end_2):
-	sub	src, src, 16
 L(end):
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
deleted file mode 100644
index ec79149..0000000
--- a/string/aarch64/memset-mops.S
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * memset using MOPS extension.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "asmdefs.h"
-
-ENTRY (__memset_aarch64_mops)
-	PTR_ARG (0)
-	SIZE_ARG (2)
-
-	mov     x3, x0
-	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
-	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
-	.inst   0x19c18443	/* sete    [x3]!, x2!, x1  */
-	ret
-
-END (__memset_aarch64_mops)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 553b0fc..9fcd975 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,8 +1,8 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2012-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define dstin	x0
 #define val	x1
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
new file mode 100644
index 0000000..f1c7119
--- /dev/null
+++ b/string/aarch64/stpcpy-mte.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
index 5d3f14b..82dd971 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
index 155c68d..4f62aa4 100644
--- a/string/aarch64/stpcpy.S
+++ b/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index 6ec08f7..dcb0e46 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,7 +19,8 @@
 
 #define src		x2
 #define tmp1		x1
-#define tmp2		x3
+#define wtmp2		w3
+#define tmp3		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -27,30 +28,39 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vrepmask2	v5
+#define vend		v6
+#define dend		d6
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. Bits 0-1 are set if the relevant byte matched the requested
-   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
-   zeroes gives the position of the matching byte if it is a multiple of 4.
-   If it is not a multiple of 4, there was no match.  */
+   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+   requested character, bits 2-3 are set if the byte is NUL (or matched), and
+   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+   in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	movi	vrepmask.16b, 0x33
+	mov	wtmp2, 0x3003
+	dup	vrepmask.8h, wtmp2
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp2, 0xf00f
+	dup	vrepmask2.8h, wtmp2
+
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	lsl	tmp2, srcin, 2
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	lsl	tmp3, srcin, 2
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp2
+	lsr	tmp1, tmp1, tmp3
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -64,34 +74,28 @@ ENTRY (__strchr_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]
-	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	tmp1, dend
-	cbnz	tmp1, L(end)
-	ldr	qdata, [src, 32]!
+	ldr	qdata, [src, 16]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-	sub	src, src, 16
-L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
-	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is a multiple of 4 if the target character was found.  */
+	/* Tmp1 is an even multiple of 2 if the target character was
+	   found first. Otherwise we've found the end of string.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index ff07516..13ba9f4 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 37193bd..1063cbf 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 543ee88..1b0d0a6 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -20,32 +20,38 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
+#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vend		v4
-#define dend		d4
+#define vrepmask	v4
+#define vend		v5
+#define dend		d5
 
-/*
-   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting leading zeros identifies
-   exactly which byte matched.  */
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
+	mov	tmp2w, 0xf00f
+	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -57,22 +63,15 @@ ENTRY (__strchrnul_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]
-	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
-	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
-	fmov	tmp1, dend
-	cbnz	tmp1, L(end)
-	ldr	qdata, [src, 32]!
+	ldr	qdata, [src, 16]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-	sub	src, src, 16
-L(end):
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
-	add	src, src, 16
+
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 0005f91..428ff1a 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #define BUILD_STRCHRNUL
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 666e8d0..a4230d9 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
new file mode 100644
index 0000000..12d1a6b
--- /dev/null
+++ b/string/aarch64/strcmp-mte.S
@@ -0,0 +1,189 @@
+/*
+ * strcmp - compare two strings
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define off1		x5
+#define syndrome	x6
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
+ENTRY (__strcmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
+	b.ne	L(misaligned8)
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
+L(loop_aligned):
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+L(end):
+#ifndef __AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
+	ret
+
+	.p2align 4
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, 7
+	b.ne	L(do_misaligned)
+
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+	b	L(end)
+
+L(done):
+	sub	result, data1, data2
+	ret
+
+END (__strcmp_aarch64_mte)
+
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index eaf909a..e6d2da5 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 137a9aa..7714ebf 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,184 +1,168 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-
 /* Assumptions:
  *
- * ARMv8-a, AArch64.
- * MTE compatible.
+ * ARMv8-a, AArch64
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
 
+/* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define result		x0
 
+/* Internal variables.  */
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
-#define off1		x5
 #define syndrome	x6
-#define tmp		x6
-#define data3		x7
-#define zeroones	x8
-#define shift		x9
-#define off2		x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.  */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-   can be done in parallel across the entire word.
-   Since carry propagation makes 0x1 bytes before a NUL byte appear
-   NUL too in big-endian, byte-reverse the data before the NUL check.  */
-
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
 
+	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	sub	off2, src2, src1
-	mov	zeroones, REP8_01
-	and	tmp, src1, 7
-	tst	off2, 7
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
 	b.ne	L(misaligned8)
-	cbnz	tmp, L(mutual_align)
-
-	.p2align 4
-
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
 L(loop_aligned):
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
 L(start_realigned):
-#ifdef __AARCH64EB__
-	rev	tmp, data1
-	sub	has_nul, tmp, zeroones
-	orr	tmp, tmp, REP8_7f
-#else
-	sub	has_nul, data1, zeroones
-	orr	tmp, data1, REP8_7f
-#endif
-	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_aligned)
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
 L(end):
-#ifndef __AARCH64EB__
+#ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
 	rev	data2, data2
-#endif
-	clz	shift, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-	lsl	data1, data1, shift
-	lsl	data2, data2, shift
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, 56
-	sub	result, data1, data2, lsr 56
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
 	ret
-
-	.p2align 4
+#endif
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, 7
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
-	mov	tmp, -1
-	LS_FW	tmp, tmp, shift
-	orr	data1, data1, tmp
-	orr	data2, data2, tmp
+	   the bytes that preceed the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond the end of SRC2.  */
-	cbz	tmp, L(src1_aligned)
+	   checking to make sure that we don't access beyond page boundary in
+	   SRC2.  */
+	tst	src1, #7
+	b.eq	L(loop_misaligned)
 L(do_misaligned):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	cmp	data1w, 0
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
 	b.ne	L(done)
-	tst	src1, 7
+	tst	src1, #7
 	b.ne	L(do_misaligned)
 
-L(src1_aligned):
-	neg	shift, src2, lsl 3
-	bic	src2, src2, 7
-	ldr	data3, [src2], 8
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	lsr	tmp, zeroones, shift
-	orr	data3, data3, tmp
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	bics	has_nul, has_nul, tmp
-	b.ne	L(tail)
-
-	sub	off1, src2, src1
-
-	.p2align 4
-
-L(loop_unaligned):
-	ldr	data3, [src1, off1]
-	ldr	data2, [src1, off2]
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	ldr	data1, [src1], 8
-	bics	has_nul, has_nul, tmp
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_unaligned)
-
-	lsl	tmp, has_nul, shift
-#ifdef __AARCH64EB__
-	rev	tmp, tmp
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, tmp
-	cbnz	syndrome, L(end)
-L(tail):
-	ldr	data1, [src1]
-	neg	shift, shift
-	lsr	data2, data3, shift
-	lsr	has_nul, has_nul, shift
-#ifdef __AARCH64EB__
-	rev     data2, data2
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
+L(loop_misaligned):
+	/* Test if we are within the last dword of the end of a 4K page.  If
+	   yes then jump back to the misaligned loop to copy a byte at a time.  */
+	and	tmp1, src2, #0xff8
+	eor	tmp1, tmp1, #0xff8
+	cbz	tmp1, L(do_misaligned)
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_misaligned)
 	b	L(end)
 
 L(done):
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
new file mode 100644
index 0000000..88c222d
--- /dev/null
+++ b/string/aarch64/strcpy-mte.S
@@ -0,0 +1,161 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define dstin		x0
+#define srcin		x1
+#define result		x0
+
+#define src		x2
+#define dst		x3
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define wtmp		w5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+#define dataq2		q1
+
+#ifdef BUILD_STPCPY
+# define STRCPY __stpcpy_aarch64_mte
+# define IFSTPCPY(X,...) X,__VA_ARGS__
+#else
+# define STRCPY __strcpy_aarch64_mte
+# define IFSTPCPY(X,...)
+#endif
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (STRCPY)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4,,8
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+
+	.p2align 4
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
+	str	data1, [dstin]
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
+	ret
+
+	.p2align 4
+L(start_loop):
+	sub	len, src, srcin
+	ldr	dataq2, [srcin]
+	add	dst, dstin, len
+	str	dataq2, [dstin]
+
+	.p2align 5
+L(loop):
+	str	dataq, [dst], 16
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	len, synd
+	lsr	len, len, 2
+	sub	tmp, len, 15
+	ldr	dataq, [src, tmp]
+	str	dataq, [dst, tmp]
+	IFSTPCPY (add result, dst, len)
+	ret
+
+END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index 00e72dc..f515462 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,11 +1,11 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 97ae37e..6e9ed42 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,156 +1,311 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
+ * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
+/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
+
+   To test the page crossing code path more thoroughly, compile with
+   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
+   entry path.  This option is not intended for production use.  */
+
+/* Arguments and results.  */
 #define dstin		x0
 #define srcin		x1
-#define result		x0
 
+/* Locals and temporaries.  */
 #define src		x2
 #define dst		x3
-#define len		x4
-#define synd		x4
-#define	tmp		x5
-#define shift		x5
-#define data1		x6
-#define dataw1		w6
-#define data2		x7
-#define dataw2		w7
-
-#define dataq		q0
-#define vdata		v0
-#define vhas_nul	v1
-#define vend		v2
-#define dend		d2
-#define dataq2		q1
+#define data1		x4
+#define data1w		w4
+#define data2		x5
+#define data2w		w5
+#define has_nul1	x6
+#define has_nul2	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define tmp4		x11
+#define zeroones	x12
+#define data1a		x13
+#define data2a		x14
+#define pos		x15
+#define len		x16
+#define to_align	x17
 
 #ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64
-# define IFSTPCPY(X,...) X,__VA_ARGS__
+#define STRCPY __stpcpy_aarch64
 #else
-# define STRCPY __strcpy_aarch64
-# define IFSTPCPY(X,...)
+#define STRCPY __strcpy_aarch64
 #endif
 
-/*
-   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting leading zeros identifies
-   exactly which byte matched.  */
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
+	   page size check for crossing this boundary on entry and if we
+	   do not, then we can short-circuit much of the entry code.  We
+	   expect early page-crossing strings to be rare (probability of
+	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
+	   predictable, even with random strings.
+
+	   We don't bother checking for larger page sizes, the cost of setting
+	   up the correct page size is just not worth the extra gain from
+	   a small reduction in the cases taking the slow path.  Note that
+	   we only care about whether the first fetch, which may be
+	   misaligned, crosses a page boundary - after that we move to aligned
+	   fetches for the remainder of the string.  */
+
+#ifdef STRCPY_TEST_PAGE_CROSS
+	/* Make everything that isn't Qword aligned look like a page cross.  */
+#define MIN_PAGE_P2 4
+#else
+#define MIN_PAGE_P2 12
+#endif
+
+#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	bic	src, srcin, 15
-	ld1	{vdata.16b}, [src]
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	lsl	shift, srcin, 2
-	shrn	vend.8b, vhas_nul.8h, 4
-	fmov	synd, dend
-	lsr	synd, synd, shift
-	cbnz	synd, L(tail)
-
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	shrn	vend.8b, vhas_nul.8h, 4
-	fmov	synd, dend
-	cbz	synd, L(start_loop)
-
-#ifndef __AARCH64EB__
-	rbit	synd, synd
+	/* For moderately short strings, the fastest way to do the copy is to
+	   calculate the length of the string in the same way as strlen, then
+	   essentially do a memcpy of the result.  This avoids the need for
+	   multiple byte copies and further means that by the time we
+	   reach the bulk copy loop we know we can always use DWord
+	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
+	   with the same source string, so branch prediction is likely to
+	   always be difficult - we mitigate against this by preferring
+	   conditional select operations over branches whenever this is
+	   feasible.  */
+	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
+	mov	zeroones, #REP8_01
+	and	to_align, srcin, #15
+	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
+	neg	tmp1, to_align
+	/* The first fetch will straddle a (possible) page boundary iff
+	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
+	   aligned string will never fail the page align check, so will
+	   always take the fast path.  */
+	b.gt	L(page_cross)
+
+L(page_cross_ok):
+	ldp	data1, data2, [srcin]
+#ifdef __AARCH64EB__
+	/* Because we expect the end to be found within 16 characters
+	   (profiling shows this is the most common case), it's worth
+	   swapping the bytes now to save having to recalculate the
+	   termination syndrome later.  We preserve data1 and data2
+	   so that we can re-use the values later on.  */
+	rev	tmp2, data1
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	L(fp_le8)
+	rev	tmp4, data2
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	b.ne	L(fp_le8)
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
 #endif
-	sub	tmp, src, srcin
-	clz	len, synd
-	add	len, tmp, len, lsr 2
-	tbz	len, 4, L(less16)
-	sub	tmp, len, 15
-	ldr	dataq, [srcin]
-	ldr	dataq2, [srcin, tmp]
-	str	dataq, [dstin]
-	str	dataq2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
+	bics	has_nul2, tmp3, tmp4
+	b.eq	L(bulk_entry)
 
-L(tail):
-	rbit	synd, synd
-	clz	len, synd
-	lsr	len, len, 2
-L(less16):
-	tbz	len, 3, L(less8)
-	sub	tmp, len, 7
-	ldr	data1, [srcin]
-	ldr	data2, [srcin, tmp]
+	/* The string is short (<=16 bytes).  We don't know exactly how
+	   short though, yet.  Work out the exact length so that we can
+	   quickly select the optimal copy strategy.  */
+L(fp_gt8):
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	mov	tmp2, #56
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	sub	pos, tmp2, pos
+#ifdef __AARCH64EB__
+	lsr	data2, data2, pos
+#else
+	lsl	data2, data2, pos
+#endif
+	str	data2, [dst, #1]
 	str	data1, [dstin]
-	str	data2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
+#ifdef BUILD_STPCPY
+	add	dstin, dst, #8
+#endif
 	ret
 
-	.p2align 4
-L(less8):
-	subs	tmp, len, 3
-	b.lo	L(less4)
-	ldr	dataw1, [srcin]
-	ldr	dataw2, [srcin, tmp]
-	str	dataw1, [dstin]
-	str	dataw2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
+L(fp_le8):
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
+	subs	tmp2, pos, #24			/* Pos in bits. */
+	b.lt	L(fp_lt4)
+#ifdef __AARCH64EB__
+	mov	tmp2, #56
+	sub	pos, tmp2, pos
+	lsr	data2, data1, pos
+	lsr	data1, data1, #32
+#else
+	lsr	data2, data1, tmp2
+#endif
+	/* 4->7 bytes to copy.  */
+	str	data2w, [dst, #-3]
+	str	data1w, [dstin]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
 	ret
-
-L(less4):
-	cbz	len, L(zerobyte)
-	ldrh	dataw1, [srcin]
-	strh	dataw1, [dstin]
-L(zerobyte):
-	strb	wzr, [dstin, len]
-	IFSTPCPY (add result, dstin, len)
+L(fp_lt4):
+	cbz	pos, L(fp_lt2)
+	/* 2->3 bytes to copy.  */
+#ifdef __AARCH64EB__
+	lsr	data1, data1, #48
+#endif
+	strh	data1w, [dstin]
+	/* Fall-through, one byte (max) to go.  */
+L(fp_lt2):
+	/* Null-terminated string.  Last character must be zero!  */
+	strb	wzr, [dst]
+#ifdef BUILD_STPCPY
+	mov	dstin, dst
+#endif
 	ret
 
-	.p2align 4
-L(start_loop):
-	sub	tmp, srcin, dstin
-	ldr	dataq2, [srcin]
-	sub	dst, src, tmp
-	str	dataq2, [dstin]
-L(loop):
-	str	dataq, [dst], 32
-	ldr	dataq, [src, 16]
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbnz	synd, L(loopend)
-	str	dataq, [dst, -16]
-	ldr	dataq, [src, 32]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(loop)
-	add	dst, dst, 16
-L(loopend):
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
-	fmov	synd, dend
-	sub	dst, dst, 31
-#ifndef __AARCH64EB__
-	rbit	synd, synd
+	.p2align 6
+	/* Aligning here ensures that the entry code and main loop all lies
+	   within one 64-byte cache line.  */
+L(bulk_entry):
+	sub	to_align, to_align, #16
+	stp	data1, data2, [dstin]
+	sub	src, srcin, to_align
+	sub	dst, dstin, to_align
+	b	L(entry_no_page_cross)
+
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+L(main_loop):
+	stp	data1, data2, [dst], #16
+L(entry_no_page_cross):
+	ldp	data1, data2, [src], #16
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(main_loop)
+
+	/* Since we know we are copying at least 16 bytes, the fastest way
+	   to deal with the tail is to determine the location of the
+	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
+	cmp	has_nul1, #0
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	csel	data1, data1, data2, ne
+	rev	data1, data1
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+#else
+	csel	has_nul1, has_nul1, has_nul2, ne
+#endif
+	rev	has_nul1, has_nul1
+	clz	pos, has_nul1
+	add	tmp1, pos, #72
+	add	pos, pos, #8
+	csel	pos, pos, tmp1, ne
+	add	src, src, pos, lsr #3
+	add	dst, dst, pos, lsr #3
+	ldp	data1, data2, [src, #-32]
+	stp	data1, data2, [dst, #-16]
+#ifdef BUILD_STPCPY
+	sub	dstin, dst, #1
 #endif
-	clz	len, synd
-	lsr	len, len, 2
-	add	dst, dst, len
-	ldr	dataq, [dst, tmp]
-	str	dataq, [dst]
-	IFSTPCPY (add result, dst, 15)
 	ret
 
+L(page_cross):
+	bic	src, srcin, #15
+	/* Start by loading two words at [srcin & ~15], then forcing the
+	   bytes that precede srcin to 0xff.  This means they never look
+	   like termination bytes.  */
+	ldp	data1, data2, [src]
+	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
+	tst	to_align, #7
+	csetm	tmp2, ne
+#ifdef __AARCH64EB__
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	cmp	to_align, #8
+	csinv	data1, data1, xzr, lt
+	csel	data2, data2, data2a, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(page_cross_ok)
+	/* We now need to make data1 and data2 look like they've been
+	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
+	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
+	neg	tmp2, to_align, lsl #3
+#ifdef __AARCH64EB__
+	lsl	data1a, data1, tmp1
+	lsr	tmp4, data2, tmp2
+	lsl	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	rev	tmp2, data1
+	rev	tmp4, data2
+	sub	tmp1, tmp2, zeroones
+	orr	tmp2, tmp2, #REP8_7f
+	sub	tmp3, tmp4, zeroones
+	orr	tmp4, tmp4, #REP8_7f
+#else
+	lsr	data1a, data1, tmp1
+	lsl	tmp4, data2, tmp2
+	lsr	data2, data2, tmp1
+	orr	tmp4, tmp4, data1a
+	cmp	to_align, #8
+	csel	data1, tmp4, data2, lt
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+#endif
+	bic	has_nul1, tmp1, tmp2
+	cbnz	has_nul1, L(fp_le8)
+	bic	has_nul2, tmp3, tmp4
+	b	L(fp_gt8)
+
 END (STRCPY)
+
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 7723579..7cf41d5 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define result		x0
@@ -19,26 +19,35 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
+#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vend		v2
-#define dend		d2
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
 
 /* Core algorithm:
-   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
-   four bits per byte using the shrn instruction. A count trailing zeros then
-   identifies the first zero byte.  */
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
+	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -50,25 +59,19 @@ ENTRY (__strlen_aarch64_mte)
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbnz	synd, L(loop_end)
-	ldr	data, [src, 32]!
+	ldr	data, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-	sub	src, src, 16
-L(loop_end):
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
-	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 12ebbdb..2392493 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,11 +1,11 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 6f6f08f..a1b164a 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Not MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin	x0
 #define len	x0
@@ -36,7 +36,6 @@
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
-#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
@@ -111,6 +110,7 @@ ENTRY (__strlen_aarch64)
 	add	len, len, tmp1, lsr 3
 	ret
 
+	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
@@ -138,7 +138,6 @@ L(bytes16_31):
 	add	len, len, tmp1, lsr 3
 	ret
 
-	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
@@ -154,12 +153,18 @@ L(loop):
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	cbnz	syndw, 1f
+	tst	synd, 0xffffffff
+	b.ne	1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-	shrn	maskv.8b, maskv.8h, 4
+#ifdef __AARCH64EB__
+	bic	maskv.8h, 0xf0
+#else
+	bic	maskv.8h, 0x0f, lsl 8
+#endif
+	umaxp	maskv.16b, maskv.16b, maskv.16b
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
@@ -168,6 +173,8 @@ L(loop):
 	add	len, len, tmp, lsr 2
 	ret
 
+        .p2align 4
+
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
new file mode 100644
index 0000000..c9d6fc8
--- /dev/null
+++ b/string/aarch64/strncmp-mte.S
@@ -0,0 +1,307 @@
+/*
+ * strncmp - compare two strings
+ *
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include "../asmdefs.h"
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define mask		x13
+#define endloop		x14
+#define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ENTRY (__strncmp_aarch64_mte)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit, limit, #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	L(loop_aligned)
+	/* End of main loop */
+
+L(full_check):
+#ifndef __AARCH64EB__
+	orr	syndrome, diff, has_nul
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
+	rev	syndrome, syndrome
+	rev	data1, data1
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
+	ret
+#else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+L(end_quick):
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	cbz	count, L(src1_aligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
+
+L(ret0):
+	mov	result, #0
+	ret
+END(__strncmp_aarch64_mte)
+
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 6a9e9f7..234190e 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 128a10c..738b653 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2013-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64.
- * MTE compatible.
+ * ARMv8-a, AArch64
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
 #define src1		x0
@@ -35,24 +35,10 @@
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
-#define mask		x13
-#define endloop		x14
+#define limit_wd	x13
+#define mask		x14
+#define endloop		x15
 #define count		mask
-#define offset		pos
-#define neg_offset	x15
-
-/* Define endian dependent shift operations.
-   On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.
-   LS_BK means shifting towards later bytes.
-   */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
 
 ENTRY (__strncmp_aarch64)
 	PTR_ARG (0)
@@ -65,6 +51,9 @@ ENTRY (__strncmp_aarch64)
 	and	count, src1, #7
 	b.ne	L(misaligned8)
 	cbnz	count, L(mutual_align)
+	/* Calculate the number of full and partial words -1.  */
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -74,52 +63,56 @@ L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
-	subs	limit, limit, #8
+	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
+	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
 	/* End of main loop */
 
-L(full_check):
-#ifndef __AARCH64EB__
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit_wd, #63, L(not_limit)
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	L(not_limit)
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
 	orr	syndrome, diff, has_nul
-	add	limit, limit, 8	/* Rewind limit to before last subs. */
-L(syndrome_check):
-	/* Limit was reached. Check if the NUL byte or the difference
-	   is before the limit. */
+
+#ifndef	__AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
 	clz	pos, syndrome
 	rev	data2, data2
 	lsl	data1, data1, pos
-	cmp	limit, pos, lsr #3
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
-	csel result, result, xzr, hi
 	ret
 #else
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit, #63, L(not_limit)
-	add	tmp1, limit, 8
-	cbz	limit, L(not_limit)
-
-	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-	lsr	mask, mask, limit
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
@@ -140,11 +133,10 @@ L(not_limit):
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
@@ -166,12 +158,22 @@ L(mutual_align):
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit and ensure it doesn't overflow.  */
-	adds	limit, limit, count
-	csinv	limit, limit, xzr, lo
+	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+#endif
+	and	tmp3, limit_wd, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
+	add	limit, limit, count
+	add	tmp3, tmp3, count
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
+	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
 	.p2align 4
@@ -194,11 +196,13 @@ L(done):
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	cbz	count, L(src1_aligned)
+	lsr	limit_wd, limit, #3
+	cbz	count, L(do_misaligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
+	lsr	limit_wd, limit, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
@@ -209,100 +213,48 @@ L(page_end_loop):
 	subs	count, count, #1
 	b.hi	L(page_end_loop)
 
-	/* The following diagram explains the comparison of misaligned strings.
-	   The bytes are shown in natural order. For little-endian, it is
-	   reversed in the registers. The "x" bytes are before the string.
-	   The "|" separates data that is loaded at one time.
-	   src1     | a a a a a a a a | b b b c c c c c | . . .
-	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
-
-	   After shifting in each step, the data looks like this:
-	                STEP_A              STEP_B              STEP_C
-	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
-	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
-
-	   The bytes with "0" are eliminated from the syndrome via mask.
-
-	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
-	   time from SRC2. The comparison happens in 3 steps. After each step
-	   the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
-	/* Calculate offset from 8 byte alignment to string start in bits. No
-	   need to mask offset since shifts are ignoring upper bits. */
-	lsl	offset, src2, #3
-	bic	src2, src2, #0xf
-	mov	mask, -1
-	neg	neg_offset, offset
-	ldr	data1, [src1], #8
-	ldp	tmp1, tmp2, [src2], #16
-	LS_BK	mask, mask, neg_offset
-	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
-	/* Skip the first compare if data in tmp1 is irrelevant. */
-	tbnz	offset, 6, L(misaligned_mid_loop)
-
+L(do_misaligned):
+	/* Prepare ourselves for the next page crossing.  Unlike the aligned
+	   loop, we fetch 1 less dword because we risk crossing bounds on
+	   SRC2.  */
+	mov	count, #8
+	subs	limit_wd, limit_wd, #1
+	b.lo	L(done_loop)
 L(loop_misaligned):
-	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
-	LS_FW	data2, tmp1, offset
-	LS_BK	tmp1, tmp2, neg_offset
-	subs	limit, limit, #8
-	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
-	sub	has_nul, data1, zeroones
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	orr	tmp3, data1, #REP8_7f
-	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
-	orr	tmp3, endloop, has_nul
-	cbnz	tmp3, L(full_check)
-
-	ldr	data1, [src1], #8
-L(misaligned_mid_loop):
-	/* STEP_B: Compare first part of data1 to second part of tmp2. */
-	LS_FW	data2, tmp2, offset
-#ifdef __AARCH64EB__
-	/* For big-endian we do a byte reverse to avoid carry-propagation
-	problem described above. This way we can reuse the has_nul in the
-	next step and also use syndrome value trick at the end. */
-	rev	tmp3, data1
-	#define data1_fixed tmp3
-#else
-	#define data1_fixed data1
-#endif
-	sub	has_nul, data1_fixed, zeroones
-	orr	tmp3, data1_fixed, #REP8_7f
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	cmp	limit, neg_offset, lsr #3
-	orr	syndrome, diff, has_nul
-	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	/* STEP_C: Compare second part of data1 to first part of tmp1. */
-	ldp	tmp1, tmp2, [src2], #16
-	cmp	limit, #8
-	LS_BK	data2, tmp1, neg_offset
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
+	and	tmp2, src2, #0xff8
+	eor	tmp2, tmp2, #0xff8
+	cbz	tmp2, L(page_end_loop)
 
 	ldr	data1, [src1], #8
-	sub	limit, limit, #8
-	b	L(loop_misaligned)
+	ldr	data2, [src2], #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
+	subs	limit_wd, limit_wd, #1
+	b.pl	L(loop_misaligned)
 
-#ifdef	__AARCH64EB__
-L(syndrome_check):
-	clz	pos, syndrome
-	cmp	pos, limit, lsl #3
-	b.lo	L(end_quick)
-#endif
+L(done_loop):
+	/* We found a difference or a NULL before the limit was reached.  */
+	and	limit, limit, #7
+	cbz	limit, L(not_limit)
+	/* Read the last word.  */
+	sub	src1, src1, 8
+	sub	src2, src2, 8
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	diff, #0, #0, eq
+	b.ne	L(not_limit)
 
 L(ret0):
 	mov	result, #0
 	ret
-END(__strncmp_aarch64)
+
+END ( __strncmp_aarch64)
 
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 6c43dc4..5b9ebf7 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,11 +1,11 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index f2090a7..48d2495 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define cntin		x1
@@ -20,30 +20,39 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
+#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vend		v2
-#define dend		d2
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
-   four bits per byte using the shrn instruction. A count trailing zeros then
-   identifies the first zero byte.  */
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strnlen_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
-	ld1	{vdata.16b}, [src]
+	ld1	{vdata.16b}, [src], 16
+	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -55,40 +64,37 @@ L(finish):
 	csel	result, cntin, result, ls
 	ret
 
-L(nomatch):
-	mov	result, cntin
-	ret
-
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.lo	L(nomatch)
+	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	tbz	cntrem, 4, L(loop32_2)
-	sub	src, src, 16
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
 	.p2align 5
 L(loop32):
-	ldr	qdata, [src, 32]!
+	ldr	qdata, [src], 16
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 L(loop32_2):
-	ldr	qdata, [src, 16]
+	ldr	qdata, [src], 16
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, 0
-	b.lo	L(end_2)
+	b.ls	L(end)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-L(end_2):
-	add	src, src, 16
+
 L(end):
-	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+	mov	synd, vend.d[0]
 	sub	result, src, srcin
-	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
@@ -98,5 +104,9 @@ L(end):
 	csel	result, cntin, result, ls
 	ret
 
+L(nomatch):
+	mov	result, cntin
+	ret
+
 END (__strnlen_aarch64)
 
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index bb61ab9..1e4fb1a 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,6 +19,7 @@
 
 #define src		x2
 #define tmp		x3
+#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -30,6 +31,7 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
+#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -45,67 +47,55 @@ ENTRY (__strrchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	movi	vrepmask.16b, 0x33
-	ld1	{vdata.16b}, [src]
+	mov	wtmp, 0x3003
+	dup	vrepmask.8h, wtmp
+	tst	srcin, 15
+	beq	L(loop1)
+
+	ld1	{vdata.16b}, [src], 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp, 0xf00f
+	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	shrn	vend.8b, vhas_nul.8h, 4
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2_start)
+	cbnz	synd, L(loop2)
 
-	.p2align 4
+	.p2align 5
 L(loop1):
-	ldr	q1, [src, 16]
-	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbnz	synd, L(loop1_end)
-	ldr	q1, [src, 32]!
+	ld1	{vdata.16b}, [src], 16
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-	sub	src, src, 16
-L(loop1_end):
-	add	src, src, 16
+
 	cmeq	vhas_nul.16b, vdata.16b, 0
-#ifdef __AARCH64EB__
-	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	shrn	vend.8b, vhas_nul.8h, 4
-	fmov	synd, dend
-	rbit	synd, synd
-#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	shrn	vend.8b, vhas_nul.8h, 4
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
-#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2_start)
+	beq	L(loop2)
+
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	add	result, src, 15
+	sub	result, src, 1
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
-	nop
-	nop
-L(loop2_start):
-	add	src, src, 16
-	bic	vrepmask.8h, 0xf0
-
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index 825a738..d36d69a 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index bf9cb29..56185ff 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2014-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "asmdefs.h"
+#include "../asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index e070be5..d5d4ea7 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy benchmark.
  *
- * Copyright (c) 2020-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #define _GNU_SOURCE
@@ -13,15 +13,14 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS  5000
+#define ITERS 5000
 #define ITERS2 20000000
-#define ITERS3 200000
-#define NUM_TESTS 16384
-#define MIN_SIZE 32768
-#define MAX_SIZE (1024 * 1024)
+#define ITERS3 500000
+#define MAX_COPIES 8192
+#define SIZE (256*1024)
 
-static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
-static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
+static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
 
 #define F(x) {#x, x},
 
@@ -31,21 +30,15 @@ static const struct fun
   void *(*fun)(void *, const void *, size_t);
 } funtab[] =
 {
+  F(memcpy)
 #if __aarch64__
   F(__memcpy_aarch64)
 # if __ARM_NEON
   F(__memcpy_aarch64_simd)
 # endif
-# if __ARM_FEATURE_SVE
-  F(__memcpy_aarch64_sve)
-# endif
-# if WANT_MOPS
-  F(__memcpy_aarch64_mops)
-# endif
 #elif __arm__
   F(__memcpy_arm)
 #endif
-  F(memcpy)
 #undef F
   {0, 0}
 };
@@ -116,7 +109,7 @@ typedef struct
   uint64_t len : 16;
 } copy_t;
 
-static copy_t test_arr[NUM_TESTS];
+static copy_t copy[MAX_COPIES];
 
 typedef char *(*proto_t) (char *, const char *, size_t);
 
@@ -147,14 +140,14 @@ init_copies (size_t max_size)
   size_t total = 0;
   /* Create a random set of copies with the given size and alignment
      distributions.  */
-  for (int i = 0; i < NUM_TESTS; i++)
+  for (int i = 0; i < MAX_COPIES; i++)
     {
-      test_arr[i].dst = (rand32 (0) & (max_size - 1));
-      test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
-      test_arr[i].src = (rand32 (0) & (max_size - 1));
-      test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
-      test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
-      total += test_arr[i].len;
+      copy[i].dst = (rand32 (0) & (max_size - 1));
+      copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+      copy[i].src = (rand32 (0) & (max_size - 1));
+      copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+      copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
+      total += copy[i].len;
     }
 
   return total;
@@ -167,27 +160,25 @@ int main (void)
   memset (a, 1, sizeof (a));
   memset (b, 2, sizeof (b));
 
-  printf("Random memcpy (bytes/ns):\n");
+  printf("Random memcpy:\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t total = 0;
       uint64_t tsum = 0;
-      printf ("%22s ", funtab[f].name);
+      printf ("%22s (B/ns) ", funtab[f].name);
       rand32 (0x12345678);
 
-      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+      for (int size = 16384; size <= SIZE; size *= 2)
 	{
 	  size_t copy_size = init_copies (size) * ITERS;
 
-	  for (int c = 0; c < NUM_TESTS; c++)
-	    funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
-			   test_arr[c].len);
+	  for (int c = 0; c < MAX_COPIES; c++)
+	    funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < NUM_TESTS; c++)
-	      funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
-			     test_arr[c].len);
+	    for (int c = 0; c < MAX_COPIES; c++)
+	      funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
 	  t = clock_get_ns () - t;
 	  total += copy_size;
 	  tsum += t;
@@ -196,147 +187,74 @@ int main (void)
       printf( "avg %.2f\n", (double)total / tsum);
     }
 
-  size_t total = 0;
-  uint64_t tsum = 0;
-  printf ("%22s ", "memcpy_call");
-  rand32 (0x12345678);
-
-  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
-    {
-      size_t copy_size = init_copies (size) * ITERS;
-
-      for (int c = 0; c < NUM_TESTS; c++)
-	memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
-
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_TESTS; c++)
-	  memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
-      t = clock_get_ns () - t;
-      total += copy_size;
-      tsum += t;
-      printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
-    }
-  printf( "avg %.2f\n", (double)total / tsum);
-
-
-  printf ("\nAligned medium memcpy (bytes/ns):\n");
+  printf ("\nMedium memcpy:\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s ", funtab[f].name);
+      printf ("%22s (B/ns) ", funtab[f].name);
 
-      for (int size = 8; size <= 512; size *= 2)
+      for (int size = 16; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
-  printf ("%22s ", "memcpy_call");
-  for (int size = 8; size <= 512; size *= 2)
-    {
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS2; i++)
-	memcpy (b, a, size);
-      t = clock_get_ns () - t;
-      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-    }
-  printf ("\n");
-
-
-  printf ("\nUnaligned medium memcpy (bytes/ns):\n");
+  printf ("\nLarge memcpy:\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s ", funtab[f].name);
+      printf ("%22s (B/ns) ", funtab[f].name);
 
-      for (int size = 8; size <= 512; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (b + 3, a + 1, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-	}
-      printf ("\n");
-    }
-
-  printf ("%22s ", "memcpy_call");
-  for (int size = 8; size <= 512; size *= 2)
-    {
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS2; i++)
-	memcpy (b + 3, a + 1, size);
-      t = clock_get_ns () - t;
-      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-    }
-  printf ("\n");
-
-
-  printf ("\nLarge memcpy (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1024; size <= 65536; size *= 2)
+      for (int size = 1024; size <= 32768; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
-  printf ("%22s ", "memcpy_call");
-  for (int size = 1024; size <= 65536; size *= 2)
-    {
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS3; i++)
-	memcpy (b, a, size);
-      t = clock_get_ns () - t;
-      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-    }
-  printf ("\n");
-
-
-  printf ("\nUnaligned forwards memmove (bytes/ns):\n");
+  printf ("\nUnaligned forwards memmove:\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s ", funtab[f].name);
+      printf ("%22s (B/ns) ", funtab[f].name);
 
-      for (int size = 1024; size <= 65536; size *= 2)
+      for (int size = 1024; size <= 32768; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a, a + 256 + (i & 31), size);
 	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
 
-  printf ("\nUnaligned backwards memmove (bytes/ns):\n");
+  printf ("\nUnaligned backwards memmove:\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s ", funtab[f].name);
+      printf ("%22s (B/ns) ", funtab[f].name);
 
-      for (int size = 1024; size <= 65536; size *= 2)
+      for (int size = 1024; size <= 32768; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a + 256 + (i & 31), a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
-  printf ("\n");
 
   return 0;
 }
diff --git a/string/bench/memset.c b/string/bench/memset.c
deleted file mode 100644
index 990e23b..0000000
--- a/string/bench/memset.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * memset benchmark.
- *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#define _GNU_SOURCE
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "stringlib.h"
-#include "benchlib.h"
-
-#define ITERS  5000
-#define ITERS2 20000000
-#define ITERS3 1000000
-#define NUM_TESTS 16384
-#define MIN_SIZE 32768
-#define MAX_SIZE (1024 * 1024)
-
-static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
-
-#define F(x) {#x, x},
-
-static const struct fun
-{
-  const char *name;
-  void *(*fun)(void *, int, size_t);
-} funtab[] =
-{
-#if __aarch64__
-  F(__memset_aarch64)
-#elif __arm__
-  F(__memset_arm)
-#endif
-  F(memset)
-#undef F
-  {0, 0}
-};
-
-typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
-static memset_test_t test_arr[NUM_TESTS];
-
-typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
-typedef struct { uint8_t align; uint16_t freq; } align_data_t;
-
-#define SIZE_NUM 65536
-#define SIZE_MASK (SIZE_NUM-1)
-static uint8_t len_arr[SIZE_NUM];
-
-/* Frequency data for memset sizes up to 4096 based on SPEC2017.  */
-static freq_data_t memset_len_freq[] =
-{
-{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, {  8,1412},
-{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
-{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, {  2, 200}, {  4, 192},
-{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
-{4095,133}, { 10, 130}, {  9, 124}, {  3, 124}, { 28, 120}, {  0, 118},
-{288, 110}, {1152, 96}, {104,  90}, {  1,  86}, {832,  76}, {248,  74},
-{1024, 69}, {120,  64}, {512,  63}, {384,  60}, {  6,  59}, { 80,  54},
-{ 17,  50}, {  7,  49}, {520,  47}, {2048, 39}, {256,  37}, {864,  33},
-{1440, 28}, { 22,  27}, {2056, 24}, {260,  23}, { 68,  23}, {  5,  22},
-{ 18,  21}, {200,  18}, {2120, 18}, { 60,  17}, { 52,  16}, {336,  15},
-{ 44,  13}, {192,  13}, {160,  12}, {2064, 12}, {128,  12}, { 76,  11},
-{164,  11}, {152,  10}, {136,   9}, {488,   7}, { 96,   6}, {560,   6},
-{1016,  6}, {112,   5}, {232,   5}, {168,   5}, {952,   5}, {184,   5},
-{144,   4}, {252,   4}, { 84,   3}, {960,   3}, {3808,  3}, {244,   3},
-{280,   3}, {224,   3}, {156,   3}, {1088,  3}, {440,   3}, {216,   2},
-{304,   2}, { 23,   2}, { 25,   2}, { 26,   2}, {264,   2}, {328,   2},
-{1096,  2}, {240,   2}, {1104,  2}, {704,   2}, {1664,  2}, {360,   2},
-{808,   1}, {544,   1}, {236,   1}, {720,   1}, {368,   1}, {424,   1},
-{640,   1}, {1112,  1}, {552,   1}, {272,   1}, {776,   1}, {376,   1},
-{ 92,   1}, {536,   1}, {824,   1}, {496,   1}, {760,   1}, {792,   1},
-{504,   1}, {344,   1}, {1816,  1}, {880,   1}, {176,   1}, {320,   1},
-{352,   1}, {2008,  1}, {208,   1}, {408,   1}, {228,   1}, {2072,  1},
-{568,   1}, {220,   1}, {616,   1}, {600,   1}, {392,   1}, {696,   1},
-{2144,  1}, {1280,  1}, {2136,  1}, {632,   1}, {584,   1}, {456,   1},
-{472,   1}, {3440,  1}, {2088,  1}, {680,   1}, {2928,  1}, {212,   1},
-{648,   1}, {1752,  1}, {664,   1}, {3512,  1}, {1032,  1}, {528,   1},
-{4072,  1}, {204,   1}, {2880,  1}, {3392,  1}, {712,   1}, { 59,   1},
-{736,   1}, {592,   1}, {2520,  1}, {744,   1}, {196,   1}, {172,   1},
-{728,   1}, {2040,  1}, {1192,  1}, {3600,  1}, {0, 0}
-};
-
-#define ALIGN_NUM 1024
-#define ALIGN_MASK (ALIGN_NUM-1)
-static uint8_t align_arr[ALIGN_NUM];
-
-/* Alignment data for memset based on SPEC2017.  */
-static align_data_t memset_align_freq[] =
-{
- {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
-};
-
-static void
-init_memset_distribution (void)
-{
-  int i, j, freq, size, n;
-
-  for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
-    for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
-      len_arr[n++] = size;
-  assert (n == SIZE_NUM);
-
-  for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
-    for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
-      align_arr[n++] = size - 1;
-  assert (n == ALIGN_NUM);
-}
-
-static size_t
-init_memset (size_t max_size)
-{
-  size_t total = 0;
-  /* Create a random set of memsets with the given size and alignment
-     distributions.  */
-  for (int i = 0; i < NUM_TESTS; i++)
-    {
-      test_arr[i].offset = (rand32 (0) & (max_size - 1));
-      test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
-      test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
-      total += test_arr[i].len;
-    }
-
-  return total;
-}
-
-
-int main (void)
-{
-  init_memset_distribution ();
-
-  memset (a, 1, sizeof (a));
-
-  printf("Random memset (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      size_t total_size = 0;
-      uint64_t tsum = 0;
-      printf ("%22s ", funtab[f].name);
-      rand32 (0x12345678);
-
-      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
-	{
-	  size_t memset_size = init_memset (size) * ITERS;
-
-	  for (int c = 0; c < NUM_TESTS; c++)
-	    funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < NUM_TESTS; c++)
-	      funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
-	  t = clock_get_ns () - t;
-	  total_size += memset_size;
-	  tsum += t;
-	  printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
-	}
-      printf( "avg %.2f\n", (double)total_size / tsum);
-    }
-
-  size_t total_size = 0;
-  uint64_t tsum = 0;
-  printf ("%22s ", "memset_call");
-  rand32 (0x12345678);
-
-  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
-    {
-      size_t memset_size = init_memset (size) * ITERS;
-
-      for (int c = 0; c < NUM_TESTS; c++)
-	memset (a + test_arr[c].offset, 0, test_arr[c].len);
-
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_TESTS; c++)
-	  memset (a + test_arr[c].offset, 0, test_arr[c].len);
-      t = clock_get_ns () - t;
-      total_size += memset_size;
-      tsum += t;
-      printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
-    }
-  printf( "avg %.2f\n", (double)total_size / tsum);
-
-
-  printf ("\nMedium memset (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 8; size <= 512; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS2; i++)
-	    funtab[f].fun (a, 0, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-	}
-      printf ("\n");
-    }
-
-  printf ("%22s ", "memset_call");
-  for (int size = 8; size <= 512; size *= 2)
-    {
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS2; i++)
-	memset (a, 0, size);
-      t = clock_get_ns () - t;
-      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
-    }
-
-
-  printf ("\nLarge memset (bytes/ns):\n");
-  for (int f = 0; funtab[f].name != 0; f++)
-    {
-      printf ("%22s ", funtab[f].name);
-
-      for (int size = 1024; size <= 65536; size *= 2)
-	{
-	  uint64_t t = clock_get_ns ();
-	  for (int i = 0; i < ITERS3; i++)
-	    funtab[f].fun (a, 0, size);
-	  t = clock_get_ns () - t;
-	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-	}
-      printf ("\n");
-    }
-
-  printf ("%22s ", "memset_call");
-  for (int size = 1024; size <= 65536; size *= 2)
-    {
-      uint64_t t = clock_get_ns ();
-      for (int i = 0; i < ITERS3; i++)
-	memset (a, 0, size);
-      t = clock_get_ns () - t;
-      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
-    }
-  printf ("\n\n");
-
-  return 0;
-}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index f05d0d5..cc0f04b 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -1,8 +1,8 @@
 /*
  * strlen benchmark.
  *
- * Copyright (c) 2020-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #define _GNU_SOURCE
@@ -13,10 +13,10 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 5000
+#define ITERS 2000
 #define ITERS2 20000000
 #define ITERS3 2000000
-#define NUM_TESTS 16384
+#define NUM_STRLEN 16384
 
 #define MAX_ALIGN 32
 #define MAX_STRLEN 256
@@ -49,7 +49,7 @@ static const struct fun
 };
 #undef F
 
-static uint16_t strlen_tests[NUM_TESTS];
+static uint16_t strlen_tests[NUM_STRLEN];
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -117,7 +117,7 @@ init_strlen_tests (void)
 
   /* Create a random set of strlen input strings using the string length
      and alignment distributions.  */
-  for (int n = 0; n < NUM_TESTS; n++)
+  for (int n = 0; n < NUM_STRLEN; n++)
     {
       int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
       int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@@ -141,14 +141,14 @@ int main (void)
       size_t res = 0, strlen_size = 0, mask = maskv;
       printf ("%22s ", funtab[f].name);
 
-      for (int c = 0; c < NUM_TESTS; c++)
+      for (int c = 0; c < NUM_STRLEN; c++)
 	strlen_size += funtab[f].fun (a + strlen_tests[c]);
       strlen_size *= ITERS;
 
       /* Measure latency of strlen result with (res & mask).  */
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_TESTS; c++)
+	for (int c = 0; c < NUM_STRLEN; c++)
 	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
       t = clock_get_ns () - t;
       printf ("%.2f\n", (double)strlen_size / t);
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index f1bbea3..0f2ce2e 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -2,7 +2,7 @@
  * Benchmark support functions.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 650c52c..378c3cd 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,8 +1,8 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stddef.h>
@@ -29,17 +29,19 @@ size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
 void * __memchr_aarch64_mte (const void *, int, size_t);
+char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
 char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
 char *__strrchr_aarch64_mte (const char *, int);
+int __strcmp_aarch64_mte (const char *, const char *);
+int __strncmp_aarch64_mte (const char *, const char *, size_t);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
 #endif
 # if __ARM_FEATURE_SVE
-void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
-void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memchr_aarch64_sve (const void *, int, size_t);
 int __memcmp_aarch64_sve (const void *, const void *, size_t);
 char *__strchr_aarch64_sve (const char *, int);
@@ -52,11 +54,6 @@ size_t __strlen_aarch64_sve (const char *);
 size_t __strnlen_aarch64_sve (const char *, size_t);
 int __strncmp_aarch64_sve (const char *, const char *, size_t);
 # endif
-# if WANT_MOPS
-void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t);
-void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t);
-void *__memset_aarch64_mops (void *, int, size_t);
-# endif
 # if __ARM_FEATURE_MEMORY_TAGGING
 void *__mtag_tag_region (void *, size_t);
 void *__mtag_tag_zero_region (void *, size_t);
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
index c45fa66..d8c02d9 100644
--- a/string/test/__mtag_tag_region.c
+++ b/string/test/__mtag_tag_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
index a4a7861..221c223 100644
--- a/string/test/__mtag_tag_zero_region.c
+++ b/string/test/__mtag_tag_zero_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_zero_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/memchr.c b/string/test/memchr.c
index c6a9448..0ff77f5 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index f9236b8..7a7cf9c 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -2,7 +2,7 @@
  * memcmp test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index 0c2c75a..ce0ceee 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -28,12 +28,6 @@ static const struct fun
 # if __ARM_NEON
   F(__memcpy_aarch64_simd, 1)
 # endif
-# if __ARM_FEATURE_SVE
-  F(__memcpy_aarch64_sve, 1)
-# endif
-# if WANT_MOPS
-  F(__memcpy_aarch64_mops, 1)
-# endif
 #elif __arm__
   F(__memcpy_arm, 0)
 #endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index a5149d7..689b68c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,8 +1,8 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -28,12 +28,6 @@ static const struct fun
 # if __ARM_NEON
   F(__memmove_aarch64_simd, 1)
 # endif
-# if __ARM_FEATURE_SVE
-  F(__memmove_aarch64_sve, 1)
-# endif
-# if WANT_MOPS
-  F(__memmove_aarch64_mops, 1)
-# endif
 #endif
   {0, 0, 0}
   // clang-format on
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
index 4171a56..adf96f0 100644
--- a/string/test/memrchr.c
+++ b/string/test/memrchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/memset.c b/string/test/memset.c
index 3489e29..f172144 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -2,7 +2,7 @@
  * memset test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -25,9 +25,6 @@ static const struct fun
   F(memset, 0)
 #if __aarch64__
   F(__memset_aarch64, 1)
-# if WANT_MOPS
-  F(__memset_aarch64_mops, 1)
-# endif
 #elif __arm__
   F(__memset_arm, 0)
 #endif
diff --git a/string/test/mte.h b/string/test/mte.h
index 40b0ecf..e67cbd9 100644
--- a/string/test/mte.h
+++ b/string/test/mte.h
@@ -2,7 +2,7 @@
  * Memory tagging testing code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef __TEST_MTE_H
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 0300892..1827e68 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -1,8 +1,8 @@
 /*
  * stpcpy test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef _GNU_SOURCE
@@ -28,7 +28,8 @@ static const struct fun
   // clang-format off
   F(stpcpy, 0)
 #if __aarch64__
-  F(__stpcpy_aarch64, 1)
+  F(__stpcpy_aarch64, 0)
+  F(__stpcpy_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__stpcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/strchr.c b/string/test/strchr.c
index 66180ac..f3ae982 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -2,7 +2,7 @@
  * strchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index aad0bf5..6c30ab2 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -2,7 +2,7 @@
  * strchrnul test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index 4aa95f4..d57b54e 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,8 +1,8 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -24,7 +24,8 @@ static const struct fun
   // clang-format off
   F(strcmp, 0)
 #if __aarch64__
-  F(__strcmp_aarch64, 1)
+  F(__strcmp_aarch64, 0)
+  F(__strcmp_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strcmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index af297f9..e84cace 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,8 +1,8 @@
 /*
  * strcpy test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -24,7 +24,8 @@ static const struct fun
   // clang-format off
   F(strcpy, 0)
 #if __aarch64__
-  F(__strcpy_aarch64, 1)
+  F(__strcpy_aarch64, 0)
+  F(__strcpy_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
index 6bb7e1f..fe855fc 100644
--- a/string/test/stringtest.h
+++ b/string/test/stringtest.h
@@ -2,7 +2,7 @@
  * Common string test code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <ctype.h>
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 47ef3dc..6278380 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,14 +1,15 @@
 /*
  * strlen test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 4bbab6f..018a8a4 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,8 +1,8 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * Copyright (c) 2019-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
@@ -24,7 +24,8 @@ static const struct fun
   // clang-format off
   F(strncmp, 0)
 #if __aarch64__
-  F(__strncmp_aarch64, 1)
+  F(__strncmp_aarch64, 0)
+  F(__strncmp_aarch64_mte, 1)
 # if __ARM_FEATURE_SVE
   F(__strncmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index a800fd1..0dea00e 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -2,7 +2,7 @@
  * strnlen test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 580ca49..fedbdc5 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -2,7 +2,7 @@
  * strrchr test.
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #include <stdint.h>
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
index 5afcf7b..26ade0a 100644
--- a/string/x86_64/check-arch.S
+++ b/string/x86_64/check-arch.S
@@ -2,7 +2,7 @@
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ * SPDX-License-Identifier: MIT
  */
 
 #if !__x86_64__
-- 
Gitee