diff --git a/LICENSE b/LICENSE
index 2543b82ed92d0bdc5f3fdfa5047144db3c7e9014..20a4b7717cf5e46e2def2ecd47756baf3061d2bd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,11 @@
+MIT OR Apache-2.0 WITH LLVM-exception
+=====================================
+
+
 MIT License
+-----------
 
-Copyright (c) 1999-2019, Arm Limited.
+Copyright (c) 1999-2022, Arm Limited.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -19,3 +24,226 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+
+Apache-2.0 WITH LLVM-exception
+------------------------------
+
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/Makefile b/Makefile
index 169f89e2c9d6be3f53a91780447652ee7917b28e..c487896728c2cd3c877dad0f52256ddd1e5ebbe8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile - requires GNU make
 #
-# Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 srcdir = .
 prefix = /usr
@@ -11,6 +11,7 @@ includedir = $(prefix)/include
 
 # Configure these in config.mk, do not make changes in this file.
 SUBS = math string networking
+PLSUBS = math
 HOST_CC = cc
 HOST_CFLAGS = -std=c99 -O2
 HOST_LDFLAGS =
@@ -20,6 +21,7 @@ CPPFLAGS =
 CFLAGS = -std=c99 -O2
 CFLAGS_SHARED = -fPIC
 CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS)
+CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL
 LDFLAGS =
 LDLIBS =
 AR = $(CROSS_COMPILE)ar
@@ -51,6 +53,7 @@ $(DIRS):
 	mkdir -p $@
 
 $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED)
+$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED)
 
 build/%.o: $(srcdir)/%.S
 	$(CC) $(CFLAGS_ALL) -c -o $@ $<
diff --git a/OAT.xml b/OAT.xml
index 71acb93c33930961bd73a8c0eed2ddee84da6bd7..ab48a784237e62c8f8595b1b124e3251991afade 100644
--- a/OAT.xml
+++ b/OAT.xml
@@ -19,7 +19,7 @@
     policylist:
     1. policy: If the OAT-Default.xml policies do not meet your requirements, please add policies here.
     2. policyitem: The fields type, name, path, desc is required, and the fields rule, group, filefilter is optional,the default value is:
-    <policyitem type="" name="" path="" desc="" rule="may" group="defaultGroup" filefilter="defaultPolicyFilter"/>
+    <policyitem type="" name="" path="" desc="" rule="may" filefilter="defaultPolicyFilter"/>
     3. policyitem type:
         "compatibility" is used to check license compatibility in the specified path;
         "license" is used to check source license header in the specified path;
@@ -49,10 +49,43 @@ All configurations in this file will be merged to OAT-Default.xml, if you have a
 
 <configuration>
     <oatconfig>
+        <licensefile></licensefile>
+        <policylist>
+            <policy>
+                <policyitem type="license" name="MIT" path=".*"  desc="兼容license"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="math/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="networking/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/aarch64/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/include/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/bench/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/test/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/x86_64/.*" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="string/Dir.mk" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>  
+                <policyitem type="compatibility" name="GPL-2.0+-with-LLVM-exception" path="Makefile" rule="may" filefilter="defaultPolicyFilter" desc="Other process calls"/>  
+
+            
+            </policy>
+        </policylist>
         <filefilterlist>
+            <filefilter name="defaultPolicyFilter" desc="Filters for compatibility, license header policies">
+                 <filteritem type="filepath" name="math/README.contributors" desc="官方自带文件"/>
+                 <filteritem type="filepath" name="LICENSE" desc="官方自带文件"/>
+                 <filteritem type="filepath" name="string/README.contributors" desc="官方自带文件"/>
+                 <filteritem type="filepath" name="README.OpenSource" desc="官方自带文件"/>
+                 <filteritem type="filepath" name="README" desc="官方自带文件"/>
+                 <filteritem type="filepath" name="optimized-routines.gni" desc="不涉及license"/>
+                 <filteritem type="filepath" name="bundle.json" desc="不涉及license"/>
+                 <filteritem type="filepath" name="config.mk.dist" desc="不涉及license"/>
+
+
+
+            </filefilter>
+
             <filefilter name="binaryFileTypePolicyFilter" desc="Filters for binary file policies">
-		<filteritem type="filename" name="*.pdf" desc="官方自带文件"/>
+		        <filteritem type="filename" name="*.pdf" desc="官方自带文件"/>
             </filefilter>
-        </filefilterlist>
+
+
+		</filefilterlist>
     </oatconfig>
 </configuration>
diff --git a/README b/README
index 9e1a34fdc65d9acd27964255a42211af5ef06efa..a2143a28488abe9cbdb629698a3f22d353489b9a 100644
--- a/README
+++ b/README
@@ -2,14 +2,17 @@ Arm Optimized Routines
 ----------------------
 
 This repository contains implementations of library functions
-provided by Arm under MIT License (See LICENSE). Contributions
-to this project are accepted, but Contributors have to sign an
-Assignment Agreement, please follow the instructions in
+provided by Arm. The outbound license is available under a dual
+license, at the user’s election, as reflected in the LICENSE file.
+Contributions to this project are accepted, but Contributors have
+to sign an Assignment Agreement, please follow the instructions in
 contributor-agreement.pdf. This is needed so upstreaming code
-to projects that require copyright assignment is possible.
+to projects that require copyright assignment is possible. Further
+contribution requirements are documented in README.contributors of
+the appropriate subdirectory.
 
 Regular quarterly releases are tagged as vYY.MM, the latest
-release is v21.02.
+release is v23.01.
 
 Source code layout:
 
@@ -24,6 +27,7 @@ networking/test/ - networking test and benchmark related sources.
 string/         - string routines subproject sources.
 string/include/ - string library public headers.
 string/test/    - string test and benchmark related sources.
+pl/...          - separately maintained performance library code.
 
 The steps to build the target libraries and run the tests:
 
diff --git a/config.mk.dist b/config.mk.dist
index 177e1ac4f53a3e14772a7560f7f79eba86ffe5e7..c4a6dba4b463f669c8a27bac66029c508ed2c875 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,11 +1,14 @@
 # Example config.mk
 #
-# Copyright (c) 2018-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 # Subprojects to build
 SUBS = math string networking
 
+# Subsubprojects to build if subproject pl is built
+PLSUBS = math
+
 # Target architecture: aarch64, arm or x86_64
 ARCH = aarch64
 
@@ -56,8 +59,22 @@ math-cflags += -ffp-contract=fast -fno-math-errno
 # Use with clang.
 #math-cflags += -ffp-contract=fast
 
-# Disable vector math code
-#math-cflags += -DWANT_VMATH=0
+# Disable/enable SVE vector math code and tests
+WANT_SVE_MATH = 0
+ifeq ($(WANT_SVE_MATH), 1)
+  math-cflags += -march=armv8.2-a+sve
+endif
+math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
+
+# If defined to 1, set errno in math functions according to ISO C.  Many math
+# libraries do not set errno, so this is 0 by default.  It may need to be
+# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0.
+WANT_ERRNO = 0
+math-cflags += -DWANT_ERRNO=$(WANT_ERRNO)
+
+# If set to 1, set fenv in vector math routines.
+WANT_SIMD_EXCEPT = 0
+math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT)
 
 # Disable fenv checks
 #math-ulpflags = -q -f
diff --git a/math/Dir.mk b/math/Dir.mk
index 3b841ab71955cc69efff77a1e1fee21938422371..d6385d2bf5173daa6ea0b68d358749c5e7c45154 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,12 +1,14 @@
 # Makefile fragment - requires GNU make
 #
-# Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2022, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/math
 B := build/math
 
 math-lib-srcs := $(wildcard $(S)/*.[cS])
+math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+
 math-test-srcs := \
 	$(S)/test/mathtest.c \
 	$(S)/test/mathbench.c \
@@ -15,6 +17,7 @@ math-test-srcs := \
 math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
 
 math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
 
 math-libs := \
 	build/lib/libmathlib.so \
@@ -42,10 +45,11 @@ math-files := \
 	$(math-tools) \
 	$(math-host-tools) \
 	$(math-includes) \
+	$(math-test-includes) \
 
-all-math: $(math-libs) $(math-tools) $(math-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
 
-$(math-objs): $(math-includes)
+$(math-objs): $(math-includes) $(math-test-includes)
 $(math-objs): CFLAGS_ALL += $(math-cflags)
 $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
 $(math-host-objs): CC = $(HOST_CC)
@@ -83,6 +87,9 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
 build/include/%.h: $(S)/include/%.h
 	cp $< $@
 
+build/include/test/%.h: $(S)/test/%.h
+	cp $< $@
+
 build/bin/%.sh: $(S)/test/%.sh
 	cp $< $@
 
@@ -96,7 +103,7 @@ check-math-rtest: $(math-host-tools) $(math-tools)
 	cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
 
 check-math-ulp: $(math-tools)
-	ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
+	ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
 
 check-math: check-math-test check-math-rtest check-math-ulp
 
diff --git a/math/README.contributors b/math/README.contributors
new file mode 100644
index 0000000000000000000000000000000000000000..33e7ba376e419301eaf8e51fc7abe4ad10a31350
--- /dev/null
+++ b/math/README.contributors
@@ -0,0 +1,78 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
+==============================================
+
+1. Math functions have quality and performance requirements.
+
+2. Quality:
+   - Worst-case ULP error should be small in the entire input domain (for most
+     common double precision scalar functions the target is < 0.66 ULP error,
+     and < 1 ULP for single precision, even performance optimized function
+     variant should not have > 5 ULP error if the goal is to be a drop in
+     replacement for a standard math function), this should be tested
+     statistically (or on all inputs if possible in reasonable amount of time).
+     The ulp tool is for this and runulp.sh should be updated for new functions.
+
+   - All standard rounding modes need to be supported but in non-default rounding
+     modes the quality requirement can be relaxed. (Non-nearest rounded
+     computation can be slow and inaccurate but has to be correct for conformance
+     reasons.)
+
+   - Special cases and error handling need to follow ISO C Annex F requirements,
+     POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
+     https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
+     this should be tested by direct tests (glibc test system may be used for it).
+
+   - Error handling code should be decoupled from the approximation code as much
+     as possible. (There are helper functions, these take care of errno as well
+     as exception raising.)
+
+   - Vector math code does not need to work in non-nearest rounding mode and error
+     handling side effects need not happen (fenv exceptions and errno), but the
+     result should be correct (within quality requirements, which are lower for
+     vector code than for scalar code).
+
+   - Error bounds of the approximation should be clearly documented.
+
+   - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
+     systems. (Routines and features can be disabled on specific targets, but
+     the build must complete). On aarch64, both little- and big-endian targets
+     are supported as well as valid combinations of architecture extensions.
+     The configurations that should be tested depend on the contribution.
+
+3. Performance:
+   - Common math code should be benchmarked on modern aarch64 microarchitectures
+     over typical inputs.
+
+   - Performance improvements should be documented (relative numbers can be
+     published; it is enough to use the mathbench microbenchmark tool which should
+     be updated for new functions).
+
+   - Attention should be paid to the compilation flags: for aarch64 fma
+     contraction should be on and math errno turned off so some builtins can be
+     inlined.
+
+   - The code should be reasonably performant on x86_64 too, e.g. some rounding
+     instructions and fma may not be available on x86_64, such builtins turn into
+     libc calls with slow code. Such slowdown is not acceptable, a faster fallback
+     should be present: glibc and bionic use the same code on all targets. (This
+     does not apply to vector math code).
diff --git a/math/aarch64/v_cos.c b/math/aarch64/v_cos.c
new file mode 100644
index 0000000000000000000000000000000000000000..9a73575bce896a9cc54930bb5cd7586b316aa5c0
--- /dev/null
+++ b/math/aarch64/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[7];
+  float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+} data = {
+  /* Worst-case error is 3.3 ulp in [-pi/2, pi/2].  */
+  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+	    V2 (-0x1.9e9540300a1p-41) },
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+  .half_pi = V2 (0x1.921fb54442d18p+0),
+  .pi_1 = V2 (0x1.921fb54442d18p+1),
+  .pi_2 = V2 (0x1.1a62633145c06p-53),
+  .pi_3 = V2 (0x1.c1cd129024e09p-106),
+  .shift = V2 (0x1.8p52),
+  .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+  uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  r = vabsq_f64 (x);
+  cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+		   vreinterpretq_u64_f64 (d->range_val));
+  if (unlikely (v_any_u64 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       special-case handler later.  */
+    r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+  cmp = vcageq_f64 (x, d->range_val);
+  r = x;
+#endif
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+  n = vsubq_f64 (n, d->shift);
+  n = vsubq_f64 (n, v_f64 (0.5));
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (r, d->pi_2, n);
+  r = vfmsq_f64 (r, d->pi_3, n);
+
+  /* sin(r) poly approx.  */
+  r2 = vmulq_f64 (r, r);
+  r3 = vmulq_f64 (r2, r);
+  r4 = vmulq_f64 (r2, r2);
+
+  t1 = vfmaq_f64 (C (4), C (5), r2);
+  t2 = vfmaq_f64 (C (2), C (3), r2);
+  t3 = vfmaq_f64 (C (0), C (1), r2);
+
+  y = vfmaq_f64 (t1, C (6), r4);
+  y = vfmaq_f64 (t2, y, r4);
+  y = vfmaq_f64 (t3, y, r4);
+  y = vfmaq_f64 (r, y, r3);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/math/aarch64/v_cosf.c b/math/aarch64/v_cosf.c
new file mode 100644
index 0000000000000000000000000000000000000000..b9890b2998ad3c260a6849d980cf3f69b4453ec4
--- /dev/null
+++ b/math/aarch64/v_cosf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[4];
+  float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+  /* 1.886 ulp error.  */
+  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+	    V4 (0x1.5b2e76p-19f) },
+
+  .pi_1 = V4 (0x1.921fb6p+1f),
+  .pi_2 = V4 (-0x1.777a5cp-24f),
+  .pi_3 = V4 (-0x1.ee59dap-49f),
+
+  .inv_pi = V4 (0x1.45f306p-2f),
+  .shift = V4 (0x1.8p+23f),
+  .half_pi = V4 (0x1.921fb6p0f),
+  .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, r3, y;
+  uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  r = vabsq_f32 (x);
+  cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+		   vreinterpretq_u32_f32 (d->range_val));
+  if (unlikely (v_any_u32 (cmp)))
+    /* If fenv exceptions are to be triggered correctly, set any special lanes
+       to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+       special-case handler later.  */
+    r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+  cmp = vcageq_f32 (x, d->range_val);
+  r = x;
+#endif
+
+  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+  n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+  n = vsubq_f32 (n, d->shift);
+  n = vsubq_f32 (n, v_f32 (0.5f));
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (r, d->pi_2, n);
+  r = vfmsq_f32 (r, d->pi_3, n);
+
+  /* y = sin(r).  */
+  r2 = vmulq_f32 (r, r);
+  r3 = vmulq_f32 (r2, r);
+  y = vfmaq_f32 (C (2), C (3), r2);
+  y = vfmaq_f32 (C (1), y, r2);
+  y = vfmaq_f32 (C (0), y, r2);
+  y = vfmaq_f32 (r, y, r3);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/math/aarch64/v_exp.c b/math/aarch64/v_exp.c
new file mode 100644
index 0000000000000000000000000000000000000000..bc5609faf4fc3597a5ec3a1080a12e843417bcc7
--- /dev/null
+++ b/math/aarch64/v_exp.c
@@ -0,0 +1,125 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+  float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.88 +0.5 ulp
+     rel error: 1.4337*2^-53
+     abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+  .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+	    V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+  .scale_thresh = V2 (163840.0), /* 1280.0 * N.  */
+  .special_bound = V2 (704.0),
+#endif
+  .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2.  */
+  .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N.  */
+  .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+  .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9).  */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound.  */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (
+      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+  uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+  return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+  float64x2_t n, r, r2, s, y, z;
+  uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  float64x2_t xm = x;
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+  if (unlikely (v_any_u64 (cmp)))
+    x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+  cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+  /* n = round(x/(ln2/N)).  */
+  z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+  u = vreinterpretq_u64_f64 (z);
+  n = vsubq_f64 (z, data.shift);
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = vfmsq_f64 (r, data.ln2_hi, n);
+  r = vfmsq_f64 (r, data.ln2_lo, n);
+
+  e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (C (0), C (1), r);
+  y = vfmaq_f64 (y, C (2), r2);
+  y = vfmaq_f64 (r, y, r2);
+
+  /* s = 2^(n/N).  */
+  u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+  s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+    return special_case (s, y, n);
+#endif
+
+  return vfmaq_f64 (s, y, s);
+}
diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/v_exp2f.c
new file mode 100644
index 0000000000000000000000000000000000000000..e402205e98e6bea310877d6d8b9b5f014e16c47a
--- /dev/null
+++ b/math/aarch64/v_exp2f.c
@@ -0,0 +1,113 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[5];
+  uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.962 ulp.  */
+  .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+	    V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+  .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000)	  /* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42800000)	  /* asuint (0x1p6).  */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine for special lanes.  */
+  return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, scale, p, q, poly;
+  uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+  /* asuint(|x|) - TinyBound >= BigBound - TinyBound.  */
+  uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+    /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+       x = n + r, with r in [-1/2, 1/2].  */
+  n = vrndaq_f32 (x);
+  r = vsubq_f32 (x, n);
+  e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+  r2 = vmulq_f32 (r, r);
+  p = vfmaq_f32 (C (1), C (0), r);
+  q = vfmaq_f32 (C (3), C (2), r);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (C (4), r);
+  poly = vfmaq_f32 (p, q, r2);
+
+  if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/math/v_exp2f_1u.c b/math/aarch64/v_exp2f_1u.c
similarity index 43%
rename from math/v_exp2f_1u.c
rename to math/aarch64/v_exp2f_1u.c
index 1caa14d9bffffbb2d0cc47ac6470b12701732f67..ba6b02fbb4bcbd9c215d8326dd74f2e4bbadc18b 100644
--- a/math/v_exp2f_1u.c
+++ b/math/aarch64/v_exp2f_1u.c
@@ -1,13 +1,12 @@
 /*
  * Single-precision vector 2^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
-#if V_SUPPORTED
 
 static const float Poly[] = {
   /*  maxerr: 0.878 ulp.  */
@@ -25,51 +24,49 @@ static const float Poly[] = {
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
-  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
-  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
-  v_f32_t s2 = v_as_f32_u32 (e - b);
-  v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
-  v_f32_t r1 = s1 * s1;
-  v_f32_t r0 = poly * s1 * s2;
-  return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
+  uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+  float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+  uint32x4_t cmp = absn > v_f32 (192.0f);
+  float32x4_t r1 = s1 * s1;
+  float32x4_t r0 = poly * s1 * s2;
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
 }
 
-VPCS_ATTR
-v_f32_t
-V_NAME(exp2f_1u) (v_f32_t x)
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
 {
-  v_f32_t n, r, scale, poly, absn;
-  v_u32_t cmp, e;
+  float32x4_t n, r, scale, poly, absn;
+  uint32x4_t cmp, e;
 
   /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
      x = n + r, with r in [-1/2, 1/2].  */
 #if 0
-  v_f32_t z;
+  float32x4_t z;
   z = x + Shift;
   n = z - Shift;
   r = x - n;
-  e = v_as_u32_f32 (z) << 23;
+  e = vreinterpretq_u32_f32 (z) << 23;
 #else
-  n = v_round_f32 (x);
+  n = vrndaq_f32 (x);
   r = x - n;
-  e = v_as_u32_s32 (v_round_s32 (x)) << 23;
+  e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
 #endif
-  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
-  cmp = v_cond_u32 (absn > v_f32 (126.0f));
-  poly = v_fma_f32 (C0, r, C1);
-  poly = v_fma_f32 (poly, r, C2);
-  poly = v_fma_f32 (poly, r, C3);
-  poly = v_fma_f32 (poly, r, C4);
-  poly = v_fma_f32 (poly, r, C5);
-  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
+  absn = vabsq_f32 (n);
+  cmp = absn > v_f32 (126.0f);
+  poly = vfmaq_f32 (C1, C0, r);
+  poly = vfmaq_f32 (C2, poly, r);
+  poly = vfmaq_f32 (C3, poly, r);
+  poly = vfmaq_f32 (C4, poly, r);
+  poly = vfmaq_f32 (C5, poly, r);
+  poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn);
   return scale * poly;
 }
-#endif
diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c
new file mode 100644
index 0000000000000000000000000000000000000000..45f0848cac5b5bcf00b768c7f107e0400a8fab7a
--- /dev/null
+++ b/math/aarch64/v_exp_data.c
@@ -0,0 +1,146 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+# define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N.  */
+const uint64_t __v_exp_data[] = {
+# if N == 128
+  0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
+  0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
+  0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
+  0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
+  0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
+  0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
+  0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
+  0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
+  0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
+  0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
+  0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
+  0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
+  0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
+  0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
+  0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
+  0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
+  0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
+  0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
+  0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
+  0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
+  0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
+  0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
+  0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
+  0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
+  0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
+  0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
+  0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
+  0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
+  0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
+  0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
+  0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
+  0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
+  0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
+  0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
+  0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
+  0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
+  0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
+  0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
+  0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
+  0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
+  0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
+  0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
+  0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
+# elif N == 256
+  0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+  0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+  0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+  0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+  0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+  0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+  0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+  0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+  0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+  0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+  0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+  0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+  0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+  0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+  0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+  0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+  0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+  0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+  0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+  0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+  0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+  0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+  0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+  0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+  0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+  0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+  0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+  0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+  0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+  0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+  0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+  0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+  0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+  0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+  0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+  0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+  0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+  0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+  0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+  0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+  0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+  0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+  0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+  0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+  0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+  0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+  0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+  0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+  0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+  0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+  0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+  0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+  0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+  0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+  0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+  0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+  0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+  0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+  0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+  0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+  0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+  0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+  0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+  0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+  0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+  0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+  0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+  0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+  0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+  0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+  0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+  0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+  0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+  0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+  0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+  0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+  0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+  0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+  0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+  0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+  0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+  0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+  0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+  0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+  0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+  0x3feff9d96b2a23d9,
+# endif
+};
diff --git a/math/aarch64/v_expf.c b/math/aarch64/v_expf.c
new file mode 100644
index 0000000000000000000000000000000000000000..34e8b6081bcd947effb06be781b8fba6bd95bbba
--- /dev/null
+++ b/math/aarch64/v_expf.c
@@ -0,0 +1,122 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[5];
+  float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+  uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+  .shift = V4 (0x1.8p23f),
+  .inv_ln2 = V4 (0x1.715476p+0f),
+  .ln2_hi = V4 (0x1.62e4p-1f),
+  .ln2_lo = V4 (0x1.7f7d1cp-20f),
+  .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000)	/* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42800000)	/* asuint (0x1p6).  */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, scale, p, q, poly, z;
+  uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
+  cmp = vcgeq_u32 (
+      vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+		 TinyBound),
+      SpecialBound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  z = vfmaq_f32 (d->shift, x, d->inv_ln2);
+  n = vsubq_f32 (z, d->shift);
+  r = vfmsq_f32 (x, n, d->ln2_hi);
+  r = vfmsq_f32 (r, n, d->ln2_lo);
+  e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+  r2 = vmulq_f32 (r, r);
+  p = vfmaq_f32 (C (1), C (0), r);
+  q = vfmaq_f32 (C (3), C (2), r);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (C (4), r);
+  poly = vfmaq_f32 (p, q, r2);
+
+  if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/math/v_expf_1u.c b/math/aarch64/v_expf_1u.c
similarity index 39%
rename from math/v_expf_1u.c
rename to math/aarch64/v_expf_1u.c
index 023bd248c9ac9c89e88a9979d0d1a24197550f79..43d03fa34efab42e2ac666dd6c784c02b8fdf6ed 100644
--- a/math/v_expf_1u.c
+++ b/math/aarch64/v_expf_1u.c
@@ -1,13 +1,12 @@
 /*
  * Single-precision vector e^x function.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
-#if V_SUPPORTED
 
 static const float Poly[] = {
   /*  maxerr: 0.36565 +0.5 ulp.  */
@@ -28,53 +27,51 @@ static const float Poly[] = {
 #define Ln2hi v_f32 (0x1.62e4p-1f)
 #define Ln2lo v_f32 (0x1.7f7d1cp-20f)
 
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
 {
   /* 2^n may overflow, break it up into s1*s2.  */
-  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
-  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
-  v_f32_t s2 = v_as_f32_u32 (e - b);
-  v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
-  v_f32_t r1 = s1 * s1;
-  v_f32_t r0 = poly * s1 * s2;
-  return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
+  uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+  float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+  float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+  uint32x4_t cmp = absn > v_f32 (192.0f);
+  float32x4_t r1 = s1 * s1;
+  float32x4_t r0 = poly * s1 * s2;
+  return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+				| (~cmp & vreinterpretq_u32_f32 (r0)));
 }
 
-VPCS_ATTR
-v_f32_t
-V_NAME(expf_1u) (v_f32_t x)
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
 {
-  v_f32_t n, r, scale, poly, absn, z;
-  v_u32_t cmp, e;
+  float32x4_t n, r, scale, poly, absn, z;
+  uint32x4_t cmp, e;
 
   /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 #if 1
-  z = v_fma_f32 (x, InvLn2, Shift);
+  z = vfmaq_f32 (Shift, x, InvLn2);
   n = z - Shift;
-  r = v_fma_f32 (n, -Ln2hi, x);
-  r = v_fma_f32 (n, -Ln2lo, r);
-  e = v_as_u32_f32 (z) << 23;
+  r = vfmaq_f32 (x, n, -Ln2hi);
+  r = vfmaq_f32 (r, n, -Ln2lo);
+  e = vreinterpretq_u32_f32 (z) << 23;
 #else
   z = x * InvLn2;
-  n = v_round_f32 (z);
-  r = v_fma_f32 (n, -Ln2hi, x);
-  r = v_fma_f32 (n, -Ln2lo, r);
-  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
+  n = vrndaq_f32 (z);
+  r = vfmaq_f32 (x, n, -Ln2hi);
+  r = vfmaq_f32 (r, n, -Ln2lo);
+  e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
 #endif
-  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
-  cmp = v_cond_u32 (absn > v_f32 (126.0f));
-  poly = v_fma_f32 (C0, r, C1);
-  poly = v_fma_f32 (poly, r, C2);
-  poly = v_fma_f32 (poly, r, C3);
-  poly = v_fma_f32 (poly, r, C4);
-  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
-  poly = v_fma_f32 (poly, r, v_f32 (1.0f));
+  scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
+  absn = vabsq_f32 (n);
+  cmp = absn > v_f32 (126.0f);
+  poly = vfmaq_f32 (C1, C0, r);
+  poly = vfmaq_f32 (C2, poly, r);
+  poly = vfmaq_f32 (C3, poly, r);
+  poly = vfmaq_f32 (C4, poly, r);
+  poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+  poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
   if (unlikely (v_any_u32 (cmp)))
     return specialcase (poly, n, e, absn);
   return scale * poly;
 }
-#endif
diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c
new file mode 100644
index 0000000000000000000000000000000000000000..1d1c1fa62c0423da2c6c402113da471af2df7540
--- /dev/null
+++ b/math/aarch64/v_log.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  uint64x2_t min_norm;
+  uint32x4_t special_bound;
+  float64x2_t poly[5];
+  float64x2_t ln2;
+  uint64x2_t sign_exp_mask;
+} data = {
+  /* Worst-case error: 1.17 + 0.5 ulp.
+     Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
+  .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
+	    V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
+	    V2 (-0x1.554e550bd501ep-3) },
+  .ln2 = V2 (0x1.62e42fefa39efp-1),
+  .min_norm = V2 (0x0010000000000000),
+  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm.  */
+  .sign_exp_mask = V2 (0xfff0000000000000)
+};
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+#define Off v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+  float64x2_t invc;
+  float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+  /* Since N is a power of 2, n % N = n & (N - 1).  */
+  struct entry e;
+  uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+  float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+  float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+  e.invc = vuzp1q_f64 (e0, e1);
+  e.logc = vuzp2q_f64 (e0, e1);
+  return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+	      uint32x2_t cmp)
+{
+  return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t z, r, r2, p, y, kd, hi;
+  uint64x2_t ix, iz, tmp;
+  uint32x2_t cmp;
+  int64x2_t k;
+  struct entry e;
+
+  ix = vreinterpretq_u64_f64 (x);
+  cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+		  vget_low_u32 (d->special_bound));
+
+  /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+     The range is split into N subintervals.
+     The ith subinterval contains z and c is near its center.  */
+  tmp = vsubq_u64 (ix, Off);
+  k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift.  */
+  iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+  z = vreinterpretq_f64_u64 (iz);
+  e = lookup (tmp);
+
+  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
+  r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+  kd = vcvtq_f64_s64 (k);
+
+  /* hi = r + log(c) + k*Ln2.  */
+  hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (A (2), A (3), r);
+  p = vfmaq_f64 (A (0), A (1), r);
+  y = vfmaq_f64 (y, A (4), r2);
+  y = vfmaq_f64 (p, y, r2);
+
+  if (unlikely (v_any_u32h (cmp)))
+    return special_case (x, y, hi, r2, cmp);
+  return vfmaq_f64 (hi, y, r2);
+}
diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c
new file mode 100644
index 0000000000000000000000000000000000000000..82351bb14766f2fbf6095cbf2e214e99b45f217d
--- /dev/null
+++ b/math/aarch64/v_log_data.c
@@ -0,0 +1,156 @@
+/*
+ * Lookup table for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#define N (1 << V_LOG_TABLE_BITS)
+
+const struct v_log_data __v_log_data = {
+  /* Algorithm:
+
+	x = 2^k z
+	log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+  where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+  N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
+
+	table[i].invc = 1/c
+	table[i].logc = (double)log(c)
+
+  where c is near the center of the subinterval and is chosen by trying several
+  floating point invc candidates around 1/center and selecting one for which
+  the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+  that contains 1 and the previous one got tweaked to avoid cancellation.  */
+  .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
+	     { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
+	     { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
+	     { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
+	     { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
+	     { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
+	     { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
+	     { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
+	     { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
+	     { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
+	     { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
+	     { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
+	     { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
+	     { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
+	     { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
+	     { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
+	     { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
+	     { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
+	     { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
+	     { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
+	     { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
+	     { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
+	     { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
+	     { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
+	     { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
+	     { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
+	     { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
+	     { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
+	     { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
+	     { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
+	     { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
+	     { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
+	     { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
+	     { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
+	     { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
+	     { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
+	     { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
+	     { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
+	     { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
+	     { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
+	     { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
+	     { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
+	     { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
+	     { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
+	     { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
+	     { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
+	     { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
+	     { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
+	     { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
+	     { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
+	     { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
+	     { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
+	     { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
+	     { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
+	     { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
+	     { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
+	     { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
+	     { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
+	     { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
+	     { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
+	     { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
+	     { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
+	     { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
+	     { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
+	     { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
+	     { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
+	     { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
+	     { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
+	     { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
+	     { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
+	     { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
+	     { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
+	     { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
+	     { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
+	     { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
+	     { 1.0, 0.0 },
+	     { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
+	     { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
+	     { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
+	     { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
+	     { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
+	     { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
+	     { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
+	     { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
+	     { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
+	     { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
+	     { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
+	     { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
+	     { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
+	     { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
+	     { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
+	     { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
+	     { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
+	     { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
+	     { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
+	     { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
+	     { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
+	     { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
+	     { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
+	     { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
+	     { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
+	     { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
+	     { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
+	     { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
+	     { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
+	     { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
+	     { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
+	     { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
+	     { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
+	     { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
+	     { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
+	     { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
+	     { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
+	     { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
+	     { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
+	     { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
+	     { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
+	     { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
+	     { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
+	     { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
+	     { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
+	     { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
+	     { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
+	     { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
+	     { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
+	     { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
+	     { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
+	     { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
+};
diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c
new file mode 100644
index 0000000000000000000000000000000000000000..66ebbbcd2b5a840b8a194cb18139ee585f67208a
--- /dev/null
+++ b/math/aarch64/v_logf.c
@@ -0,0 +1,74 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  uint32x4_t min_norm;
+  uint16x8_t special_bound;
+  float32x4_t poly[7];
+  float32x4_t ln2, tiny_bound;
+  uint32x4_t off, mantissa_mask;
+} data = {
+  /* 3.34 ulp error.  */
+  .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+	    V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+	    V4 (-0x1.ffffc8p-2f) },
+  .ln2 = V4 (0x1.62e43p-1f),
+  .tiny_bound = V4 (0x1p-126),
+  .min_norm = V4 (0x00800000),
+  .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm.  */
+  .off = V4 (0x3f2aaaab),	/* 0.666667.  */
+  .mantissa_mask = V4 (0x007fffff)
+};
+
+#define P(i) d->poly[7 - i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
+	      uint16x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, p, q, r, r2, y;
+  uint32x4_t u;
+  uint16x4_t cmp;
+
+  u = vreinterpretq_u32_f32 (x);
+  cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+		  vget_low_u16 (d->special_bound));
+
+  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
+  u = vsubq_u32 (u, d->off);
+  n = vcvtq_f32_s32 (
+      vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend.  */
+  u = vandq_u32 (u, d->mantissa_mask);
+  u = vaddq_u32 (u, d->off);
+  r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+  /* y = log(1+r) + n*ln2.  */
+  r2 = vmulq_f32 (r, r);
+  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
+  p = vfmaq_f32 (P (5), P (6), r);
+  q = vfmaq_f32 (P (3), P (4), r);
+  y = vfmaq_f32 (P (1), P (2), r);
+  p = vfmaq_f32 (p, P (7), r2);
+  q = vfmaq_f32 (q, p, r2);
+  y = vfmaq_f32 (y, q, r2);
+  p = vfmaq_f32 (r, d->ln2, n);
+
+  if (unlikely (v_any_u16h (cmp)))
+    return special_case (x, y, r2, p, cmp);
+  return vfmaq_f32 (p, y, r2);
+}
diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dc9916c6fb076fd0c3d5074f5d156d2d952b4f2
--- /dev/null
+++ b/math/aarch64/v_math.h
@@ -0,0 +1,135 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#if !__aarch64__
+# error "Cannot build without AArch64"
+#endif
+
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+
+#include <stdint.h>
+#include "../math_config.h"
+#include <arm_neon.h>
+
+/* Shorthand helpers for declaring constants.  */
+#  define V2(X) { X, X }
+#  define V4(X) { X, X, X, X }
+#  define V8(X) { X, X, X, X, X, X, X, X }
+
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+  return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
+
+static inline int
+v_lanes32 (void)
+{
+  return 4;
+}
+
+static inline float32x4_t
+v_f32 (float x)
+{
+  return (float32x4_t) V4 (x);
+}
+static inline uint32x4_t
+v_u32 (uint32_t x)
+{
+  return (uint32x4_t) V4 (x);
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u32 (uint32x4_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+  return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
+{
+  return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
+{
+  return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
+{
+  return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+		       p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+	     float32x4_t y, uint32x4_t p)
+{
+  return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
+		       p[1] ? f (x1[1], x2[1]) : y[1],
+		       p[2] ? f (x1[2], x2[2]) : y[2],
+		       p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
+static inline float64x2_t
+v_f64 (double x)
+{
+  return (float64x2_t) V2 (x);
+}
+static inline uint64x2_t
+v_u64 (uint64_t x)
+{
+  return (uint64x2_t) V2 (x);
+}
+/* true if any elements of a v_cond result is non-zero.  */
+static inline int
+v_any_u64 (uint64x2_t x)
+{
+  /* assume elements in x are either 0 or -1u.  */
+  return vpaddd_u64 (x) != 0;
+}
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
+{
+  return (float64x2_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
+{
+  return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
+{
+  double p1 = p[1];
+  double x1 = x[1];
+  if (likely (p[0]))
+    y[0] = f (x[0]);
+  if (likely (p1))
+    y[1] = f (x1);
+  return y;
+}
+
+#endif
diff --git a/math/v_pow.c b/math/aarch64/v_pow.c
similarity index 35%
rename from math/v_pow.c
rename to math/aarch64/v_pow.c
index a209d57f41cee70ac78bc4f418c385f481636025..734f1663a283d4ce068efc2526d0dd989ba5433b 100644
--- a/math/v_pow.c
+++ b/math/aarch64/v_pow.c
@@ -1,27 +1,22 @@
 /*
  * Double-precision vector pow function.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "mathlib.h"
 #include "v_math.h"
-#if V_SUPPORTED
 
-VPCS_ATTR
-v_f64_t
-V_NAME(pow) (v_f64_t x, v_f64_t y)
+float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
 {
-  v_f64_t z;
+  float64x2_t z;
   for (int lane = 0; lane < v_lanes64 (); lane++)
     {
-      f64_t sx = v_get_f64 (x, lane);
-      f64_t sy = v_get_f64 (y, lane);
-      f64_t sz = pow (sx, sy);
-      v_set_f64 (&z, lane, sz);
+      double sx = x[lane];
+      double sy = y[lane];
+      double sz = pow (sx, sy);
+      z[lane] = sz;
     }
   return z;
 }
-VPCS_ALIAS
-#endif
diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c
new file mode 100644
index 0000000000000000000000000000000000000000..3a4163ab05582b387e87245bd4de77e9b93f9ac1
--- /dev/null
+++ b/math/aarch64/v_powf.c
@@ -0,0 +1,148 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min.  */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A data.log2_poly
+#define C data.exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct
+{
+  struct
+  {
+    double invc, logc;
+  } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+  double log2_poly[4];
+  uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+  double exp2f_poly[3];
+} data = {
+  .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+	       {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+	       {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+	       {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+	       {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+	       {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+	       {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+	       {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+	       {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+	       {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+	       {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+	       {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+	       {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+	       {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+	       {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+	       {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+	       {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+	       {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+	       {0x1p+0, 0x0p+0 * Scale},
+	       {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+	       {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+	       {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+	       {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+	       {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+	       {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+	       {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+	       {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+	       {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+	       {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+	       {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+	       {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+	       {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+  .log2_poly = { /* rel err: 1.5 * 2^-30.  */
+		-0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
+		-0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
+  .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+		0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+		0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+		0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+		0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+		0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+		0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+		0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+		0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+		0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+		0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+  .exp2f_poly = { /* rel err: 1.69 * 2^-34.  */
+		 0x1.c6af84b912394p-5 / Scale / Scale / Scale,
+		 0x1.ebfce50fac4f3p-3 / Scale / Scale,
+		 0x1.62e42ff0c52d6p-1 / Scale}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+  return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+  uint32x4_t u = vreinterpretq_u32_f32 (x);
+  uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+  uint32x4_t tmp = vsubq_u32 (u, Off);
+  uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+			    Log2IdxMask);
+  uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+  uint32x4_t iz = vsubq_u32 (u, top);
+  int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+			     23 - V_EXP2F_TABLE_BITS); /* arithmetic shift.  */
+
+  float32x4_t ret;
+  for (int lane = 0; lane < 4; lane++)
+    {
+      /* Use double precision for each lane.  */
+      double invc = data.log2_tab[i[lane]].invc;
+      double logc = data.log2_tab[i[lane]].logc;
+      double z = (double) asfloat (iz[lane]);
+
+      /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
+      double r = __builtin_fma (z, invc, -1.0);
+      double y0 = logc + (double) k[lane];
+
+      /* Polynomial to approximate log1p(r)/ln2.  */
+      double logx = A[0];
+      logx = r * logx + A[1];
+      logx = r * logx + A[2];
+      logx = r * logx + A[3];
+      logx = r * logx + y0;
+      double ylogx = y[lane] * logx;
+      cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
+			  >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
+		      ? 1
+		      : cmp[lane];
+
+      /* N*x = k + r with r in [-1/2, 1/2].  */
+      double kd = round (ylogx);
+      uint64_t ki = lround (ylogx);
+      r = ylogx - kd;
+
+      /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
+      uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
+      t += ki << (52 - V_EXP2F_TABLE_BITS);
+      double s = asdouble (t);
+      double p = C[0];
+      p = __builtin_fma (p, r, C[1]);
+      p = __builtin_fma (p, r, C[2]);
+      p = __builtin_fma (p, s * r, s);
+
+      ret[lane] = p;
+    }
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, ret, cmp);
+  return ret;
+}
diff --git a/math/aarch64/v_sin.c b/math/aarch64/v_sin.c
new file mode 100644
index 0000000000000000000000000000000000000000..04129c31133d62dcecedf832b4e410b5217a51a2
--- /dev/null
+++ b/math/aarch64/v_sin.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t poly[7];
+  float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+	    V2 (-0x1.9e9540300a1p-41) },
+
+  .range_val = V2 (0x1p23),
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+  .pi_1 = V2 (0x1.921fb54442d18p+1),
+  .pi_2 = V2 (0x1.1a62633145c06p-53),
+  .pi_3 = V2 (0x1.c1cd129024e09p-106),
+  .shift = V2 (0x1.8p52),
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255).  */
+# define Thresh v_u64 (0x1160000000000000)    /* RangeVal - TinyBound.  */
+#endif
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (sin, x, y, cmp);
+}
+
+/* Vector (AdvSIMD) sin approximation.
+   Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
+   is 2.87 ULP:
+   _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
+				      want 0x1.fffffffa7dc05p-1
+   Maximum observed error in the entire non-special domain ([-2^23, 2^23])
+   is 3.22 ULP:
+   _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
+				       want 0x1.ffdcd125c84f8p-3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
+  uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
+     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+     fenv). These lanes will be fixed by special-case handler later.  */
+  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+  r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+#else
+  r = x;
+  cmp = vcageq_f64 (x, d->range_val);
+#endif
+
+  /* n = rint(|x|/pi).  */
+  n = vfmaq_f64 (d->shift, d->inv_pi, r);
+  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+  n = vsubq_f64 (n, d->shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f64 (r, d->pi_1, n);
+  r = vfmsq_f64 (r, d->pi_2, n);
+  r = vfmsq_f64 (r, d->pi_3, n);
+
+  /* sin(r) poly approx.  */
+  r2 = vmulq_f64 (r, r);
+  r3 = vmulq_f64 (r2, r);
+  r4 = vmulq_f64 (r2, r2);
+
+  t1 = vfmaq_f64 (C (4), C (5), r2);
+  t2 = vfmaq_f64 (C (2), C (3), r2);
+  t3 = vfmaq_f64 (C (0), C (1), r2);
+
+  y = vfmaq_f64 (t1, C (6), r4);
+  y = vfmaq_f64 (t2, y, r4);
+  y = vfmaq_f64 (t3, y, r4);
+  y = vfmaq_f64 (r, y, r3);
+
+  if (unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/math/aarch64/v_sinf.c b/math/aarch64/v_sinf.c
new file mode 100644
index 0000000000000000000000000000000000000000..336879844459f70accf8f2532407db6fc6810e69
--- /dev/null
+++ b/math/aarch64/v_sinf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t poly[4];
+  float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+  /* 1.886 ulp error.  */
+  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+	    V4 (0x1.5b2e76p-19f) },
+
+  .pi_1 = V4 (0x1.921fb6p+1f),
+  .pi_2 = V4 (-0x1.777a5cp-24f),
+  .pi_3 = V4 (-0x1.ee59dap-49f),
+
+  .inv_pi = V4 (0x1.45f306p-2f),
+  .shift = V4 (0x1.8p+23f),
+  .range_val = V4 (0x1p20f)
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f).  */
+# define Thresh v_u32 (0x28800000)    /* RangeVal - TinyBound.  */
+#endif
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (sinf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float32x4_t n, r, r2, y;
+  uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+  /* If fenv exceptions are to be triggered correctly, set any special lanes
+     to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+     special-case handler later.  */
+  r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+#else
+  r = x;
+  cmp = vcageq_f32 (x, d->range_val);
+#endif
+
+  /* n = rint(|x|/pi) */
+  n = vfmaq_f32 (d->shift, d->inv_pi, r);
+  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+  n = vsubq_f32 (n, d->shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  r = vfmsq_f32 (r, d->pi_1, n);
+  r = vfmsq_f32 (r, d->pi_2, n);
+  r = vfmsq_f32 (r, d->pi_3, n);
+
+  /* y = sin(r) */
+  r2 = vmulq_f32 (r, r);
+  y = vfmaq_f32 (C (2), C (3), r2);
+  y = vfmaq_f32 (C (1), y, r2);
+  y = vfmaq_f32 (C (0), y, r2);
+  y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
+
+  if (unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/math/cosf.c b/math/cosf.c
index f29f19474e230327f439da21eb0661e53bfaa1fe..6293ce8f1b7d6bc0d0a515bb07339b1e364a0c27 100644
--- a/math/cosf.c
+++ b/math/cosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -22,7 +22,7 @@ cosf (float y)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
diff --git a/math/erf.c b/math/erf.c
index 12d7e5160df702ab10ff1ae5da5604c927e54372..5f9f40dda26434e314e4d141d84868b2d3b9c1f6 100644
--- a/math/erf.c
+++ b/math/erf.c
@@ -2,7 +2,7 @@
  * Double-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/erf_data.c b/math/erf_data.c
index 807875bdd7f5db86ad3557c9c36c7afd93c07ca0..10cf1fae93e078c2636409318f91078931c443bf 100644
--- a/math/erf_data.c
+++ b/math/erf_data.c
@@ -2,7 +2,7 @@
  * Shared data between erf and erfc.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/erff.c b/math/erff.c
index a58e82565dc34745500197c469d7f2ea9ec1f71b..9fa476dbbab2d72299486163eaeb7f5676a7b040 100644
--- a/math/erff.c
+++ b/math/erff.c
@@ -2,7 +2,7 @@
  * Single-precision erf(x) function.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/erff_data.c b/math/erff_data.c
index fa6b1ef4dedbfe7bafe493aa7c0dc007174fe704..f822788d0dd8068b17dc84ac3204349e21b4f34d 100644
--- a/math/erff_data.c
+++ b/math/erff_data.c
@@ -2,7 +2,7 @@
  * Data for approximation of erff.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/exp.c b/math/exp.c
index 7f5024cd8792144fe2681f1a60e297d405b9ea06..1de500c31f3ed08468b4e712fd3f7ea28e8a137e 100644
--- a/math/exp.c
+++ b/math/exp.c
@@ -2,7 +2,7 @@
  * Double-precision e^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/exp10.c b/math/exp10.c
new file mode 100644
index 0000000000000000000000000000000000000000..0fbec4c694ca831797d96968fc881a87aaf93644
--- /dev/null
+++ b/math/exp10.c
@@ -0,0 +1,129 @@
+/*
+ * Double-precision 10^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX).  */
+#define UFlowBound -0x1.5ep+8 /* -350.  */
+#define SmallTop 0x3c6 /* top12(0x1p-57).  */
+#define BigTop 0x407   /* top12(0x1p8).  */
+#define Thresh 0x41    /* BigTop - SmallTop.  */
+#define Shift __exp_data.shift
+#define C(i) __exp_data.exp10_poly[i]
+
+static double
+special_case (uint64_t sbits, double_t tmp, uint64_t ki)
+{
+  double_t scale, y;
+
+  if (ki - (1ull << 16) < 0x80000000)
+    {
+      /* The exponent of scale might have overflowed by 1.  */
+      sbits -= 1ull << 52;
+      scale = asdouble (sbits);
+      y = 2 * (scale + scale * tmp);
+      return check_oflow (eval_as_double (y));
+    }
+
+  /* n < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t lo = scale - y + scale * tmp;
+      double_t hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = eval_as_double (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+
+  return check_uflow (y);
+}
+
+/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP.  */
+double
+exp10 (double x)
+{
+  uint64_t ix = asuint64 (x);
+  uint32_t abstop = (ix >> 52) & 0x7ff;
+
+  if (unlikely (abstop - SmallTop >= Thresh))
+    {
+      if (abstop - SmallTop >= 0x80000000)
+	/* Avoid spurious underflow for tiny x.
+	   Note: 0 is common input.  */
+	return x + 1;
+      if (abstop == 0x7ff)
+	return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
+      if (x >= OFlowBound)
+	return __math_oflow (0);
+      if (x < UFlowBound)
+	return __math_uflow (0);
+
+      /* Large x is special-cased below.  */
+      abstop = 0;
+    }
+
+  /* Reduce x: z = x * N / log10(2), k = round(z).  */
+  double_t z = __exp_data.invlog10_2N * x;
+  double_t kd;
+  int64_t ki;
+#if TOINT_INTRINSICS
+  kd = roundtoint (z);
+  ki = converttoint (z);
+#else
+  kd = eval_as_double (z + Shift);
+  kd -= Shift;
+  ki = kd;
+#endif
+
+  /* r = x - k * log10(2), r in [-0.5, 0.5].  */
+  double_t r = x;
+  r = __exp_data.neglog10_2hiN * kd + r;
+  r = __exp_data.neglog10_2loN * kd + r;
+
+  /* exp10(x) = 2^(k/N) * 2^(r/N).
+     Approximate the two components separately.  */
+
+  /* s = 2^(k/N), using lookup table.  */
+  uint64_t e = ki << (52 - EXP_TABLE_BITS);
+  uint64_t i = (ki & IndexMask) * 2;
+  uint64_t u = __exp_data.tab[i + 1];
+  uint64_t sbits = u + e;
+
+  double_t tail = asdouble (__exp_data.tab[i]);
+
+  /* 2^(r/N) ~= 1 + r * Poly(r).  */
+  double_t r2 = r * r;
+  double_t p = C (0) + r * C (1);
+  double_t y = C (2) + r * C (3);
+  y = y + r2 * C (4);
+  y = p + r2 * y;
+  y = tail + y * r;
+
+  if (unlikely (abstop == 0))
+    return special_case (sbits, y, ki);
+
+  /* Assemble components:
+     y  = 2^(r/N) * 2^(k/N)
+       ~= (y + 1) * s.  */
+  double_t s = asdouble (sbits);
+  return eval_as_double (s * y + s);
+}
diff --git a/math/exp2.c b/math/exp2.c
index 35ab39f22ed5fcb0442c2fb84eea80ff95540fe2..a1eee44f1f4828b7fb9f133227e8b0e808f83788 100644
--- a/math/exp2.c
+++ b/math/exp2.c
@@ -2,7 +2,7 @@
  * Double-precision 2^x function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/exp2f.c b/math/exp2f.c
index 94b32538aa0de9c7e47ea3df9a5c60b7851bbeed..776c3ddf76636a75b24de080ac9fde62eed642d8 100644
--- a/math/exp2f.c
+++ b/math/exp2f.c
@@ -2,7 +2,7 @@
  * Single-precision 2^x function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/exp2f_data.c b/math/exp2f_data.c
index 3fb0ad11b15a4e387b91778ea2dc31faa1903bfa..f0cb7fccacd158e0a771e3c3cb7ea4847896c149 100644
--- a/math/exp2f_data.c
+++ b/math/exp2f_data.c
@@ -2,7 +2,7 @@
  * Shared data between expf, exp2f and powf.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/exp_data.c b/math/exp_data.c
index cba76832566f04cc100bd153da745a6a57d30faf..9df4e0b1a2bb9ccbb2c21deb23787323bcfce88d 100644
--- a/math/exp_data.c
+++ b/math/exp_data.c
@@ -2,7 +2,7 @@
  * Shared data between exp, exp2 and pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
@@ -12,6 +12,7 @@
 const struct exp_data __exp_data = {
 // N/ln2
 .invln2N = 0x1.71547652b82fep0 * N,
+.invlog10_2N = 0x1.a934f0979a371p1 * N,
 // -ln2/N
 #if N == 64
 .negln2hiN = -0x1.62e42fefa0000p-7,
@@ -26,6 +27,8 @@ const struct exp_data __exp_data = {
 .negln2hiN = -0x1.62e42fef80000p-10,
 .negln2loN = -0x1.1cf79abc9e3b4p-45,
 #endif
+.neglog10_2hiN = -0x1.3441350ap-2 / N,
+.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
 // Used for rounding when !TOINT_INTRINSICS
 #if EXP_USE_TOINT_NARROW
 .shift = 0x1800000000.8p0,
@@ -147,6 +150,24 @@ const struct exp_data __exp_data = {
 0x1.3b2ab786ee1dap-7,
 #endif
 },
+.exp10_poly = {
+#if EXP10_POLY_WIDE
+/* Range is wider if using shift-based reduction: coeffs generated
+   using Remez in [-log10(2)/128, log10(2)/128 ].  */
+0x1.26bb1bbb55515p1,
+0x1.53524c73cd32bp1,
+0x1.0470591e1a108p1,
+0x1.2bd77b12fe9a8p0,
+0x1.14289fef24b78p-1
+#else
+/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ].  */
+0x1.26bb1bbb55516p1,
+0x1.53524c73ce9fep1,
+0x1.0470591ce4b26p1,
+0x1.2bd76577fe684p0,
+0x1.1446eeccd0efbp-1
+#endif
+},
 // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
 // tab[2*k] = asuint64(T[k])
 // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/math/expf.c b/math/expf.c
index 9b2f0c3d8c56c98d8e9d37d45143b713cb92e570..08a20d59e49145ab8ae0099c1bda89ab6cad0752 100644
--- a/math/expf.c
+++ b/math/expf.c
@@ -2,7 +2,7 @@
  * Single-precision e^x function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 279d829d8ea15acae38ae51ada3fa74f3920f7f5..64cbb9c1f8506eca4fc7bf0ccf9c2991b4663b06 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,8 +1,8 @@
 /*
  * Public API.
  *
- * Copyright (c) 2015-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2015-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATHLIB_H
@@ -18,74 +18,33 @@ float cosf (float);
 void sincosf (float, float*, float*);
 
 double exp (double);
+double exp10 (double);
 double exp2 (double);
 double log (double);
 double log2 (double);
 double pow (double, double);
 
-/* Scalar functions using the vector algorithm with identical result.  */
-float __s_sinf (float);
-float __s_cosf (float);
-float __s_expf (float);
-float __s_expf_1u (float);
-float __s_exp2f (float);
-float __s_exp2f_1u (float);
-float __s_logf (float);
-float __s_powf (float, float);
-double __s_sin (double);
-double __s_cos (double);
-double __s_exp (double);
-double __s_log (double);
-double __s_pow (double, double);
-
 #if __aarch64__
-#if __GNUC__ >= 5
+# if __GNUC__ >= 5
 typedef __Float32x4_t __f32x4_t;
 typedef __Float64x2_t __f64x2_t;
-#elif __clang_major__*100+__clang_minor__ >= 305
+# elif __clang_major__*100+__clang_minor__ >= 305
 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
 typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-#else
-#error Unsupported compiler
-#endif
-
-/* Vector functions following the base PCS.  */
-__f32x4_t __v_sinf (__f32x4_t);
-__f32x4_t __v_cosf (__f32x4_t);
-__f32x4_t __v_expf (__f32x4_t);
-__f32x4_t __v_expf_1u (__f32x4_t);
-__f32x4_t __v_exp2f (__f32x4_t);
-__f32x4_t __v_exp2f_1u (__f32x4_t);
-__f32x4_t __v_logf (__f32x4_t);
-__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
-__f64x2_t __v_sin (__f64x2_t);
-__f64x2_t __v_cos (__f64x2_t);
-__f64x2_t __v_exp (__f64x2_t);
-__f64x2_t __v_log (__f64x2_t);
-__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
+# else
+#  error Unsupported compiler
+# endif
 
-#if __GNUC__ >= 9 || __clang_major__ >= 8
-#define __vpcs __attribute__((__aarch64_vector_pcs__))
-
-/* Vector functions following the vector PCS.  */
-__vpcs __f32x4_t __vn_sinf (__f32x4_t);
-__vpcs __f32x4_t __vn_cosf (__f32x4_t);
-__vpcs __f32x4_t __vn_expf (__f32x4_t);
-__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
-__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
-__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
-__vpcs __f32x4_t __vn_logf (__f32x4_t);
-__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t __vn_sin (__f64x2_t);
-__vpcs __f64x2_t __vn_cos (__f64x2_t);
-__vpcs __f64x2_t __vn_exp (__f64x2_t);
-__vpcs __f64x2_t __vn_log (__f64x2_t);
-__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
+# if __GNUC__ >= 9 || __clang_major__ >= 8
+#  undef __vpcs
+#  define __vpcs __attribute__((__aarch64_vector_pcs__))
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
@@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
-#endif
+# endif
 #endif
 
 #endif
diff --git a/math/log.c b/math/log.c
index d3b7bc60747c2ace661ed1885669b1ab763e4dd2..43dfc2a744f060f8ebe9a4b25fb8da0367070d5e 100644
--- a/math/log.c
+++ b/math/log.c
@@ -2,7 +2,7 @@
  * Double-precision log(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/log2.c b/math/log2.c
index 55102b7729696324f1f2afb4cf4cd89fbd06c034..3f9c21b0396263dd8274b252ffb4b1669e03ef18 100644
--- a/math/log2.c
+++ b/math/log2.c
@@ -2,7 +2,7 @@
  * Double-precision log2(x) function.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/log2_data.c b/math/log2_data.c
index 3fc9b47c1f03868c950cac77bcc28e552fbf411a..293bd7df4118b08a69b4d0ff3bcc65917c628a73 100644
--- a/math/log2_data.c
+++ b/math/log2_data.c
@@ -2,7 +2,7 @@
  * Data for log2.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/log2f.c b/math/log2f.c
index acb629e6846cf3b94f665bca351d93098fb543a3..0a44fa2024f60639c34a1ce06a7b5d4eb77b09c6 100644
--- a/math/log2f.c
+++ b/math/log2f.c
@@ -2,7 +2,7 @@
  * Single-precision log2 function.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/log2f_data.c b/math/log2f_data.c
index f3546d730abab682f5b6e81adeb2064ef9357ba4..4866ef7f8171e67f36f16e41abfc858173326ab6 100644
--- a/math/log2f_data.c
+++ b/math/log2f_data.c
@@ -2,7 +2,7 @@
  * Data definition for log2f.
  *
  * Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/log_data.c b/math/log_data.c
index 96a098d42c160e9d8713e565e35bf8901183528d..3ecc1f40a8228d5e13438a51ac0b615392e268fb 100644
--- a/math/log_data.c
+++ b/math/log_data.c
@@ -2,7 +2,7 @@
  * Data for log.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/logf.c b/math/logf.c
index cfbaee12df108750f6de0ca9f8dd30be7a17ff2b..820f74c3e66a7078f78d39a326aade89251894ee 100644
--- a/math/logf.c
+++ b/math/logf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision log function.
  *
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
@@ -57,7 +57,7 @@ logf (float x)
   tmp = ix - OFF;
   i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
   k = (int32_t) tmp >> 23; /* arithmetic shift */
-  iz = ix - (tmp & 0x1ff << 23);
+  iz = ix - (tmp & 0xff800000);
   invc = T[i].invc;
   logc = T[i].logc;
   z = (double_t) asfloat (iz);
diff --git a/math/logf_data.c b/math/logf_data.c
index e8973ce4fedcbffc2d587bf73fd2afa3917331ca..04247684755fdf65d4a834920f30dbb4fe72d89b 100644
--- a/math/logf_data.c
+++ b/math/logf_data.c
@@ -2,7 +2,7 @@
  * Data definition for logf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/math_config.h b/math/math_config.h
index e85104337048abdfb1f51302fe7b3d33ead2b06a..394aaebc48ac8a94e4ab15b23326a2b3de4e337d 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,8 +1,8 @@
 /*
  * Configuration for math routines.
  *
- * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _MATH_CONFIG_H
@@ -92,6 +92,17 @@
 # define unlikely(x) (x)
 #endif
 
+/* Return ptr but hide its value from the compiler so accesses through it
+   cannot be optimized based on the contents.  */
+#define ptr_barrier(ptr)                                                      \
+  ({                                                                          \
+    __typeof (ptr) __ptr = (ptr);                                             \
+    __asm("" : "+r"(__ptr));                                                  \
+    __ptr;                                                                    \
+  })
+
+/* Symbol renames to avoid libc conflicts.  */
+
 #if HAVE_FAST_ROUND
 /* When set, the roundtoint and converttoint functions are provided with
    the semantics documented below.  */
@@ -381,15 +392,22 @@ extern const struct powf_log2_data
 #define EXP_USE_TOINT_NARROW 0
 #define EXP2_POLY_ORDER 5
 #define EXP2_POLY_WIDE 0
+/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
+   and !TOINT_INTRINSICS.  */
+#define EXP10_POLY_WIDE 0
 extern const struct exp_data
 {
   double invln2N;
+  double invlog10_2N;
   double shift;
   double negln2hiN;
   double negln2loN;
+  double neglog10_2hiN;
+  double neglog10_2loN;
   double poly[4]; /* Last four coefficients.  */
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
+  double exp10_poly[5];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data HIDDEN;
 
@@ -459,4 +477,16 @@ extern const struct erf_data
   double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
 } __erf_data HIDDEN;
 
+#define V_EXP_TABLE_BITS 7
+extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+
+#define V_LOG_TABLE_BITS 7
+extern const struct v_log_data
+{
+  struct
+  {
+    double invc, logc;
+  } table[1 << V_LOG_TABLE_BITS];
+} __v_log_data HIDDEN;
+
 #endif
diff --git a/math/math_err.c b/math/math_err.c
index 1bf9538a1ab1d43ee26a20b8a57d2c129685fcd7..cfe072809cf43c2dcd700798469af446e576affa 100644
--- a/math/math_err.c
+++ b/math/math_err.c
@@ -2,7 +2,7 @@
  * Double-precision math error handling.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/math_errf.c b/math/math_errf.c
index d5350b819ab1aa4c37f61616e2d54b77027520fd..4233918b1eaeef1e597d82e3242ce302d34e572c 100644
--- a/math/math_errf.c
+++ b/math/math_errf.c
@@ -2,7 +2,7 @@
  * Single-precision math error handling.
  *
  * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/pow.c b/math/pow.c
index 86842c6abacd962b4df3f536229c977b9d167775..af719fe5ab105861f410eaaa0350692bfaa49346 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -2,7 +2,7 @@
  * Double-precision x^y function.
  *
  * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <float.h>
diff --git a/math/pow_log_data.c b/math/pow_log_data.c
index 45569c5cc0645171b2e88db7dacc186540f8614b..2a4c250d85c3b7715e84c513ed1e6c9daa628ce2 100644
--- a/math/pow_log_data.c
+++ b/math/pow_log_data.c
@@ -2,7 +2,7 @@
  * Data for the log part of pow.
  *
  * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/powf.c b/math/powf.c
index 6ba45d3852a50b1ae3decb93e291a6d285692e5e..05c80bb2eb670e032ec6ce4bc504ef0577d45ea8 100644
--- a/math/powf.c
+++ b/math/powf.c
@@ -2,7 +2,7 @@
  * Single-precision pow function.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c
index 97e0d98cdbab6ffa9358a9670acd5c1255c02799..243836a549fdb7d8daf14488b2403eea074f4594 100644
--- a/math/powf_log2_data.c
+++ b/math/powf_log2_data.c
@@ -2,7 +2,7 @@
  * Data definition for powf.
  *
  * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "math_config.h"
diff --git a/math/s_cos.c b/math/s_cos.c
deleted file mode 100644
index 53a95b0adfde452cdfd9adb3fd315f314d080118..0000000000000000000000000000000000000000
--- a/math/s_cos.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_cos.c"
diff --git a/math/s_cosf.c b/math/s_cosf.c
deleted file mode 100644
index 914c02eba6516e924785351f166161a608520c30..0000000000000000000000000000000000000000
--- a/math/s_cosf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_cosf.c"
diff --git a/math/s_exp.c b/math/s_exp.c
deleted file mode 100644
index ac7246b2c100d474250533eae917b79e179ae13c..0000000000000000000000000000000000000000
--- a/math/s_exp.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_exp.c"
diff --git a/math/s_exp2f.c b/math/s_exp2f.c
deleted file mode 100644
index df7dfd680ff40d4a15c8fdff9dec438a9978ddd1..0000000000000000000000000000000000000000
--- a/math/s_exp2f.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_exp2f.c"
diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c
deleted file mode 100644
index 5e3852b41d83710fe91f1dffcd97662a5bfe6d01..0000000000000000000000000000000000000000
--- a/math/s_exp2f_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_exp2f_1u.c"
diff --git a/math/s_expf.c b/math/s_expf.c
deleted file mode 100644
index 3492c460733d7a128deb55b4bb6db4eaa7092db4..0000000000000000000000000000000000000000
--- a/math/s_expf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
deleted file mode 100644
index eb7bbcba5566a177dad04d70407cbeb4c99b3aee..0000000000000000000000000000000000000000
--- a/math/s_expf_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_expf_1u.c"
diff --git a/math/s_log.c b/math/s_log.c
deleted file mode 100644
index 23289cf948ecd9503653a7719bfcec1daf238289..0000000000000000000000000000000000000000
--- a/math/s_log.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_log.c"
diff --git a/math/s_logf.c b/math/s_logf.c
deleted file mode 100644
index 9399350fc1ee501f7e855ef1bf4ad53fd1c2d374..0000000000000000000000000000000000000000
--- a/math/s_logf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_logf.c"
diff --git a/math/s_pow.c b/math/s_pow.c
deleted file mode 100644
index 2e34c9f896d6d920937d12befad1fc9e4e0a1596..0000000000000000000000000000000000000000
--- a/math/s_pow.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_pow.c"
diff --git a/math/s_powf.c b/math/s_powf.c
deleted file mode 100644
index 6d91a4a72b3733ba435d3605ec4d5a880f33ce90..0000000000000000000000000000000000000000
--- a/math/s_powf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_powf.c"
diff --git a/math/s_sin.c b/math/s_sin.c
deleted file mode 100644
index 06982c2018c675c1b8eac362d7c17b07553da760..0000000000000000000000000000000000000000
--- a/math/s_sin.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_sin.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
deleted file mode 100644
index 68ca90853736f260b4b7f345928c1b7ee893f24c..0000000000000000000000000000000000000000
--- a/math/s_sinf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_sinf.c"
diff --git a/math/sincosf.c b/math/sincosf.c
index 9746f1c22e6c2b30a2003e649fcfd40ebd8bcc7c..446f21d60faf3a5b3203ac6abf4df89a77907ed6 100644
--- a/math/sincosf.c
+++ b/math/sincosf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin/cos function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       double x2 = x * x;
 
diff --git a/math/sincosf.h b/math/sincosf.h
index 1e80fc9ba8e19cab265fc98ec325c9b3f17a998d..ec23ed7aeb2615e97ca26860c12452d548179ba4 100644
--- a/math/sincosf.h
+++ b/math/sincosf.h
@@ -1,8 +1,8 @@
 /*
  * Header for sinf, cosf and sincosf.
  *
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -12,7 +12,7 @@
 /* 2PI * 2^-64.  */
 static const double pi63 = 0x1.921FB54442D18p-62;
 /* PI / 4.  */
-static const double pio4 = 0x1.921FB54442D18p-1;
+static const float pio4f = 0x1.921FB6p-1f;
 
 /* The constants and polynomials for sine and cosine.  */
 typedef struct
diff --git a/math/sincosf_data.c b/math/sincosf_data.c
index ab4ac4710feff2468cf9e55b04d4ad22dbc75233..22525290ab087a0f27e60f0c36731a5227da4baa 100644
--- a/math/sincosf_data.c
+++ b/math/sincosf_data.c
@@ -2,7 +2,7 @@
  * Data definition for sinf, cosf and sincosf.
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/math/sinf.c b/math/sinf.c
index ddbc1daf74a9df1d90dad824f3c30d0460aafcc2..8dd8ae458794c51cd24a4e7623d16ccf49b0bcd9 100644
--- a/math/sinf.c
+++ b/math/sinf.c
@@ -1,8 +1,8 @@
 /*
  * Single-precision sin function.
  *
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <math.h>
@@ -21,7 +21,7 @@ sinf (float y)
   int n;
   const sincos_t *p = &__sincosf_table[0];
 
-  if (abstop12 (y) < abstop12 (pio4))
+  if (abstop12 (y) < abstop12 (pio4f))
     {
       s = x * x;
 
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 0c17826e52961b3abd86b1e53ab3ec4a74d7ed8e..b2711e5a763ab4c4b13dfe23ce6abebd8d18d4da 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,8 +1,8 @@
 /*
  * Microbenchmark for math functions.
  *
- * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #undef _GNU_SOURCE
@@ -15,11 +15,6 @@
 #include <math.h>
 #include "mathlib.h"
 
-#ifndef WANT_VMATH
-/* Enable the build of vector math code.  */
-# define WANT_VMATH 1
-#endif
-
 /* Number of measurements, best result is reported.  */
 #define MEASURE 60
 /* Array size.  */
@@ -34,8 +29,9 @@ static float Af[N];
 static long measurecount = MEASURE;
 static long itercount = ITER;
 
-#if __aarch64__ && WANT_VMATH
-typedef __f64x2_t v_double;
+#ifdef __vpcs
+#include <arm_neon.h>
+typedef float64x2_t v_double;
 
 #define v_double_len() 2
 
@@ -51,7 +47,7 @@ v_double_dup (double x)
   return (v_double){x, x};
 }
 
-typedef __f32x4_t v_float;
+typedef float32x4_t v_float;
 
 #define v_float_len() 4
 
@@ -76,141 +72,91 @@ typedef float v_float;
 #define v_float_len(x) 1
 #define v_float_load(x) (x)[0]
 #define v_float_dup(x) (x)
-#endif
-
-static double
-dummy (double x)
-{
-  return x;
-}
-
-static float
-dummyf (float x)
-{
-  return x;
-}
-
-#if WANT_VMATH
-#if __aarch64__
-static v_double
-__v_dummy (v_double x)
-{
-  return x;
-}
 
-static v_float
-__v_dummyf (v_float x)
-{
-  return x;
-}
-
-#ifdef __vpcs
-__vpcs static v_double
-__vn_dummy (v_double x)
-{
-  return x;
-}
+#endif
 
-__vpcs static v_float
-__vn_dummyf (v_float x)
-{
-  return x;
-}
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef svbool_t sv_bool;
+typedef svfloat64_t sv_double;
 
-__vpcs static v_float
-xy__vn_powf (v_float x)
-{
-  return __vn_powf (x, x);
-}
+#define sv_double_len() svcntd()
 
-__vpcs static v_float
-xy_Z_powf (v_float x)
+static inline sv_double
+sv_double_load (const double *p)
 {
-  return _ZGVnN4vv_powf (x, x);
+  svbool_t pg = svptrue_b64();
+  return svld1(pg, p);
 }
 
-__vpcs static v_double
-xy__vn_pow (v_double x)
+static inline sv_double
+sv_double_dup (double x)
 {
-  return __vn_pow (x, x);
+  return svdup_n_f64(x);
 }
 
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
-  return _ZGVnN2vv_pow (x, x);
-}
-#endif
+typedef svfloat32_t sv_float;
 
-static v_float
-xy__v_powf (v_float x)
-{
-  return __v_powf (x, x);
-}
+#define sv_float_len() svcntw()
 
-static v_double
-xy__v_pow (v_double x)
+static inline sv_float
+sv_float_load (const float *p)
 {
-  return __v_pow (x, x);
+  svbool_t pg = svptrue_b32();
+  return svld1(pg, p);
 }
-#endif
 
-static float
-xy__s_powf (float x)
+static inline sv_float
+sv_float_dup (float x)
 {
-  return __s_powf (x, x);
-}
-
-static double
-xy__s_pow (double x)
-{
-  return __s_pow (x, x);
+  return svdup_n_f32(x);
 }
+#else
+/* dummy definitions to make things compile.  */
+#define sv_double_len(x) 1
+#define sv_float_len(x) 1
 #endif
 
 static double
-xypow (double x)
+dummy (double x)
 {
-  return pow (x, x);
+  return x;
 }
 
 static float
-xypowf (float x)
+dummyf (float x)
 {
-  return powf (x, x);
+  return x;
 }
-
-static double
-xpow (double x)
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
 {
-  return pow (x, 23.4);
+  return x;
 }
 
-static float
-xpowf (float x)
+__vpcs static v_float
+__vn_dummyf (v_float x)
 {
-  return powf (x, 23.4f);
+  return x;
 }
-
-static double
-ypow (double x)
+#endif
+#if WANT_SVE_MATH
+static sv_double
+__sv_dummy (sv_double x, sv_bool pg)
 {
-  return pow (2.34, x);
+  return x;
 }
 
-static float
-ypowf (float x)
+static sv_float
+__sv_dummyf (sv_float x, sv_bool pg)
 {
-  return powf (2.34f, x);
+  return x;
 }
 
-static float
-sincosf_wrap (float x)
-{
-  float s, c;
-  sincosf (x, &s, &c);
-  return s + c;
-}
+#endif
+
+#include "test/mathbench_wrappers.h"
 
 static const struct fun
 {
@@ -223,127 +169,40 @@ static const struct fun
   {
     double (*d) (double);
     float (*f) (float);
-    v_double (*vd) (v_double);
-    v_float (*vf) (v_float);
 #ifdef __vpcs
     __vpcs v_double (*vnd) (v_double);
     __vpcs v_float (*vnf) (v_float);
+#endif
+#if WANT_SVE_MATH
+    sv_double (*svd) (sv_double, sv_bool);
+    sv_float (*svf) (sv_float, sv_bool);
 #endif
   } fun;
 } funtab[] = {
 #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
 #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
-#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
-#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
 #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
 #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
+#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
+#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
 D (dummy, 1.0, 2.0)
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
 F (dummyf, 1.0, 2.0)
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
-#if WANT_VMATH
-D (__s_sin, -3.1, 3.1)
-D (__s_cos, -3.1, 3.1)
-D (__s_exp, -9.9, 9.9)
-D (__s_log, 0.01, 11.1)
-{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
-F (__s_expf, -9.9, 9.9)
-F (__s_expf_1u, -9.9, 9.9)
-F (__s_exp2f, -9.9, 9.9)
-F (__s_exp2f_1u, -9.9, 9.9)
-F (__s_logf, 0.01, 11.1)
-{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
-F (__s_sinf, -3.1, 3.1)
-F (__s_cosf, -3.1, 3.1)
-#if __aarch64__
-VD (__v_dummy, 1.0, 2.0)
-VD (__v_sin, -3.1, 3.1)
-VD (__v_cos, -3.1, 3.1)
-VD (__v_exp, -9.9, 9.9)
-VD (__v_log, 0.01, 11.1)
-{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
-VF (__v_dummyf, 1.0, 2.0)
-VF (__v_expf, -9.9, 9.9)
-VF (__v_expf_1u, -9.9, 9.9)
-VF (__v_exp2f, -9.9, 9.9)
-VF (__v_exp2f_1u, -9.9, 9.9)
-VF (__v_logf, 0.01, 11.1)
-{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
-VF (__v_sinf, -3.1, 3.1)
-VF (__v_cosf, -3.1, 3.1)
 #ifdef __vpcs
 VND (__vn_dummy, 1.0, 2.0)
-VND (__vn_exp, -9.9, 9.9)
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (__vn_log, 0.01, 11.1)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (__vn_sin, -3.1, 3.1)
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (__vn_cos, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
 VNF (__vn_dummyf, 1.0, 2.0)
-VNF (__vn_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (__vn_expf_1u, -9.9, 9.9)
-VNF (__vn_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (__vn_exp2f_1u, -9.9, 9.9)
-VNF (__vn_logf, 0.01, 11.1)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (__vn_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (__vn_cosf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
-#endif
 #endif
+#if WANT_SVE_MATH
+SVD (__sv_dummy, 1.0, 2.0)
+SVF (__sv_dummyf, 1.0, 2.0)
 #endif
+#include "test/mathbench_funcs.h"
 {0},
 #undef F
 #undef D
-#undef VF
-#undef VD
 #undef VNF
 #undef VND
+#undef SVF
+#undef SVD
 };
 
 static void
@@ -442,69 +301,75 @@ runf_latency (float f (float))
     prev = f (Af[i] + prev * z);
 }
 
+#ifdef __vpcs
 static void
-run_v_thruput (v_double f (v_double))
+run_vn_thruput (__vpcs v_double f (v_double))
 {
   for (int i = 0; i < N; i += v_double_len ())
     f (v_double_load (A+i));
 }
 
 static void
-runf_v_thruput (v_float f (v_float))
+runf_vn_thruput (__vpcs v_float f (v_float))
 {
   for (int i = 0; i < N; i += v_float_len ())
     f (v_float_load (Af+i));
 }
 
 static void
-run_v_latency (v_double f (v_double))
+run_vn_latency (__vpcs v_double f (v_double))
 {
-  v_double z = v_double_dup (zero);
-  v_double prev = z;
+  volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
+  uint64x2_t sel = vsel;
+  v_double prev = v_double_dup (0);
   for (int i = 0; i < N; i += v_double_len ())
-    prev = f (v_double_load (A+i) + prev * z);
+    prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
 }
 
 static void
-runf_v_latency (v_float f (v_float))
+runf_vn_latency (__vpcs v_float f (v_float))
 {
-  v_float z = v_float_dup (zero);
-  v_float prev = z;
+  volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
+  uint32x4_t sel = vsel;
+  v_float prev = v_float_dup (0);
   for (int i = 0; i < N; i += v_float_len ())
-    prev = f (v_float_load (Af+i) + prev * z);
+    prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
 }
+#endif
 
-#ifdef __vpcs
+#if WANT_SVE_MATH
 static void
-run_vn_thruput (__vpcs v_double f (v_double))
+run_sv_thruput (sv_double f (sv_double, sv_bool))
 {
-  for (int i = 0; i < N; i += v_double_len ())
-    f (v_double_load (A+i));
+  for (int i = 0; i < N; i += sv_double_len ())
+    f (sv_double_load (A+i), svptrue_b64 ());
 }
 
 static void
-runf_vn_thruput (__vpcs v_float f (v_float))
+runf_sv_thruput (sv_float f (sv_float, sv_bool))
 {
-  for (int i = 0; i < N; i += v_float_len ())
-    f (v_float_load (Af+i));
+  for (int i = 0; i < N; i += sv_float_len ())
+    f (sv_float_load (Af+i), svptrue_b32 ());
 }
 
 static void
-run_vn_latency (__vpcs v_double f (v_double))
+run_sv_latency (sv_double f (sv_double, sv_bool))
 {
-  v_double z = v_double_dup (zero);
-  v_double prev = z;
-  for (int i = 0; i < N; i += v_double_len ())
-    prev = f (v_double_load (A+i) + prev * z);
+  volatile sv_bool vsel = svptrue_b64 ();
+  sv_bool sel = vsel;
+  sv_double prev = sv_double_dup (0);
+  for (int i = 0; i < N; i += sv_double_len ())
+    prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
 }
 
 static void
-runf_vn_latency (__vpcs v_float f (v_float))
+runf_sv_latency (sv_float f (sv_float, sv_bool))
 {
-  v_float z = v_float_dup (zero);
-  v_float prev = z;
-  for (int i = 0; i < N; i += v_float_len ())
-    prev = f (v_float_load (Af+i) + prev * z);
+  volatile sv_bool vsel = svptrue_b32 ();
+  sv_bool sel = vsel;
+  sv_float prev = sv_float_dup (0);
+  for (int i = 0; i < N; i += sv_float_len ())
+    prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
 }
 #endif
 
@@ -539,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   const char *s = type == 't' ? "rthruput" : "latency";
   int vlen = 1;
 
-  if (f->vec && f->prec == 'd')
-    vlen = v_double_len();
-  else if (f->vec && f->prec == 'f')
-    vlen = v_float_len();
+  if (f->vec == 'n')
+    vlen = f->prec == 'd' ? v_double_len() : v_float_len();
+  else if (f->vec == 's')
+    vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
 
   if (f->prec == 'd' && type == 't' && f->vec == 0)
     TIMEIT (run_thruput, f->fun.d);
@@ -552,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi)
     TIMEIT (runf_thruput, f->fun.f);
   else if (f->prec == 'f' && type == 'l' && f->vec == 0)
     TIMEIT (runf_latency, f->fun.f);
-  else if (f->prec == 'd' && type == 't' && f->vec == 'v')
-    TIMEIT (run_v_thruput, f->fun.vd);
-  else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
-    TIMEIT (run_v_latency, f->fun.vd);
-  else if (f->prec == 'f' && type == 't' && f->vec == 'v')
-    TIMEIT (runf_v_thruput, f->fun.vf);
-  else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
-    TIMEIT (runf_v_latency, f->fun.vf);
 #ifdef __vpcs
   else if (f->prec == 'd' && type == 't' && f->vec == 'n')
     TIMEIT (run_vn_thruput, f->fun.vnd);
@@ -570,20 +427,32 @@ bench1 (const struct fun *f, int type, double lo, double hi)
   else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
     TIMEIT (runf_vn_latency, f->fun.vnf);
 #endif
+#if WANT_SVE_MATH
+  else if (f->prec == 'd' && type == 't' && f->vec == 's')
+    TIMEIT (run_sv_thruput, f->fun.svd);
+  else if (f->prec == 'd' && type == 'l' && f->vec == 's')
+    TIMEIT (run_sv_latency, f->fun.svd);
+  else if (f->prec == 'f' && type == 't' && f->vec == 's')
+    TIMEIT (runf_sv_thruput, f->fun.svf);
+  else if (f->prec == 'f' && type == 'l' && f->vec == 's')
+    TIMEIT (runf_sv_latency, f->fun.svf);
+#endif
 
   if (type == 't')
     {
       ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
-      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
+      printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
+	      f->name, s,
 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	      (unsigned long long) dt, lo, hi);
+	      (unsigned long long) dt, lo, hi, vlen);
     }
   else if (type == 'l')
     {
       ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
-      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
+      printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
+	      f->name, s,
 	      (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
-	      (unsigned long long) dt, lo, hi);
+	      (unsigned long long) dt, lo, hi, vlen);
     }
   fflush (stdout);
 }
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..84c4e68650acbb1ded2e43dee5410b7c3e7224c4
--- /dev/null
+++ b/math/test/mathbench_funcs.h
@@ -0,0 +1,62 @@
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+/* clang-format off */
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp10, -9.9, 9.9)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#ifdef __vpcs
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+  /* clang-format on */
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
new file mode 100644
index 0000000000000000000000000000000000000000..062b9db56de51a741a698e13a547184461b2ca2b
--- /dev/null
+++ b/math/test/mathbench_wrappers.h
@@ -0,0 +1,66 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifdef __vpcs
+
+__vpcs static v_float
+xy_Z_powf (v_float x)
+{
+  return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
+}
+
+#endif
+
+static double
+xypow (double x)
+{
+  return pow (x, x);
+}
+
+static float
+xypowf (float x)
+{
+  return powf (x, x);
+}
+
+static double
+xpow (double x)
+{
+  return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+  return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+  return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+  return powf (2.34f, x);
+}
+
+static float
+sincosf_wrap (float x)
+{
+  float s, c;
+  sincosf (x, &s, &c);
+  return s + c;
+}
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 310896738e478481a9f91ff878957a1f86accc2e..cedccfd39455930bf51ffdbb638b1be9935b4d80 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,8 +1,8 @@
 /*
  * mathtest.c - test rig for mathlib
  *
- * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 1998-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
@@ -196,9 +196,11 @@ int is_complex_rettype(int rettype) {
 #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
 #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
 
+#ifndef PL
 /* sincosf wrappers for easier testing.  */
 static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
 static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
+#endif
 
 test_func tfuncs[] = {
     /* trigonometric */
@@ -218,9 +220,10 @@ test_func tfuncs[] = {
     TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
     TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
+#ifndef PL
     TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-
+#endif
     /* hyperbolic */
     TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
     TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
@@ -251,6 +254,7 @@ test_func tfuncs[] = {
     TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
     TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
     TFUNC(at_s,rt_s, expm1f, ULPUNIT),
+    TFUNC(at_d,rt_d, exp10, ULPUNIT),
 
     /* power */
     TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@@ -1018,6 +1022,7 @@ int runtest(testdetail t) {
     DO_DOP(d_arg1,op1r);
     DO_DOP(d_arg2,op2r);
     s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
+    s_res.i = 0;
 
     /*
      * Detect NaNs, infinities and denormals on input, and set a
@@ -1152,22 +1157,25 @@ int runtest(testdetail t) {
             tresultr[0] = t.resultr[0];
             tresultr[1] = t.resultr[1];
             resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
+            resulti[0] = resulti[1] = 0;
             wres = 2;
             break;
         case rt_i:
             tresultr[0] = t.resultr[0];
             resultr[0] = intres;
+            resulti[0] = 0;
             wres = 1;
             break;
         case rt_s:
         case rt_s2:
             tresultr[0] = t.resultr[0];
             resultr[0] = s_res.i;
+            resulti[0] = 0;
             wres = 1;
             break;
         default:
             puts("unhandled rettype in runtest");
-            wres = 0;
+            abort ();
         }
         if(t.resultc != rc_none) {
             int err = 0;
diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c
index 6be79e1df0d1acef5a5c3861f1ab73058e10b836..5b3e9b4f18e467c536d989292b91706af1ff4f67 100644
--- a/math/test/rtest/dotest.c
+++ b/math/test/rtest/dotest.c
@@ -2,7 +2,7 @@
  * dotest.c - actually generate mathlib test cases
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h
index 12a9c749e18e1127eb27922ad30d13ac3cbd4d1c..3ebd7ddaf85d7b37c163ea8994250ac20408b396 100644
--- a/math/test/rtest/intern.h
+++ b/math/test/rtest/intern.h
@@ -2,7 +2,7 @@
  * intern.h
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_intern_h
diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c
index 0d8ead891320a5c5afb3d72d7b7fbd82c5f6e540..3d533c946f79be126fc0b427a9578effed68179a 100644
--- a/math/test/rtest/main.c
+++ b/math/test/rtest/main.c
@@ -2,7 +2,7 @@
  * main.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c
index 56123966b8c48f8acbeb1501d1e56d5b1d5e3e2c..1de32580b733d347d9a62cfe2521a8aca3622b87 100644
--- a/math/test/rtest/random.c
+++ b/math/test/rtest/random.c
@@ -2,7 +2,7 @@
  * random.c - random number generator for producing mathlib test cases
  *
  * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h
index b4b22df82a3d768bdb8227f6731b0bce5d6ce843..0b477d72b2346b2adbcd7cc7fec20dd9ade42395 100644
--- a/math/test/rtest/random.h
+++ b/math/test/rtest/random.h
@@ -2,7 +2,7 @@
  * random.h - header for random.c
  *
  * Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "types.h"
diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c
index c9f0daf76508194f5443bfe9fccbbfb89fe8964d..70a7844a48d613d1726b529e1347ccaec6439c1a 100644
--- a/math/test/rtest/semi.c
+++ b/math/test/rtest/semi.c
@@ -2,7 +2,7 @@
  * semi.c: test implementations of mathlib seminumerical functions
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdio.h>
diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h
index 17dc4158fb51e87e465c76ca5c9192cfb0dee71b..7a1444e55d288c93f665769a4a6fea37d42d7d33 100644
--- a/math/test/rtest/semi.h
+++ b/math/test/rtest/semi.h
@@ -2,7 +2,7 @@
  * semi.h: header for semi.c
  *
  * Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef test_semi_h
diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h
index 53cd557fa4cf448d6d4f49dbd85cf8c514905d47..e15b4e06a0d4aac3a0595edf69bfe830f56d624c 100644
--- a/math/test/rtest/types.h
+++ b/math/test/rtest/types.h
@@ -2,7 +2,7 @@
  * types.h
  *
  * Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef mathtest_types_h
diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c
index de45ac5768d0f750c1ad48c15902a02df9a8336d..441017192ab48b8332415c666cbd6d29c87c1e08 100644
--- a/math/test/rtest/wrappers.c
+++ b/math/test/rtest/wrappers.c
@@ -2,7 +2,7 @@
  * wrappers.c - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <assert.h>
diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h
index 7b09c85a59f114af56f6ec7dd9e0e7c00bd43721..0a8a58777d8aed7ae1c2e1f489b822919aa53967 100644
--- a/math/test/rtest/wrappers.h
+++ b/math/test/rtest/wrappers.h
@@ -2,7 +2,7 @@
  * wrappers.h - wrappers to modify output of MPFR/MPC test functions
  *
  * Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 typedef struct {
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 0190d9ab27fb104de780d9101507a85ee9ff7a2e..e2e03e3ae76196e8f94f2cdadd3147b4f4fafdac 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,8 +2,8 @@
 
 # ULP error check script.
 #
-# Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 #set -x
 set -eu
@@ -72,6 +72,16 @@ t pow  0x1.ffffffffffff0p-1  0x1.0000000000008p0 x 0x1p60 0x1p68 50000
 t pow  0x1.ffffffffff000p-1  0x1p0 x 0x1p50 0x1p52 50000
 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
 
+L=0.02
+t exp10   0                   0x1p-47             5000
+t exp10  -0                  -0x1p-47             5000
+t exp10   0x1p-47             1                   50000
+t exp10  -0x1p-47            -1                   50000
+t exp10   1                   0x1.34413509f79ffp8 50000
+t exp10  -1                  -0x1.434e6420f4374p8 50000
+t exp10  0x1.34413509f79ffp8  inf                 5000
+t exp10 -0x1.434e6420f4374p8 -inf                 5000
+
 L=1.0
 Ldir=0.9
 t erf  0 0xffff000000000000 10000
@@ -143,15 +153,10 @@ Ldir=0.5
 done
 
 # vector functions
+
 Ldir=0.5
 r='n'
-flags="${ULPFLAGS:--q} -f"
-runs=
-check __s_exp 1 && runs=1
-runv=
-check __v_exp 1 && runv=1
-runvn=
-check __vn_exp 1 && runvn=1
+flags="${ULPFLAGS:--q}"
 
 range_exp='
   0 0xffff000000000000 10000
@@ -177,9 +182,10 @@ range_pow='
 '
 
 range_sin='
-  0 0xffff000000000000 10000
-  0x1p-4     0x1p4     400000
- -0x1p-23    0x1p23    400000
+  0       0x1p23     500000
+ -0      -0x1p23     500000
+  0x1p23  inf        10000
+ -0x1p23 -inf        10000
 '
 range_cos="$range_sin"
 
@@ -199,9 +205,10 @@ range_logf='
 '
 
 range_sinf='
- 0    0xffff0000    10000
- 0x1p-4    0x1p4    300000
--0x1p-9   -0x1p9    300000
+  0        0x1p20   500000
+ -0       -0x1p20   500000
+  0x1p20   inf      10000
+ -0x1p20  -inf      10000
 '
 range_cosf="$range_sinf"
 
@@ -229,9 +236,8 @@ L_sinf=1.4
 L_cosf=1.4
 L_powf=2.1
 
-while read G F R
+while read G F D
 do
-	[ "$R" = 1 ] || continue
 	case "$G" in \#*) continue ;; esac
 	eval range="\${range_$G}"
 	eval L="\${L_$G}"
@@ -239,74 +245,35 @@ do
 	do
 		[ -n "$X" ] || continue
 		case "$X" in \#*) continue ;; esac
-		t $F $X
+		disable_fenv=""
+		if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
+			# If library was built with SIMD exceptions
+			# disabled, disable fenv checking in ulp
+			# tool. Otherwise, fenv checking may still be
+			# disabled by adding -f to the end of the run
+			# line.
+			disable_fenv="-f"
+		fi
+		t $D $disable_fenv $F $X
 	done << EOF
 $range
+
 EOF
 done << EOF
 # group symbol run
-exp  __s_exp       $runs
-exp  __v_exp       $runv
-exp  __vn_exp      $runvn
-exp  _ZGVnN2v_exp  $runvn
-
-log  __s_log       $runs
-log  __v_log       $runv
-log  __vn_log      $runvn
-log  _ZGVnN2v_log  $runvn
-
-pow __s_pow       $runs
-pow __v_pow       $runv
-pow __vn_pow      $runvn
-pow _ZGVnN2vv_pow $runvn
-
-sin __s_sin       $runs
-sin __v_sin       $runv
-sin __vn_sin      $runvn
-sin _ZGVnN2v_sin  $runvn
-
-cos __s_cos       $runs
-cos __v_cos       $runv
-cos __vn_cos      $runvn
-cos _ZGVnN2v_cos  $runvn
-
-expf __s_expf      $runs
-expf __v_expf      $runv
-expf __vn_expf     $runvn
-expf _ZGVnN4v_expf $runvn
-
-expf_1u __s_expf_1u   $runs
-expf_1u __v_expf_1u   $runv
-expf_1u __vn_expf_1u  $runvn
-
-exp2f __s_exp2f      $runs
-exp2f __v_exp2f      $runv
-exp2f __vn_exp2f     $runvn
-exp2f _ZGVnN4v_exp2f $runvn
-
-exp2f_1u __s_exp2f_1u  $runs
-exp2f_1u __v_exp2f_1u  $runv
-exp2f_1u __vn_exp2f_1u $runvn
-
-logf __s_logf      $runs
-logf __v_logf      $runv
-logf __vn_logf     $runvn
-logf _ZGVnN4v_logf $runvn
-
-sinf __s_sinf      $runs
-sinf __v_sinf      $runv
-sinf __vn_sinf     $runvn
-sinf _ZGVnN4v_sinf $runvn
-
-cosf __s_cosf      $runs
-cosf __v_cosf      $runv
-cosf __vn_cosf     $runvn
-cosf _ZGVnN4v_cosf $runvn
-
-powf __s_powf       $runs
-powf __v_powf       $runv
-powf __vn_powf      $runvn
-powf _ZGVnN4vv_powf $runvn
+exp       _ZGVnN2v_exp
+log       _ZGVnN2v_log
+pow       _ZGVnN2vv_pow      -f
+sin       _ZGVnN2v_sin       -z
+cos       _ZGVnN2v_cos
+expf      _ZGVnN4v_expf
+expf_1u   _ZGVnN4v_expf_1u   -f
+exp2f     _ZGVnN4v_exp2f
+exp2f_1u  _ZGVnN4v_exp2f_1u  -f
+logf      _ZGVnN4v_logf
+sinf      _ZGVnN4v_sinf      -z
+cosf      _ZGVnN4v_cosf
+powf      _ZGVnN4vv_powf     -f
 EOF
 
 [ 0 -eq $FAIL ] || {
diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst
index 79160443f0990058f70bc0d03be6be545f3fd6f7..7ea0d45795a3647c73ea1de9f6cbf956a1ef7bb9 100644
--- a/math/test/testcases/directed/cosf.tst
+++ b/math/test/testcases/directed/cosf.tst
@@ -1,7 +1,7 @@
 ; cosf.tst - Directed test cases for SP cosine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=cosf op1=7fc00001 result=7fc00001 errno=0
 func=cosf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst
index 7fa4d1868c0eb1a27920eda1485ee7be4dbe0f01..12384cef0dd98e24ac842c45ed89012d9c1b0ef6 100644
--- a/math/test/testcases/directed/erf.tst
+++ b/math/test/testcases/directed/erf.tst
@@ -1,7 +1,7 @@
 ; erf.tst - Directed test cases for erf
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst
index d05b7b1119c46c21ce7d22d2d3f2cbebff6eae44..28f8fa37f5aa7db743d399ab2a6af3070b009fe8 100644
--- a/math/test/testcases/directed/erff.tst
+++ b/math/test/testcases/directed/erff.tst
@@ -1,7 +1,7 @@
 ; erff.tst
 ;
 ; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=erff op1=7fc00001 result=7fc00001 errno=0
 func=erff op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst
index 85d556cd1e00f75c3273e67d420adce2ea7849df..0bb2ef4579cc1c5313494ead5e0dc80f4a02153e 100644
--- a/math/test/testcases/directed/exp.tst
+++ b/math/test/testcases/directed/exp.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp10.tst b/math/test/testcases/directed/exp10.tst
new file mode 100644
index 0000000000000000000000000000000000000000..2cf4273bd1d718ef0332e78553e4c2c3c1cb5c2d
--- /dev/null
+++ b/math/test/testcases/directed/exp10.tst
@@ -0,0 +1,15 @@
+; Directed test cases for exp10
+;
+; Copyright (c) 2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
+func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst
index fa56c9f8be4b91598121f7f376e68968d806001d..7069f9010c8ccf6e5407aadf2a581b12ff736fe4 100644
--- a/math/test/testcases/directed/exp2.tst
+++ b/math/test/testcases/directed/exp2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for exp2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst
index 38cfc3f78ac61dae04c0d0372110d3351e669848..6ca2eeab4e121e165703644bee54b5d855225886 100644
--- a/math/test/testcases/directed/exp2f.tst
+++ b/math/test/testcases/directed/exp2f.tst
@@ -1,7 +1,7 @@
 ; exp2f.tst - Directed test cases for exp2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=exp2f op1=7fc00001 result=7fc00001 errno=0
 func=exp2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst
index ff0f671c2656a94b17f96d8f9a683a6b8436f674..89ae8fe78e6c17cd5295230c4185a8d11b98849d 100644
--- a/math/test/testcases/directed/expf.tst
+++ b/math/test/testcases/directed/expf.tst
@@ -1,7 +1,7 @@
 ; expf.tst - Directed test cases for expf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=expf op1=7fc00001 result=7fc00001 errno=0
 func=expf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst
index a0aa398cbf734396be64c61612f463524d283d15..686ea835645b9c7af857f5052efbb29c71493a20 100644
--- a/math/test/testcases/directed/log.tst
+++ b/math/test/testcases/directed/log.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst
index ff1286cbd53e8ebfba5db81b9d244a598eb9a6ac..361bddec374bb16da87dd6fa5dc10d0a2f1ff366 100644
--- a/math/test/testcases/directed/log2.tst
+++ b/math/test/testcases/directed/log2.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for log2
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst
index 5832c4f08f1ecb6acf9fdbbb06f0ce75bac82f6d..5fce051cddba75e19eff4fd577a456a452554159 100644
--- a/math/test/testcases/directed/log2f.tst
+++ b/math/test/testcases/directed/log2f.tst
@@ -1,7 +1,7 @@
 ; log2f.tst - Directed test cases for log2f
 ;
 ; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=log2f op1=7fc00001 result=7fc00001 errno=0
 func=log2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst
index 6e68a36e0f6a29f8d0646f450ed3abf0aafca260..a6d1b9d5c51fa1b9b7e8c5eef465f2e4f5cfb6f5 100644
--- a/math/test/testcases/directed/logf.tst
+++ b/math/test/testcases/directed/logf.tst
@@ -1,7 +1,7 @@
 ; logf.tst - Directed test cases for logf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=logf op1=7fc00001 result=7fc00001 errno=0
 func=logf op1=ffc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst
index 19665817153d03ef84dc85fe3be375bd63d2dad5..879d12864afe5d2c3e98e1c07095d7f58fe68b3b 100644
--- a/math/test/testcases/directed/pow.tst
+++ b/math/test/testcases/directed/pow.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for pow
 ;
 ; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst
index 3fa8b110f8bcb97196dca92030271ceb376644a8..46d5224008710127eb93863f5b19196cdc89693d 100644
--- a/math/test/testcases/directed/powf.tst
+++ b/math/test/testcases/directed/powf.tst
@@ -1,7 +1,7 @@
 ; powf.tst - Directed test cases for powf
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
 func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst
index 4b33d2291c660c034fed47522966599203bb8b6c..cddb346558ea3c16ad3ca47ff7c2e3789aa31816 100644
--- a/math/test/testcases/directed/sincosf.tst
+++ b/math/test/testcases/directed/sincosf.tst
@@ -1,7 +1,7 @@
 ; Directed test cases for SP sincos
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst
index ded80b1598c6a3904ed8eb6baab351f493592bcc..041b13d5d6cbc5e56fb570126159a73b70d0d1ab 100644
--- a/math/test/testcases/directed/sinf.tst
+++ b/math/test/testcases/directed/sinf.tst
@@ -1,7 +1,7 @@
 ; sinf.tst - Directed test cases for SP sine
 ;
 ; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 
 func=sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst
index c24ff80d5d95eccc799de5bd3dd0876b19ae8fb9..8e885d61722a0b5e871e12af0a0cc0f4558f6d5b 100644
--- a/math/test/testcases/random/double.tst
+++ b/math/test/testcases/random/double.tst
@@ -1,7 +1,7 @@
 !! double.tst - Random test case specification for DP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test exp 10000
 test exp2 10000
diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst
index d02a22750abe07b9b64b63a0caf9afcb3c83d50c..ea4a5a01521484b53a8413a7edfe673c110b2bae 100644
--- a/math/test/testcases/random/float.tst
+++ b/math/test/testcases/random/float.tst
@@ -1,7 +1,7 @@
 !! single.tst - Random test case specification for SP functions
 !!
 !! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 test sinf 10000
 test cosf 10000
diff --git a/math/test/ulp.c b/math/test/ulp.c
index 51479b87a0fde860e1584536fd13b8471cfca9a2..5ff29972e50ee01026e8f15af0e7c73008909bda 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,10 +1,11 @@
 /*
  * ULP error checking tool for math functions.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+#define _GNU_SOURCE
 #include <ctype.h>
 #include <fenv.h>
 #include <float.h>
@@ -23,11 +24,6 @@
 # include <mpfr.h>
 #endif
 
-#ifndef WANT_VMATH
-/* Enable the build of vector math code.  */
-# define WANT_VMATH 1
-#endif
-
 static inline uint64_t
 asuint64 (double f)
 {
@@ -212,73 +208,61 @@ struct conf
   unsigned long long n;
   double softlim;
   double errlim;
+  int ignore_zero_sign;
 };
 
-/* Wrappers for sincos.  */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
 /* A bit of a hack: call vector functions twice with the same
    input in lane 0 but a different value in other lanes: once
    with an in-range value and then with a special case value.  */
 static int secondcall;
 
 /* Wrappers for vector functions.  */
-#if __aarch64__ && WANT_VMATH
+#ifdef __vpcs
 typedef __f32x4_t v_float;
 typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
+/* First element of fv and dv may be changed by -c argument.  */
+static float fv[2] = {1.0f, -INFINITY};
+static double dv[2] = {1.0, -INFINITY};
 static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
 static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-
-static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
-static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
-static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
-static float v_expf(float x) { return __v_expf(argf(x))[0]; }
-static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
-static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
-static float v_logf(float x) { return __v_logf(argf(x))[0]; }
-static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
-static double v_sin(double x) { return __v_sin(argd(x))[0]; }
-static double v_cos(double x) { return __v_cos(argd(x))[0]; }
-static double v_exp(double x) { return __v_exp(argd(x))[0]; }
-static double v_log(double x) { return __v_log(argd(x))[0]; }
-static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
-#ifdef __vpcs
-static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
-static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
-static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
-static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
-static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
-static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
-static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
-static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
-static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
-static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
-static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
-static double vn_log(double x) { return __vn_log(argd(x))[0]; }
-static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef __SVFloat32_t sv_float;
+typedef __SVFloat64_t sv_double;
+
+static inline sv_float svargf(float x)  {
+	int n = svcntw();
+	float base[n];
+	for (int i=0; i<n; i++)
+		base[i] = (float)x;
+	base[n-1] = (float) fv[secondcall];
+	return svld1(svptrue_b32(), base);
+}
+static inline sv_double svargd(double x) {
+	int n = svcntd();
+	double base[n];
+	for (int i=0; i<n; i++)
+		base[i] = x;
+	base[n-1] = dv[secondcall];
+	return svld1(svptrue_b64(), base);
+}
+static inline float svretf(sv_float vec)  {
+	int n = svcntw();
+	float res[n];
+	svst1(svptrue_b32(), res, vec);
+	return res[0];
+}
+static inline double svretd(sv_double vec) {
+	int n = svcntd();
+	double res[n];
+	svst1(svptrue_b64(), res, vec);
+	return res[0];
+}
 #endif
 #endif
 
+#include "test/ulp_wrappers.h"
+
 struct fun
 {
   const char *name;
@@ -322,83 +306,44 @@ static const struct fun fun[] = {
 #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
 #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#if WANT_VMATH
- F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
- F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
- F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
- F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
- F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
- F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
- F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
- F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
- F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
-#if __aarch64__
- F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#ifdef __vpcs
- F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#endif
-#endif
-#endif
+/* Neon routines.  */
+#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVNF1(x) VNF1 (x) ZVF1 (x)
+#define ZVNF2(x) VNF2 (x) ZVF2 (x)
+#define ZVND1(x) VND1 (x) ZVD1 (x)
+#define ZVND2(x) VND2 (x) ZVD2 (x)
+/* SVE routines.  */
+#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+
+#include "test/ulp_funcs.h"
+
 #undef F
 #undef F1
 #undef F2
 #undef D1
 #undef D2
+#undef SVF1
+#undef SVF2
+#undef SVD1
+#undef SVD2
  {0}};
 
 /* Boilerplate for generic calls.  */
@@ -639,12 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
 static void
 usage (void)
 {
-  puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
+  puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func "
 	"lo [hi [x lo2 hi2] [count]]");
   puts ("Compares func against a higher precision implementation in [lo; hi].");
   puts ("-q: quiet.");
   puts ("-m: use mpfr even if faster method is available.");
-  puts ("-f: disable fenv testing (rounding modes and exceptions).");
+  puts ("-f: disable fenv exceptions testing.");
+#ifdef ___vpcs
+  puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
+	"    This should be different from tested input in other lanes, and non-special \n"
+	"    (i.e. should not trigger fenv exceptions). Default is 1.");
+#endif
+  puts ("-z: ignore sign of 0.");
   puts ("Supported func:");
   for (const struct fun *f = fun; f->name; f++)
     printf ("\t%s\n", f->name);
@@ -768,6 +719,7 @@ main (int argc, char *argv[])
   conf.fenv = 1;
   conf.softlim = 0;
   conf.errlim = INFINITY;
+  conf.ignore_zero_sign = 0;
   for (;;)
     {
       argc--;
@@ -807,11 +759,22 @@ main (int argc, char *argv[])
 	    {
 	      argc--;
 	      argv++;
-	      if (argc < 1)
+	      if (argc < 1 || argv[0][1] != '\0')
 		usage ();
 	      conf.rc = argv[0][0];
 	    }
 	  break;
+	case 'z':
+	  conf.ignore_zero_sign = 1;
+	  break;
+#ifdef __vpcs
+	case 'c':
+	  argc--;
+	  argv++;
+	  fv[0] = strtof(argv[0], 0);
+	  dv[0] = strtod(argv[0], 0);
+	  break;
+#endif
 	default:
 	  usage ();
 	}
@@ -837,7 +800,19 @@ main (int argc, char *argv[])
     if (strcmp (argv[0], f->name) == 0)
       break;
   if (!f->name)
-    usage ();
+    {
+#ifndef __vpcs
+      /* Ignore vector math functions if vector math is not supported.  */
+      if (strncmp (argv[0], "_ZGVnN", 6) == 0)
+	exit (0);
+#endif
+#if !WANT_SVE_MATH
+      if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
+	exit (0);
+#endif
+      printf ("math function %s not supported\n", argv[0]);
+      exit (1);
+    }
   if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
     conf.mpfr = 1; /* Use mpfr if long double has no extra precision.  */
   if (!USE_MPFR && conf.mpfr)
diff --git a/math/test/ulp.h b/math/test/ulp.h
index a0c301664321067789322ba64932130fffa37000..b0bc59aeef8ddbd712d731e3e8d6635254fa7e88 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -1,8 +1,8 @@
 /*
  * Generic functions for ULP error estimation.
  *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* For each different math function type,
@@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
 /* Difference between exact result and closest real number that
    gets rounded to got, i.e. error before rounding, for a correctly
    rounded result the difference is 0.  */
-static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
+static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
+			   int ignore_zero_sign)
 {
   RT(float) want = p->y;
   RT(float) d;
@@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
 
   if (RT(asuint) (got) == RT(asuint) (want))
     return 0.0;
+  if (isnan (got) && isnan (want))
+    /* Ignore sign of NaN.  */
+    return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
   if (signbit (got) != signbit (want))
-    /* May have false positives with NaN.  */
-    //return isnan(got) && isnan(want) ? 0 : INFINITY;
-    return INFINITY;
+    {
+      /* Fall through to ULP calculation if ignoring sign of zero and at
+	 exactly one of want and got is non-zero.  */
+      if (ignore_zero_sign && want == got)
+	return 0.0;
+      if (!ignore_zero_sign || (want != 0 && got != 0))
+	return INFINITY;
+    }
   if (!isfinite (want) || !isfinite (got))
     {
       if (isnan (got) != isnan (want))
@@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
 static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
 				    int r, RT(float) * y, int *ex)
 {
+  if (r != FE_TONEAREST)
+    fesetround (r);
   *y = T(call) (f, a);
   *ex = 0;
+  if (r != FE_TONEAREST)
+    fesetround (FE_TONEAREST);
 }
 
 static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
@@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
 					int r, struct RT(ret) * p,
 					RT(float) ygot, int exgot)
 {
+  if (r != FE_TONEAREST)
+    fesetround (r);
   RT(double) yl = T(call_long) (f, a);
   p->y = (RT(float)) yl;
+  if (r != FE_TONEAREST)
+    fesetround (FE_TONEAREST);
   if (RT(isok_nofenv) (ygot, p->y))
     return 1;
   p->ulpexp = RT(ulpscale) (p->y);
@@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
       if (!ok)
 	{
 	  int print = 0;
-	  double err = RT(ulperr) (ygot, &want, r);
+	  double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
 	  double abserr = fabs (err);
 	  // TODO: count errors below accuracy limit.
 	  if (abserr > 0)
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
new file mode 100644
index 0000000000000000000000000000000000000000..84f7927d393548617c480517b6709b875b0de70b
--- /dev/null
+++ b/math/test/ulp_funcs.h
@@ -0,0 +1,40 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+/* clang-format off */
+ F1 (sin)
+ F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ F1 (erf)
+ D1 (exp)
+ D1 (exp10)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ D1 (erf)
+#ifdef __vpcs
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#endif
+/* clang-format on */
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
new file mode 100644
index 0000000000000000000000000000000000000000..60dc3d6dd652875043118f36f38e64ebedf5ab4a
--- /dev/null
+++ b/math/test/ulp_wrappers.h
@@ -0,0 +1,37 @@
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* clang-format off */
+
+/* Wrappers for sincos.  */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
+/* Wrappers for vector functions.  */
+#ifdef __vpcs
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
+static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
+static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
+static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
+static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
+static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#endif
+
+/* clang-format on */
diff --git a/math/tgamma128.c b/math/tgamma128.c
new file mode 100644
index 0000000000000000000000000000000000000000..dda0da7e8adb4a7fa3b78826316f8fd8a4fae12a
--- /dev/null
+++ b/math/tgamma128.c
@@ -0,0 +1,351 @@
+/*
+ * Implementation of the true gamma function (as opposed to lgamma)
+ * for 128-bit long double.
+ *
+ * Copyright (c) 2006,2009,2023 Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/*
+ * This module implements the float128 gamma function under the name
+ * tgamma128. It's expected to be suitable for integration into system
+ * maths libraries under the standard name tgammal, if long double is
+ * 128-bit. Such a library will probably want to check the error
+ * handling and optimize the initial process of extracting the
+ * exponent, which is done here by simple and portable (but
+ * potentially slower) methods.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "tgamma128.h"
+
+#define lenof(x) (sizeof(x)/sizeof(*(x)))
+
+/*
+ * Helper routine to evaluate a polynomial via Horner's rule
+ */
+static long double poly(const long double *coeffs, size_t n, long double x)
+{
+    long double result = coeffs[--n];
+
+    while (n > 0)
+        result = (result * x) + coeffs[--n];
+
+    return result;
+}
+
+/*
+ * Compute sin(pi*x) / pi, for use in the reflection formula that
+ * relates gamma(-x) and gamma(x).
+ */
+static long double sin_pi_x_over_pi(long double x)
+{
+    int quo;
+    long double fracpart = remquol(x, 0.5L, &quo);
+
+    long double sign = 1.0L;
+    if (quo & 2)
+        sign = -sign;
+    quo &= 1;
+
+    if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
+        /* For numbers this size, sin(pi*x) is so close to pi*x that
+         * sin(pi*x)/pi is indistinguishable from x in float128 */
+        return sign * fracpart;
+    }
+
+    if (quo == 0) {
+        return sign * sinl(pi*fracpart) / pi;
+    } else {
+        return sign * cosl(pi*fracpart) / pi;
+    }
+}
+
+/* Return tgamma(x) on the assumption that x >= 8. */
+static long double tgamma_large(long double x,
+                                bool negative, long double negadjust)
+{
+    /*
+     * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
+     * where K is a correction factor computed as a polynomial in 1/x.
+     *
+     * (Vaguely inspired by the form of the Lanczos approximation, but
+     * I tried the Lanczos approximation itself and it suffers badly
+     * from big cancellation leading to loss of significance.)
+     */
+    long double t = 1/x;
+    long double p = poly(coeffs_large, lenof(coeffs_large), t);
+
+    /*
+     * To avoid overflow in cases where x^(x-0.5) does overflow
+     * but gamma(x) does not, we split x^(x-0.5) in half and
+     * multiply back up _after_ multiplying the shrinking factor
+     * of exp(-(x-0.5)).
+     *
+     * Note that computing x-0.5 and (x-0.5)/2 is exact for the
+     * relevant range of x, so the only sources of error are pow
+     * and exp themselves, plus the multiplications.
+     */
+    long double powhalf = powl(x, (x-0.5L)/2.0L);
+    long double expret = expl(-(x-0.5L));
+
+    if (!negative) {
+        return (expret * powhalf) * powhalf * p;
+    } else {
+        /*
+         * Apply the reflection formula as commented below, but
+         * carefully: negadjust has magnitude less than 1, so it can
+         * turn a case where gamma(+x) would overflow into a case
+         * where gamma(-x) doesn't underflow. Not only that, but the
+         * FP format has greater range in the tiny domain due to
+         * denormals. For both reasons, it's not good enough to
+         * compute the positive result and then adjust it.
+         */
+        long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
+        return ret / powhalf;
+    }
+}
+
+/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
+static long double tgamma_tiny(long double x,
+                               bool negative, long double negadjust)
+{
+    /*
+     * For x near zero, we use a polynomial approximation to
+     * g = 1/(x*gamma(x)), and then return 1/(g*x).
+     */
+    long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
+    if (!negative)
+        return 1.0L / (g*x);
+    else
+        return g / negadjust;
+}
+
+/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
+static long double tgamma_ultratiny(long double x, bool negative,
+                                    long double negadjust)
+{
+    /* On this interval, gamma can't even be distinguished from 1/x,
+     * so we skip the polynomial evaluation in tgamma_tiny, partly to
+     * save time and partly to avoid the tiny intermediate values
+     * setting the underflow exception flag. */
+    if (!negative)
+        return 1.0L / x;
+    else
+        return 1.0L / negadjust;
+}
+
+/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
+static long double tgamma_central(long double x)
+{
+    /*
+     * In this central interval, our strategy is to finding the
+     * difference between x and the point where gamma has a minimum,
+     * and approximate based on that.
+     */
+
+    /* The difference between the input x and the minimum x. The first
+     * subtraction is expected to be exact, since x and min_hi have
+     * the same exponent (unless x=2, in which case it will still be
+     * exact). */
+    long double t = (x - min_x_hi) - min_x_lo;
+
+    /*
+     * Now use two different polynomials for the intervals [1,m] and
+     * [m,2].
+     */
+    long double p;
+    if (t < 0)
+        p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
+    else
+        p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
+
+    return (min_y_lo + p * (t*t)) + min_y_hi;
+}
+
+long double tgamma128(long double x)
+{
+    /*
+     * Start by extracting the number's sign and exponent, and ruling
+     * out cases of non-normalized numbers.
+     *
+     * For an implementation integrated into a system libm, it would
+     * almost certainly be quicker to do this by direct bitwise access
+     * to the input float128 value, using whatever is the local idiom
+     * for knowing its endianness.
+     *
+     * Integration into a system libc may also need to worry about
+     * setting errno, if that's the locally preferred way to report
+     * math.h errors.
+     */
+    int sign = signbit(x);
+    int exponent;
+    switch (fpclassify(x)) {
+      case FP_NAN:
+        return x+x; /* propagate QNaN, make SNaN throw an exception */
+      case FP_ZERO:
+        return 1/x; /* divide by zero on purpose to indicate a pole */
+      case FP_INFINITE:
+        if (sign) {
+            return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
+                         * IEEE invalid operation exception to indicate that */
+        }
+        return x;     /* but gamma(+inf) is just +inf with no error */
+      case FP_SUBNORMAL:
+        exponent = -16384;
+        break;
+      default:
+        frexpl(x, &exponent);
+        exponent--;
+        break;
+    }
+
+    bool negative = false;
+    long double negadjust = 0.0L;
+
+    if (sign) {
+        /*
+         * Euler's reflection formula is
+         *
+         *    gamma(1-x) gamma(x) = pi/sin(pi*x)
+         *
+         *                        pi
+         * => gamma(x) = --------------------
+         *               gamma(1-x) sin(pi*x)
+         *
+         * But computing 1-x is going to lose a lot of accuracy when x
+         * is very small, so instead we transform using the recurrence
+         * gamma(t+1)=t gamma(t). Setting t=-x, this gives us
+         * gamma(1-x) = -x gamma(-x), so we now have
+         *
+         *                         pi
+         *    gamma(x) = ----------------------
+         *               -x gamma(-x) sin(pi*x)
+         *
+         * which relates gamma(x) to gamma(-x), which is much nicer,
+         * since x can be turned into -x without rounding.
+         */
+        negadjust = sin_pi_x_over_pi(x);
+        negative = true;
+        x = -x;
+
+        /*
+         * Now the ultimate answer we want is
+         *
+         *    1 / (gamma(x) * x * negadjust)
+         *
+         * where x is the positive value we've just turned it into.
+         *
+         * For some of the cases below, we'll compute gamma(x)
+         * normally and then compute this adjusted value afterwards.
+         * But for others, we can implement the reciprocal operation
+         * in this formula by _avoiding_ an inversion that the
+         * sub-case was going to do anyway.
+         */
+
+        if (negadjust == 0) {
+            /*
+             * Special case for negative integers. Applying the
+             * reflection formula would cause division by zero, but
+             * standards would prefer we treat this error case as an
+             * invalid operation and return NaN instead. (Possibly
+             * because otherwise you'd have to decide which sign of
+             * infinity to return, and unlike the x=0 case, there's no
+             * sign of zero available to disambiguate.)
+             */
+            return negadjust / negadjust;
+        }
+    }
+
+    /*
+     * Split the positive domain into various cases. For cases where
+     * we do the negative-number adjustment the usual way, we'll leave
+     * the answer in 'g' and drop out of the if statement.
+     */
+    long double g;
+
+    if (exponent >= 11) {
+        /*
+         * gamma of any positive value this large overflows, and gamma
+         * of any negative value underflows.
+         */
+        if (!negative) {
+            long double huge = 0x1p+12288L;
+            return huge * huge; /* provoke an overflow */
+        } else {
+            long double tiny = 0x1p-12288L;
+            return tiny * tiny * negadjust; /* underflow, of the right sign */
+        }
+    } else if (exponent >= 3) {
+        /* Negative-number adjustment happens inside here */
+        return tgamma_large(x, negative, negadjust);
+    } else if (exponent < -113) {
+        /* Negative-number adjustment happens inside here */
+        return tgamma_ultratiny(x, negative, negadjust);
+    } else if (exponent < -5) {
+        /* Negative-number adjustment happens inside here */
+        return tgamma_tiny(x, negative, negadjust);
+    } else if (exponent == 0) {
+        g = tgamma_central(x);
+    } else if (exponent < 0) {
+        /*
+         * For x in [1/32,1) we range-reduce upwards to the interval
+         * [1,2), using the inverse of the normal recurrence formula:
+         * gamma(x) = gamma(x+1)/x.
+         */
+        g = tgamma_central(1+x) / x;
+    } else {
+        /*
+         * For x in [2,8) we range-reduce downwards to the interval
+         * [1,2) by repeated application of the recurrence formula.
+         *
+         * Actually multiplying (x-1) by (x-2) by (x-3) and so on
+         * would introduce multiple ULPs of rounding error. We can get
+         * better accuracy by writing x = (k+1/2) + t, where k is an
+         * integer and |t|<1/2, and expanding out the obvious factor
+         * (x-1)(x-2)...(x-k+1) as a polynomial in t.
+         */
+        long double mult;
+        int i = x;
+        if (i == 2) { /* x in [2,3) */
+            mult = (x-1);
+        } else {
+            long double t = x - (i + 0.5L);
+            switch (i) {
+                /* E.g. for x=3.5+t, we want
+                 * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
+              case 3:
+                mult = 3.75L+t*(4.0L+t);
+                break;
+              case 4:
+                mult = 13.125L+t*(17.75L+t*(7.5L+t));
+                break;
+              case 5:
+                mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
+                break;
+              case 6:
+                mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
+                    117.5L+t*(17.5L+t))));
+                break;
+              case 7:
+                mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
+                    1140.0L+t*(231.25L+t*(24.0L+t)))));
+                break;
+            }
+        }
+
+        g = tgamma_central(x - (i-1)) * mult;
+    }
+
+    if (!negative) {
+        /* Positive domain: return g unmodified */
+        return g;
+    } else {
+        /* Negative domain: apply the reflection formula as commented above */
+        return 1.0L / (g * x * negadjust);
+    }
+}
diff --git a/math/tgamma128.h b/math/tgamma128.h
new file mode 100644
index 0000000000000000000000000000000000000000..ced10c3cc34ca26bcb3d6d8b31899ef9c3f35b15
--- /dev/null
+++ b/math/tgamma128.h
@@ -0,0 +1,141 @@
+/*
+ * Polynomial coefficients and other constants for tgamma128.c.
+ *
+ * Copyright (c) 2006,2009,2023 Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* The largest positive value for which 128-bit tgamma does not overflow. */
+static const long double max_x =  0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
+
+/* Coefficients of the polynomial used in the tgamma_large() subroutine */
+static const long double coeffs_large[] = {
+     0x1.8535745aa79569579b9eec0f3bbcp+0L,
+     0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
+     0x1.59f6a05094f69686c3380f4e2783p-8L,
+    -0x1.0b291dee952a82764a4859b081a6p-8L,
+    -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
+     0x1.387a8b5f38dd77e7f139b1021e86p-10L,
+     0x1.bca46637f65b13750c728cc29e40p-14L,
+    -0x1.d80401c00aef998c9e303151a51cp-11L,
+    -0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
+     0x1.4e950204437dcaf2be77f73a6f45p-10L,
+     0x1.cb711a2d65f188bf60110934d6bep-14L,
+    -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
+    -0x1.0305ab9760cddb0d833e73766836p-12L,
+     0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
+     0x1.bb4144740ad9290123fdcea684aap-11L,
+    -0x1.72ab4e88272a229bfafd192450f0p-5L,
+     0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
+     0x1.e222791c6743ce3e3cae220fb236p-3L,
+     0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
+    -0x1.9d204fa235a42cd901b123d2ad47p+1L,
+     0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
+     0x1.37f900a11dbd892abd7dde533e2dp+5L,
+    -0x1.2da49f4188dd89cb958369ef2401p+7L,
+     0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
+    -0x1.61433cebe649098c9611c4c7774ap+7L,
+};
+
+/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
+static const long double coeffs_tiny[] = {
+     0x1.0000000000000000000000000000p+0L,
+     0x1.2788cfc6fb618f49a37c7f0201fep-1L,
+    -0x1.4fcf4026afa2dceb8490ade22796p-1L,
+    -0x1.5815e8fa27047c8f42b5d9217244p-5L,
+     0x1.5512320b43fbe5dfa771333518f7p-3L,
+    -0x1.59af103c340927bffdd44f954bfcp-5L,
+    -0x1.3b4af28483e210479657e5543366p-7L,
+     0x1.d919c527f6070bfce9b29c2ace9cp-8L,
+    -0x1.317112ce35337def3556a18aa178p-10L,
+    -0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
+     0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
+    -0x1.51cf9f090b5dc398ba86305e3634p-16L,
+    -0x1.4e80f64c04a339740de06ca9fa4ap-20L,
+     0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
+};
+
+/* The location within the interval [1,2] where gamma has a minimum.
+ * Specified as the sum of two 128-bit values, for extra precision. */
+static const long double min_x_hi =  0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
+static const long double min_x_lo =  0x1.ac54d7d218de21303a7c60f08840p-118L;
+
+/* The actual minimum value that gamma takes at that location.
+ * Again specified as the sum of two 128-bit values. */
+static const long double min_y_hi =  0x1.c56dc82a74aee8d8851566d40f32p-1L;
+static const long double min_y_lo =  0x1.8ed98685742c353ce55e5794686fp-114L;
+
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [1,min_x] */
+static const long double coeffs_central_neg[] = {
+     0x1.b6c53f7377b83839c8a292e43b69p-2L,
+     0x1.0bae9f40c7d09ed76e732045850ap-3L,
+     0x1.4981175e14d04c3530e51d01c5fep-3L,
+     0x1.79f77aaf032c948af3a9edbd2061p-4L,
+     0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
+     0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
+     0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
+     0x1.6df1de1e178ef72ca7bd63d40870p-6L,
+     0x1.f63f502bde27e81c0f5e13479b43p-7L,
+     0x1.57fd67d901f40ea011353ad89a0ap-7L,
+     0x1.d7151376eed187eb753e2273cafcp-8L,
+     0x1.427162b5c6ff1d904c71ef53e37cp-8L,
+     0x1.b954b8c3a56cf93e49ef6538928ap-9L,
+     0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
+     0x1.9d35250d9b9378d9b59df734537ap-10L,
+     0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
+     0x1.7e0db39bb99cdb52b028d9359380p-11L,
+     0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
+     0x1.27521cf5fd24dcdf43524e6add11p-13L,
+     0x1.06461d62243bf9a826b42349672fp-10L,
+    -0x1.2b852abead28209b4e0c756dc46ep-9L,
+     0x1.be673c11a72c826115ec6d286c14p-8L,
+    -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
+     0x1.fa362bd2dc68f41abef2d8600acdp-6L,
+    -0x1.a21585b2f52f8b23855de8e452edp-5L,
+     0x1.1f234431ed032052fc92e64e0493p-4L,
+    -0x1.40d332476ca0199c60cdae3f9132p-4L,
+     0x1.1d45dc665d86012eba2eea199cefp-4L,
+    -0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
+     0x1.7e7e2fbc6d49ad484300d6add324p-6L,
+    -0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
+     0x1.30a2a73944f8c84998314d69c23fp-10L,
+};
+
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [min_x,2] */
+static const long double coeffs_central_pos[] = {
+     0x1.b6c53f7377b83839c8a292e22aa2p-2L,
+    -0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
+     0x1.4981175e14d04c3530ee5e1ecebcp-3L,
+    -0x1.79f77aaf032c948ac983d77f3e07p-4L,
+     0x1.1e97bd10821095ab7dc94936cc11p-4L,
+    -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
+     0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
+    -0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
+     0x1.f63f502be57d11aed2cfe90843ffp-7L,
+    -0x1.57fd67d852f230015b9f64770273p-7L,
+     0x1.d715138adc07e5fce81077070357p-8L,
+    -0x1.4271618e9fda8992a667adb15f4fp-8L,
+     0x1.b954d15d9eb772e80fdd760672d7p-9L,
+    -0x1.2dfe391241d3cb79c8c15182843dp-9L,
+     0x1.9d44396fcd48451c3ba924cee814p-10L,
+    -0x1.1ac195fb99739e341589e39803e6p-10L,
+     0x1.82e46127b68f002770826e25f146p-11L,
+    -0x1.089dacd90d9f41493119ac178359p-11L,
+     0x1.6993c007b20394a057d21f3d37f8p-12L,
+    -0x1.ec43a709f4446560c099dec8e31bp-13L,
+     0x1.4ba36322f4074e9add9450f003cap-13L,
+    -0x1.b3f83a977965ca1b7937bf5b34cap-14L,
+     0x1.10af346abc09cb25a6d9fe810b6ep-14L,
+    -0x1.38d8ea1188f242f50203edc395bdp-15L,
+     0x1.39add987a948ec56f62b721a4475p-16L,
+    -0x1.02a4e141f286c8a967e2df9bc9adp-17L,
+     0x1.433b50af22425f546e87113062d7p-19L,
+    -0x1.0c7b73cb0013f00aafc103e8e382p-21L,
+     0x1.b852de313ec38da2297f6deaa6b4p-25L,
+};
+
+/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
+ */
+static const long double pi =  0x1.921fb54442d18469898cc51701b8p+1L;
diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya
index bd72d6b7482089d27bce02848b85b074e4b737b3..6690adfcbb9b8e57cfb5e11ca73fa52594a8443c 100644
--- a/math/tools/cos.sollya
+++ b/math/tools/cos.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating cos(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 8;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya
index b7a462cda5a4f8efb571c3ce3c296d42bb7d7e98..0668bdb5b3d30a088e09b38f099824e91368a237 100644
--- a/math/tools/exp.sollya
+++ b/math/tools/exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 5; // poly degree
 N = 128; // table entries
diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya
index e760769601d40009575d6b121e969e7c09749acb..bd0a42d6bbcbc0c66157c423d19a2a26970eecd5 100644
--- a/math/tools/exp2.sollya
+++ b/math/tools/exp2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating 2^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 // exp2f parameters
 deg = 3; // poly degree
diff --git a/math/tools/log.sollya b/math/tools/log.sollya
index 6df4db44b6f30133e38fa46a0824ea1356313fb1..5288f557292570e5f54ef2e80083407e51c82c41 100644
--- a/math/tools/log.sollya
+++ b/math/tools/log.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 12; // poly degree
 // |log(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya
index 4a364c0f111ff6acebfb0782b472b1500218187e..85811be5d90c9bb5acdee32f5dcfe6d3a2989514 100644
--- a/math/tools/log2.sollya
+++ b/math/tools/log2.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 11; // poly degree
 // |log2(1+x)| > 0x1p-4 outside the interval
diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya
index 82c4dac26fa128d98f0905b12166efee28f0f180..d018ba0145d24d0d095b4393ff9263bcff89cdb0 100644
--- a/math/tools/log2_abs.sollya
+++ b/math/tools/log2_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log2(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya
index a2ac190fc49702e362decc43aafa5240d15730f5..5f9bfe41a6830f5a4ae4028bba223c0186d9c9ee 100644
--- a/math/tools/log_abs.sollya
+++ b/math/tools/log_abs.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating log(1+x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 // interval ~= 1/(2*N), where N is the table entries
diff --git a/math/tools/plot.py b/math/tools/plot.py
index 6c8b89ff284b5a6e220d940fa06d0d56549a21c6..a0fa023225606e0b02afadede19c028d64d85d15 100755
--- a/math/tools/plot.py
+++ b/math/tools/plot.py
@@ -3,7 +3,7 @@
 # ULP error plot tool.
 #
 # Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/math/tools/remez.jl b/math/tools/remez.jl
index 2ff436f5287ff2d426413f6817a966ac82990439..1deab67d0660a946fac4e38d6394bae0aaeb7c98 100755
--- a/math/tools/remez.jl
+++ b/math/tools/remez.jl
@@ -4,7 +4,7 @@
 # remez.jl - implementation of the Remez algorithm for polynomial approximation
 #
 # Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 import Base.\
 
diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya
index a6e851145c119e9a425e6af308d01b4022be44f5..a19300019867873928cb384f28d7ede5a46155dc 100644
--- a/math/tools/sin.sollya
+++ b/math/tools/sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 7;   // polynomial degree
 a = -pi/4; // interval
diff --git a/math/tools/tgamma128_gen.jl b/math/tools/tgamma128_gen.jl
new file mode 100644
index 0000000000000000000000000000000000000000..da76e8b9b84ba8f5e0290db2e38e551b26c7c332
--- /dev/null
+++ b/math/tools/tgamma128_gen.jl
@@ -0,0 +1,212 @@
+# -*- julia -*-
+#
+# Generate tgamma128.h, containing polynomials and constants used by
+# tgamma128.c.
+#
+# Copyright (c) 2006,2009,2023 Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+# This Julia program depends on the 'Remez' and 'SpecialFunctions'
+# library packages. To install them, run this at the interactive Julia
+# prompt:
+#
+#   import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
+#
+# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
+
+import Printf
+import Remez
+import SpecialFunctions
+
+# Round a BigFloat to 128-bit long double and format it as a C99 hex
+# float literal.
+function quadhex(x)
+    sign = " "
+    if x < 0
+        sign = "-"
+        x = -x
+    end
+
+    exponent = BigInt(floor(log2(x)))
+    exponent = max(exponent, -16382)
+    @assert(exponent <= 16383) # else overflow
+
+    x /= BigFloat(2)^exponent
+    @assert(1 <= x < 2)
+    x *= BigFloat(2)^112
+    mantissa = BigInt(round(x))
+
+    mantstr = string(mantissa, base=16, pad=29)
+    return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
+                           exponent)
+end
+
+# Round a BigFloat to 128-bit long double and return it still as a
+# BigFloat.
+function quadval(x, round=0)
+    sign = +1
+    if x.sign < 0
+        sign = -1
+        x = -x
+    end
+
+    exponent = BigInt(floor(log2(x)))
+    exponent = max(exponent, -16382)
+    @assert(exponent <= 16383) # else overflow
+
+    x /= BigFloat(2)^exponent
+    @assert(1 <= x < 2)
+    x *= BigFloat(2)^112
+    if round < 0
+        mantissa = floor(x)
+    elseif round > 0
+        mantissa = ceil(x)
+    else
+        mantissa = round(x)
+    end
+
+    return sign * mantissa * BigFloat(2)^(exponent - 112)
+end
+
+# Output an array of BigFloats as a C array declaration.
+function dumparray(a, name)
+    println("static const long double ", name, "[] = {")
+    for x in N
+        println("    ", quadhex(x), ",")
+    end
+    println("};")
+end
+
+print("/*
+ * Polynomial coefficients and other constants for tgamma128.c.
+ *
+ * Copyright (c) 2006,2009,2023 Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+")
+
+Base.MPFR.setprecision(512)
+
+e = exp(BigFloat(1))
+
+print("
+/* The largest positive value for which 128-bit tgamma does not overflow. */
+")
+lo = BigFloat("1000")
+hi = BigFloat("2000")
+while true
+    global lo
+    global hi
+    global max_x
+
+    mid = (lo + hi) / 2
+    if mid == lo || mid == hi
+        max_x = mid
+        break
+    end
+    if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
+        lo = mid
+    else
+        hi = mid
+    end
+end
+max_x = quadval(max_x, -1)
+println("static const long double max_x = ", quadhex(max_x), ";")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_large() subroutine */
+")
+N, D, E, X = Remez.ratfn_minimax(
+    x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
+                exp(SpecialFunctions.logabsgamma(1/x)[1] +
+                    (1/x-0.5)*(1+log(x))),
+    (0, 1/BigFloat(8)),
+    24, 0,
+    (x, y) -> 1/y
+)
+dumparray(N, "coeffs_large")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
+")
+N, D, E, X = Remez.ratfn_minimax(
+    x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
+    (0, 1/BigFloat(32)),
+    13, 0,
+)
+dumparray(N, "coeffs_tiny")
+
+print("
+/* The location within the interval [1,2] where gamma has a minimum.
+ * Specified as the sum of two 128-bit values, for extra precision. */
+")
+lo = BigFloat("1.4")
+hi = BigFloat("1.5")
+while true
+    global lo
+    global hi
+    global min_x
+
+    mid = (lo + hi) / 2
+    if mid == lo || mid == hi
+        min_x = mid
+        break
+    end
+    if SpecialFunctions.digamma(mid) < 0
+        lo = mid
+    else
+        hi = mid
+    end
+end
+min_x_hi = quadval(min_x, -1)
+println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
+println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
+
+print("
+/* The actual minimum value that gamma takes at that location.
+ * Again specified as the sum of two 128-bit values. */
+")
+min_y = SpecialFunctions.gamma(min_x)
+min_y_hi = quadval(min_y, -1)
+println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
+println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
+
+function taylor_bodge(x)
+    # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
+    # Used in the Remez calls below for x values very near the origin, to avoid
+    # significance loss problems when trying to compute it directly via that
+    # formula (even in MPFR's extra precision).
+    return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
+end
+
+print("
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [1,min_x] */
+")
+N, D, E, X = Remez.ratfn_minimax(
+    x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
+        (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
+    (0, min_x - 1),
+    31, 0,
+    (x, y) -> x^2,
+)
+dumparray(N, "coeffs_central_neg")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [min_x,2] */
+")
+N, D, E, X = Remez.ratfn_minimax(
+    x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
+        (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
+    (0, 2 - min_x),
+    28, 0,
+    (x, y) -> x^2,
+)
+dumparray(N, "coeffs_central_pos")
+
+print("
+/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
+ */
+")
+println("static const long double pi = ", quadhex(BigFloat(pi)), ";")
diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya
index c0abb63fb642a58ca023eb242a010b1a418e15fe..5fa7de7435a9863d3b9511cdff140977165a8333 100644
--- a/math/tools/v_exp.sollya
+++ b/math/tools/v_exp.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating e^x
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 4; // poly degree
 N = 128; // table entries
diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya
index cc3d2c4ae72a1b860313625a9771e2fc1e19e93b..d982524eb920f0e581fd2b9221d493364a26bd37 100644
--- a/math/tools/v_log.sollya
+++ b/math/tools/v_log.sollya
@@ -1,7 +1,7 @@
 // polynomial used for __v_log(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 6; // poly degree
 a = -0x1.fc1p-9;
diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya
index 65cc9957c624a6fd09a32762d7f4d7296e0b8319..63b9d65a1ac35a14b98a8dcab6a00637d35db4fb 100644
--- a/math/tools/v_sin.sollya
+++ b/math/tools/v_sin.sollya
@@ -1,7 +1,7 @@
 // polynomial for approximating sin(x)
 //
 // Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 deg = 15;  // polynomial degree
 a = -pi/2; // interval
diff --git a/math/v_cos.c b/math/v_cos.c
deleted file mode 100644
index 20ba6bd0d0d9a4a5e98f56f0e374f22a88df2f7a..0000000000000000000000000000000000000000
--- a/math/v_cos.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Double-precision vector cos function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const double Poly[] = {
-/* worst-case error is 3.5 ulp.
-   abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].  */
--0x1.9f4a9c8b21dc9p-41,
- 0x1.60e88a10163f2p-33,
--0x1.ae6361b7254e7p-26,
- 0x1.71de382e8d62bp-19,
--0x1.a01a019aeb4ffp-13,
- 0x1.111111110b25ep-7,
--0x1.55555555554c3p-3,
-};
-
-#define C7 v_f64 (Poly[0])
-#define C6 v_f64 (Poly[1])
-#define C5 v_f64 (Poly[2])
-#define C4 v_f64 (Poly[3])
-#define C3 v_f64 (Poly[4])
-#define C2 v_f64 (Poly[5])
-#define C1 v_f64 (Poly[6])
-
-#define InvPi v_f64 (0x1.45f306dc9c883p-2)
-#define HalfPi v_f64 (0x1.921fb54442d18p+0)
-#define Pi1 v_f64 (0x1.921fb54442d18p+1)
-#define Pi2 v_f64 (0x1.1a62633145c06p-53)
-#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
-#define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
-  return v_call_f64 (cos, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(cos) (v_f64_t x)
-{
-  v_f64_t n, r, r2, y;
-  v_u64_t odd, cmp;
-
-  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
-  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
-
-  /* n = rint((|x|+pi/2)/pi) - 0.5.  */
-  n = v_fma_f64 (InvPi, r + HalfPi, Shift);
-  odd = v_as_u64_f64 (n) << 63;
-  n -= Shift;
-  n -= v_f64 (0.5);
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = v_fma_f64 (-Pi1, n, r);
-  r = v_fma_f64 (-Pi2, n, r);
-  r = v_fma_f64 (-Pi3, n, r);
-
-  /* sin(r) poly approx.  */
-  r2 = r * r;
-  y = v_fma_f64 (C7, r2, C6);
-  y = v_fma_f64 (y, r2, C5);
-  y = v_fma_f64 (y, r2, C4);
-  y = v_fma_f64 (y, r2, C3);
-  y = v_fma_f64 (y, r2, C2);
-  y = v_fma_f64 (y, r2, C1);
-  y = v_fma_f64 (y * r2, r, r);
-
-  /* sign.  */
-  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
-
-  if (unlikely (v_any_u64 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_cosf.c b/math/v_cosf.c
deleted file mode 100644
index 150294b8845e735c06423bdbc9e78fe6bb567b06..0000000000000000000000000000000000000000
--- a/math/v_cosf.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Single-precision vector cos function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
-  /* 1.886 ulp error */
-  0x1.5b2e76p-19f,
-  -0x1.9f42eap-13f,
-  0x1.110df4p-7f,
-  -0x1.555548p-3f,
-};
-#define Pi1 v_f32 (0x1.921fb6p+1f)
-#define Pi2 v_f32 (-0x1.777a5cp-24f)
-#define Pi3 v_f32 (-0x1.ee59dap-49f)
-#define A3 v_f32 (Poly[3])
-#define A5 v_f32 (Poly[2])
-#define A7 v_f32 (Poly[1])
-#define A9 v_f32 (Poly[0])
-#define RangeVal v_f32 (0x1p20f)
-#define InvPi v_f32 (0x1.45f306p-2f)
-#define Shift v_f32 (0x1.8p+23f)
-#define AbsMask v_u32 (0x7fffffff)
-#define HalfPi v_f32 (0x1.921fb6p0f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
-  /* Fall back to scalar code.  */
-  return v_call_f32 (cosf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(cosf) (v_f32_t x)
-{
-  v_f32_t n, r, r2, y;
-  v_u32_t odd, cmp;
-
-  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
-  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
-
-  /* n = rint((|x|+pi/2)/pi) - 0.5 */
-  n = v_fma_f32 (InvPi, r + HalfPi, Shift);
-  odd = v_as_u32_f32 (n) << 31;
-  n -= Shift;
-  n -= v_f32 (0.5f);
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
-  r = v_fma_f32 (-Pi1, n, r);
-  r = v_fma_f32 (-Pi2, n, r);
-  r = v_fma_f32 (-Pi3, n, r);
-
-  /* y = sin(r) */
-  r2 = r * r;
-  y = v_fma_f32 (A9, r2, A7);
-  y = v_fma_f32 (y, r2, A5);
-  y = v_fma_f32 (y, r2, A3);
-  y = v_fma_f32 (y * r2, r, r);
-
-  /* sign fix */
-  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
-
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_exp.c b/math/v_exp.c
deleted file mode 100644
index e459d53fddd2509f6f8ddb69328615e8cc80b2e8..0000000000000000000000000000000000000000
--- a/math/v_exp.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Double-precision vector e^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-#include "v_exp.h"
-
-#if V_EXP_TABLE_BITS == 7
-/* maxerr: 1.88 +0.5 ulp
-   rel error: 1.4337*2^-53
-   abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
-#define C1 v_f64 (0x1.ffffffffffd43p-2)
-#define C2 v_f64 (0x1.55555c75adbb2p-3)
-#define C3 v_f64 (0x1.55555da646206p-5)
-#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2.  */
-#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N.  */
-#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
-#elif V_EXP_TABLE_BITS == 8
-/* maxerr: 0.54 +0.5 ulp
-   rel error: 1.4318*2^-58
-   abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ].  */
-#define C1 v_f64 (0x1.fffffffffffd4p-2)
-#define C2 v_f64 (0x1.5555571d6b68cp-3)
-#define C3 v_f64 (0x1.5555576a59599p-5)
-#define InvLn2 v_f64 (0x1.71547652b82fep8)
-#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
-#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
-#endif
-
-#define N (1 << V_EXP_TABLE_BITS)
-#define Tab __v_exp_data
-#define IndexMask v_u64 (N - 1)
-#define Shift v_f64 (0x1.8p+52)
-#define Thres v_f64 (704.0)
-
-VPCS_ATTR
-static v_f64_t
-specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
-{
-  v_f64_t absn = v_abs_f64 (n);
-
-  /* 2^(n/N) may overflow, break it up into s1*s2.  */
-  v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
-  v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
-  v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
-  v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
-  v_f64_t r1 = s1 * s1;
-  v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
-  return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(exp) (v_f64_t x)
-{
-  v_f64_t n, r, r2, s, y, z;
-  v_u64_t cmp, u, e, i;
-
-  cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
-
-  /* n = round(x/(ln2/N)).  */
-  z = v_fma_f64 (x, InvLn2, Shift);
-  u = v_as_u64_f64 (z);
-  n = z - Shift;
-
-  /* r = x - n*ln2/N.  */
-  r = x;
-  r = v_fma_f64 (-Ln2hi, n, r);
-  r = v_fma_f64 (-Ln2lo, n, r);
-
-  e = u << (52 - V_EXP_TABLE_BITS);
-  i = u & IndexMask;
-
-  /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4.  */
-  r2 = r * r;
-  y = v_fma_f64 (C2, r, C1);
-  y = v_fma_f64 (C3, r2, y);
-  y = v_fma_f64 (y, r2, r);
-
-  /* s = 2^(n/N).  */
-  u = v_lookup_u64 (Tab, i);
-  s = v_as_f64_u64 (u + e);
-
-  if (unlikely (v_any_u64 (cmp)))
-    return specialcase (s, y, n);
-  return v_fma_f64 (y, s, s);
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_exp.h b/math/v_exp.h
deleted file mode 100644
index 305da19c0a53924f18007df499b0af2747b1cfa2..0000000000000000000000000000000000000000
--- a/math/v_exp.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Declarations for double-precision e^x vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_EXP_TABLE_BITS 7
-
-extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
-#endif
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
deleted file mode 100644
index e3ea5af3414dc848da0a12659444ff2626f6cfcc..0000000000000000000000000000000000000000
--- a/math/v_exp2f.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
-  /* maxerr: 1.962 ulp.  */
-  0x1.59977ap-10f,
-  0x1.3ce9e4p-7f,
-  0x1.c6bd32p-5f,
-  0x1.ebf9bcp-3f,
-  0x1.62e422p-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
-  /* 2^n may overflow, break it up into s1*s2.  */
-  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
-  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
-  v_f32_t s2 = v_as_f32_u32 (e - b);
-  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
-  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
-  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
-  /* Similar to r1 but avoids double rounding in the subnormal range.  */
-  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
-  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(exp2f) (v_f32_t x)
-{
-  v_f32_t n, r, r2, scale, p, q, poly, absn;
-  v_u32_t cmp, e;
-
-  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = n + r, with r in [-1/2, 1/2].  */
-#if 0
-  v_f32_t z;
-  z = x + Shift;
-  n = z - Shift;
-  r = x - n;
-  e = v_as_u32_f32 (z) << 23;
-#else
-  n = v_round_f32 (x);
-  r = x - n;
-  e = v_as_u32_s32 (v_round_s32 (x)) << 23;
-#endif
-  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
-  cmp = v_cond_u32 (absn > v_f32 (126.0f));
-  r2 = r * r;
-  p = v_fma_f32 (C0, r, C1);
-  q = v_fma_f32 (C2, r, C3);
-  q = v_fma_f32 (p, r2, q);
-  p = C4 * r;
-  poly = v_fma_f32 (q, r2, p);
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (poly, n, e, absn, cmp, scale);
-  return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_exp_data.c b/math/v_exp_data.c
deleted file mode 100644
index 365355497e95026692d683d656b8b286e3594446..0000000000000000000000000000000000000000
--- a/math/v_exp_data.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Lookup table for double-precision e^x vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_exp.h"
-#if WANT_VMATH
-
-#define N (1 << V_EXP_TABLE_BITS)
-
-/* 2^(j/N), j=0..N.  */
-const u64_t __v_exp_data[] = {
-#if N == 128
-0x3ff0000000000000,
-0x3feff63da9fb3335,
-0x3fefec9a3e778061,
-0x3fefe315e86e7f85,
-0x3fefd9b0d3158574,
-0x3fefd06b29ddf6de,
-0x3fefc74518759bc8,
-0x3fefbe3ecac6f383,
-0x3fefb5586cf9890f,
-0x3fefac922b7247f7,
-0x3fefa3ec32d3d1a2,
-0x3fef9b66affed31b,
-0x3fef9301d0125b51,
-0x3fef8abdc06c31cc,
-0x3fef829aaea92de0,
-0x3fef7a98c8a58e51,
-0x3fef72b83c7d517b,
-0x3fef6af9388c8dea,
-0x3fef635beb6fcb75,
-0x3fef5be084045cd4,
-0x3fef54873168b9aa,
-0x3fef4d5022fcd91d,
-0x3fef463b88628cd6,
-0x3fef3f49917ddc96,
-0x3fef387a6e756238,
-0x3fef31ce4fb2a63f,
-0x3fef2b4565e27cdd,
-0x3fef24dfe1f56381,
-0x3fef1e9df51fdee1,
-0x3fef187fd0dad990,
-0x3fef1285a6e4030b,
-0x3fef0cafa93e2f56,
-0x3fef06fe0a31b715,
-0x3fef0170fc4cd831,
-0x3feefc08b26416ff,
-0x3feef6c55f929ff1,
-0x3feef1a7373aa9cb,
-0x3feeecae6d05d866,
-0x3feee7db34e59ff7,
-0x3feee32dc313a8e5,
-0x3feedea64c123422,
-0x3feeda4504ac801c,
-0x3feed60a21f72e2a,
-0x3feed1f5d950a897,
-0x3feece086061892d,
-0x3feeca41ed1d0057,
-0x3feec6a2b5c13cd0,
-0x3feec32af0d7d3de,
-0x3feebfdad5362a27,
-0x3feebcb299fddd0d,
-0x3feeb9b2769d2ca7,
-0x3feeb6daa2cf6642,
-0x3feeb42b569d4f82,
-0x3feeb1a4ca5d920f,
-0x3feeaf4736b527da,
-0x3feead12d497c7fd,
-0x3feeab07dd485429,
-0x3feea9268a5946b7,
-0x3feea76f15ad2148,
-0x3feea5e1b976dc09,
-0x3feea47eb03a5585,
-0x3feea34634ccc320,
-0x3feea23882552225,
-0x3feea155d44ca973,
-0x3feea09e667f3bcd,
-0x3feea012750bdabf,
-0x3fee9fb23c651a2f,
-0x3fee9f7df9519484,
-0x3fee9f75e8ec5f74,
-0x3fee9f9a48a58174,
-0x3fee9feb564267c9,
-0x3feea0694fde5d3f,
-0x3feea11473eb0187,
-0x3feea1ed0130c132,
-0x3feea2f336cf4e62,
-0x3feea427543e1a12,
-0x3feea589994cce13,
-0x3feea71a4623c7ad,
-0x3feea8d99b4492ed,
-0x3feeaac7d98a6699,
-0x3feeace5422aa0db,
-0x3feeaf3216b5448c,
-0x3feeb1ae99157736,
-0x3feeb45b0b91ffc6,
-0x3feeb737b0cdc5e5,
-0x3feeba44cbc8520f,
-0x3feebd829fde4e50,
-0x3feec0f170ca07ba,
-0x3feec49182a3f090,
-0x3feec86319e32323,
-0x3feecc667b5de565,
-0x3feed09bec4a2d33,
-0x3feed503b23e255d,
-0x3feed99e1330b358,
-0x3feede6b5579fdbf,
-0x3feee36bbfd3f37a,
-0x3feee89f995ad3ad,
-0x3feeee07298db666,
-0x3feef3a2b84f15fb,
-0x3feef9728de5593a,
-0x3feeff76f2fb5e47,
-0x3fef05b030a1064a,
-0x3fef0c1e904bc1d2,
-0x3fef12c25bd71e09,
-0x3fef199bdd85529c,
-0x3fef20ab5fffd07a,
-0x3fef27f12e57d14b,
-0x3fef2f6d9406e7b5,
-0x3fef3720dcef9069,
-0x3fef3f0b555dc3fa,
-0x3fef472d4a07897c,
-0x3fef4f87080d89f2,
-0x3fef5818dcfba487,
-0x3fef60e316c98398,
-0x3fef69e603db3285,
-0x3fef7321f301b460,
-0x3fef7c97337b9b5f,
-0x3fef864614f5a129,
-0x3fef902ee78b3ff6,
-0x3fef9a51fbc74c83,
-0x3fefa4afa2a490da,
-0x3fefaf482d8e67f1,
-0x3fefba1bee615a27,
-0x3fefc52b376bba97,
-0x3fefd0765b6e4540,
-0x3fefdbfdad9cbe14,
-0x3fefe7c1819e90d8,
-0x3feff3c22b8f71f1,
-#elif N == 256
-0x3ff0000000000000,
-0x3feffb1afa5abcbf,
-0x3feff63da9fb3335,
-0x3feff168143b0281,
-0x3fefec9a3e778061,
-0x3fefe7d42e11bbcc,
-0x3fefe315e86e7f85,
-0x3fefde5f72f654b1,
-0x3fefd9b0d3158574,
-0x3fefd50a0e3c1f89,
-0x3fefd06b29ddf6de,
-0x3fefcbd42b72a836,
-0x3fefc74518759bc8,
-0x3fefc2bdf66607e0,
-0x3fefbe3ecac6f383,
-0x3fefb9c79b1f3919,
-0x3fefb5586cf9890f,
-0x3fefb0f145e46c85,
-0x3fefac922b7247f7,
-0x3fefa83b23395dec,
-0x3fefa3ec32d3d1a2,
-0x3fef9fa55fdfa9c5,
-0x3fef9b66affed31b,
-0x3fef973028d7233e,
-0x3fef9301d0125b51,
-0x3fef8edbab5e2ab6,
-0x3fef8abdc06c31cc,
-0x3fef86a814f204ab,
-0x3fef829aaea92de0,
-0x3fef7e95934f312e,
-0x3fef7a98c8a58e51,
-0x3fef76a45471c3c2,
-0x3fef72b83c7d517b,
-0x3fef6ed48695bbc0,
-0x3fef6af9388c8dea,
-0x3fef672658375d2f,
-0x3fef635beb6fcb75,
-0x3fef5f99f8138a1c,
-0x3fef5be084045cd4,
-0x3fef582f95281c6b,
-0x3fef54873168b9aa,
-0x3fef50e75eb44027,
-0x3fef4d5022fcd91d,
-0x3fef49c18438ce4d,
-0x3fef463b88628cd6,
-0x3fef42be3578a819,
-0x3fef3f49917ddc96,
-0x3fef3bdda27912d1,
-0x3fef387a6e756238,
-0x3fef351ffb82140a,
-0x3fef31ce4fb2a63f,
-0x3fef2e85711ece75,
-0x3fef2b4565e27cdd,
-0x3fef280e341ddf29,
-0x3fef24dfe1f56381,
-0x3fef21ba7591bb70,
-0x3fef1e9df51fdee1,
-0x3fef1b8a66d10f13,
-0x3fef187fd0dad990,
-0x3fef157e39771b2f,
-0x3fef1285a6e4030b,
-0x3fef0f961f641589,
-0x3fef0cafa93e2f56,
-0x3fef09d24abd886b,
-0x3fef06fe0a31b715,
-0x3fef0432edeeb2fd,
-0x3fef0170fc4cd831,
-0x3feefeb83ba8ea32,
-0x3feefc08b26416ff,
-0x3feef96266e3fa2d,
-0x3feef6c55f929ff1,
-0x3feef431a2de883b,
-0x3feef1a7373aa9cb,
-0x3feeef26231e754a,
-0x3feeecae6d05d866,
-0x3feeea401b7140ef,
-0x3feee7db34e59ff7,
-0x3feee57fbfec6cf4,
-0x3feee32dc313a8e5,
-0x3feee0e544ede173,
-0x3feedea64c123422,
-0x3feedc70df1c5175,
-0x3feeda4504ac801c,
-0x3feed822c367a024,
-0x3feed60a21f72e2a,
-0x3feed3fb2709468a,
-0x3feed1f5d950a897,
-0x3feecffa3f84b9d4,
-0x3feece086061892d,
-0x3feecc2042a7d232,
-0x3feeca41ed1d0057,
-0x3feec86d668b3237,
-0x3feec6a2b5c13cd0,
-0x3feec4e1e192aed2,
-0x3feec32af0d7d3de,
-0x3feec17dea6db7d7,
-0x3feebfdad5362a27,
-0x3feebe41b817c114,
-0x3feebcb299fddd0d,
-0x3feebb2d81d8abff,
-0x3feeb9b2769d2ca7,
-0x3feeb8417f4531ee,
-0x3feeb6daa2cf6642,
-0x3feeb57de83f4eef,
-0x3feeb42b569d4f82,
-0x3feeb2e2f4f6ad27,
-0x3feeb1a4ca5d920f,
-0x3feeb070dde910d2,
-0x3feeaf4736b527da,
-0x3feeae27dbe2c4cf,
-0x3feead12d497c7fd,
-0x3feeac0827ff07cc,
-0x3feeab07dd485429,
-0x3feeaa11fba87a03,
-0x3feea9268a5946b7,
-0x3feea84590998b93,
-0x3feea76f15ad2148,
-0x3feea6a320dceb71,
-0x3feea5e1b976dc09,
-0x3feea52ae6cdf6f4,
-0x3feea47eb03a5585,
-0x3feea3dd1d1929fd,
-0x3feea34634ccc320,
-0x3feea2b9febc8fb7,
-0x3feea23882552225,
-0x3feea1c1c70833f6,
-0x3feea155d44ca973,
-0x3feea0f4b19e9538,
-0x3feea09e667f3bcd,
-0x3feea052fa75173e,
-0x3feea012750bdabf,
-0x3fee9fdcddd47645,
-0x3fee9fb23c651a2f,
-0x3fee9f9298593ae5,
-0x3fee9f7df9519484,
-0x3fee9f7466f42e87,
-0x3fee9f75e8ec5f74,
-0x3fee9f8286ead08a,
-0x3fee9f9a48a58174,
-0x3fee9fbd35d7cbfd,
-0x3fee9feb564267c9,
-0x3feea024b1ab6e09,
-0x3feea0694fde5d3f,
-0x3feea0b938ac1cf6,
-0x3feea11473eb0187,
-0x3feea17b0976cfdb,
-0x3feea1ed0130c132,
-0x3feea26a62ff86f0,
-0x3feea2f336cf4e62,
-0x3feea3878491c491,
-0x3feea427543e1a12,
-0x3feea4d2add106d9,
-0x3feea589994cce13,
-0x3feea64c1eb941f7,
-0x3feea71a4623c7ad,
-0x3feea7f4179f5b21,
-0x3feea8d99b4492ed,
-0x3feea9cad931a436,
-0x3feeaac7d98a6699,
-0x3feeabd0a478580f,
-0x3feeace5422aa0db,
-0x3feeae05bad61778,
-0x3feeaf3216b5448c,
-0x3feeb06a5e0866d9,
-0x3feeb1ae99157736,
-0x3feeb2fed0282c8a,
-0x3feeb45b0b91ffc6,
-0x3feeb5c353aa2fe2,
-0x3feeb737b0cdc5e5,
-0x3feeb8b82b5f98e5,
-0x3feeba44cbc8520f,
-0x3feebbdd9a7670b3,
-0x3feebd829fde4e50,
-0x3feebf33e47a22a2,
-0x3feec0f170ca07ba,
-0x3feec2bb4d53fe0d,
-0x3feec49182a3f090,
-0x3feec674194bb8d5,
-0x3feec86319e32323,
-0x3feeca5e8d07f29e,
-0x3feecc667b5de565,
-0x3feece7aed8eb8bb,
-0x3feed09bec4a2d33,
-0x3feed2c980460ad8,
-0x3feed503b23e255d,
-0x3feed74a8af46052,
-0x3feed99e1330b358,
-0x3feedbfe53c12e59,
-0x3feede6b5579fdbf,
-0x3feee0e521356eba,
-0x3feee36bbfd3f37a,
-0x3feee5ff3a3c2774,
-0x3feee89f995ad3ad,
-0x3feeeb4ce622f2ff,
-0x3feeee07298db666,
-0x3feef0ce6c9a8952,
-0x3feef3a2b84f15fb,
-0x3feef68415b749b1,
-0x3feef9728de5593a,
-0x3feefc6e29f1c52a,
-0x3feeff76f2fb5e47,
-0x3fef028cf22749e4,
-0x3fef05b030a1064a,
-0x3fef08e0b79a6f1f,
-0x3fef0c1e904bc1d2,
-0x3fef0f69c3f3a207,
-0x3fef12c25bd71e09,
-0x3fef16286141b33d,
-0x3fef199bdd85529c,
-0x3fef1d1cd9fa652c,
-0x3fef20ab5fffd07a,
-0x3fef244778fafb22,
-0x3fef27f12e57d14b,
-0x3fef2ba88988c933,
-0x3fef2f6d9406e7b5,
-0x3fef33405751c4db,
-0x3fef3720dcef9069,
-0x3fef3b0f2e6d1675,
-0x3fef3f0b555dc3fa,
-0x3fef43155b5bab74,
-0x3fef472d4a07897c,
-0x3fef4b532b08c968,
-0x3fef4f87080d89f2,
-0x3fef53c8eacaa1d6,
-0x3fef5818dcfba487,
-0x3fef5c76e862e6d3,
-0x3fef60e316c98398,
-0x3fef655d71ff6075,
-0x3fef69e603db3285,
-0x3fef6e7cd63a8315,
-0x3fef7321f301b460,
-0x3fef77d5641c0658,
-0x3fef7c97337b9b5f,
-0x3fef81676b197d17,
-0x3fef864614f5a129,
-0x3fef8b333b16ee12,
-0x3fef902ee78b3ff6,
-0x3fef953924676d76,
-0x3fef9a51fbc74c83,
-0x3fef9f7977cdb740,
-0x3fefa4afa2a490da,
-0x3fefa9f4867cca6e,
-0x3fefaf482d8e67f1,
-0x3fefb4aaa2188510,
-0x3fefba1bee615a27,
-0x3fefbf9c1cb6412a,
-0x3fefc52b376bba97,
-0x3fefcac948dd7274,
-0x3fefd0765b6e4540,
-0x3fefd632798844f8,
-0x3fefdbfdad9cbe14,
-0x3fefe1d802243c89,
-0x3fefe7c1819e90d8,
-0x3fefedba3692d514,
-0x3feff3c22b8f71f1,
-0x3feff9d96b2a23d9,
-#endif
-};
-#endif
diff --git a/math/v_expf.c b/math/v_expf.c
deleted file mode 100644
index d403e00534f068d81edefca1f48b6800cf7ab363..0000000000000000000000000000000000000000
--- a/math/v_expf.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
-  /* maxerr: 1.45358 +0.5 ulp.  */
-  0x1.0e4020p-7f,
-  0x1.573e2ep-5f,
-  0x1.555e66p-3f,
-  0x1.fffdb6p-2f,
-  0x1.ffffecp-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
-  /* 2^n may overflow, break it up into s1*s2.  */
-  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
-  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
-  v_f32_t s2 = v_as_f32_u32 (e - b);
-  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
-  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
-  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
-  /* Similar to r1 but avoids double rounding in the subnormal range.  */
-  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
-  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(expf) (v_f32_t x)
-{
-  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
-  v_u32_t cmp, e;
-
-  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
-     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
-#if 1
-  z = v_fma_f32 (x, InvLn2, Shift);
-  n = z - Shift;
-  r = v_fma_f32 (n, -Ln2hi, x);
-  r = v_fma_f32 (n, -Ln2lo, r);
-  e = v_as_u32_f32 (z) << 23;
-#else
-  z = x * InvLn2;
-  n = v_round_f32 (z);
-  r = v_fma_f32 (n, -Ln2hi, x);
-  r = v_fma_f32 (n, -Ln2lo, r);
-  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
-#endif
-  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-  absn = v_abs_f32 (n);
-  cmp = v_cond_u32 (absn > v_f32 (126.0f));
-  r2 = r * r;
-  p = v_fma_f32 (C0, r, C1);
-  q = v_fma_f32 (C2, r, C3);
-  q = v_fma_f32 (p, r2, q);
-  p = C4 * r;
-  poly = v_fma_f32 (q, r2, p);
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (poly, n, e, absn, cmp, scale);
-  return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_log.c b/math/v_log.c
deleted file mode 100644
index d84c740d2b6b519a5572b41a2e4e91aba27b0477..0000000000000000000000000000000000000000
--- a/math/v_log.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Double-precision vector log(x) function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#include "v_log.h"
-#if V_SUPPORTED
-
-/* Worst-case error: 1.17 + 0.5 ulp.  */
-
-static const f64_t Poly[] = {
-  /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ].  */
-  -0x1.ffffffffffff7p-2,
-   0x1.55555555170d4p-2,
-  -0x1.0000000399c27p-2,
-   0x1.999b2e90e94cap-3,
-  -0x1.554e550bd501ep-3,
-};
-
-#define A0 v_f64 (Poly[0])
-#define A1 v_f64 (Poly[1])
-#define A2 v_f64 (Poly[2])
-#define A3 v_f64 (Poly[3])
-#define A4 v_f64 (Poly[4])
-#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
-#define N (1 << V_LOG_TABLE_BITS)
-#define OFF v_u64 (0x3fe6900900000000)
-
-struct entry
-{
-  v_f64_t invc;
-  v_f64_t logc;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
-  struct entry e;
-#ifdef SCALAR
-  e.invc = __v_log_data[i].invc;
-  e.logc = __v_log_data[i].logc;
-#else
-  e.invc[0] = __v_log_data[i[0]].invc;
-  e.logc[0] = __v_log_data[i[0]].logc;
-  e.invc[1] = __v_log_data[i[1]].invc;
-  e.logc[1] = __v_log_data[i[1]].logc;
-#endif
-  return e;
-}
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
-  return v_call_f64 (log, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(log) (v_f64_t x)
-{
-  v_f64_t z, r, r2, p, y, kd, hi;
-  v_u64_t ix, iz, tmp, top, i, cmp;
-  v_s64_t k;
-  struct entry e;
-
-  ix = v_as_u64_f64 (x);
-  top = ix >> 48;
-  cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
-
-  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
-     The range is split into N subintervals.
-     The ith subinterval contains z and c is near its center.  */
-  tmp = ix - OFF;
-  i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
-  k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
-  iz = ix - (tmp & v_u64 (0xfffULL << 52));
-  z = v_as_f64_u64 (iz);
-  e = lookup (i);
-
-  /* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */
-  r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
-  kd = v_to_f64_s64 (k);
-
-  /* hi = r + log(c) + k*Ln2.  */
-  hi = v_fma_f64 (kd, Ln2, e.logc + r);
-  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
-  r2 = r * r;
-  y = v_fma_f64 (A3, r, A2);
-  p = v_fma_f64 (A1, r, A0);
-  y = v_fma_f64 (A4, r2, y);
-  y = v_fma_f64 (y, r2, p);
-  y = v_fma_f64 (y, r2, hi);
-
-  if (unlikely (v_any_u64 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_log.h b/math/v_log.h
deleted file mode 100644
index bcc2fa6fa9305a936ae6b6e25997a27c2c4ab4e5..0000000000000000000000000000000000000000
--- a/math/v_log.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Declarations for double-precision log(x) vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_LOG_TABLE_BITS 7
-
-extern const struct v_log_data
-{
-  f64_t invc;
-  f64_t logc;
-} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
-#endif
diff --git a/math/v_log_data.c b/math/v_log_data.c
deleted file mode 100644
index 97ee5b09c6a9c2b6b100f444fa16e7dd801e5c5b..0000000000000000000000000000000000000000
--- a/math/v_log_data.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Lookup table for double-precision log(x) vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_log.h"
-#if WANT_VMATH
-
-#define N (1 << V_LOG_TABLE_BITS)
-
-/* Algorithm:
-
-	x = 2^k z
-	log(x) = k ln2 + log(c) + poly(z/c - 1)
-
-where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
-and log(c) and 1/c for the ith subinterval comes from a lookup table:
-
-	tab[i].invc = 1/c
-	tab[i].logc = (double)log(c)
-
-where c is near the center of the subinterval and is chosen by trying several
-floating point invc candidates around 1/center and selecting one for which
-the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
-that contains 1 and the previous one got tweaked to avoid cancellation.  */
-const struct v_log_data __v_log_data[N] = {
-{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
-{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
-{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
-{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
-{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
-{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
-{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
-{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
-{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
-{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
-{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
-{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
-{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
-{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
-{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
-{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
-{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
-{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
-{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
-{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
-{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
-{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
-{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
-{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
-{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
-{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
-{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
-{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
-{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
-{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
-{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
-{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
-{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
-{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
-{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
-{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
-{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
-{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
-{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
-{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
-{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
-{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
-{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
-{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
-{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
-{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
-{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
-{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
-{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
-{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
-{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
-{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
-{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
-{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
-{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
-{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
-{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
-{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
-{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
-{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
-{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
-{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
-{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
-{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
-{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
-{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
-{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
-{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
-{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
-{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
-{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
-{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
-{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
-{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
-{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
-{1.0, 0.0},
-{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
-{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
-{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
-{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
-{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
-{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
-{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
-{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
-{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
-{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
-{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
-{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
-{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
-{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
-{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
-{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
-{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
-{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
-{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
-{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
-{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
-{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
-{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
-{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
-{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
-{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
-{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
-{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
-{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
-{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
-{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
-{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
-{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
-{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
-{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
-{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
-{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
-{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
-{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
-{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
-{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
-{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
-{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
-{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
-{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
-{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
-{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
-{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
-{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
-{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
-{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
-{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
-};
-#endif
diff --git a/math/v_logf.c b/math/v_logf.c
deleted file mode 100644
index 7373192f03fae52c113eabcb69067019e6e2a70c..0000000000000000000000000000000000000000
--- a/math/v_logf.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Single-precision vector log function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
-  /* 3.34 ulp error */
-  -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
-  -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
-};
-#define P7 v_f32 (Poly[0])
-#define P6 v_f32 (Poly[1])
-#define P5 v_f32 (Poly[2])
-#define P4 v_f32 (Poly[3])
-#define P3 v_f32 (Poly[4])
-#define P2 v_f32 (Poly[5])
-#define P1 v_f32 (Poly[6])
-
-#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Mask v_u32 (0x007fffff)
-#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
-  /* Fall back to scalar code.  */
-  return v_call_f32 (logf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(logf) (v_f32_t x)
-{
-  v_f32_t n, p, q, r, r2, y;
-  v_u32_t u, cmp;
-
-  u = v_as_u32_f32 (x);
-  cmp = v_cond_u32 (u - Min >= Max - Min);
-
-  /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
-  u -= Off;
-  n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
-  u &= Mask;
-  u += Off;
-  r = v_as_f32_u32 (u) - v_f32 (1.0f);
-
-  /* y = log(1+r) + n*ln2.  */
-  r2 = r * r;
-  /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))).  */
-  p = v_fma_f32 (P6, r, P5);
-  q = v_fma_f32 (P4, r, P3);
-  y = v_fma_f32 (P2, r, P1);
-  p = v_fma_f32 (P7, r2, p);
-  q = v_fma_f32 (p, r2, q);
-  y = v_fma_f32 (q, r2, y);
-  p = v_fma_f32 (Ln2, n, r);
-  y = v_fma_f32 (y, r2, p);
-
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_math.h b/math/v_math.h
deleted file mode 100644
index f2cc4670bb9b8524c0318952b3e0a417a73746b1..0000000000000000000000000000000000000000
--- a/math/v_math.h
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- * Vector math abstractions.
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef _V_MATH_H
-#define _V_MATH_H
-
-#ifndef WANT_VMATH
-/* Enable the build of vector math code.  */
-# define WANT_VMATH 1
-#endif
-#if WANT_VMATH
-
-/* The goal of this header is to allow vector and scalar
-   build of the same algorithm, the provided intrinsic
-   wrappers are also vector length agnostic so they can
-   be implemented for SVE too (or other simd architectures)
-   and then the code should work on those targets too.  */
-
-#if SCALAR
-#define V_NAME(x) __s_##x
-#elif VPCS && __aarch64__
-#define V_NAME(x) __vn_##x
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-#else
-#define V_NAME(x) __v_##x
-#endif
-
-#ifndef VPCS_ATTR
-#define VPCS_ATTR
-#endif
-#ifndef VPCS_ALIAS
-#define VPCS_ALIAS
-#endif
-
-#include <stdint.h>
-#include "math_config.h"
-
-typedef float f32_t;
-typedef uint32_t u32_t;
-typedef int32_t s32_t;
-typedef double f64_t;
-typedef uint64_t u64_t;
-typedef int64_t s64_t;
-
-/* reinterpret as type1 from type2.  */
-static inline u32_t
-as_u32_f32 (f32_t x)
-{
-  union { f32_t f; u32_t u; } r = {x};
-  return r.u;
-}
-static inline f32_t
-as_f32_u32 (u32_t x)
-{
-  union { u32_t u; f32_t f; } r = {x};
-  return r.f;
-}
-static inline s32_t
-as_s32_u32 (u32_t x)
-{
-  union { u32_t u; s32_t i; } r = {x};
-  return r.i;
-}
-static inline u32_t
-as_u32_s32 (s32_t x)
-{
-  union { s32_t i; u32_t u; } r = {x};
-  return r.u;
-}
-static inline u64_t
-as_u64_f64 (f64_t x)
-{
-  union { f64_t f; u64_t u; } r = {x};
-  return r.u;
-}
-static inline f64_t
-as_f64_u64 (u64_t x)
-{
-  union { u64_t u; f64_t f; } r = {x};
-  return r.f;
-}
-static inline s64_t
-as_s64_u64 (u64_t x)
-{
-  union { u64_t u; s64_t i; } r = {x};
-  return r.i;
-}
-static inline u64_t
-as_u64_s64 (s64_t x)
-{
-  union { s64_t i; u64_t u; } r = {x};
-  return r.u;
-}
-
-#if SCALAR
-#define V_SUPPORTED 1
-typedef f32_t v_f32_t;
-typedef u32_t v_u32_t;
-typedef s32_t v_s32_t;
-typedef f64_t v_f64_t;
-typedef u64_t v_u64_t;
-typedef s64_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
-  return 1;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
-  return x;
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
-  return x;
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
-  return x;
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
-  return x;
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
-  return x;
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
-  return x;
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
-  *x = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
-  *x = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
-  *x = v;
-}
-
-/* true if any elements of a v_cond result is non-zero.  */
-static inline int
-v_any_u32 (v_u32_t x)
-{
-  return x != 0;
-}
-/* to wrap the result of relational operators.  */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
-  return x ? -1 : 0;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
-  return __builtin_fabsf (x);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
-  return __builtin_fmaf (x, y, z);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
-  return __builtin_roundf (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
-  return __builtin_lroundf (x); /* relies on -fno-math-errno.  */
-}
-/* convert to type1 from type2.  */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
-  return x;
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
-  return x;
-}
-/* reinterpret as type1 from type2.  */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
-  union { v_f32_t f; v_u32_t u; } r = {x};
-  return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
-  union { v_u32_t u; v_f32_t f; } r = {x};
-  return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
-  union { v_u32_t u; v_s32_t i; } r = {x};
-  return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
-  union { v_s32_t i; v_u32_t u; } r = {x};
-  return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
-  return tab[idx];
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
-  return tab[idx];
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
-  return f (x);
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
-	     v_u32_t p)
-{
-  return f (x1, x2);
-}
-
-static inline int
-v_lanes64 (void)
-{
-  return 1;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
-  return x;
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
-  return x;
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
-  return x;
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
-  return x;
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
-  *x = v;
-}
-/* true if any elements of a v_cond result is non-zero.  */
-static inline int
-v_any_u64 (v_u64_t x)
-{
-  return x != 0;
-}
-/* to wrap the result of relational operators.  */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
-  return x ? -1 : 0;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
-  return __builtin_fabs (x);
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
-  return __builtin_fma (x, y, z);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
-  return __builtin_round (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
-  return __builtin_lround (x); /* relies on -fno-math-errno.  */
-}
-/* convert to type1 from type2.  */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
-  return x;
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
-  return x;
-}
-/* reinterpret as type1 from type2.  */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
-  union { v_f64_t f; v_u64_t u; } r = {x};
-  return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
-  union { v_u64_t u; v_f64_t f; } r = {x};
-  return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
-  union { v_u64_t u; v_s64_t i; } r = {x};
-  return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
-  union { v_s64_t i; v_u64_t u; } r = {x};
-  return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
-  return tab[idx];
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
-  return tab[idx];
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
-  return f (x);
-}
-
-#elif __aarch64__
-#define V_SUPPORTED 1
-#include <arm_neon.h>
-typedef float32x4_t v_f32_t;
-typedef uint32x4_t v_u32_t;
-typedef int32x4_t v_s32_t;
-typedef float64x2_t v_f64_t;
-typedef uint64x2_t v_u64_t;
-typedef int64x2_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
-  return 4;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
-  return (v_f32_t){x, x, x, x};
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
-  return (v_u32_t){x, x, x, x};
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
-  return (v_s32_t){x, x, x, x};
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
-  return x[i];
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
-  return x[i];
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
-  return x[i];
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
-  (*x)[i] = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
-  (*x)[i] = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
-  (*x)[i] = v;
-}
-
-/* true if any elements of a v_cond result is non-zero.  */
-static inline int
-v_any_u32 (v_u32_t x)
-{
-  /* assume elements in x are either 0 or -1u.  */
-  return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
-}
-/* to wrap the result of relational operators.  */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
-  return x;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
-  return vabsq_f32 (x);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
-  return vfmaq_f32 (z, x, y);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
-  return vrndaq_f32 (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
-  return vcvtaq_s32_f32 (x);
-}
-/* convert to type1 from type2.  */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
-  return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
-  return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-/* reinterpret as type1 from type2.  */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
-  union { v_f32_t f; v_u32_t u; } r = {x};
-  return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
-  union { v_u32_t u; v_f32_t f; } r = {x};
-  return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
-  union { v_u32_t u; v_s32_t i; } r = {x};
-  return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
-  union { v_s32_t i; v_u32_t u; } r = {x};
-  return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
-  return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
-  return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
-  return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
-		   p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
-	     v_u32_t p)
-{
-  return (
-    v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
-	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
-}
-
-static inline int
-v_lanes64 (void)
-{
-  return 2;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
-  return (v_f64_t){x, x};
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
-  return (v_u64_t){x, x};
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
-  return (v_s64_t){x, x};
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
-  return x[i];
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
-  (*x)[i] = v;
-}
-/* true if any elements of a v_cond result is non-zero.  */
-static inline int
-v_any_u64 (v_u64_t x)
-{
-  /* assume elements in x are either 0 or -1u.  */
-  return vpaddd_u64 (x) != 0;
-}
-/* to wrap the result of relational operators.  */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
-  return x;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
-  return vabsq_f64 (x);
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
-  return vfmaq_f64 (z, x, y);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
-  return vrndaq_f64 (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
-  return vcvtaq_s64_f64 (x);
-}
-/* convert to type1 from type2.  */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
-  return (v_f64_t){x[0], x[1]};
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
-  return (v_f64_t){x[0], x[1]};
-}
-/* reinterpret as type1 from type2.  */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
-  union { v_f64_t f; v_u64_t u; } r = {x};
-  return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
-  union { v_u64_t u; v_f64_t f; } r = {x};
-  return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
-  union {  v_u64_t u; v_s64_t i; } r = {x};
-  return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
-  union { v_s64_t i; v_u64_t u; } r = {x};
-  return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
-  return (v_f64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
-  return (v_u64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
-  return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
-}
-#endif
-
-#endif
-#endif
diff --git a/math/v_powf.c b/math/v_powf.c
deleted file mode 100644
index fb80fa6f184688ee7396a12121604b12d9b1db1a..0000000000000000000000000000000000000000
--- a/math/v_powf.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Single-precision vector powf function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define SBITS 5
-#define Tlog v__powf_log2_data.tab
-#define Texp v__exp2f_data.tab
-#define A v__powf_log2_data.poly
-#define C v__exp2f_data.poly
-#define LOGDEG 4
-
-#if LOGDEG == 5
-/* 1.01 ulp */
-#define OFF v_u32 (0x3f330000)
-#define TBITS 4
-#elif LOGDEG == 4
-/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
-#define OFF v_u32 (0x3f35d000)
-#define TBITS 5
-#endif
-
-#define V_EXP2F_TABLE_BITS SBITS
-#define V_EXP2F_POLY_ORDER 3
-struct v_exp2f_data
-{
-  uint64_t tab[1 << V_EXP2F_TABLE_BITS];
-  double poly[V_EXP2F_POLY_ORDER];
-};
-
-#define V_POWF_LOG2_TABLE_BITS TBITS
-#define V_POWF_LOG2_POLY_ORDER LOGDEG
-#define SCALE ((double) (1 << SBITS))
-struct v_powf_log2_data
-{
-  struct
-  {
-    double invc, logc;
-  } tab[1 << V_POWF_LOG2_TABLE_BITS];
-  double poly[V_POWF_LOG2_POLY_ORDER];
-};
-
-static const struct v_powf_log2_data v__powf_log2_data = {
-#if LOGDEG == 5
-  .tab = {
-{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
-{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
-{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
-{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
-{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
-{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
-{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
-{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
-{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
-{ 0x1p+0, 0x0p+0 * SCALE },
-{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
-{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
-{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
-{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
-{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
-{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
-  },
-/* rel err: 1.46 * 2^-32 */
-  .poly = {
-0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
-0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
-0x1.71547652ab82bp0 * SCALE,
-  }
-#elif LOGDEG == 4
-  .tab = {
-{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
-{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
-{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
-{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
-{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
-{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
-{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
-{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
-{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
-{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
-{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
-{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
-{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
-{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
-{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
-{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
-{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
-{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
-{0x1p+0, 0x0p+0 * SCALE},
-{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
-{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
-{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
-{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
-{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
-{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
-{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
-{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
-{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
-{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
-{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
-{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
-{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
-  },
-/* rel err: 1.5 * 2^-30 */
-  .poly = {
- -0x1.6ff5daa3b3d7cp-2 * SCALE,
- 0x1.ec81d03c01aebp-2 * SCALE,
- -0x1.71547bb43f101p-1 * SCALE,
- 0x1.7154764a815cbp0 * SCALE,
-  }
-#endif
-};
-
-static const struct v_exp2f_data v__exp2f_data = {
-  .tab = {
-0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
-0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
-0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
-0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
-0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
-0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
-0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
-0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
-  },
-/* rel err: 1.69 * 2^-34 */
-  .poly = {
-0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
-  },
-};
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
-{
-  return v_call2_f32 (powf, x, y, ret, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(powf) (v_f32_t x, v_f32_t y)
-{
-  v_u32_t u, tmp, cmp, i, top, iz;
-  v_s32_t k;
-  v_f32_t ret;
-
-  u = v_as_u32_f32 (x);
-  cmp = v_cond_u32 (u - Min >= Max - Min);
-  tmp = u - OFF;
-  i = (tmp >> (23 - TBITS)) % (1 << TBITS);
-  top = tmp & 0xff800000;
-  iz = u - top;
-  k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
-
-  for (int lane = 0; lane < v_lanes32 (); lane++)
-    {
-      uint32_t si, siz;
-      int32_t sk;
-      float sy;
-
-      /* Use double precision for each lane.  */
-      double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
-      uint64_t ki, t;
-
-      si = v_get_u32 (i, lane);
-      siz = v_get_u32 (iz, lane);
-      sk = v_get_s32 (k, lane);
-      sy = v_get_f32 (y, lane);
-
-      invc = Tlog[si].invc;
-      logc = Tlog[si].logc;
-      z = (double) as_f32_u32 (siz);
-
-      /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
-      r = __builtin_fma (z, invc, -1.0);
-      y0 = logc + (double) sk;
-
-      /* Polynomial to approximate log1p(r)/ln2.  */
-#if LOGDEG == 5
-      logx = A[0];
-      logx = r * logx + A[1];
-      logx = r * logx + A[2];
-      logx = r * logx + A[3];
-      logx = r * logx + A[4];
-      logx = r * logx + y0;
-#elif LOGDEG == 4
-      logx = A[0];
-      logx = r * logx + A[1];
-      logx = r * logx + A[2];
-      logx = r * logx + A[3];
-      logx = r * logx + y0;
-#endif
-      ylogx = sy * logx;
-      v_set_u32 (&cmp, lane,
-		 (as_u64_f64 (ylogx) >> 47 & 0xffff)
-		     >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
-		   ? 1
-		   : v_get_u32 (cmp, lane));
-
-      /* N*x = k + r with r in [-1/2, 1/2] */
-#if TOINT_INTRINSICS
-      kd = roundtoint (ylogx); /* k */
-      ki = converttoint (ylogx);
-#else
-# define SHIFT 0x1.8p52
-      kd = eval_as_double (ylogx + SHIFT);
-      ki = asuint64 (kd);
-      kd -= SHIFT;
-#endif
-      r = ylogx - kd;
-
-      /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
-      t = Texp[ki % (1 << SBITS)];
-      t += ki << (52 - SBITS);
-      s = as_f64_u64 (t);
-      p = C[0];
-      p = __builtin_fma (p, r, C[1]);
-      p = __builtin_fma (p, r, C[2]);
-      p = __builtin_fma (p, s * r, s);
-
-      v_set_f32 (&ret, lane, p);
-    }
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (x, y, ret, cmp);
-  return ret;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_sin.c b/math/v_sin.c
deleted file mode 100644
index 2b9ed059189ca0402c8ec93f915fa6d3ed11be88..0000000000000000000000000000000000000000
--- a/math/v_sin.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Double-precision vector sin function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const double Poly[] = {
-/* worst-case error is 3.5 ulp.
-   abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].  */
--0x1.9f4a9c8b21dc9p-41,
- 0x1.60e88a10163f2p-33,
--0x1.ae6361b7254e7p-26,
- 0x1.71de382e8d62bp-19,
--0x1.a01a019aeb4ffp-13,
- 0x1.111111110b25ep-7,
--0x1.55555555554c3p-3,
-};
-
-#define C7 v_f64 (Poly[0])
-#define C6 v_f64 (Poly[1])
-#define C5 v_f64 (Poly[2])
-#define C4 v_f64 (Poly[3])
-#define C3 v_f64 (Poly[4])
-#define C2 v_f64 (Poly[5])
-#define C1 v_f64 (Poly[6])
-
-#define InvPi v_f64 (0x1.45f306dc9c883p-2)
-#define Pi1 v_f64 (0x1.921fb54442d18p+1)
-#define Pi2 v_f64 (0x1.1a62633145c06p-53)
-#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
-#define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
-  return v_call_f64 (sin, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(sin) (v_f64_t x)
-{
-  v_f64_t n, r, r2, y;
-  v_u64_t sign, odd, cmp;
-
-  r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
-  sign = v_as_u64_f64 (x) & ~AbsMask;
-  cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
-
-  /* n = rint(|x|/pi).  */
-  n = v_fma_f64 (InvPi, r, Shift);
-  odd = v_as_u64_f64 (n) << 63;
-  n -= Shift;
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
-  r = v_fma_f64 (-Pi1, n, r);
-  r = v_fma_f64 (-Pi2, n, r);
-  r = v_fma_f64 (-Pi3, n, r);
-
-  /* sin(r) poly approx.  */
-  r2 = r * r;
-  y = v_fma_f64 (C7, r2, C6);
-  y = v_fma_f64 (y, r2, C5);
-  y = v_fma_f64 (y, r2, C4);
-  y = v_fma_f64 (y, r2, C3);
-  y = v_fma_f64 (y, r2, C2);
-  y = v_fma_f64 (y, r2, C1);
-  y = v_fma_f64 (y * r2, r, r);
-
-  /* sign.  */
-  y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
-
-  if (unlikely (v_any_u64 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_sinf.c b/math/v_sinf.c
deleted file mode 100644
index e66bfce6d8aa4888cfe610d2c7250a144366091b..0000000000000000000000000000000000000000
--- a/math/v_sinf.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Single-precision vector sin function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
-  /* 1.886 ulp error */
-  0x1.5b2e76p-19f,
-  -0x1.9f42eap-13f,
-  0x1.110df4p-7f,
-  -0x1.555548p-3f,
-};
-#define Pi1 v_f32 (0x1.921fb6p+1f)
-#define Pi2 v_f32 (-0x1.777a5cp-24f)
-#define Pi3 v_f32 (-0x1.ee59dap-49f)
-#define A3 v_f32 (Poly[3])
-#define A5 v_f32 (Poly[2])
-#define A7 v_f32 (Poly[1])
-#define A9 v_f32 (Poly[0])
-#define RangeVal v_f32 (0x1p20f)
-#define InvPi v_f32 (0x1.45f306p-2f)
-#define Shift v_f32 (0x1.8p+23f)
-#define AbsMask v_u32 (0x7fffffff)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
-  /* Fall back to scalar code.  */
-  return v_call_f32 (sinf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(sinf) (v_f32_t x)
-{
-  v_f32_t n, r, r2, y;
-  v_u32_t sign, odd, cmp;
-
-  r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
-  sign = v_as_u32_f32 (x) & ~AbsMask;
-  cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
-
-  /* n = rint(|x|/pi) */
-  n = v_fma_f32 (InvPi, r, Shift);
-  odd = v_as_u32_f32 (n) << 31;
-  n -= Shift;
-
-  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
-  r = v_fma_f32 (-Pi1, n, r);
-  r = v_fma_f32 (-Pi2, n, r);
-  r = v_fma_f32 (-Pi3, n, r);
-
-  /* y = sin(r) */
-  r2 = r * r;
-  y = v_fma_f32 (A9, r2, A7);
-  y = v_fma_f32 (y, r2, A5);
-  y = v_fma_f32 (y, r2, A3);
-  y = v_fma_f32 (y * r2, r, r);
-
-  /* sign fix */
-  y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
-
-  if (unlikely (v_any_u32 (cmp)))
-    return specialcase (x, y, cmp);
-  return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/vn_cos.c b/math/vn_cos.c
deleted file mode 100644
index b57a549eba68b3c9dba8a4f06a68fb80c73352c1..0000000000000000000000000000000000000000
--- a/math/vn_cos.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cos.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
-#include "v_cos.c"
-#endif
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
deleted file mode 100644
index 6321d4620fa700ece0d12e0ccd2445fbd4a299ec..0000000000000000000000000000000000000000
--- a/math/vn_cosf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cosf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
-#include "v_cosf.c"
-#endif
diff --git a/math/vn_exp.c b/math/vn_exp.c
deleted file mode 100644
index 06e269d41766bbc7040fdd92cde5782142db0d57..0000000000000000000000000000000000000000
--- a/math/vn_exp.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
-#include "v_exp.c"
-#endif
diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c
deleted file mode 100644
index db9707e86f16f94ce8d05149a58efd6fa518de14..0000000000000000000000000000000000000000
--- a/math/vn_exp2f.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp2f.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
-#include "v_exp2f.c"
-#endif
diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c
deleted file mode 100644
index 17bd0abd7a60450f157462def7fb66b450044a75..0000000000000000000000000000000000000000
--- a/math/vn_exp2f_1u.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp2f_1u.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_exp2f_1u.c"
-#endif
diff --git a/math/vn_expf.c b/math/vn_expf.c
deleted file mode 100644
index 0652907225d94898aa9034b86bb2b361e0ea3586..0000000000000000000000000000000000000000
--- a/math/vn_expf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
-#include "v_expf.c"
-#endif
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
deleted file mode 100644
index 3be7768148225aa7756bd5f19a2dd026ab2d35f5..0000000000000000000000000000000000000000
--- a/math/vn_expf_1u.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf_1u.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_expf_1u.c"
-#endif
diff --git a/math/vn_log.c b/math/vn_log.c
deleted file mode 100644
index b58fe8ff820a7bb49aafb18d0d287c45d35f6aff..0000000000000000000000000000000000000000
--- a/math/vn_log.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
-#include "v_log.c"
-#endif
diff --git a/math/vn_logf.c b/math/vn_logf.c
deleted file mode 100644
index cc5b8ae3ed55fec377883dd1dfabb4c678e3c48e..0000000000000000000000000000000000000000
--- a/math/vn_logf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_logf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
-#include "v_logf.c"
-#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
deleted file mode 100644
index 260950113b04016a2b8425b6a6333be1830248c1..0000000000000000000000000000000000000000
--- a/math/vn_pow.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_pow.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
-#include "v_pow.c"
-#endif
diff --git a/math/vn_powf.c b/math/vn_powf.c
deleted file mode 100644
index 095d07e337ad27d26699a4159be158a756e2d79a..0000000000000000000000000000000000000000
--- a/math/vn_powf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_powf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
-#include "v_powf.c"
-#endif
diff --git a/math/vn_sin.c b/math/vn_sin.c
deleted file mode 100644
index 905c7962335029212e84676883f9e275b06c56a4..0000000000000000000000000000000000000000
--- a/math/vn_sin.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sin.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
-#include "v_sin.c"
-#endif
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
deleted file mode 100644
index 1214e1a556385b12e1e90bf74ed3e5828f8182d5..0000000000000000000000000000000000000000
--- a/math/vn_sinf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sinf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
-#include "v_sinf.c"
-#endif
diff --git a/networking/Dir.mk b/networking/Dir.mk
index b49610341171f43700b2af195fe7b4c7f2402af7..2589e0a1f91c47b76a50bf78e1c7aa01d3ec495f 100644
--- a/networking/Dir.mk
+++ b/networking/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/networking
 B := build/networking
diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c
index 6d5be58b1f32d1d49482129a62d7c40e715f9d4f..90c00eb7cabe5a0f3e28b6e8f94c17e9f5750334 100644
--- a/networking/aarch64/chksum_simd.c
+++ b/networking/aarch64/chksum_simd.c
@@ -2,7 +2,7 @@
  * AArch64-specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c
index 7f69adfc963c375221bf1d661f2b6f37e5fc56c9..ae08fe5dd0566632cfffdcf245c4d3915884cbd3 100644
--- a/networking/arm/chksum_simd.c
+++ b/networking/arm/chksum_simd.c
@@ -2,7 +2,7 @@
  * Armv7-A specific checksum implementation using NEON
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/chksum.c b/networking/chksum.c
index 95ce5baa94e43e9008e2b0750713cf0efb77e7ed..329482ffdcee963b4deed851ce56af0f0748b6b8 100644
--- a/networking/chksum.c
+++ b/networking/chksum.c
@@ -3,7 +3,7 @@
  * This sum is often used as a simple checksum in networking.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include "networking.h"
diff --git a/networking/chksum_common.h b/networking/chksum_common.h
index 958c8cc0742e7fb2b58e2bda236f836f69715ee9..16f0f6c11df7015ed0a87e0032685a69c74c154f 100644
--- a/networking/chksum_common.h
+++ b/networking/chksum_common.h
@@ -2,7 +2,7 @@
  * Common code for checksum implementations
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef CHKSUM_COMMON_H
diff --git a/networking/include/networking.h b/networking/include/networking.h
index a88feff883394ef5c4d7bb840813d5af7f584e90..297dd4bfab0234ceabf663f5e39552b1e08f63ac 100644
--- a/networking/include/networking.h
+++ b/networking/include/networking.h
@@ -2,7 +2,7 @@
  * Public API.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 unsigned short __chksum (const void *, unsigned int);
diff --git a/networking/test/chksum.c b/networking/test/chksum.c
index 41b98120f2758b54b8d13122caffb00224cc3139..239b5b88777be2a4870b4fd65fc29ddadc5ba11a 100644
--- a/networking/test/chksum.c
+++ b/networking/test/chksum.c
@@ -2,7 +2,7 @@
  * Ones' complement checksum test & benchmark
  *
  * Copyright (c) 2016-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
diff --git a/string/Dir.mk b/string/Dir.mk
index cf3453f7580d381464b4ebb5eacfe1306a427822..40ff5acc093e9d042afdcb6748aa540da6970816 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -1,7 +1,7 @@
 # Makefile fragment - requires GNU make
 #
 # Copyright (c) 2019-2021, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
 
 S := $(srcdir)/string
 B := build/string
diff --git a/string/README.contributors b/string/README.contributors
new file mode 100644
index 0000000000000000000000000000000000000000..0b4a51b563669a48e24d35135eb4ef50293ef2af
--- /dev/null
+++ b/string/README.contributors
@@ -0,0 +1,30 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+   the GNU Coding Standard and glibc specific conventions should be followed
+   to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+   into a libc with minimal changes. This e.g. means that internal symbols
+   should be hidden and in the implementation reserved namespace according to
+   ISO C and POSIX rules. If possible the built shared libraries and static
+   library archives should be usable to override libc symbols at link time (or
+   at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+   (other than symbol versioning), this cannot be done reliably for static
+   linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+   and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY
+================================================
+1. Code:
+   - The assumptions of the code must be clearly documented.
+
+   - Assembly style should be consistent across different implementations.
+
+
+2. Performance:
+   - Benchmarking is needed on several microarchitectures.
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 84339f73cf23770b991c15e62eaba4b186a3201e..207e22950c6d3c4e42c20460cf4a3b1d7fde9eec 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_region - tag memory
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index f58364ca6fcb8c11b548b4288efdd21c716d5866..44b8e0114f4265d1ba02acb0a8622ca27a9a6973 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
 /*
  * __mtag_tag_zero_region - tag memory and fill it with zero bytes
  *
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -15,7 +15,7 @@
  * The memory region may remain untagged if tagging is not enabled.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_MEMORY_TAGGING
 
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..131b95e1fea98f789a678ce075846425cf0a24e6
--- /dev/null
+++ b/string/aarch64/asmdefs.h
@@ -0,0 +1,106 @@
+/*
+ * Macros for asm code.  AArch64 version.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#ifdef __ILP32__
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 2;				\
+  .word 4;				\
+  .word 12;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .text
+#else
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+#endif
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
+#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 5a54242d7de62303fe852f099f7025f32eac9f63..131b7fa36ec2dda1154e4435f53c5f45d6af1baf 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
 /*
  * check ARCH setting.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__aarch64__
@@ -10,4 +10,4 @@
 #endif
 
 /* Include for GNU property notes.  */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index c2e967d1004e06e372725f5cc8ddb95aeb629aa2..948c3cbc7dd43a773d035c9fcf364d994fe3b5a8 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,25 +23,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
 	PTR_ARG (0)
@@ -50,55 +46,53 @@ ENTRY (__memchr_aarch64_mte)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
 
 	rbit	synd, synd
 	clz	synd, synd
-	add	result, srcin, synd, lsr 2
 	cmp	cntin, synd, lsr 2
+	add	result, srcin, synd, lsr 2
 	csel	result, result, xzr, hi
 	ret
 
+	.p2align 3
 L(start_loop):
 	sub	tmp, src, srcin
-	add	tmp, tmp, 16
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 4
 L(loop32):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, 16]!
-	subs	cntrem, cntrem, 32
+	ldr	qdata, [src, 16]
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	subs	cntrem, cntrem, 32
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	cntrem, src, srcin
 	fmov	synd, dend
-	add	tmp, srcin, cntin
-	sub	cntrem, tmp, src
+	sub	cntrem, cntin, cntrem
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index c22e6596f19bdde2e6ced26a3ca11e99c0c5b7f5..b851cf31f2383e874c96b24ba82006d82e52f060 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 353f0d1eac53098f8b8e921d12af1404ec2cf96c..fe6cfe2bc0e28d56100536ec25186f0543b03897 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
 /*
  * memchr - find a character in a memory zone
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 78c5ecaa4cdcba0b826d62369d40f18afa8313d9..d52ce4555344e5b2fcca1ebcbc8b99651c0097fb 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * memcmp - compare memory
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 3b1026642eee805ca31d7f88b13eac082ce4b726..35135e72cc8e5324ade0a2443dc17fa1098142d6 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,103 +1,84 @@
 /* memcmp - compare memory
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		w0
+#define src1	x0
+#define src2	x1
+#define limit	x2
+#define result	w0
+
+#define data1	x3
+#define data1w	w3
+#define data2	x4
+#define data2w	w4
+#define data3	x5
+#define data3w	w5
+#define data4	x6
+#define data4w	w6
+#define tmp	x6
+#define src1end	x7
+#define src2end	x8
 
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data1h		x4
-#define data2		x5
-#define data2w		w5
-#define data2h		x6
-#define tmp1		x7
-#define tmp2		x8
 
 ENTRY (__memcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
-	subs	limit, limit, 8
-	b.lo	L(less8)
-
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	b.ne	L(return)
-
-	subs	limit, limit, 8
-	b.gt	L(more16)
 
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	b	L(return)
-
-L(more16):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	cmp	data1, data2
-	bne	L(return)
-
-	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
-	   strings.  */
-	subs	limit, limit, 16
+	cmp	limit, 16
+	b.lo	L(less16)
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
 	b.ls	L(last_bytes)
+	cmp	limit, 160
+	b.hs	L(loop_align)
+	sub	limit, limit, 32
 
-	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
-	   try to align, so limit it only to strings larger than 128 bytes.  */
-	cmp	limit, 96
-	b.ls	L(loop16)
-
-	/* Align src1 and adjust src2 with bytes not yet done.  */
-	and	tmp1, src1, 15
-	add	limit, limit, tmp1
-	sub	src1, src1, tmp1
-	sub	src2, src2, tmp1
-
-	/* Loop performing 16 bytes per iteration using aligned src1.
-	   Limit is pre-decremented by 16 and must be larger than zero.
-	   Exit if <= 16 bytes left to do or if the data is not equal.  */
 	.p2align 4
-L(loop16):
-	ldp	data1, data1h, [src1], 16
-	ldp	data2, data2h, [src2], 16
-	subs	limit, limit, 16
-	ccmp	data1, data2, 0, hi
-	ccmp	data1h, data2h, 0, eq
-	b.eq	L(loop16)
-
+L(loop32):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
 	cmp	data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	cmp	limit, 16
+	b.ls	L(last_bytes)
+
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
 	cmp	data1, data2
-	bne	L(return)
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	add	src1, src1, 32
+	add	src2, src2, 32
+L(last64):
+	subs	limit, limit, 32
+	b.hi	L(loop32)
 
 	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	add	src1, src1, limit
-	add	src2, src2, limit
-	ldp	data1, data1h, [src1]
-	ldp	data2, data2h, [src2]
-	cmp     data1, data2
-	bne	L(return)
-	mov	data1, data1h
-	mov	data2, data2h
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+L(return2):
 	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):
@@ -105,33 +86,105 @@ L(return):
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	cmp     data1, data2
-L(ret_eq):
+	cmp	data1, data2
 	cset	result, ne
 	cneg	result, result, lo
 	ret
 
 	.p2align 4
-	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less16):
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, L(less8)
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	L(return2)
+
+	.p2align 4
 L(less8):
-	adds	limit, limit, 4
-	b.lo	L(less4)
-	ldr	data1w, [src1], 4
-	ldr	data2w, [src2], 4
+	tbz	limit, 2, L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	L(return2)
+
+L(less4):
+	tbz	limit, 1, L(less2)
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
 	cmp	data1w, data2w
 	b.ne	L(return)
-	sub	limit, limit, 4
-L(less4):
-	adds	limit, limit, 4
-	beq	L(ret_eq)
-L(byte_loop):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	subs	limit, limit, 1
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
+L(less2):
+	mov	result, 0
+	tbz	limit, 0, L(return_zero)
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
 	sub	result, data1w, data2w
+L(return_zero):
 	ret
 
-END (__memcmp_aarch64)
+L(loop_align):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+L(loop64):
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	L(loop64)
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, L(last64)
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
+	ret
 
+END (__memcmp_aarch64)
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index f97f2c3047b96e489ff97395173f2069469144e0..e6527d0dac2c48c3b313f25a5d61314df87871e0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
new file mode 100644
index 0000000000000000000000000000000000000000..b45c31418717cd1e5cc7f29dd42aceab31d784c8
--- /dev/null
+++ b/string/aarch64/memcpy-mops.S
@@ -0,0 +1,21 @@
+/*
+ * memcpy using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memcpy_aarch64_mops)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	mov	x3, x0
+	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
+	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
+	.inst	0x19810443	/* cpyfe   [x3]!, [x1]!, x2!  */
+	ret
+
+END (__memcpy_aarch64_mops)
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
new file mode 100644
index 0000000000000000000000000000000000000000..e8a946d7db37f44fa8b819be5cf81fe0ee5f719d
--- /dev/null
+++ b/string/aarch64/memcpy-sve.S
@@ -0,0 +1,177 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#ifdef HAVE_SVE
+
+.arch armv8-a+sve
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1	x6
+#define vlen	x6
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+   SVE vectors are used to speedup small copies.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cntb	vlen
+	cmp	count, vlen, lsl 1
+	b.hi	L(copy32_128)
+
+	whilelo p0.b, xzr, count
+	whilelo p1.b, vlen, count
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+	ret
+
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	add	srcend, src, count
+	add	dstend, dstin, count
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(return)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(return):
+	ret
+
+END (__memcpy_aarch64_sve)
+
+#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 8a967cdf4d2b5c014ce0737c19e4884297cd18b7..2b1a592feb39b5c831a908bda3f42bf3f9fc44ab 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define src	x1
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
new file mode 100644
index 0000000000000000000000000000000000000000..6c73017bb16f00ded1eaaaa5bf61fe9e68de5e9c
--- /dev/null
+++ b/string/aarch64/memmove-mops.S
@@ -0,0 +1,21 @@
+/*
+ * memmove using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memmove_aarch64_mops)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	mov	x3, x0
+	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
+	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
+	.inst	0x1d810443	/* cpye    [x3]!, [x1]!, x2!  */
+	ret
+
+END (__memmove_aarch64_mops)
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 7b4be847cecbf93820be6ca931cf6b4569bf382f..6418bdf56f414880540632cd8c8257ed3d95d6d2 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
 /*
  * memrchr - find last character in a memory zone.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -23,7 +23,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -31,19 +30,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
 	PTR_ARG (0)
@@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -69,34 +62,36 @@ ENTRY (__memrchr_aarch64)
 	csel	result, result, xzr, hi
 	ret
 
+	nop
 L(start_loop):
-	sub	tmp, end, src
-	subs	cntrem, cntin, tmp
+	subs	cntrem, src, srcin
 	b.ls	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
 
-	.p2align 4
+	.p2align 5
 L(loop32):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 
 L(loop32_2):
-	ldr	qdata, [src, -16]!
+	ldr	qdata, [src, -16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
new file mode 100644
index 0000000000000000000000000000000000000000..ec791493bae9c019b92374f0920edbec00b10507
--- /dev/null
+++ b/string/aarch64/memset-mops.S
@@ -0,0 +1,20 @@
+/*
+ * memset using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memset_aarch64_mops)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	mov     x3, x0
+	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
+	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
+	.inst   0x19c18443	/* sete    [x3]!, x2!, x1  */
+	ret
+
+END (__memset_aarch64_mops)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 9fcd97579913b025028f6728098ebd570992cb7d..553b0fcaefea5e5ae60c4ef583b80dc81f165ae6 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,8 +1,8 @@
 /*
  * memset - fill memory with a constant byte
  *
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  *
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define dstin	x0
 #define val	x1
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c7119065152def69dabaa5edfd92ada06685f1..0000000000000000000000000000000000000000
--- a/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
index 82dd9717b0a0af44d7a14bee1ff9de16df7a6535..5d3f14b86026882d092f567f598ed46fdbe9447f 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
index 4f62aa46238987bbbd634b3fb794433d7bd74965..155c68d75a7b23a7c4f1be9cf864a7c1f1287ccd 100644
--- a/string/aarch64/stpcpy.S
+++ b/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
  * stpcpy - copy a string returning pointer to end.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STPCPY 1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index dcb0e46258709760e7ef1c7d81e47a86457a2846..6ec08f7acc766b652cee0c340541f74ac01cebd7 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,8 +19,7 @@
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -28,39 +27,30 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (__strchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 13ba9f44f9c5a3dd716252b0459955cfe12c3b18..ff075167bfefb7dcf66869626c28c7d58163ab7f 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * strchr/strchrnul - find a character in a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 1063cbfd77aa817ed1502e0b2c39643fb102c16b..37193bd947a73dbf7167e3b10d5ddb8e2510dd31 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
 /*
  * strchr - find a character in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 1b0d0a63094c6567c3ee3654b416635f28a8acfd..543ee88bb285852eb6a7cf22cf25480f6403c98d 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -20,38 +20,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -63,15 +57,22 @@ ENTRY (__strchrnul_aarch64_mte)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	add	src, src, 16
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 428ff1a3d008325778eccc4e9fe1ec99bfc70bb5..0005f9177514082544bc0f5f5a245ef5632430a7 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
  * strchrnul - find a character or nul in a string
  *
  * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define BUILD_STRCHRNUL
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index a4230d919b478d3001d412a7b3574f7ec94d2fb1..666e8d0304c16d4f9ebb8fa443670a40673b934b 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
 /*
  * strchrnul - find a character or nul in a string
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b51dd3442ca89ba7994569ce9e54b0e351..0000000000000000000000000000000000000000
--- a/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1		x0
-#define src2		x1
-#define result		x0
-
-#define data1		x2
-#define data1w		w2
-#define data2		x3
-#define data2w		w3
-#define has_nul		x4
-#define diff		x5
-#define off1		x5
-#define syndrome	x6
-#define tmp		x6
-#define data3		x7
-#define zeroones	x8
-#define shift		x9
-#define off2		x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.  */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-   can be done in parallel across the entire word.
-   Since carry propagation makes 0x1 bytes before a NUL byte appear
-   NUL too in big-endian, byte-reverse the data before the NUL check.  */
-
-
-ENTRY (__strcmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	sub	off2, src2, src1
-	mov	zeroones, REP8_01
-	and	tmp, src1, 7
-	tst	off2, 7
-	b.ne	L(misaligned8)
-	cbnz	tmp, L(mutual_align)
-
-	.p2align 4
-
-L(loop_aligned):
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
-	rev	tmp, data1
-	sub	has_nul, tmp, zeroones
-	orr	tmp, tmp, REP8_7f
-#else
-	sub	has_nul, data1, zeroones
-	orr	tmp, data1, REP8_7f
-#endif
-	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_aligned)
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
-	rev	syndrome, syndrome
-	rev	data1, data1
-	rev	data2, data2
-#endif
-	clz	shift, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	lsl	data1, data1, shift
-	lsl	data2, data2, shift
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, 56
-	sub	result, data1, data2, lsr 56
-	ret
-
-	.p2align 4
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.  */
-	bic	src1, src1, 7
-	ldr	data2, [src1, off2]
-	ldr	data1, [src1], 8
-	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
-	mov	tmp, -1
-	LS_FW	tmp, tmp, shift
-	orr	data1, data1, tmp
-	orr	data2, data2, tmp
-	b	L(start_realigned)
-
-L(misaligned8):
-	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond the end of SRC2.  */
-	cbz	tmp, L(src1_aligned)
-L(do_misaligned):
-	ldrb	data1w, [src1], 1
-	ldrb	data2w, [src2], 1
-	cmp	data1w, 0
-	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	tst	src1, 7
-	b.ne	L(do_misaligned)
-
-L(src1_aligned):
-	neg	shift, src2, lsl 3
-	bic	src2, src2, 7
-	ldr	data3, [src2], 8
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	lsr	tmp, zeroones, shift
-	orr	data3, data3, tmp
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	bics	has_nul, has_nul, tmp
-	b.ne	L(tail)
-
-	sub	off1, src2, src1
-
-	.p2align 4
-
-L(loop_unaligned):
-	ldr	data3, [src1, off1]
-	ldr	data2, [src1, off2]
-#ifdef __AARCH64EB__
-	rev	data3, data3
-#endif
-	sub	has_nul, data3, zeroones
-	orr	tmp, data3, REP8_7f
-	ldr	data1, [src1], 8
-	bics	has_nul, has_nul, tmp
-	ccmp	data1, data2, 0, eq
-	b.eq	L(loop_unaligned)
-
-	lsl	tmp, has_nul, shift
-#ifdef __AARCH64EB__
-	rev	tmp, tmp
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, tmp
-	cbnz	syndrome, L(end)
-L(tail):
-	ldr	data1, [src1]
-	neg	shift, shift
-	lsr	data2, data3, shift
-	lsr	has_nul, has_nul, shift
-#ifdef __AARCH64EB__
-	rev     data2, data2
-	rev	has_nul, has_nul
-#endif
-	eor	diff, data1, data2
-	orr	syndrome, diff, has_nul
-	b	L(end)
-
-L(done):
-	sub	result, data1, data2
-	ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index e6d2da5411cac58a14b62d4767022a0c22b87ecc..eaf909a378f1f52dcf180e1f10a82dba071d94c9 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * __strcmp_aarch64_sve - compare two strings
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 7714ebf5577d84a279f911914f5f7f28d41f3e8c..137a9aa06681a3c6d00062c88cddf8b9a227c220 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,168 +1,184 @@
 /*
  * strcmp - compare two strings
  *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
+
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
-/* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define result		x0
 
-/* Internal variables.  */
 #define data1		x2
 #define data1w		w2
 #define data2		x3
 #define data2w		w3
 #define has_nul		x4
 #define diff		x5
+#define off1		x5
 #define syndrome	x6
-#define tmp1		x7
-#define tmp2		x8
-#define tmp3		x9
-#define zeroones	x10
-#define pos		x11
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
 
-	/* Start of performance-critical section  -- one 64B cache line.  */
 ENTRY (__strcmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
 	b.ne	L(misaligned8)
-	ands	tmp1, src1, #7
-	b.ne	L(mutual_align)
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
 L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
 L(start_realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
-
 L(end):
-#ifndef	__AARCH64EB__
+#ifndef __AARCH64EB__
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-	clz	pos, syndrome
 	rev	data2, data2
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#else
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
 	ret
-#endif
+
+	.p2align 4
 
 L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that preceed the start point.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
-	ldr	data1, [src1], #8
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#endif
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
 	b	L(start_realigned)
 
 L(misaligned8):
 	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
-	   checking to make sure that we don't access beyond page boundary in
-	   SRC2.  */
-	tst	src1, #7
-	b.eq	L(loop_misaligned)
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
 L(do_misaligned):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
 	b.ne	L(done)
-	tst	src1, #7
+	tst	src1, 7
 	b.ne	L(do_misaligned)
 
-L(loop_misaligned):
-	/* Test if we are within the last dword of the end of a 4K page.  If
-	   yes then jump back to the misaligned loop to copy a byte at a time.  */
-	and	tmp1, src2, #0xff8
-	eor	tmp1, tmp1, #0xff8
-	cbz	tmp1, L(do_misaligned)
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
 	orr	syndrome, diff, has_nul
-	cbz	syndrome, L(loop_misaligned)
 	b	L(end)
 
 L(done):
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d61e53ad6841b10ef2b874852df203d800..0000000000000000000000000000000000000000
--- a/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin		x0
-#define srcin		x1
-#define result		x0
-
-#define src		x2
-#define dst		x3
-#define len		x4
-#define synd		x4
-#define	tmp		x5
-#define wtmp		w5
-#define shift		x5
-#define data1		x6
-#define dataw1		w6
-#define data2		x7
-#define dataw2		w7
-
-#define dataq		q0
-#define vdata		v0
-#define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
-#define dataq2		q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
-
-ENTRY (STRCPY)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
-	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	lsr	synd, synd, shift
-	cbnz	synd, L(tail)
-
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(start_loop)
-
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	sub	tmp, src, srcin
-	clz	len, synd
-	add	len, tmp, len, lsr 2
-	tbz	len, 4, L(less16)
-	sub	tmp, len, 15
-	ldr	dataq, [srcin]
-	ldr	dataq2, [srcin, tmp]
-	str	dataq, [dstin]
-	str	dataq2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4,,8
-L(tail):
-	rbit	synd, synd
-	clz	len, synd
-	lsr	len, len, 2
-
-	.p2align 4
-L(less16):
-	tbz	len, 3, L(less8)
-	sub	tmp, len, 7
-	ldr	data1, [srcin]
-	ldr	data2, [srcin, tmp]
-	str	data1, [dstin]
-	str	data2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(less8):
-	subs	tmp, len, 3
-	b.lo	L(less4)
-	ldr	dataw1, [srcin]
-	ldr	dataw2, [srcin, tmp]
-	str	dataw1, [dstin]
-	str	dataw2, [dstin, tmp]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-L(less4):
-	cbz	len, L(zerobyte)
-	ldrh	dataw1, [srcin]
-	strh	dataw1, [dstin]
-L(zerobyte):
-	strb	wzr, [dstin, len]
-	IFSTPCPY (add result, dstin, len)
-	ret
-
-	.p2align 4
-L(start_loop):
-	sub	len, src, srcin
-	ldr	dataq2, [srcin]
-	add	dst, dstin, len
-	str	dataq2, [dstin]
-
-	.p2align 5
-L(loop):
-	str	dataq, [dst], 16
-	ldr	dataq, [src, 16]!
-	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
-	fmov	synd, dend
-	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-	fmov	synd, dend
-#ifndef __AARCH64EB__
-	rbit	synd, synd
-#endif
-	clz	len, synd
-	lsr	len, len, 2
-	sub	tmp, len, 15
-	ldr	dataq, [src, tmp]
-	str	dataq, [dst, tmp]
-	IFSTPCPY (add result, dst, len)
-	ret
-
-END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index f515462e09ae768dbc921ba2928150dd5a98c6e7..00e72dce4451b3ead0e83c2c90832088ab79fb50 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,11 +1,11 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 6e9ed424b693919e95f7fbe8569fc9024633715a..97ae37ea422973e3eeea510bf63c3f314ff574d3 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,311 +1,156 @@
 /*
  * strcpy/stpcpy - copy a string returning pointer to start/end.
  *
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
-
-   To test the page crossing code path more thoroughly, compile with
-   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
-   entry path.  This option is not intended for production use.  */
-
-/* Arguments and results.  */
 #define dstin		x0
 #define srcin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
 #define dst		x3
-#define data1		x4
-#define data1w		w4
-#define data2		x5
-#define data2w		w5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define data1a		x13
-#define data2a		x14
-#define pos		x15
-#define len		x16
-#define to_align	x17
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vend		v2
+#define dend		d2
+#define dataq2		q1
 
 #ifdef BUILD_STPCPY
-#define STRCPY __stpcpy_aarch64
-#else
-#define STRCPY __strcpy_aarch64
-#endif
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
-	   page size check for crossing this boundary on entry and if we
-	   do not, then we can short-circuit much of the entry code.  We
-	   expect early page-crossing strings to be rare (probability of
-	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
-	   predictable, even with random strings.
-
-	   We don't bother checking for larger page sizes, the cost of setting
-	   up the correct page size is just not worth the extra gain from
-	   a small reduction in the cases taking the slow path.  Note that
-	   we only care about whether the first fetch, which may be
-	   misaligned, crosses a page boundary - after that we move to aligned
-	   fetches for the remainder of the string.  */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
-	/* Make everything that isn't Qword aligned look like a page cross.  */
-#define MIN_PAGE_P2 4
+# define STRCPY __stpcpy_aarch64
+# define IFSTPCPY(X,...) X,__VA_ARGS__
 #else
-#define MIN_PAGE_P2 12
+# define STRCPY __strcpy_aarch64
+# define IFSTPCPY(X,...)
 #endif
 
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
-	/* For moderately short strings, the fastest way to do the copy is to
-	   calculate the length of the string in the same way as strlen, then
-	   essentially do a memcpy of the result.  This avoids the need for
-	   multiple byte copies and further means that by the time we
-	   reach the bulk copy loop we know we can always use DWord
-	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
-	   with the same source string, so branch prediction is likely to
-	   always be difficult - we mitigate against this by preferring
-	   conditional select operations over branches whenever this is
-	   feasible.  */
-	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
-	mov	zeroones, #REP8_01
-	and	to_align, srcin, #15
-	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
-	neg	tmp1, to_align
-	/* The first fetch will straddle a (possible) page boundary iff
-	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
-	   aligned string will never fail the page align check, so will
-	   always take the fast path.  */
-	b.gt	L(page_cross)
-
-L(page_cross_ok):
-	ldp	data1, data2, [srcin]
-#ifdef __AARCH64EB__
-	/* Because we expect the end to be found within 16 characters
-	   (profiling shows this is the most common case), it's worth
-	   swapping the bytes now to save having to recalculate the
-	   termination syndrome later.  We preserve data1 and data2
-	   so that we can re-use the values later on.  */
-	rev	tmp2, data1
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	rev	tmp4, data2
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bics	has_nul1, tmp1, tmp2
-	b.ne	L(fp_le8)
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
+	bic	src, srcin, 15
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	bics	has_nul2, tmp3, tmp4
-	b.eq	L(bulk_entry)
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
+	ret
 
-	/* The string is short (<=16 bytes).  We don't know exactly how
-	   short though, yet.  Work out the exact length so that we can
-	   quickly select the optimal copy strategy.  */
-L(fp_gt8):
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	mov	tmp2, #56
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	sub	pos, tmp2, pos
-#ifdef __AARCH64EB__
-	lsr	data2, data2, pos
-#else
-	lsl	data2, data2, pos
-#endif
-	str	data2, [dst, #1]
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
 	str	data1, [dstin]
-#ifdef BUILD_STPCPY
-	add	dstin, dst, #8
-#endif
+	str	data2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(fp_le8):
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
-	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	L(fp_lt4)
-#ifdef __AARCH64EB__
-	mov	tmp2, #56
-	sub	pos, tmp2, pos
-	lsr	data2, data1, pos
-	lsr	data1, data1, #32
-#else
-	lsr	data2, data1, tmp2
-#endif
-	/* 4->7 bytes to copy.  */
-	str	data2w, [dst, #-3]
-	str	data1w, [dstin]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	IFSTPCPY (add result, dstin, len)
 	ret
-L(fp_lt4):
-	cbz	pos, L(fp_lt2)
-	/* 2->3 bytes to copy.  */
-#ifdef __AARCH64EB__
-	lsr	data1, data1, #48
-#endif
-	strh	data1w, [dstin]
-	/* Fall-through, one byte (max) to go.  */
-L(fp_lt2):
-	/* Null-terminated string.  Last character must be zero!  */
-	strb	wzr, [dst]
-#ifdef BUILD_STPCPY
-	mov	dstin, dst
-#endif
-	ret
-
-	.p2align 6
-	/* Aligning here ensures that the entry code and main loop all lies
-	   within one 64-byte cache line.  */
-L(bulk_entry):
-	sub	to_align, to_align, #16
-	stp	data1, data2, [dstin]
-	sub	src, srcin, to_align
-	sub	dst, dstin, to_align
-	b	L(entry_no_page_cross)
-
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-L(main_loop):
-	stp	data1, data2, [dst], #16
-L(entry_no_page_cross):
-	ldp	data1, data2, [src], #16
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(main_loop)
 
-	/* Since we know we are copying at least 16 bytes, the fastest way
-	   to deal with the tail is to determine the location of the
-	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
-	cmp	has_nul1, #0
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	csel	data1, data1, data2, ne
-	rev	data1, data1
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-#else
-	csel	has_nul1, has_nul1, has_nul2, ne
-#endif
-	rev	has_nul1, has_nul1
-	clz	pos, has_nul1
-	add	tmp1, pos, #72
-	add	pos, pos, #8
-	csel	pos, pos, tmp1, ne
-	add	src, src, pos, lsr #3
-	add	dst, dst, pos, lsr #3
-	ldp	data1, data2, [src, #-32]
-	stp	data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
-	sub	dstin, dst, #1
-#endif
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	IFSTPCPY (add result, dstin, len)
 	ret
 
-L(page_cross):
-	bic	src, srcin, #15
-	/* Start by loading two words at [srcin & ~15], then forcing the
-	   bytes that precede srcin to 0xff.  This means they never look
-	   like termination bytes.  */
-	ldp	data1, data2, [src]
-	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
-	tst	to_align, #7
-	csetm	tmp2, ne
-#ifdef __AARCH64EB__
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
-#else
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	.p2align 4
+L(start_loop):
+	sub	tmp, srcin, dstin
+	ldr	dataq2, [srcin]
+	sub	dst, src, tmp
+	str	dataq2, [dstin]
+L(loop):
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	sub	dst, dst, 31
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
-	cmp	to_align, #8
-	csinv	data1, data1, xzr, lt
-	csel	data2, data2, data2a, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bics	has_nul2, tmp3, tmp4
-	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	L(page_cross_ok)
-	/* We now need to make data1 and data2 look like they've been
-	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
-	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
-	neg	tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
-	lsl	data1a, data1, tmp1
-	lsr	tmp4, data2, tmp2
-	lsl	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	rev	tmp2, data1
-	rev	tmp4, data2
-	sub	tmp1, tmp2, zeroones
-	orr	tmp2, tmp2, #REP8_7f
-	sub	tmp3, tmp4, zeroones
-	orr	tmp4, tmp4, #REP8_7f
-#else
-	lsr	data1a, data1, tmp1
-	lsl	tmp4, data2, tmp2
-	lsr	data2, data2, tmp1
-	orr	tmp4, tmp4, data1a
-	cmp	to_align, #8
-	csel	data1, tmp4, data2, lt
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-#endif
-	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, L(fp_le8)
-	bic	has_nul2, tmp3, tmp4
-	b	L(fp_gt8)
+	clz	len, synd
+	lsr	len, len, 2
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	IFSTPCPY (add result, dst, 15)
+	ret
 
 END (STRCPY)
-
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 7cf41d5c1eac995332ae42bbaf962116eb32457d..77235797f7c54fe5af374120f76362148b11ce0f 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define result		x0
@@ -19,35 +19,26 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -59,19 +50,25 @@ ENTRY (__strlen_aarch64_mte)
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]!
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	sub	src, src, 16
+L(loop_end):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
+	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 2392493f1a3c4c79b67f790bfa064766253e55e7..12ebbdba5c93ae99a195dd8abef828c2b7804982 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,11 +1,11 @@
 /*
  * __strlen_aarch64_sve - compute the length of a string
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index a1b164a49238243419c89a365dd6757f9e9be7cd..6f6f08f636b248abc9c9b2e847545588efca281b 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
 /*
  * strlen - calculate the length of a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Not MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin	x0
 #define len	x0
@@ -36,6 +36,7 @@
 #define tmp	x2
 #define tmpw	w2
 #define synd	x3
+#define syndw	w3
 #define shift	x4
 
 /* For the first 32 bytes, NUL detection works on the principle that
@@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64)
 	add	len, len, tmp1, lsr 3
 	ret
 
-	.p2align 3
 	/* Look for a NUL byte at offset 16..31 in the string.  */
 L(bytes16_31):
 	ldp	data1, data2, [srcin, 16]
@@ -138,6 +138,7 @@ L(bytes16_31):
 	add	len, len, tmp1, lsr 3
 	ret
 
+	nop
 L(loop_entry):
 	bic	src, srcin, 31
 
@@ -153,18 +154,12 @@ L(loop):
 	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
 	cmeq	maskv.16b, datav1.16b, 0
 	sub	len, src, srcin
-	tst	synd, 0xffffffff
-	b.ne	1f
+	cbnz	syndw, 1f
 	cmeq	maskv.16b, datav2.16b, 0
 	add	len, len, 16
 1:
 	/* Generate a bitmask and compute correct byte offset.  */
-#ifdef __AARCH64EB__
-	bic	maskv.8h, 0xf0
-#else
-	bic	maskv.8h, 0x0f, lsl 8
-#endif
-	umaxp	maskv.16b, maskv.16b, maskv.16b
+	shrn	maskv.8b, maskv.8h, 4
 	fmov	synd, maskd
 #ifndef __AARCH64EB__
 	rbit	synd, synd
@@ -173,8 +168,6 @@ L(loop):
 	add	len, len, tmp, lsr 2
 	ret
 
-        .p2align 4
-
 L(page_cross):
 	bic	src, srcin, 31
 	mov	tmpw, 0x0c03
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8a158beca38419a6ccf82cd8573394f7b6..0000000000000000000000000000000000000000
--- a/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result.  */
-#define src1		x0
-#define src2		x1
-#define limit		x2
-#define result		x0
-
-/* Internal variables.  */
-#define data1		x3
-#define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define has_nul		x5
-#define diff		x6
-#define syndrome	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define zeroones	x11
-#define pos		x12
-#define mask		x13
-#define endloop		x14
-#define count		mask
-#define offset		pos
-#define neg_offset	x15
-
-/* Define endian dependent shift operations.
-   On big-endian early bytes are at MSB and on little-endian LSB.
-   LS_FW means shifting towards early bytes.
-   LS_BK means shifting towards later bytes.
-   */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
-	PTR_ARG (0)
-	PTR_ARG (1)
-	SIZE_ARG (2)
-	cbz	limit, L(ret0)
-	eor	tmp1, src1, src2
-	mov	zeroones, #REP8_01
-	tst	tmp1, #7
-	and	count, src1, #7
-	b.ne	L(misaligned8)
-	cbnz	count, L(mutual_align)
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	.p2align 4
-L(loop_aligned):
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-L(start_realigned):
-	subs	limit, limit, #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	endloop, #0, #0, eq
-	b.eq	L(loop_aligned)
-	/* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
-	orr	syndrome, diff, has_nul
-	add	limit, limit, 8	/* Rewind limit to before last subs. */
-L(syndrome_check):
-	/* Limit was reached. Check if the NUL byte or the difference
-	   is before the limit. */
-	rev	syndrome, syndrome
-	rev	data1, data1
-	clz	pos, syndrome
-	rev	data2, data2
-	lsl	data1, data1, pos
-	cmp	limit, pos, lsr #3
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	csel result, result, xzr, hi
-	ret
-#else
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit, #63, L(not_limit)
-	add	tmp1, limit, 8
-	cbz	limit, L(not_limit)
-
-	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-	lsr	mask, mask, limit
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
-	/* For big-endian we cannot use the trick with the syndrome value
-	   as carry-propagation can corrupt the upper bits if the trailing
-	   bytes in the string contain 0x01.  */
-	/* However, if there is no NUL byte in the dword, we can generate
-	   the result directly.  We can't just subtract the bytes as the
-	   MSB might be significant.  */
-	cbnz	has_nul, 1f
-	cmp	data1, data2
-	cset	result, ne
-	cneg	result, result, lo
-	ret
-1:
-	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
-	rev	tmp3, data1
-	sub	tmp1, tmp3, zeroones
-	orr	tmp2, tmp3, #REP8_7f
-	bic	has_nul, tmp1, tmp2
-	rev	has_nul, has_nul
-	orr	syndrome, diff, has_nul
-	clz	pos, syndrome
-	/* The most-significant-non-zero bit of the syndrome marks either the
-	   first bit that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
-L(end_quick):
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/* But we need to zero-extend (char is unsigned) the value and then
-	   perform a signed 32-bit subtraction.  */
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-#endif
-
-L(mutual_align):
-	/* Sources are mutually aligned, but are not currently at an
-	   alignment boundary.  Round down the addresses and then mask off
-	   the bytes that precede the start point.
-	   We also need to adjust the limit calculations, but without
-	   overflowing if the limit is near ULONG_MAX.  */
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
-	ldr	data2, [src2], #8
-	mov	tmp2, #~0
-	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-	/* Adjust the limit and ensure it doesn't overflow.  */
-	adds	limit, limit, count
-	csinv	limit, limit, xzr, lo
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	L(start_realigned)
-
-	.p2align 4
-	/* Don't bother with dwords for up to 16 bytes.  */
-L(misaligned8):
-	cmp	limit, #16
-	b.hs	L(try_misaligned_words)
-
-L(byte_loop):
-	/* Perhaps we can do better than this.  */
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	L(byte_loop)
-L(done):
-	sub	result, data1, data2
-	ret
-	/* Align the SRC1 to a dword by doing a bytewise compare and then do
-	   the dword loop.  */
-L(try_misaligned_words):
-	cbz	count, L(src1_aligned)
-
-	neg	count, count
-	and	count, count, #7
-	sub	limit, limit, count
-
-L(page_end_loop):
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	cmp	data1w, #1
-	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	L(done)
-	subs	count, count, #1
-	b.hi	L(page_end_loop)
-
-	/* The following diagram explains the comparison of misaligned strings.
-	   The bytes are shown in natural order. For little-endian, it is
-	   reversed in the registers. The "x" bytes are before the string.
-	   The "|" separates data that is loaded at one time.
-	   src1     | a a a a a a a a | b b b c c c c c | . . .
-	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
-
-	   After shifting in each step, the data looks like this:
-	                STEP_A              STEP_B              STEP_C
-	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
-	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
-
-	   The bytes with "0" are eliminated from the syndrome via mask.
-
-	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
-	   time from SRC2. The comparison happens in 3 steps. After each step
-	   the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
-	/* Calculate offset from 8 byte alignment to string start in bits. No
-	   need to mask offset since shifts are ignoring upper bits. */
-	lsl	offset, src2, #3
-	bic	src2, src2, #0xf
-	mov	mask, -1
-	neg	neg_offset, offset
-	ldr	data1, [src1], #8
-	ldp	tmp1, tmp2, [src2], #16
-	LS_BK	mask, mask, neg_offset
-	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
-	/* Skip the first compare if data in tmp1 is irrelevant. */
-	tbnz	offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
-	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
-	LS_FW	data2, tmp1, offset
-	LS_BK	tmp1, tmp2, neg_offset
-	subs	limit, limit, #8
-	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
-	sub	has_nul, data1, zeroones
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	orr	tmp3, data1, #REP8_7f
-	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
-	orr	tmp3, endloop, has_nul
-	cbnz	tmp3, L(full_check)
-
-	ldr	data1, [src1], #8
-L(misaligned_mid_loop):
-	/* STEP_B: Compare first part of data1 to second part of tmp2. */
-	LS_FW	data2, tmp2, offset
-#ifdef __AARCH64EB__
-	/* For big-endian we do a byte reverse to avoid carry-propagation
-	problem described above. This way we can reuse the has_nul in the
-	next step and also use syndrome value trick at the end. */
-	rev	tmp3, data1
-	#define data1_fixed tmp3
-#else
-	#define data1_fixed data1
-#endif
-	sub	has_nul, data1_fixed, zeroones
-	orr	tmp3, data1_fixed, #REP8_7f
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
-#ifdef __AARCH64EB__
-	rev	has_nul, has_nul
-#endif
-	cmp	limit, neg_offset, lsr #3
-	orr	syndrome, diff, has_nul
-	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	/* STEP_C: Compare second part of data1 to first part of tmp1. */
-	ldp	tmp1, tmp2, [src2], #16
-	cmp	limit, #8
-	LS_BK	data2, tmp1, neg_offset
-	eor	diff, data2, data1	/* Non-zero if differences found.  */
-	orr	syndrome, diff, has_nul
-	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
-	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
-	cbnz	tmp3, L(syndrome_check)
-
-	ldr	data1, [src1], #8
-	sub	limit, limit, #8
-	b	L(loop_misaligned)
-
-#ifdef	__AARCH64EB__
-L(syndrome_check):
-	clz	pos, syndrome
-	cmp	pos, limit, lsl #3
-	b.lo	L(end_quick)
-#endif
-
-L(ret0):
-	mov	result, #0
-	ret
-END(__strncmp_aarch64_mte)
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 234190e245b0ba30f6257fad70b9fcbc4ce767cd..6a9e9f7b6437fdab851d5a4a4651b3f4922bf06b 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,11 +1,11 @@
 /*
  * strncmp - compare two strings with limit
  *
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 738b6539cab647129d801a21bb7b88876b37c070..128a10c52bb175436312c6326030c4d34cc4190f 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
 /*
  * strncmp - compare two strings
  *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
 
 /* Parameters and result.  */
 #define src1		x0
@@ -35,10 +35,24 @@
 #define tmp3		x10
 #define zeroones	x11
 #define pos		x12
-#define limit_wd	x13
-#define mask		x14
-#define endloop		x15
+#define mask		x13
+#define endloop		x14
 #define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
 
 ENTRY (__strncmp_aarch64)
 	PTR_ARG (0)
@@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)
 	and	count, src1, #7
 	b.ne	L(misaligned8)
 	cbnz	count, L(mutual_align)
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
 
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -63,56 +74,52 @@ L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
-	subs	limit_wd, limit_wd, #1
+	subs	limit, limit, #8
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
 	/* End of main loop */
 
-	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, L(not_limit)
-
-	/* Limit % 8 == 0 => all bytes significant.  */
-	ands	limit, limit, #7
-	b.eq	L(not_limit)
-
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
-	mov	mask, #~0
-#ifdef __AARCH64EB__
-	lsr	mask, mask, limit
-#else
-	lsl	mask, mask, limit
-#endif
-	bic	data1, data1, mask
-	bic	data2, data2, mask
-
-	/* Make sure that the NUL byte is marked in the syndrome.  */
-	orr	has_nul, has_nul, mask
-
-L(not_limit):
+L(full_check):
+#ifndef __AARCH64EB__
 	orr	syndrome, diff, has_nul
-
-#ifndef	__AARCH64EB__
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
 	rev	syndrome, syndrome
 	rev	data1, data1
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
-	   Shifting left now will bring the critical information into the
-	   top bits.  */
 	clz	pos, syndrome
 	rev	data2, data2
 	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
 	ret
 #else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
 	/* For big-endian we cannot use the trick with the syndrome value
 	   as carry-propagation can corrupt the upper bits if the trailing
 	   bytes in the string contain 0x01.  */
@@ -133,10 +140,11 @@ L(not_limit):
 	rev	has_nul, has_nul
 	orr	syndrome, diff, has_nul
 	clz	pos, syndrome
-	/* The MS-non-zero bit of the syndrome marks either the first bit
-	   that is different, or the top bit of the first zero byte.
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
 	   Shifting left now will bring the critical information into the
 	   top bits.  */
+L(end_quick):
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
@@ -158,22 +166,12 @@ L(mutual_align):
 	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
 	ldr	data2, [src2], #8
 	mov	tmp2, #~0
-	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
-#endif
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	/* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
-	add	limit, limit, count
-	add	tmp3, tmp3, count
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
-	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
 	.p2align 4
@@ -196,13 +194,11 @@ L(done):
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
 L(try_misaligned_words):
-	lsr	limit_wd, limit, #3
-	cbz	count, L(do_misaligned)
+	cbz	count, L(src1_aligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
-	lsr	limit_wd, limit, #3
 
 L(page_end_loop):
 	ldrb	data1w, [src1], #1
@@ -213,48 +209,100 @@ L(page_end_loop):
 	subs	count, count, #1
 	b.hi	L(page_end_loop)
 
-L(do_misaligned):
-	/* Prepare ourselves for the next page crossing.  Unlike the aligned
-	   loop, we fetch 1 less dword because we risk crossing bounds on
-	   SRC2.  */
-	mov	count, #8
-	subs	limit_wd, limit_wd, #1
-	b.lo	L(done_loop)
-L(loop_misaligned):
-	and	tmp2, src2, #0xff8
-	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, L(page_end_loop)
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
 
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
 	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
-	subs	limit_wd, limit_wd, #1
-	b.pl	L(loop_misaligned)
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
 
-L(done_loop):
-	/* We found a difference or a NULL before the limit was reached.  */
-	and	limit, limit, #7
-	cbz	limit, L(not_limit)
-	/* Read the last word.  */
-	sub	src1, src1, 8
-	sub	src2, src2, 8
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
-	ccmp	diff, #0, #0, eq
-	b.ne	L(not_limit)
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
 
 L(ret0):
 	mov	result, #0
 	ret
-
-END ( __strncmp_aarch64)
+END(__strncmp_aarch64)
 
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5b9ebf7763bc2491011641702eac4dbc32f45482..6c43dc427da7a9279ed400a6186bbd162cc10148 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,11 +1,11 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 48d2495d2082be8318c88148eb21d00ee6f0b421..f2090a7485a5646dec85bfb0b4fce421471adb13 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
 /*
  * strnlen - calculate the length of a string with limit.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define cntin		x1
@@ -20,39 +20,30 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (__strnlen_aarch64)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
-	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -64,37 +55,40 @@ L(finish):
 	csel	result, cntin, result, ls
 	ret
 
+L(nomatch):
+	mov	result, cntin
+	ret
+
 L(start_loop):
 	sub	tmp, src, srcin
+	add	tmp, tmp, 17
 	subs	cntrem, cntin, tmp
-	b.ls	L(nomatch)
+	b.lo	L(nomatch)
 
 	/* Make sure that it won't overread by a 16-byte chunk */
-	add	tmp, cntrem, 15
-	tbnz	tmp, 4, L(loop32_2)
-
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
 	.p2align 5
 L(loop32):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbnz	synd, L(end)
 L(loop32_2):
-	ldr	qdata, [src], 16
+	ldr	qdata, [src, 16]
 	subs	cntrem, cntrem, 32
 	cmeq	vhas_chr.16b, vdata.16b, 0
-	b.ls	L(end)
+	b.lo	L(end_2)
 	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(loop32)
-
+L(end_2):
+	add	src, src, 16
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
-	sub	src, src, 16
-	mov	synd, vend.d[0]
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	result, src, srcin
+	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
@@ -104,9 +98,5 @@ L(end):
 	csel	result, cntin, result, ls
 	ret
 
-L(nomatch):
-	mov	result, cntin
-	ret
-
 END (__strnlen_aarch64)
 
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index 1e4fb1a68f7e8bc21a65f5925194f5d188d01e7c..bb61ab9ad4e7c5d5966daa950d7ef2c2dec4726d 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * MTE compatible.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #define srcin		x0
 #define chrin		w1
@@ -19,7 +19,6 @@
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -31,7 +30,6 @@
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	rbit	synd, synd
+#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index d36d69af37fd71a23f656ae0c5bc87f719bd3073..825a7384cfc11831455e5544408e9d5faf8ce57f 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,11 +1,11 @@
 /*
  * strrchr - find the last of a character in a string
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 #if __ARM_FEATURE_SVE
 /* Assumptions:
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 56185ff534e3915d3ada2c025b2943489b9b2d7b..bf9cb297b6cb3f4bc539594edb7e4a6cddc96f20 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
 /*
  * strrchr - find last position of a character in a string.
  *
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 /* Assumptions:
@@ -11,7 +11,7 @@
  * Neon Available.
  */
 
-#include "../asmdefs.h"
+#include "asmdefs.h"
 
 /* Arguments and results.  */
 #define srcin		x0
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index d5d4ea7e0309a0a9e00dca54048cbb8dc7bb4c00..e070be586b528dc57d40f709e93ad2e10c34f053 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy benchmark.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
@@ -13,14 +13,15 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 5000
+#define ITERS  5000
 #define ITERS2 20000000
-#define ITERS3 500000
-#define MAX_COPIES 8192
-#define SIZE (256*1024)
+#define ITERS3 200000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
 
-static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64)));
-static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64)));
+static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
+static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64)));
 
 #define F(x) {#x, x},
 
@@ -30,15 +31,21 @@ static const struct fun
   void *(*fun)(void *, const void *, size_t);
 } funtab[] =
 {
-  F(memcpy)
 #if __aarch64__
   F(__memcpy_aarch64)
 # if __ARM_NEON
   F(__memcpy_aarch64_simd)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memcpy_aarch64_sve)
+# endif
+# if WANT_MOPS
+  F(__memcpy_aarch64_mops)
+# endif
 #elif __arm__
   F(__memcpy_arm)
 #endif
+  F(memcpy)
 #undef F
   {0, 0}
 };
@@ -109,7 +116,7 @@ typedef struct
   uint64_t len : 16;
 } copy_t;
 
-static copy_t copy[MAX_COPIES];
+static copy_t test_arr[NUM_TESTS];
 
 typedef char *(*proto_t) (char *, const char *, size_t);
 
@@ -140,14 +147,14 @@ init_copies (size_t max_size)
   size_t total = 0;
   /* Create a random set of copies with the given size and alignment
      distributions.  */
-  for (int i = 0; i < MAX_COPIES; i++)
+  for (int i = 0; i < NUM_TESTS; i++)
     {
-      copy[i].dst = (rand32 (0) & (max_size - 1));
-      copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
-      copy[i].src = (rand32 (0) & (max_size - 1));
-      copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
-      copy[i].len = size_arr[rand32 (0) & SIZE_MASK];
-      total += copy[i].len;
+      test_arr[i].dst = (rand32 (0) & (max_size - 1));
+      test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].src = (rand32 (0) & (max_size - 1));
+      test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK];
+      total += test_arr[i].len;
     }
 
   return total;
@@ -160,25 +167,27 @@ int main (void)
   memset (a, 1, sizeof (a));
   memset (b, 2, sizeof (b));
 
-  printf("Random memcpy:\n");
+  printf("Random memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
       size_t total = 0;
       uint64_t tsum = 0;
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
       rand32 (0x12345678);
 
-      for (int size = 16384; size <= SIZE; size *= 2)
+      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
 	{
 	  size_t copy_size = init_copies (size) * ITERS;
 
-	  for (int c = 0; c < MAX_COPIES; c++)
-	    funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	  for (int c = 0; c < NUM_TESTS; c++)
+	    funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+			   test_arr[c].len);
 
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS; i++)
-	    for (int c = 0; c < MAX_COPIES; c++)
-	      funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len);
+	    for (int c = 0; c < NUM_TESTS; c++)
+	      funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src,
+			     test_arr[c].len);
 	  t = clock_get_ns () - t;
 	  total += copy_size;
 	  tsum += t;
@@ -187,74 +196,147 @@ int main (void)
       printf( "avg %.2f\n", (double)total / tsum);
     }
 
-  printf ("\nMedium memcpy:\n");
+  size_t total = 0;
+  uint64_t tsum = 0;
+  printf ("%22s ", "memcpy_call");
+  rand32 (0x12345678);
+
+  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+    {
+      size_t copy_size = init_copies (size) * ITERS;
+
+      for (int c = 0; c < NUM_TESTS; c++)
+	memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_TESTS; c++)
+	  memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len);
+      t = clock_get_ns () - t;
+      total += copy_size;
+      tsum += t;
+      printf ("%dK: %.2f ", size / 1024, (double)copy_size / t);
+    }
+  printf( "avg %.2f\n", (double)total / tsum);
+
+
+  printf ("\nAligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 16; size <= 512; size *= 2)
+      for (int size = 8; size <= 512; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS2; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
 	}
       printf ("\n");
     }
 
-  printf ("\nLarge memcpy:\n");
+  printf ("%22s ", "memcpy_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memcpy (b, a, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nUnaligned medium memcpy (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 8; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (b + 3, a + 1, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memcpy_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memcpy (b + 3, a + 1, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nLarge memcpy (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (b, a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
-  printf ("\nUnaligned forwards memmove:\n");
+  printf ("%22s ", "memcpy_call");
+  for (int size = 1024; size <= 65536; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	memcpy (b, a, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+    }
+  printf ("\n");
+
+
+  printf ("\nUnaligned forwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a, a + 256 + (i & 31), size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
 
 
-  printf ("\nUnaligned backwards memmove:\n");
+  printf ("\nUnaligned backwards memmove (bytes/ns):\n");
   for (int f = 0; funtab[f].name != 0; f++)
     {
-      printf ("%22s (B/ns) ", funtab[f].name);
+      printf ("%22s ", funtab[f].name);
 
-      for (int size = 1024; size <= 32768; size *= 2)
+      for (int size = 1024; size <= 65536; size *= 2)
 	{
 	  uint64_t t = clock_get_ns ();
 	  for (int i = 0; i < ITERS3; i++)
 	    funtab[f].fun (a + 256 + (i & 31), a, size);
 	  t = clock_get_ns () - t;
-	  printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
-		  size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
 	}
       printf ("\n");
     }
+  printf ("\n");
 
   return 0;
 }
diff --git a/string/bench/memset.c b/string/bench/memset.c
new file mode 100644
index 0000000000000000000000000000000000000000..990e23ba9a368bb28960d4211b1d3e3f4d96dee4
--- /dev/null
+++ b/string/bench/memset.c
@@ -0,0 +1,243 @@
+/*
+ * memset benchmark.
+ *
+ * Copyright (c) 2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS  5000
+#define ITERS2 20000000
+#define ITERS3 1000000
+#define NUM_TESTS 16384
+#define MIN_SIZE 32768
+#define MAX_SIZE (1024 * 1024)
+
+static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64)));
+
+#define F(x) {#x, x},
+
+static const struct fun
+{
+  const char *name;
+  void *(*fun)(void *, int, size_t);
+} funtab[] =
+{
+#if __aarch64__
+  F(__memset_aarch64)
+#elif __arm__
+  F(__memset_arm)
+#endif
+  F(memset)
+#undef F
+  {0, 0}
+};
+
+typedef struct { uint32_t offset : 20, len : 12; } memset_test_t;
+static memset_test_t test_arr[NUM_TESTS];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM-1)
+static uint8_t len_arr[SIZE_NUM];
+
+/* Frequency data for memset sizes up to 4096 based on SPEC2017.  */
+static freq_data_t memset_len_freq[] =
+{
+{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, {  8,1412},
+{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414},
+{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, {  2, 200}, {  4, 192},
+{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140},
+{4095,133}, { 10, 130}, {  9, 124}, {  3, 124}, { 28, 120}, {  0, 118},
+{288, 110}, {1152, 96}, {104,  90}, {  1,  86}, {832,  76}, {248,  74},
+{1024, 69}, {120,  64}, {512,  63}, {384,  60}, {  6,  59}, { 80,  54},
+{ 17,  50}, {  7,  49}, {520,  47}, {2048, 39}, {256,  37}, {864,  33},
+{1440, 28}, { 22,  27}, {2056, 24}, {260,  23}, { 68,  23}, {  5,  22},
+{ 18,  21}, {200,  18}, {2120, 18}, { 60,  17}, { 52,  16}, {336,  15},
+{ 44,  13}, {192,  13}, {160,  12}, {2064, 12}, {128,  12}, { 76,  11},
+{164,  11}, {152,  10}, {136,   9}, {488,   7}, { 96,   6}, {560,   6},
+{1016,  6}, {112,   5}, {232,   5}, {168,   5}, {952,   5}, {184,   5},
+{144,   4}, {252,   4}, { 84,   3}, {960,   3}, {3808,  3}, {244,   3},
+{280,   3}, {224,   3}, {156,   3}, {1088,  3}, {440,   3}, {216,   2},
+{304,   2}, { 23,   2}, { 25,   2}, { 26,   2}, {264,   2}, {328,   2},
+{1096,  2}, {240,   2}, {1104,  2}, {704,   2}, {1664,  2}, {360,   2},
+{808,   1}, {544,   1}, {236,   1}, {720,   1}, {368,   1}, {424,   1},
+{640,   1}, {1112,  1}, {552,   1}, {272,   1}, {776,   1}, {376,   1},
+{ 92,   1}, {536,   1}, {824,   1}, {496,   1}, {760,   1}, {792,   1},
+{504,   1}, {344,   1}, {1816,  1}, {880,   1}, {176,   1}, {320,   1},
+{352,   1}, {2008,  1}, {208,   1}, {408,   1}, {228,   1}, {2072,  1},
+{568,   1}, {220,   1}, {616,   1}, {600,   1}, {392,   1}, {696,   1},
+{2144,  1}, {1280,  1}, {2136,  1}, {632,   1}, {584,   1}, {456,   1},
+{472,   1}, {3440,  1}, {2088,  1}, {680,   1}, {2928,  1}, {212,   1},
+{648,   1}, {1752,  1}, {664,   1}, {3512,  1}, {1032,  1}, {528,   1},
+{4072,  1}, {204,   1}, {2880,  1}, {3392,  1}, {712,   1}, { 59,   1},
+{736,   1}, {592,   1}, {2520,  1}, {744,   1}, {196,   1}, {172,   1},
+{728,   1}, {2040,  1}, {1192,  1}, {3600,  1}, {0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM-1)
+static uint8_t align_arr[ALIGN_NUM];
+
+/* Alignment data for memset based on SPEC2017.  */
+static align_data_t memset_align_freq[] =
+{
+ {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0}
+};
+
+static void
+init_memset_distribution (void)
+{
+  int i, j, freq, size, n;
+
+  for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++)
+    for (j = 0, size = memset_len_freq[i].size; j < freq; j++)
+      len_arr[n++] = size;
+  assert (n == SIZE_NUM);
+
+  for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++)
+    for (j = 0, size = memset_align_freq[i].align; j < freq; j++)
+      align_arr[n++] = size - 1;
+  assert (n == ALIGN_NUM);
+}
+
+static size_t
+init_memset (size_t max_size)
+{
+  size_t total = 0;
+  /* Create a random set of memsets with the given size and alignment
+     distributions.  */
+  for (int i = 0; i < NUM_TESTS; i++)
+    {
+      test_arr[i].offset = (rand32 (0) & (max_size - 1));
+      test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK];
+      test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK];
+      total += test_arr[i].len;
+    }
+
+  return total;
+}
+
+
+int main (void)
+{
+  init_memset_distribution ();
+
+  memset (a, 1, sizeof (a));
+
+  printf("Random memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      size_t total_size = 0;
+      uint64_t tsum = 0;
+      printf ("%22s ", funtab[f].name);
+      rand32 (0x12345678);
+
+      for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+	{
+	  size_t memset_size = init_memset (size) * ITERS;
+
+	  for (int c = 0; c < NUM_TESTS; c++)
+	    funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS; i++)
+	    for (int c = 0; c < NUM_TESTS; c++)
+	      funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len);
+	  t = clock_get_ns () - t;
+	  total_size += memset_size;
+	  tsum += t;
+	  printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+	}
+      printf( "avg %.2f\n", (double)total_size / tsum);
+    }
+
+  size_t total_size = 0;
+  uint64_t tsum = 0;
+  printf ("%22s ", "memset_call");
+  rand32 (0x12345678);
+
+  for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2)
+    {
+      size_t memset_size = init_memset (size) * ITERS;
+
+      for (int c = 0; c < NUM_TESTS; c++)
+	memset (a + test_arr[c].offset, 0, test_arr[c].len);
+
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS; i++)
+	for (int c = 0; c < NUM_TESTS; c++)
+	  memset (a + test_arr[c].offset, 0, test_arr[c].len);
+      t = clock_get_ns () - t;
+      total_size += memset_size;
+      tsum += t;
+      printf ("%dK: %.2f ", size / 1024, (double)memset_size / t);
+    }
+  printf( "avg %.2f\n", (double)total_size / tsum);
+
+
+  printf ("\nMedium memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 8; size <= 512; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS2; i++)
+	    funtab[f].fun (a, 0, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memset_call");
+  for (int size = 8; size <= 512; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS2; i++)
+	memset (a, 0, size);
+      t = clock_get_ns () - t;
+      printf ("%dB: %.2f ", size, (double)size * ITERS2 / t);
+    }
+
+
+  printf ("\nLarge memset (bytes/ns):\n");
+  for (int f = 0; funtab[f].name != 0; f++)
+    {
+      printf ("%22s ", funtab[f].name);
+
+      for (int size = 1024; size <= 65536; size *= 2)
+	{
+	  uint64_t t = clock_get_ns ();
+	  for (int i = 0; i < ITERS3; i++)
+	    funtab[f].fun (a, 0, size);
+	  t = clock_get_ns () - t;
+	  printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+	}
+      printf ("\n");
+    }
+
+  printf ("%22s ", "memset_call");
+  for (int size = 1024; size <= 65536; size *= 2)
+    {
+      uint64_t t = clock_get_ns ();
+      for (int i = 0; i < ITERS3; i++)
+	memset (a, 0, size);
+      t = clock_get_ns () - t;
+      printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t);
+    }
+  printf ("\n\n");
+
+  return 0;
+}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
index cc0f04bee5471a4c623e047f773bde10f0e8aac7..f05d0d5b89e6f1c689d38ea45d2feefb99bf5f82 100644
--- a/string/bench/strlen.c
+++ b/string/bench/strlen.c
@@ -1,8 +1,8 @@
 /*
  * strlen benchmark.
  *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #define _GNU_SOURCE
@@ -13,10 +13,10 @@
 #include "stringlib.h"
 #include "benchlib.h"
 
-#define ITERS 2000
+#define ITERS 5000
 #define ITERS2 20000000
 #define ITERS3 2000000
-#define NUM_STRLEN 16384
+#define NUM_TESTS 16384
 
 #define MAX_ALIGN 32
 #define MAX_STRLEN 256
@@ -49,7 +49,7 @@ static const struct fun
 };
 #undef F
 
-static uint16_t strlen_tests[NUM_STRLEN];
+static uint16_t strlen_tests[NUM_TESTS];
 
 typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
 typedef struct { uint8_t align; uint16_t freq; } align_data_t;
@@ -117,7 +117,7 @@ init_strlen_tests (void)
 
   /* Create a random set of strlen input strings using the string length
      and alignment distributions.  */
-  for (int n = 0; n < NUM_STRLEN; n++)
+  for (int n = 0; n < NUM_TESTS; n++)
     {
       int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
       int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
@@ -141,14 +141,14 @@ int main (void)
       size_t res = 0, strlen_size = 0, mask = maskv;
       printf ("%22s ", funtab[f].name);
 
-      for (int c = 0; c < NUM_STRLEN; c++)
+      for (int c = 0; c < NUM_TESTS; c++)
 	strlen_size += funtab[f].fun (a + strlen_tests[c]);
       strlen_size *= ITERS;
 
       /* Measure latency of strlen result with (res & mask).  */
       uint64_t t = clock_get_ns ();
       for (int i = 0; i < ITERS; i++)
-	for (int c = 0; c < NUM_STRLEN; c++)
+	for (int c = 0; c < NUM_TESTS; c++)
 	  res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
       t = clock_get_ns () - t;
       printf ("%.2f\n", (double)strlen_size / t);
diff --git a/string/include/benchlib.h b/string/include/benchlib.h
index 0f2ce2eb6bce2685432d4207f987f3896c4b8363..f1bbea388cd217981dbf6513a1c0a1fadbc894bc 100644
--- a/string/include/benchlib.h
+++ b/string/include/benchlib.h
@@ -2,7 +2,7 @@
  * Benchmark support functions.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 378c3cd2d64590c05aa1cb80f6ba2559be017d2d..650c52cbda786613bbd5daf64a827903b54bb3ba 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,8 +1,8 @@
 /*
  * Public API.
  *
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stddef.h>
@@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
 int __strncmp_aarch64 (const char *, const char *, size_t);
 void * __memchr_aarch64_mte (const void *, int, size_t);
-char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
-char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
 char *__strchr_aarch64_mte (const char *, int);
 char * __strchrnul_aarch64_mte (const char *, int );
 size_t __strlen_aarch64_mte (const char *);
 char *__strrchr_aarch64_mte (const char *, int);
-int __strcmp_aarch64_mte (const char *, const char *);
-int __strncmp_aarch64_mte (const char *, const char *, size_t);
 #if __ARM_NEON
 void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64_simd (void *, const void *, size_t);
 #endif
 # if __ARM_FEATURE_SVE
+void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t);
 void *__memchr_aarch64_sve (const void *, int, size_t);
 int __memcmp_aarch64_sve (const void *, const void *, size_t);
 char *__strchr_aarch64_sve (const char *, int);
@@ -54,6 +52,11 @@ size_t __strlen_aarch64_sve (const char *);
 size_t __strnlen_aarch64_sve (const char *, size_t);
 int __strncmp_aarch64_sve (const char *, const char *, size_t);
 # endif
+# if WANT_MOPS
+void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64_mops (void *, int, size_t);
+# endif
 # if __ARM_FEATURE_MEMORY_TAGGING
 void *__mtag_tag_region (void *, size_t);
 void *__mtag_tag_zero_region (void *, size_t);
diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c
index d8c02d92d626a6e754b756cdcb17945e6a6a14ad..c45fa6662a77bbdab77fe6998ffb3830952016fa 100644
--- a/string/test/__mtag_tag_region.c
+++ b/string/test/__mtag_tag_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c
index 221c223a2f3105ab02c7b21b9560a81bddf4355d..a4a7861620d1f4db8eedc438cae77aa8145040d7 100644
--- a/string/test/__mtag_tag_zero_region.c
+++ b/string/test/__mtag_tag_zero_region.c
@@ -2,7 +2,7 @@
  * __mtag_tag_zero_region test.
  *
  * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST
diff --git a/string/test/memchr.c b/string/test/memchr.c
index 0ff77f5710bf2d413b5e1f9a4c5243e0fe945c2c..c6a94481c0adbaeaf27b81c0d18643a25236f623 100644
--- a/string/test/memchr.c
+++ b/string/test/memchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memcmp.c b/string/test/memcmp.c
index 7a7cf9cff35af2c22248dfd21609b7e83af68976..f9236b83a60d446315cbc5ddb27f03458d50b538 100644
--- a/string/test/memcmp.c
+++ b/string/test/memcmp.c
@@ -2,7 +2,7 @@
  * memcmp test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index ce0ceeef5ee844e5feadaf2cb18020436e1e9b12..0c2c75a29e2d45c13a6d900a4a8e21984266b2d8 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,8 +1,8 @@
 /*
  * memcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -28,6 +28,12 @@ static const struct fun
 # if __ARM_NEON
   F(__memcpy_aarch64_simd, 1)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memcpy_aarch64_sve, 1)
+# endif
+# if WANT_MOPS
+  F(__memcpy_aarch64_mops, 1)
+# endif
 #elif __arm__
   F(__memcpy_arm, 0)
 #endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 689b68c98af264c8d5e485e7134a0f216fce555c..a5149d74465dad744ec85bee844f053b8727739c 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,8 +1,8 @@
 /*
  * memmove test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -28,6 +28,12 @@ static const struct fun
 # if __ARM_NEON
   F(__memmove_aarch64_simd, 1)
 # endif
+# if __ARM_FEATURE_SVE
+  F(__memmove_aarch64_sve, 1)
+# endif
+# if WANT_MOPS
+  F(__memmove_aarch64_mops, 1)
+# endif
 #endif
   {0, 0, 0}
   // clang-format on
diff --git a/string/test/memrchr.c b/string/test/memrchr.c
index adf96f049cc938ee48cf51c1a1fea94ac73af60a..4171a56daefd6596cc453d075292960db6225d0f 100644
--- a/string/test/memrchr.c
+++ b/string/test/memrchr.c
@@ -2,7 +2,7 @@
  * memchr test.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/memset.c b/string/test/memset.c
index f1721442dbaf83f682859526632655c7ad65cd75..3489e2986a71c18e40d1a08d069664b0149de415 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -2,7 +2,7 @@
  * memset test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -25,6 +25,9 @@ static const struct fun
   F(memset, 0)
 #if __aarch64__
   F(__memset_aarch64, 1)
+# if WANT_MOPS
+  F(__memset_aarch64_mops, 1)
+# endif
 #elif __arm__
   F(__memset_arm, 0)
 #endif
diff --git a/string/test/mte.h b/string/test/mte.h
index e67cbd9d2d400ac1b6bbb4ce815073f483fdb20b..40b0ecf6c194df67a51a14bbe6d3a262dc441590 100644
--- a/string/test/mte.h
+++ b/string/test/mte.h
@@ -2,7 +2,7 @@
  * Memory tagging testing code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef __TEST_MTE_H
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index 1827e68c9a30e75b75e467968c75cca7e4f54dc8..0300892a1f3ccaf0dc35ea0b3e85bca861ce99cc 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -1,8 +1,8 @@
 /*
  * stpcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
@@ -28,8 +28,7 @@ static const struct fun
   // clang-format off
   F(stpcpy, 0)
 #if __aarch64__
-  F(__stpcpy_aarch64, 0)
-  F(__stpcpy_aarch64_mte, 1)
+  F(__stpcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__stpcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/strchr.c b/string/test/strchr.c
index f3ae982ef0adf0850986741f84e9f63d131d9cfe..66180acfb57c6b824bcd39b8e23bada7ab3904a7 100644
--- a/string/test/strchr.c
+++ b/string/test/strchr.c
@@ -2,7 +2,7 @@
  * strchr test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c
index 6c30ab2123f16aac57b59896740e859018fb3bf0..aad0bf59da664e02495e82ab014b19fb81b3576b 100644
--- a/string/test/strchrnul.c
+++ b/string/test/strchrnul.c
@@ -2,7 +2,7 @@
  * strchrnul test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strcmp.c b/string/test/strcmp.c
index d57b54ed50a8a5e8b742805444510ec98a62851d..4aa95f4f2f1dd6e00fc97082abf8994f5fce2643 100644
--- a/string/test/strcmp.c
+++ b/string/test/strcmp.c
@@ -1,8 +1,8 @@
 /*
  * strcmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -24,8 +24,7 @@ static const struct fun
   // clang-format off
   F(strcmp, 0)
 #if __aarch64__
-  F(__strcmp_aarch64, 0)
-  F(__strcmp_aarch64_mte, 1)
+  F(__strcmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index e84cace9c8c610e6f03892be2eb8fc3c92d537ea..af297f90396a95d6b88cbf0357aa1860d862f62c 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -1,8 +1,8 @@
 /*
  * strcpy test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -24,8 +24,7 @@ static const struct fun
   // clang-format off
   F(strcpy, 0)
 #if __aarch64__
-  F(__strcpy_aarch64, 0)
-  F(__strcpy_aarch64_mte, 1)
+  F(__strcpy_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strcpy_aarch64_sve, 1)
 # endif
diff --git a/string/test/stringtest.h b/string/test/stringtest.h
index fe855fc217369099ab10af634f392517edf89f66..6bb7e1fdfeca2d291cfba0d254d564ed3c51d57b 100644
--- a/string/test/stringtest.h
+++ b/string/test/stringtest.h
@@ -2,7 +2,7 @@
  * Common string test code.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <ctype.h>
diff --git a/string/test/strlen.c b/string/test/strlen.c
index 6278380f26df71b5742944cca66d4a7568957ea6..47ef3dcf0ef0c94adf16d07d77c341038a125389 100644
--- a/string/test/strlen.c
+++ b/string/test/strlen.c
@@ -1,15 +1,14 @@
 /*
  * strlen test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/mman.h>
 #include <limits.h>
 #include "mte.h"
 #include "stringlib.h"
diff --git a/string/test/strncmp.c b/string/test/strncmp.c
index 018a8a431ab8ca55110b814e0e089fde6f199772..4bbab6f934509708d760b7cf99d8fbf8c57b21e7 100644
--- a/string/test/strncmp.c
+++ b/string/test/strncmp.c
@@ -1,8 +1,8 @@
 /*
  * strncmp test.
  *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
@@ -24,8 +24,7 @@ static const struct fun
   // clang-format off
   F(strncmp, 0)
 #if __aarch64__
-  F(__strncmp_aarch64, 0)
-  F(__strncmp_aarch64_mte, 1)
+  F(__strncmp_aarch64, 1)
 # if __ARM_FEATURE_SVE
   F(__strncmp_aarch64_sve, 1)
 # endif
diff --git a/string/test/strnlen.c b/string/test/strnlen.c
index 0dea00eaf8e3dc41bc465aa201a312e3a85bf230..a800fd1993cdc21a9023fb5eabfb50781a2b9d70 100644
--- a/string/test/strnlen.c
+++ b/string/test/strnlen.c
@@ -2,7 +2,7 @@
  * strnlen test.
  *
  * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #ifndef _GNU_SOURCE
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index fedbdc52fcc1151ffbbd168ef3bd1cb42c700ff0..580ca497f8a46b1ae92d1e3288b29d2d13178ccf 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -2,7 +2,7 @@
  * strrchr test.
  *
  * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #include <stdint.h>
diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S
index 26ade0a0c7db635acdbb3bd9592fee3ce9ec540d..5afcf7b7ee548aa275f105f72714d390da4d076a 100644
--- a/string/x86_64/check-arch.S
+++ b/string/x86_64/check-arch.S
@@ -2,7 +2,7 @@
  * check ARCH setting.
  *
  * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
  */
 
 #if !__x86_64__