diff --git a/LICENSE b/LICENSE index 2543b82ed92d0bdc5f3fdfa5047144db3c7e9014..20a4b7717cf5e46e2def2ecd47756baf3061d2bd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,11 @@ +MIT OR Apache-2.0 WITH LLVM-exception +===================================== + + MIT License +----------- -Copyright (c) 1999-2019, Arm Limited. +Copyright (c) 1999-2022, Arm Limited. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -19,3 +24,226 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +Apache-2.0 WITH LLVM-exception +------------------------------ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +--- LLVM Exceptions to the Apache 2.0 License ---- + +As an exception, if, as a result of your compiling your source code, portions +of this Software are embedded into an Object form of such source code, you +may redistribute such embedded portions in such Object form without complying +with the conditions of Sections 4(a), 4(b) and 4(d) of the License. + +In addition, if you combine or link compiled forms of this Software with +software that is licensed under the GPLv2 ("Combined Software") and if a +court of competent jurisdiction determines that the patent provision (Section +3), the indemnity provision (Section 9) or other Section of the License +conflicts with the conditions of the GPLv2, you may retroactively and +prospectively choose to deem waived or otherwise exclude such Section(s) of +the License, but only in their entirety and only with respect to the Combined +Software. diff --git a/Makefile b/Makefile index 169f89e2c9d6be3f53a91780447652ee7917b28e..c487896728c2cd3c877dad0f52256ddd1e5ebbe8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile - requires GNU make # -# Copyright (c) 2018-2020, Arm Limited. -# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2022, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception srcdir = . prefix = /usr @@ -11,6 +11,7 @@ includedir = $(prefix)/include # Configure these in config.mk, do not make changes in this file. SUBS = math string networking +PLSUBS = math HOST_CC = cc HOST_CFLAGS = -std=c99 -O2 HOST_LDFLAGS = @@ -20,6 +21,7 @@ CPPFLAGS = CFLAGS = -std=c99 -O2 CFLAGS_SHARED = -fPIC CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS) +CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL LDFLAGS = LDLIBS = AR = $(CROSS_COMPILE)ar @@ -51,6 +53,7 @@ $(DIRS): mkdir -p $@ $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED) +$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED) build/%.o: $(srcdir)/%.S $(CC) $(CFLAGS_ALL) -c -o $@ $< diff --git a/OAT.xml b/OAT.xml index 71acb93c33930961bd73a8c0eed2ddee84da6bd7..ab48a784237e62c8f8595b1b124e3251991afade 100644 --- a/OAT.xml +++ b/OAT.xml @@ -19,7 +19,7 @@ policylist: 1. policy: If the OAT-Default.xml policies do not meet your requirements, please add policies here. 2. policyitem: The fields type, name, path, desc is required, and the fields rule, group, filefilter is optional,the default value is: - + 3. policyitem type: "compatibility" is used to check license compatibility in the specified path; "license" is used to check source license header in the specified path; @@ -49,10 +49,43 @@ All configurations in this file will be merged to OAT-Default.xml, if you have a + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + + + diff --git a/README b/README index 9e1a34fdc65d9acd27964255a42211af5ef06efa..a2143a28488abe9cbdb629698a3f22d353489b9a 100644 --- a/README +++ b/README @@ -2,14 +2,17 @@ Arm Optimized Routines ---------------------- This repository contains implementations of library functions -provided by Arm under MIT License (See LICENSE). Contributions -to this project are accepted, but Contributors have to sign an -Assignment Agreement, please follow the instructions in +provided by Arm. The outbound license is available under a dual +license, at the user’s election, as reflected in the LICENSE file. +Contributions to this project are accepted, but Contributors have +to sign an Assignment Agreement, please follow the instructions in contributor-agreement.pdf. This is needed so upstreaming code -to projects that require copyright assignment is possible. +to projects that require copyright assignment is possible. Further +contribution requirements are documented in README.contributors of +the appropriate subdirectory. Regular quarterly releases are tagged as vYY.MM, the latest -release is v21.02. +release is v23.01. Source code layout: @@ -24,6 +27,7 @@ networking/test/ - networking test and benchmark related sources. string/ - string routines subproject sources. string/include/ - string library public headers. string/test/ - string test and benchmark related sources. +pl/... - separately maintained performance library code. The steps to build the target libraries and run the tests: diff --git a/config.mk.dist b/config.mk.dist index 177e1ac4f53a3e14772a7560f7f79eba86ffe5e7..c4a6dba4b463f669c8a27bac66029c508ed2c875 100644 --- a/config.mk.dist +++ b/config.mk.dist @@ -1,11 +1,14 @@ # Example config.mk # -# Copyright (c) 2018-2020, Arm Limited. -# SPDX-License-Identifier: MIT +# Copyright (c) 2018-2022, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception # Subprojects to build SUBS = math string networking +# Subsubprojects to build if subproject pl is built +PLSUBS = math + # Target architecture: aarch64, arm or x86_64 ARCH = aarch64 @@ -56,8 +59,22 @@ math-cflags += -ffp-contract=fast -fno-math-errno # Use with clang. #math-cflags += -ffp-contract=fast -# Disable vector math code -#math-cflags += -DWANT_VMATH=0 +# Disable/enable SVE vector math code and tests +WANT_SVE_MATH = 0 +ifeq ($(WANT_SVE_MATH), 1) + math-cflags += -march=armv8.2-a+sve +endif +math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH) + +# If defined to 1, set errno in math functions according to ISO C. Many math +# libraries do not set errno, so this is 0 by default. It may need to be +# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. +WANT_ERRNO = 0 +math-cflags += -DWANT_ERRNO=$(WANT_ERRNO) + +# If set to 1, set fenv in vector math routines. +WANT_SIMD_EXCEPT = 0 +math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT) # Disable fenv checks #math-ulpflags = -q -f diff --git a/math/Dir.mk b/math/Dir.mk index 3b841ab71955cc69efff77a1e1fee21938422371..d6385d2bf5173daa6ea0b68d358749c5e7c45154 100644 --- a/math/Dir.mk +++ b/math/Dir.mk @@ -1,12 +1,14 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019, Arm Limited. -# SPDX-License-Identifier: MIT +# Copyright (c) 2019-2022, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/math B := build/math math-lib-srcs := $(wildcard $(S)/*.[cS]) +math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) + math-test-srcs := \ $(S)/test/mathtest.c \ $(S)/test/mathbench.c \ @@ -15,6 +17,7 @@ math-test-srcs := \ math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS]) math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) +math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h)) math-libs := \ build/lib/libmathlib.so \ @@ -42,10 +45,11 @@ math-files := \ $(math-tools) \ $(math-host-tools) \ $(math-includes) \ + $(math-test-includes) \ -all-math: $(math-libs) $(math-tools) $(math-includes) +all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) -$(math-objs): $(math-includes) +$(math-objs): $(math-includes) $(math-test-includes) $(math-objs): CFLAGS_ALL += $(math-cflags) $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) @@ -83,6 +87,9 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a build/include/%.h: $(S)/include/%.h cp $< $@ +build/include/test/%.h: $(S)/test/%.h + cp $< $@ + build/bin/%.sh: $(S)/test/%.sh cp $< $@ @@ -96,7 +103,7 @@ check-math-rtest: $(math-host-tools) $(math-tools) cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags) check-math-ulp: $(math-tools) - ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR) + ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR) check-math: check-math-test check-math-rtest check-math-ulp diff --git a/math/README.contributors b/math/README.contributors new file mode 100644 index 0000000000000000000000000000000000000000..33e7ba376e419301eaf8e51fc7abe4ad10a31350 --- /dev/null +++ b/math/README.contributors @@ -0,0 +1,78 @@ +STYLE REQUIREMENTS +================== + +1. Most code in this sub-directory is expected to be upstreamed into glibc so + the GNU Coding Standard and glibc specific conventions should be followed + to ease upstreaming. + +2. ABI and symbols: the code should be written so it is suitable for inclusion + into a libc with minimal changes. This e.g. means that internal symbols + should be hidden and in the implementation reserved namespace according to + ISO C and POSIX rules. If possible the built shared libraries and static + library archives should be usable to override libc symbols at link time (or + at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI + (other than symbol versioning), this cannot be done reliably for static + linking so this is a best effort requirement. + +3. API: include headers should be suitable for benchmarking and testing code + and should not conflict with libc headers. + + +CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY +============================================== + +1. Math functions have quality and performance requirements. + +2. Quality: + - Worst-case ULP error should be small in the entire input domain (for most + common double precision scalar functions the target is < 0.66 ULP error, + and < 1 ULP for single precision, even performance optimized function + variant should not have > 5 ULP error if the goal is to be a drop in + replacement for a standard math function), this should be tested + statistically (or on all inputs if possible in reasonable amount of time). + The ulp tool is for this and runulp.sh should be updated for new functions. + + - All standard rounding modes need to be supported but in non-default rounding + modes the quality requirement can be relaxed. (Non-nearest rounded + computation can be slow and inaccurate but has to be correct for conformance + reasons.) + + - Special cases and error handling need to follow ISO C Annex F requirements, + POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts: + https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions + this should be tested by direct tests (glibc test system may be used for it). + + - Error handling code should be decoupled from the approximation code as much + as possible. (There are helper functions, these take care of errno as well + as exception raising.) + + - Vector math code does not need to work in non-nearest rounding mode and error + handling side effects need not happen (fenv exceptions and errno), but the + result should be correct (within quality requirements, which are lower for + vector code than for scalar code). + + - Error bounds of the approximation should be clearly documented. + + - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux + systems. (Routines and features can be disabled on specific targets, but + the build must complete). On aarch64, both little- and big-endian targets + are supported as well as valid combinations of architecture extensions. + The configurations that should be tested depend on the contribution. + +3. Performance: + - Common math code should be benchmarked on modern aarch64 microarchitectures + over typical inputs. + + - Performance improvements should be documented (relative numbers can be + published; it is enough to use the mathbench microbenchmark tool which should + be updated for new functions). + + - Attention should be paid to the compilation flags: for aarch64 fma + contraction should be on and math errno turned off so some builtins can be + inlined. + + - The code should be reasonably performant on x86_64 too, e.g. some rounding + instructions and fma may not be available on x86_64, such builtins turn into + libc calls with slow code. Such slowdown is not acceptable, a faster fallback + should be present: glibc and bionic use the same code on all targets. (This + does not apply to vector math code). diff --git a/math/aarch64/v_cos.c b/math/aarch64/v_cos.c new file mode 100644 index 0000000000000000000000000000000000000000..9a73575bce896a9cc54930bb5cd7586b316aa5c0 --- /dev/null +++ b/math/aarch64/v_cos.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/math/aarch64/v_cosf.c b/math/aarch64/v_cosf.c new file mode 100644 index 0000000000000000000000000000000000000000..b9890b2998ad3c260a6849d980cf3f69b4453ec4 --- /dev/null +++ b/math/aarch64/v_cosf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/math/aarch64/v_exp.c b/math/aarch64/v_exp.c new file mode 100644 index 0000000000000000000000000000000000000000..bc5609faf4fc3597a5ec3a1080a12e843417bcc7 --- /dev/null +++ b/math/aarch64/v_exp.c @@ -0,0 +1,125 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static volatile struct +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ + .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), + V2 (0x1.55555da646206p-5) }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ + .special_bound = V2 (704.0), +#endif + .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), + .shift = V2 (0x1.8p+52) +}; + +#define C(i) data.poly[i] +#define Tab __v_exp_data + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ +# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f64 (exp, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) +{ + float64x2_t n, r, r2, s, y, z; + uint64x2_t cmp, u, e; + +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcagtq_f64 (x, data.special_bound); +#endif + + /* n = round(x/(ln2/N)). */ + z = vfmaq_f64 (data.shift, x, data.inv_ln2); + u = vreinterpretq_u64_f64 (z); + n = vsubq_f64 (z, data.shift); + + /* r = x - n*ln2/N. */ + r = x; + r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (r, data.ln2_lo, n); + + e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (C (0), C (1), r); + y = vfmaq_f64 (y, C (2), r2); + y = vfmaq_f64 (r, y, r2); + + /* s = 2^(n/N). */ + u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; + s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n); +#endif + + return vfmaq_f64 (s, y, s); +} diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/v_exp2f.c new file mode 100644 index 0000000000000000000000000000000000000000..e402205e98e6bea310877d6d8b9b5f014e16c47a --- /dev/null +++ b/math/aarch64/v_exp2f.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.962 ulp. */ + .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), + V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f32 (exp2f, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + n = vrndaq_f32 (x); + r = vsubq_f32 (x, n); + e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/math/v_exp2f_1u.c b/math/aarch64/v_exp2f_1u.c similarity index 43% rename from math/v_exp2f_1u.c rename to math/aarch64/v_exp2f_1u.c index 1caa14d9bffffbb2d0cc47ac6470b12701732f67..ba6b02fbb4bcbd9c215d8326dd74f2e4bbadc18b 100644 --- a/math/v_exp2f_1u.c +++ b/math/aarch64/v_exp2f_1u.c @@ -1,13 +1,12 @@ /* * Single-precision vector 2^x function. * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#if V_SUPPORTED static const float Poly[] = { /* maxerr: 0.878 ulp. */ @@ -25,51 +24,49 @@ static const float Poly[] = { #define Ln2hi v_f32 (0x1.62e4p-1f) #define Ln2lo v_f32 (0x1.7f7d1cp-20f) -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) { /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); } -VPCS_ATTR -v_f32_t -V_NAME(exp2f_1u) (v_f32_t x) +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) { - v_f32_t n, r, scale, poly, absn; - v_u32_t cmp, e; + float32x4_t n, r, scale, poly, absn; + uint32x4_t cmp, e; /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] x = n + r, with r in [-1/2, 1/2]. */ #if 0 - v_f32_t z; + float32x4_t z; z = x + Shift; n = z - Shift; r = x - n; - e = v_as_u32_f32 (z) << 23; + e = vreinterpretq_u32_f32 (z) << 23; #else - n = v_round_f32 (x); + n = vrndaq_f32 (x); r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; #endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, C5); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (C5, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); if (unlikely (v_any_u32 (cmp))) return specialcase (poly, n, e, absn); return scale * poly; } -#endif diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c new file mode 100644 index 0000000000000000000000000000000000000000..45f0848cac5b5bcf00b768c7f107e0400a8fab7a --- /dev/null +++ b/math/aarch64/v_exp_data.c @@ -0,0 +1,146 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +# define N (1 << V_EXP_TABLE_BITS) + +/* 2^(j/N), j=0..N. */ +const uint64_t __v_exp_data[] = { +# if N == 128 + 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, + 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, + 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, + 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, + 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, + 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, + 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, + 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, + 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, + 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, + 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, + 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, + 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, + 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, + 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, + 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, + 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, + 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, + 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, + 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, + 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, + 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, + 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, + 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, + 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, + 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, + 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, + 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, + 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, + 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, + 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, + 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, + 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, + 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, + 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, + 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, + 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, + 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, + 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, + 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, + 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, + 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, + 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, +# elif N == 256 + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +# endif +}; diff --git a/math/aarch64/v_expf.c b/math/aarch64/v_expf.c new file mode 100644 index 0000000000000000000000000000000000000000..34e8b6081bcd947effb06be781b8fba6bd95bbba --- /dev/null +++ b/math/aarch64/v_expf.c @@ -0,0 +1,122 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.45358 +0.5 ulp. */ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .ln2_hi = V4 (0x1.62e4p-1f), + .ln2_lo = V4 (0x1.7f7d1cp-20f), + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (expf, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly, z; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + cmp = vcgeq_u32 ( + vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), + TinyBound), + SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = vfmaq_f32 (d->shift, x, d->inv_ln2); + n = vsubq_f32 (z, d->shift); + r = vfmsq_f32 (x, n, d->ln2_hi); + r = vfmsq_f32 (r, n, d->ln2_lo); + e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/math/v_expf_1u.c b/math/aarch64/v_expf_1u.c similarity index 39% rename from math/v_expf_1u.c rename to math/aarch64/v_expf_1u.c index 023bd248c9ac9c89e88a9979d0d1a24197550f79..43d03fa34efab42e2ac666dd6c784c02b8fdf6ed 100644 --- a/math/v_expf_1u.c +++ b/math/aarch64/v_expf_1u.c @@ -1,13 +1,12 @@ /* * Single-precision vector e^x function. * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#if V_SUPPORTED static const float Poly[] = { /* maxerr: 0.36565 +0.5 ulp. */ @@ -28,53 +27,51 @@ static const float Poly[] = { #define Ln2hi v_f32 (0x1.62e4p-1f) #define Ln2lo v_f32 (0x1.7f7d1cp-20f) -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) { /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); } -VPCS_ATTR -v_f32_t -V_NAME(expf_1u) (v_f32_t x) +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) { - v_f32_t n, r, scale, poly, absn, z; - v_u32_t cmp, e; + float32x4_t n, r, scale, poly, absn, z; + uint32x4_t cmp, e; /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ #if 1 - z = v_fma_f32 (x, InvLn2, Shift); + z = vfmaq_f32 (Shift, x, InvLn2); n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_f32 (z) << 23; #else z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; + n = vrndaq_f32 (z); + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; #endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); if (unlikely (v_any_u32 (cmp))) return specialcase (poly, n, e, absn); return scale * poly; } -#endif diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c new file mode 100644 index 0000000000000000000000000000000000000000..1d1c1fa62c0423da2c6c402113da471af2df7540 --- /dev/null +++ b/math/aarch64/v_log.c @@ -0,0 +1,100 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t ln2; + uint64x2_t sign_exp_mask; +} data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), + V2 (-0x1.554e550bd501ep-3) }, + .ln2 = V2 (0x1.62e42fefa39efp-1), + .min_norm = V2 (0x0010000000000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000) +}; + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) +#define Off v_u64 (0x3fe6900900000000) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + uint32x2_t cmp) +{ + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + uint32x2_t cmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, Off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); + y = vfmaq_f64 (y, A (4), r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (cmp))) + return special_case (x, y, hi, r2, cmp); + return vfmaq_f64 (hi, y, r2); +} diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c new file mode 100644 index 0000000000000000000000000000000000000000..82351bb14766f2fbf6095cbf2e214e99b45f217d --- /dev/null +++ b/math/aarch64/v_log_data.c @@ -0,0 +1,156 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define N (1 << V_LOG_TABLE_BITS) + +const struct v_log_data __v_log_data = { + /* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: + + table[i].invc = 1/c + table[i].logc = (double)log(c) + + where c is near the center of the subinterval and is chosen by trying several + floating point invc candidates around 1/center and selecting one for which + the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval + that contains 1 and the previous one got tweaked to avoid cancellation. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, + { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, + { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, + { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, + { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, + { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, + { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, + { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, + { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, + { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, + { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, + { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, + { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, + { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, + { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, + { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, + { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, + { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, + { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, + { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, + { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, + { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, + { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, + { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, + { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, + { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, + { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, + { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, + { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, + { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, + { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, + { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, + { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, + { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, + { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, + { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, + { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, + { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, + { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, + { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, + { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, + { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, + { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, + { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, + { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, + { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, + { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, + { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, + { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, + { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, + { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, + { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, + { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, + { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, + { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, + { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, + { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, + { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, + { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, + { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, + { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, + { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, + { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, + { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, + { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, + { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, + { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, + { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, + { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, + { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, + { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, + { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, + { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, + { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, + { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, + { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, + { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, + { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, + { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, + { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, + { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, + { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, + { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, + { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, + { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, + { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, + { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } +}; diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c new file mode 100644 index 0000000000000000000000000000000000000000..66ebbbcd2b5a840b8a194cb18139ee585f67208a --- /dev/null +++ b/math/aarch64/v_logf.c @@ -0,0 +1,74 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + float32x4_t poly[7]; + float32x4_t ln2, tiny_bound; + uint32x4_t off, mantissa_mask; +} data = { + /* 3.34 ulp error. */ + .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), + V4 (-0x1.ffffc8p-2f) }, + .ln2 = V4 (0x1.62e43p-1f), + .tiny_bound = V4 (0x1p-126), + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +#define P(i) d->poly[7 - i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, + uint16x4_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + uint16x4_t cmp; + + u = vreinterpretq_u32_f32 (x); + cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (x, y, r2, p, cmp); + return vfmaq_f32 (p, y, r2); +} diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h new file mode 100644 index 0000000000000000000000000000000000000000..1dc9916c6fb076fd0c3d5074f5d156d2d952b4f2 --- /dev/null +++ b/math/aarch64/v_math.h @@ -0,0 +1,135 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#if !__aarch64__ +# error "Cannot build without AArch64" +#endif + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +#include +#include "../math_config.h" +#include + +/* Shorthand helpers for declaring constants. */ +# define V2(X) { X, X } +# define V4(X) { X, X, X, X } +# define V8(X) { X, X, X, X, X, X, X, X } + +static inline int +v_any_u16h (uint16x4_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; +} + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline int +v_any_u32h (uint32x2_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; +} + +#endif diff --git a/math/v_pow.c b/math/aarch64/v_pow.c similarity index 35% rename from math/v_pow.c rename to math/aarch64/v_pow.c index a209d57f41cee70ac78bc4f418c385f481636025..734f1663a283d4ce068efc2526d0dd989ba5433b 100644 --- a/math/v_pow.c +++ b/math/aarch64/v_pow.c @@ -1,27 +1,22 @@ /* * Double-precision vector pow function. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "mathlib.h" #include "v_math.h" -#if V_SUPPORTED -VPCS_ATTR -v_f64_t -V_NAME(pow) (v_f64_t x, v_f64_t y) +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) { - v_f64_t z; + float64x2_t z; for (int lane = 0; lane < v_lanes64 (); lane++) { - f64_t sx = v_get_f64 (x, lane); - f64_t sy = v_get_f64 (y, lane); - f64_t sz = pow (sx, sy); - v_set_f64 (&z, lane, sz); + double sx = x[lane]; + double sy = y[lane]; + double sz = pow (sx, sy); + z[lane] = sz; } return z; } -VPCS_ALIAS -#endif diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c new file mode 100644 index 0000000000000000000000000000000000000000..3a4163ab05582b387e87245bd4de77e9b93f9ac1 --- /dev/null +++ b/math/aarch64/v_powf.c @@ -0,0 +1,148 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A data.log2_poly +#define C data.exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + double log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + double exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, + -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + 0x1.c6af84b912394p-5 / Scale / Scale / Scale, + 0x1.ebfce50fac4f3p-3 / Scale / Scale, + 0x1.62e42ff0c52d6p-1 / Scale}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + Log2IdxMask); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + uint32x4_t iz = vsubq_u32 (u, top); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + float32x4_t ret; + for (int lane = 0; lane < 4; lane++) + { + /* Use double precision for each lane. */ + double invc = data.log2_tab[i[lane]].invc; + double logc = data.log2_tab[i[lane]].logc; + double z = (double) asfloat (iz[lane]); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + double r = __builtin_fma (z, invc, -1.0); + double y0 = logc + (double) k[lane]; + + /* Polynomial to approximate log1p(r)/ln2. */ + double logx = A[0]; + logx = r * logx + A[1]; + logx = r * logx + A[2]; + logx = r * logx + A[3]; + logx = r * logx + y0; + double ylogx = y[lane] * logx; + cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) + >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 + ? 1 + : cmp[lane]; + + /* N*x = k + r with r in [-1/2, 1/2]. */ + double kd = round (ylogx); + uint64_t ki = lround (ylogx); + r = ylogx - kd; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; + t += ki << (52 - V_EXP2F_TABLE_BITS); + double s = asdouble (t); + double p = C[0]; + p = __builtin_fma (p, r, C[1]); + p = __builtin_fma (p, r, C[2]); + p = __builtin_fma (p, s * r, s); + + ret[lane] = p; + } + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, ret, cmp); + return ret; +} diff --git a/math/aarch64/v_sin.c b/math/aarch64/v_sin.c new file mode 100644 index 0000000000000000000000000000000000000000..04129c31133d62dcecedf832b4e410b5217a51a2 --- /dev/null +++ b/math/aarch64/v_sin.c @@ -0,0 +1,97 @@ +/* + * Double-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + + .range_val = V2 (0x1p23), + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ +# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sin, x, y, cmp); +} + +/* Vector (AdvSIMD) sin approximation. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be + triggered correctly, set any special lanes to 1 (which is neutral w.r.t. + fenv). These lanes will be fixed by special-case handler later. */ + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); +#else + r = x; + cmp = vcageq_f64 (x, d->range_val); +#endif + + /* n = rint(|x|/pi). */ + n = vfmaq_f64 (d->shift, d->inv_pi, r); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/math/aarch64/v_sinf.c b/math/aarch64/v_sinf.c new file mode 100644 index 0000000000000000000000000000000000000000..336879844459f70accf8f2532407db6fc6810e69 --- /dev/null +++ b/math/aarch64/v_sinf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .range_val = V4 (0x1p20f) +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ +# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); +#else + r = x; + cmp = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(|x|/pi) */ + n = vfmaq_f32 (d->shift, d->inv_pi, r); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r) */ + r2 = vmulq_f32 (r, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, vmulq_f32 (y, r2), r); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/math/cosf.c b/math/cosf.c index f29f19474e230327f439da21eb0661e53bfaa1fe..6293ce8f1b7d6bc0d0a515bb07339b1e364a0c27 100644 --- a/math/cosf.c +++ b/math/cosf.c @@ -1,8 +1,8 @@ /* * Single-precision cos function. * - * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -22,7 +22,7 @@ cosf (float y) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4)) + if (abstop12 (y) < abstop12 (pio4f)) { double x2 = x * x; diff --git a/math/erf.c b/math/erf.c index 12d7e5160df702ab10ff1ae5da5604c927e54372..5f9f40dda26434e314e4d141d84868b2d3b9c1f6 100644 --- a/math/erf.c +++ b/math/erf.c @@ -2,7 +2,7 @@ * Double-precision erf(x) function. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/erf_data.c b/math/erf_data.c index 807875bdd7f5db86ad3557c9c36c7afd93c07ca0..10cf1fae93e078c2636409318f91078931c443bf 100644 --- a/math/erf_data.c +++ b/math/erf_data.c @@ -2,7 +2,7 @@ * Shared data between erf and erfc. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/erff.c b/math/erff.c index a58e82565dc34745500197c469d7f2ea9ec1f71b..9fa476dbbab2d72299486163eaeb7f5676a7b040 100644 --- a/math/erff.c +++ b/math/erff.c @@ -2,7 +2,7 @@ * Single-precision erf(x) function. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/erff_data.c b/math/erff_data.c index fa6b1ef4dedbfe7bafe493aa7c0dc007174fe704..f822788d0dd8068b17dc84ac3204349e21b4f34d 100644 --- a/math/erff_data.c +++ b/math/erff_data.c @@ -2,7 +2,7 @@ * Data for approximation of erff. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/exp.c b/math/exp.c index 7f5024cd8792144fe2681f1a60e297d405b9ea06..1de500c31f3ed08468b4e712fd3f7ea28e8a137e 100644 --- a/math/exp.c +++ b/math/exp.c @@ -2,7 +2,7 @@ * Double-precision e^x function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/exp10.c b/math/exp10.c new file mode 100644 index 0000000000000000000000000000000000000000..0fbec4c694ca831797d96968fc881a87aaf93644 --- /dev/null +++ b/math/exp10.c @@ -0,0 +1,129 @@ +/* + * Double-precision 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */ +#define UFlowBound -0x1.5ep+8 /* -350. */ +#define SmallTop 0x3c6 /* top12(0x1p-57). */ +#define BigTop 0x407 /* top12(0x1p8). */ +#define Thresh 0x41 /* BigTop - SmallTop. */ +#define Shift __exp_data.shift +#define C(i) __exp_data.exp10_poly[i] + +static double +special_case (uint64_t sbits, double_t tmp, uint64_t ki) +{ + double_t scale, y; + + if (ki - (1ull << 16) < 0x80000000) + { + /* The exponent of scale might have overflowed by 1. */ + sbits -= 1ull << 52; + scale = asdouble (sbits); + y = 2 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + + /* n < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble (sbits); + y = scale + scale * tmp; + + if (y < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t lo = scale - y + scale * tmp; + double_t hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double (hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + + return check_uflow (y); +} + +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */ +double +exp10 (double x) +{ + uint64_t ix = asuint64 (x); + uint32_t abstop = (ix >> 52) & 0x7ff; + + if (unlikely (abstop - SmallTop >= Thresh)) + { + if (abstop - SmallTop >= 0x80000000) + /* Avoid spurious underflow for tiny x. + Note: 0 is common input. */ + return x + 1; + if (abstop == 0x7ff) + return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0; + if (x >= OFlowBound) + return __math_oflow (0); + if (x < UFlowBound) + return __math_uflow (0); + + /* Large x is special-cased below. */ + abstop = 0; + } + + /* Reduce x: z = x * N / log10(2), k = round(z). */ + double_t z = __exp_data.invlog10_2N * x; + double_t kd; + int64_t ki; +#if TOINT_INTRINSICS + kd = roundtoint (z); + ki = converttoint (z); +#else + kd = eval_as_double (z + Shift); + kd -= Shift; + ki = kd; +#endif + + /* r = x - k * log10(2), r in [-0.5, 0.5]. */ + double_t r = x; + r = __exp_data.neglog10_2hiN * kd + r; + r = __exp_data.neglog10_2loN * kd + r; + + /* exp10(x) = 2^(k/N) * 2^(r/N). + Approximate the two components separately. */ + + /* s = 2^(k/N), using lookup table. */ + uint64_t e = ki << (52 - EXP_TABLE_BITS); + uint64_t i = (ki & IndexMask) * 2; + uint64_t u = __exp_data.tab[i + 1]; + uint64_t sbits = u + e; + + double_t tail = asdouble (__exp_data.tab[i]); + + /* 2^(r/N) ~= 1 + r * Poly(r). */ + double_t r2 = r * r; + double_t p = C (0) + r * C (1); + double_t y = C (2) + r * C (3); + y = y + r2 * C (4); + y = p + r2 * y; + y = tail + y * r; + + if (unlikely (abstop == 0)) + return special_case (sbits, y, ki); + + /* Assemble components: + y = 2^(r/N) * 2^(k/N) + ~= (y + 1) * s. */ + double_t s = asdouble (sbits); + return eval_as_double (s * y + s); +} diff --git a/math/exp2.c b/math/exp2.c index 35ab39f22ed5fcb0442c2fb84eea80ff95540fe2..a1eee44f1f4828b7fb9f133227e8b0e808f83788 100644 --- a/math/exp2.c +++ b/math/exp2.c @@ -2,7 +2,7 @@ * Double-precision 2^x function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/exp2f.c b/math/exp2f.c index 94b32538aa0de9c7e47ea3df9a5c60b7851bbeed..776c3ddf76636a75b24de080ac9fde62eed642d8 100644 --- a/math/exp2f.c +++ b/math/exp2f.c @@ -2,7 +2,7 @@ * Single-precision 2^x function. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/exp2f_data.c b/math/exp2f_data.c index 3fb0ad11b15a4e387b91778ea2dc31faa1903bfa..f0cb7fccacd158e0a771e3c3cb7ea4847896c149 100644 --- a/math/exp2f_data.c +++ b/math/exp2f_data.c @@ -2,7 +2,7 @@ * Shared data between expf, exp2f and powf. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/exp_data.c b/math/exp_data.c index cba76832566f04cc100bd153da745a6a57d30faf..9df4e0b1a2bb9ccbb2c21deb23787323bcfce88d 100644 --- a/math/exp_data.c +++ b/math/exp_data.c @@ -2,7 +2,7 @@ * Shared data between exp, exp2 and pow. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" @@ -12,6 +12,7 @@ const struct exp_data __exp_data = { // N/ln2 .invln2N = 0x1.71547652b82fep0 * N, +.invlog10_2N = 0x1.a934f0979a371p1 * N, // -ln2/N #if N == 64 .negln2hiN = -0x1.62e42fefa0000p-7, @@ -26,6 +27,8 @@ const struct exp_data __exp_data = { .negln2hiN = -0x1.62e42fef80000p-10, .negln2loN = -0x1.1cf79abc9e3b4p-45, #endif +.neglog10_2hiN = -0x1.3441350ap-2 / N, +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N, // Used for rounding when !TOINT_INTRINSICS #if EXP_USE_TOINT_NARROW .shift = 0x1800000000.8p0, @@ -147,6 +150,24 @@ const struct exp_data __exp_data = { 0x1.3b2ab786ee1dap-7, #endif }, +.exp10_poly = { +#if EXP10_POLY_WIDE +/* Range is wider if using shift-based reduction: coeffs generated + using Remez in [-log10(2)/128, log10(2)/128 ]. */ +0x1.26bb1bbb55515p1, +0x1.53524c73cd32bp1, +0x1.0470591e1a108p1, +0x1.2bd77b12fe9a8p0, +0x1.14289fef24b78p-1 +#else +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */ +0x1.26bb1bbb55516p1, +0x1.53524c73ce9fep1, +0x1.0470591ce4b26p1, +0x1.2bd76577fe684p0, +0x1.1446eeccd0efbp-1 +#endif +}, // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) // tab[2*k] = asuint64(T[k]) // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N diff --git a/math/expf.c b/math/expf.c index 9b2f0c3d8c56c98d8e9d37d45143b713cb92e570..08a20d59e49145ab8ae0099c1bda89ab6cad0752 100644 --- a/math/expf.c +++ b/math/expf.c @@ -2,7 +2,7 @@ * Single-precision e^x function. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/include/mathlib.h b/math/include/mathlib.h index 279d829d8ea15acae38ae51ada3fa74f3920f7f5..64cbb9c1f8506eca4fc7bf0ccf9c2991b4663b06 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2015-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2015-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATHLIB_H @@ -18,74 +18,33 @@ float cosf (float); void sincosf (float, float*, float*); double exp (double); +double exp10 (double); double exp2 (double); double log (double); double log2 (double); double pow (double, double); -/* Scalar functions using the vector algorithm with identical result. */ -float __s_sinf (float); -float __s_cosf (float); -float __s_expf (float); -float __s_expf_1u (float); -float __s_exp2f (float); -float __s_exp2f_1u (float); -float __s_logf (float); -float __s_powf (float, float); -double __s_sin (double); -double __s_cos (double); -double __s_exp (double); -double __s_log (double); -double __s_pow (double, double); - #if __aarch64__ -#if __GNUC__ >= 5 +# if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -#elif __clang_major__*100+__clang_minor__ >= 305 +# elif __clang_major__*100+__clang_minor__ >= 305 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -#else -#error Unsupported compiler -#endif - -/* Vector functions following the base PCS. */ -__f32x4_t __v_sinf (__f32x4_t); -__f32x4_t __v_cosf (__f32x4_t); -__f32x4_t __v_expf (__f32x4_t); -__f32x4_t __v_expf_1u (__f32x4_t); -__f32x4_t __v_exp2f (__f32x4_t); -__f32x4_t __v_exp2f_1u (__f32x4_t); -__f32x4_t __v_logf (__f32x4_t); -__f32x4_t __v_powf (__f32x4_t, __f32x4_t); -__f64x2_t __v_sin (__f64x2_t); -__f64x2_t __v_cos (__f64x2_t); -__f64x2_t __v_exp (__f64x2_t); -__f64x2_t __v_log (__f64x2_t); -__f64x2_t __v_pow (__f64x2_t, __f64x2_t); +# else +# error Unsupported compiler +# endif -#if __GNUC__ >= 9 || __clang_major__ >= 8 -#define __vpcs __attribute__((__aarch64_vector_pcs__)) - -/* Vector functions following the vector PCS. */ -__vpcs __f32x4_t __vn_sinf (__f32x4_t); -__vpcs __f32x4_t __vn_cosf (__f32x4_t); -__vpcs __f32x4_t __vn_expf (__f32x4_t); -__vpcs __f32x4_t __vn_expf_1u (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t); -__vpcs __f32x4_t __vn_logf (__f32x4_t); -__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t __vn_sin (__f64x2_t); -__vpcs __f64x2_t __vn_cos (__f64x2_t); -__vpcs __f64x2_t __vn_exp (__f64x2_t); -__vpcs __f64x2_t __vn_log (__f64x2_t); -__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t); +# if __GNUC__ >= 9 || __clang_major__ >= 8 +# undef __vpcs +# define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); @@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); -#endif +# endif #endif #endif diff --git a/math/log.c b/math/log.c index d3b7bc60747c2ace661ed1885669b1ab763e4dd2..43dfc2a744f060f8ebe9a4b25fb8da0367070d5e 100644 --- a/math/log.c +++ b/math/log.c @@ -2,7 +2,7 @@ * Double-precision log(x) function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/log2.c b/math/log2.c index 55102b7729696324f1f2afb4cf4cd89fbd06c034..3f9c21b0396263dd8274b252ffb4b1669e03ef18 100644 --- a/math/log2.c +++ b/math/log2.c @@ -2,7 +2,7 @@ * Double-precision log2(x) function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/log2_data.c b/math/log2_data.c index 3fc9b47c1f03868c950cac77bcc28e552fbf411a..293bd7df4118b08a69b4d0ff3bcc65917c628a73 100644 --- a/math/log2_data.c +++ b/math/log2_data.c @@ -2,7 +2,7 @@ * Data for log2. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/log2f.c b/math/log2f.c index acb629e6846cf3b94f665bca351d93098fb543a3..0a44fa2024f60639c34a1ce06a7b5d4eb77b09c6 100644 --- a/math/log2f.c +++ b/math/log2f.c @@ -2,7 +2,7 @@ * Single-precision log2 function. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/log2f_data.c b/math/log2f_data.c index f3546d730abab682f5b6e81adeb2064ef9357ba4..4866ef7f8171e67f36f16e41abfc858173326ab6 100644 --- a/math/log2f_data.c +++ b/math/log2f_data.c @@ -2,7 +2,7 @@ * Data definition for log2f. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/log_data.c b/math/log_data.c index 96a098d42c160e9d8713e565e35bf8901183528d..3ecc1f40a8228d5e13438a51ac0b615392e268fb 100644 --- a/math/log_data.c +++ b/math/log_data.c @@ -2,7 +2,7 @@ * Data for log. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/logf.c b/math/logf.c index cfbaee12df108750f6de0ca9f8dd30be7a17ff2b..820f74c3e66a7078f78d39a326aade89251894ee 100644 --- a/math/logf.c +++ b/math/logf.c @@ -1,8 +1,8 @@ /* * Single-precision log function. * - * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -57,7 +57,7 @@ logf (float x) tmp = ix - OFF; i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; k = (int32_t) tmp >> 23; /* arithmetic shift */ - iz = ix - (tmp & 0x1ff << 23); + iz = ix - (tmp & 0xff800000); invc = T[i].invc; logc = T[i].logc; z = (double_t) asfloat (iz); diff --git a/math/logf_data.c b/math/logf_data.c index e8973ce4fedcbffc2d587bf73fd2afa3917331ca..04247684755fdf65d4a834920f30dbb4fe72d89b 100644 --- a/math/logf_data.c +++ b/math/logf_data.c @@ -2,7 +2,7 @@ * Data definition for logf. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/math_config.h b/math/math_config.h index e85104337048abdfb1f51302fe7b3d33ead2b06a..394aaebc48ac8a94e4ab15b23326a2b3de4e337d 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -1,8 +1,8 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATH_CONFIG_H @@ -92,6 +92,17 @@ # define unlikely(x) (x) #endif +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +/* Symbol renames to avoid libc conflicts. */ + #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ @@ -381,15 +392,22 @@ extern const struct powf_log2_data #define EXP_USE_TOINT_NARROW 0 #define EXP2_POLY_ORDER 5 #define EXP2_POLY_WIDE 0 +/* Wider exp10 polynomial necessary for good precision in non-nearest rounding + and !TOINT_INTRINSICS. */ +#define EXP10_POLY_WIDE 0 extern const struct exp_data { double invln2N; + double invlog10_2N; double shift; double negln2hiN; double negln2loN; + double neglog10_2hiN; + double neglog10_2loN; double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; + double exp10_poly[5]; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; @@ -459,4 +477,16 @@ extern const struct erf_data double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; } __erf_data HIDDEN; +#define V_EXP_TABLE_BITS 7 +extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; + +#define V_LOG_TABLE_BITS 7 +extern const struct v_log_data +{ + struct + { + double invc, logc; + } table[1 << V_LOG_TABLE_BITS]; +} __v_log_data HIDDEN; + #endif diff --git a/math/math_err.c b/math/math_err.c index 1bf9538a1ab1d43ee26a20b8a57d2c129685fcd7..cfe072809cf43c2dcd700798469af446e576affa 100644 --- a/math/math_err.c +++ b/math/math_err.c @@ -2,7 +2,7 @@ * Double-precision math error handling. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/math_errf.c b/math/math_errf.c index d5350b819ab1aa4c37f61616e2d54b77027520fd..4233918b1eaeef1e597d82e3242ce302d34e572c 100644 --- a/math/math_errf.c +++ b/math/math_errf.c @@ -2,7 +2,7 @@ * Single-precision math error handling. * * Copyright (c) 2017-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/pow.c b/math/pow.c index 86842c6abacd962b4df3f536229c977b9d167775..af719fe5ab105861f410eaaa0350692bfaa49346 100644 --- a/math/pow.c +++ b/math/pow.c @@ -2,7 +2,7 @@ * Double-precision x^y function. * * Copyright (c) 2018-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/pow_log_data.c b/math/pow_log_data.c index 45569c5cc0645171b2e88db7dacc186540f8614b..2a4c250d85c3b7715e84c513ed1e6c9daa628ce2 100644 --- a/math/pow_log_data.c +++ b/math/pow_log_data.c @@ -2,7 +2,7 @@ * Data for the log part of pow. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/powf.c b/math/powf.c index 6ba45d3852a50b1ae3decb93e291a6d285692e5e..05c80bb2eb670e032ec6ce4bc504ef0577d45ea8 100644 --- a/math/powf.c +++ b/math/powf.c @@ -2,7 +2,7 @@ * Single-precision pow function. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c index 97e0d98cdbab6ffa9358a9670acd5c1255c02799..243836a549fdb7d8daf14488b2403eea074f4594 100644 --- a/math/powf_log2_data.c +++ b/math/powf_log2_data.c @@ -2,7 +2,7 @@ * Data definition for powf. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/math/s_cos.c b/math/s_cos.c deleted file mode 100644 index 53a95b0adfde452cdfd9adb3fd315f314d080118..0000000000000000000000000000000000000000 --- a/math/s_cos.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_cos.c" diff --git a/math/s_cosf.c b/math/s_cosf.c deleted file mode 100644 index 914c02eba6516e924785351f166161a608520c30..0000000000000000000000000000000000000000 --- a/math/s_cosf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_cosf.c" diff --git a/math/s_exp.c b/math/s_exp.c deleted file mode 100644 index ac7246b2c100d474250533eae917b79e179ae13c..0000000000000000000000000000000000000000 --- a/math/s_exp.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_exp.c" diff --git a/math/s_exp2f.c b/math/s_exp2f.c deleted file mode 100644 index df7dfd680ff40d4a15c8fdff9dec438a9978ddd1..0000000000000000000000000000000000000000 --- a/math/s_exp2f.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_exp2f.c" diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c deleted file mode 100644 index 5e3852b41d83710fe91f1dffcd97662a5bfe6d01..0000000000000000000000000000000000000000 --- a/math/s_exp2f_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_exp2f_1u.c" diff --git a/math/s_expf.c b/math/s_expf.c deleted file mode 100644 index 3492c460733d7a128deb55b4bb6db4eaa7092db4..0000000000000000000000000000000000000000 --- a/math/s_expf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_expf.c" diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c deleted file mode 100644 index eb7bbcba5566a177dad04d70407cbeb4c99b3aee..0000000000000000000000000000000000000000 --- a/math/s_expf_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_expf_1u.c" diff --git a/math/s_log.c b/math/s_log.c deleted file mode 100644 index 23289cf948ecd9503653a7719bfcec1daf238289..0000000000000000000000000000000000000000 --- a/math/s_log.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_log.c" diff --git a/math/s_logf.c b/math/s_logf.c deleted file mode 100644 index 9399350fc1ee501f7e855ef1bf4ad53fd1c2d374..0000000000000000000000000000000000000000 --- a/math/s_logf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_logf.c" diff --git a/math/s_pow.c b/math/s_pow.c deleted file mode 100644 index 2e34c9f896d6d920937d12befad1fc9e4e0a1596..0000000000000000000000000000000000000000 --- a/math/s_pow.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_pow.c" diff --git a/math/s_powf.c b/math/s_powf.c deleted file mode 100644 index 6d91a4a72b3733ba435d3605ec4d5a880f33ce90..0000000000000000000000000000000000000000 --- a/math/s_powf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_powf.c" diff --git a/math/s_sin.c b/math/s_sin.c deleted file mode 100644 index 06982c2018c675c1b8eac362d7c17b07553da760..0000000000000000000000000000000000000000 --- a/math/s_sin.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_sin.c" diff --git a/math/s_sinf.c b/math/s_sinf.c deleted file mode 100644 index 68ca90853736f260b4b7f345928c1b7ee893f24c..0000000000000000000000000000000000000000 --- a/math/s_sinf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_sinf.c" diff --git a/math/sincosf.c b/math/sincosf.c index 9746f1c22e6c2b30a2003e649fcfd40ebd8bcc7c..446f21d60faf3a5b3203ac6abf4df89a77907ed6 100644 --- a/math/sincosf.c +++ b/math/sincosf.c @@ -1,8 +1,8 @@ /* * Single-precision sin/cos function. * - * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4)) + if (abstop12 (y) < abstop12 (pio4f)) { double x2 = x * x; diff --git a/math/sincosf.h b/math/sincosf.h index 1e80fc9ba8e19cab265fc98ec325c9b3f17a998d..ec23ed7aeb2615e97ca26860c12452d548179ba4 100644 --- a/math/sincosf.h +++ b/math/sincosf.h @@ -1,8 +1,8 @@ /* * Header for sinf, cosf and sincosf. * - * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -12,7 +12,7 @@ /* 2PI * 2^-64. */ static const double pi63 = 0x1.921FB54442D18p-62; /* PI / 4. */ -static const double pio4 = 0x1.921FB54442D18p-1; +static const float pio4f = 0x1.921FB6p-1f; /* The constants and polynomials for sine and cosine. */ typedef struct diff --git a/math/sincosf_data.c b/math/sincosf_data.c index ab4ac4710feff2468cf9e55b04d4ad22dbc75233..22525290ab087a0f27e60f0c36731a5227da4baa 100644 --- a/math/sincosf_data.c +++ b/math/sincosf_data.c @@ -2,7 +2,7 @@ * Data definition for sinf, cosf and sincosf. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/sinf.c b/math/sinf.c index ddbc1daf74a9df1d90dad824f3c30d0460aafcc2..8dd8ae458794c51cd24a4e7623d16ccf49b0bcd9 100644 --- a/math/sinf.c +++ b/math/sinf.c @@ -1,8 +1,8 @@ /* * Single-precision sin function. * - * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -21,7 +21,7 @@ sinf (float y) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4)) + if (abstop12 (y) < abstop12 (pio4f)) { s = x * x; diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 0c17826e52961b3abd86b1e53ab3ec4a74d7ed8e..b2711e5a763ab4c4b13dfe23ce6abebd8d18d4da 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -1,8 +1,8 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #undef _GNU_SOURCE @@ -15,11 +15,6 @@ #include #include "mathlib.h" -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - /* Number of measurements, best result is reported. */ #define MEASURE 60 /* Array size. */ @@ -34,8 +29,9 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#if __aarch64__ && WANT_VMATH -typedef __f64x2_t v_double; +#ifdef __vpcs +#include +typedef float64x2_t v_double; #define v_double_len() 2 @@ -51,7 +47,7 @@ v_double_dup (double x) return (v_double){x, x}; } -typedef __f32x4_t v_float; +typedef float32x4_t v_float; #define v_float_len() 4 @@ -76,141 +72,91 @@ typedef float v_float; #define v_float_len(x) 1 #define v_float_load(x) (x)[0] #define v_float_dup(x) (x) -#endif - -static double -dummy (double x) -{ - return x; -} - -static float -dummyf (float x) -{ - return x; -} - -#if WANT_VMATH -#if __aarch64__ -static v_double -__v_dummy (v_double x) -{ - return x; -} -static v_float -__v_dummyf (v_float x) -{ - return x; -} - -#ifdef __vpcs -__vpcs static v_double -__vn_dummy (v_double x) -{ - return x; -} +#endif -__vpcs static v_float -__vn_dummyf (v_float x) -{ - return x; -} +#if WANT_SVE_MATH +#include +typedef svbool_t sv_bool; +typedef svfloat64_t sv_double; -__vpcs static v_float -xy__vn_powf (v_float x) -{ - return __vn_powf (x, x); -} +#define sv_double_len() svcntd() -__vpcs static v_float -xy_Z_powf (v_float x) +static inline sv_double +sv_double_load (const double *p) { - return _ZGVnN4vv_powf (x, x); + svbool_t pg = svptrue_b64(); + return svld1(pg, p); } -__vpcs static v_double -xy__vn_pow (v_double x) +static inline sv_double +sv_double_dup (double x) { - return __vn_pow (x, x); + return svdup_n_f64(x); } -__vpcs static v_double -xy_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (x, x); -} -#endif +typedef svfloat32_t sv_float; -static v_float -xy__v_powf (v_float x) -{ - return __v_powf (x, x); -} +#define sv_float_len() svcntw() -static v_double -xy__v_pow (v_double x) +static inline sv_float +sv_float_load (const float *p) { - return __v_pow (x, x); + svbool_t pg = svptrue_b32(); + return svld1(pg, p); } -#endif -static float -xy__s_powf (float x) +static inline sv_float +sv_float_dup (float x) { - return __s_powf (x, x); -} - -static double -xy__s_pow (double x) -{ - return __s_pow (x, x); + return svdup_n_f32(x); } +#else +/* dummy definitions to make things compile. */ +#define sv_double_len(x) 1 +#define sv_float_len(x) 1 #endif static double -xypow (double x) +dummy (double x) { - return pow (x, x); + return x; } static float -xypowf (float x) +dummyf (float x) { - return powf (x, x); + return x; } - -static double -xpow (double x) +#ifdef __vpcs +__vpcs static v_double +__vn_dummy (v_double x) { - return pow (x, 23.4); + return x; } -static float -xpowf (float x) +__vpcs static v_float +__vn_dummyf (v_float x) { - return powf (x, 23.4f); + return x; } - -static double -ypow (double x) +#endif +#if WANT_SVE_MATH +static sv_double +__sv_dummy (sv_double x, sv_bool pg) { - return pow (2.34, x); + return x; } -static float -ypowf (float x) +static sv_float +__sv_dummyf (sv_float x, sv_bool pg) { - return powf (2.34f, x); + return x; } -static float -sincosf_wrap (float x) -{ - float s, c; - sincosf (x, &s, &c); - return s + c; -} +#endif + +#include "test/mathbench_wrappers.h" static const struct fun { @@ -223,127 +169,40 @@ static const struct fun { double (*d) (double); float (*f) (float); - v_double (*vd) (v_double); - v_float (*vf) (v_float); #ifdef __vpcs __vpcs v_double (*vnd) (v_double); __vpcs v_float (*vnf) (v_float); +#endif +#if WANT_SVE_MATH + sv_double (*svd) (sv_double, sv_bool); + sv_float (*svf) (sv_float, sv_bool); #endif } fun; } funtab[] = { #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, -#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, -#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, +#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, +#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) -D (exp, -9.9, 9.9) -D (exp, 0.5, 1.0) -D (exp2, -9.9, 9.9) -D (log, 0.01, 11.1) -D (log, 0.999, 1.001) -D (log2, 0.01, 11.1) -D (log2, 0.999, 1.001) -{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, -D (xpow, 0.01, 11.1) -D (ypow, -9.9, 9.9) -D (erf, -6.0, 6.0) - F (dummyf, 1.0, 2.0) -F (expf, -9.9, 9.9) -F (exp2f, -9.9, 9.9) -F (logf, 0.01, 11.1) -F (log2f, 0.01, 11.1) -{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, -F (xpowf, 0.01, 11.1) -F (ypowf, -9.9, 9.9) -{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, -F (sinf, 0.1, 0.7) -F (sinf, 0.8, 3.1) -F (sinf, -3.1, 3.1) -F (sinf, 3.3, 33.3) -F (sinf, 100, 1000) -F (sinf, 1e6, 1e32) -F (cosf, 0.1, 0.7) -F (cosf, 0.8, 3.1) -F (cosf, -3.1, 3.1) -F (cosf, 3.3, 33.3) -F (cosf, 100, 1000) -F (cosf, 1e6, 1e32) -F (erff, -4.0, 4.0) -#if WANT_VMATH -D (__s_sin, -3.1, 3.1) -D (__s_cos, -3.1, 3.1) -D (__s_exp, -9.9, 9.9) -D (__s_log, 0.01, 11.1) -{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, -F (__s_expf, -9.9, 9.9) -F (__s_expf_1u, -9.9, 9.9) -F (__s_exp2f, -9.9, 9.9) -F (__s_exp2f_1u, -9.9, 9.9) -F (__s_logf, 0.01, 11.1) -{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, -F (__s_sinf, -3.1, 3.1) -F (__s_cosf, -3.1, 3.1) -#if __aarch64__ -VD (__v_dummy, 1.0, 2.0) -VD (__v_sin, -3.1, 3.1) -VD (__v_cos, -3.1, 3.1) -VD (__v_exp, -9.9, 9.9) -VD (__v_log, 0.01, 11.1) -{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, -VF (__v_dummyf, 1.0, 2.0) -VF (__v_expf, -9.9, 9.9) -VF (__v_expf_1u, -9.9, 9.9) -VF (__v_exp2f, -9.9, 9.9) -VF (__v_exp2f_1u, -9.9, 9.9) -VF (__v_logf, 0.01, 11.1) -{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, -VF (__v_sinf, -3.1, 3.1) -VF (__v_cosf, -3.1, 3.1) #ifdef __vpcs VND (__vn_dummy, 1.0, 2.0) -VND (__vn_exp, -9.9, 9.9) -VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (__vn_log, 0.01, 11.1) -VND (_ZGVnN2v_log, 0.01, 11.1) -{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, -{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (__vn_sin, -3.1, 3.1) -VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (__vn_cos, -3.1, 3.1) -VND (_ZGVnN2v_cos, -3.1, 3.1) VNF (__vn_dummyf, 1.0, 2.0) -VNF (__vn_expf, -9.9, 9.9) -VNF (_ZGVnN4v_expf, -9.9, 9.9) -VNF (__vn_expf_1u, -9.9, 9.9) -VNF (__vn_exp2f, -9.9, 9.9) -VNF (_ZGVnN4v_exp2f, -9.9, 9.9) -VNF (__vn_exp2f_1u, -9.9, 9.9) -VNF (__vn_logf, 0.01, 11.1) -VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, -{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (__vn_sinf, -3.1, 3.1) -VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (__vn_cosf, -3.1, 3.1) -VNF (_ZGVnN4v_cosf, -3.1, 3.1) -#endif #endif +#if WANT_SVE_MATH +SVD (__sv_dummy, 1.0, 2.0) +SVF (__sv_dummyf, 1.0, 2.0) #endif +#include "test/mathbench_funcs.h" {0}, #undef F #undef D -#undef VF -#undef VD #undef VNF #undef VND +#undef SVF +#undef SVD }; static void @@ -442,69 +301,75 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } +#ifdef __vpcs static void -run_v_thruput (v_double f (v_double)) +run_vn_thruput (__vpcs v_double f (v_double)) { for (int i = 0; i < N; i += v_double_len ()) f (v_double_load (A+i)); } static void -runf_v_thruput (v_float f (v_float)) +runf_vn_thruput (__vpcs v_float f (v_float)) { for (int i = 0; i < N; i += v_float_len ()) f (v_float_load (Af+i)); } static void -run_v_latency (v_double f (v_double)) +run_vn_latency (__vpcs v_double f (v_double)) { - v_double z = v_double_dup (zero); - v_double prev = z; + volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; + uint64x2_t sel = vsel; + v_double prev = v_double_dup (0); for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); } static void -runf_v_latency (v_float f (v_float)) +runf_vn_latency (__vpcs v_float f (v_float)) { - v_float z = v_float_dup (zero); - v_float prev = z; + volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; + uint32x4_t sel = vsel; + v_float prev = v_float_dup (0); for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); } +#endif -#ifdef __vpcs +#if WANT_SVE_MATH static void -run_vn_thruput (__vpcs v_double f (v_double)) +run_sv_thruput (sv_double f (sv_double, sv_bool)) { - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); + for (int i = 0; i < N; i += sv_double_len ()) + f (sv_double_load (A+i), svptrue_b64 ()); } static void -runf_vn_thruput (__vpcs v_float f (v_float)) +runf_sv_thruput (sv_float f (sv_float, sv_bool)) { - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); + for (int i = 0; i < N; i += sv_float_len ()) + f (sv_float_load (Af+i), svptrue_b32 ()); } static void -run_vn_latency (__vpcs v_double f (v_double)) +run_sv_latency (sv_double f (sv_double, sv_bool)) { - v_double z = v_double_dup (zero); - v_double prev = z; - for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + volatile sv_bool vsel = svptrue_b64 (); + sv_bool sel = vsel; + sv_double prev = sv_double_dup (0); + for (int i = 0; i < N; i += sv_double_len ()) + prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); } static void -runf_vn_latency (__vpcs v_float f (v_float)) +runf_sv_latency (sv_float f (sv_float, sv_bool)) { - v_float z = v_float_dup (zero); - v_float prev = z; - for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + volatile sv_bool vsel = svptrue_b32 (); + sv_bool sel = vsel; + sv_float prev = sv_float_dup (0); + for (int i = 0; i < N; i += sv_float_len ()) + prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); } #endif @@ -539,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi) const char *s = type == 't' ? "rthruput" : "latency"; int vlen = 1; - if (f->vec && f->prec == 'd') - vlen = v_double_len(); - else if (f->vec && f->prec == 'f') - vlen = v_float_len(); + if (f->vec == 'n') + vlen = f->prec == 'd' ? v_double_len() : v_float_len(); + else if (f->vec == 's') + vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); @@ -552,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); - else if (f->prec == 'd' && type == 't' && f->vec == 'v') - TIMEIT (run_v_thruput, f->fun.vd); - else if (f->prec == 'd' && type == 'l' && f->vec == 'v') - TIMEIT (run_v_latency, f->fun.vd); - else if (f->prec == 'f' && type == 't' && f->vec == 'v') - TIMEIT (runf_v_thruput, f->fun.vf); - else if (f->prec == 'f' && type == 'l' && f->vec == 'v') - TIMEIT (runf_v_latency, f->fun.vf); #ifdef __vpcs else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); @@ -570,20 +427,32 @@ bench1 (const struct fun *f, int type, double lo, double hi) else if (f->prec == 'f' && type == 'l' && f->vec == 'n') TIMEIT (runf_vn_latency, f->fun.vnf); #endif +#if WANT_SVE_MATH + else if (f->prec == 'd' && type == 't' && f->vec == 's') + TIMEIT (run_sv_thruput, f->fun.svd); + else if (f->prec == 'd' && type == 'l' && f->vec == 's') + TIMEIT (run_sv_latency, f->fun.svd); + else if (f->prec == 'f' && type == 't' && f->vec == 's') + TIMEIT (runf_sv_thruput, f->fun.svf); + else if (f->prec == 'f' && type == 'l' && f->vec == 's') + TIMEIT (runf_sv_latency, f->fun.svf); +#endif if (type == 't') { ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } else if (type == 'l') { ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); - printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } fflush (stdout); } diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..84c4e68650acbb1ded2e43dee5410b7c3e7224c4 --- /dev/null +++ b/math/test/mathbench_funcs.h @@ -0,0 +1,62 @@ +/* + * Function entries for mathbench. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +/* clang-format off */ +D (exp, -9.9, 9.9) +D (exp, 0.5, 1.0) +D (exp10, -9.9, 9.9) +D (exp2, -9.9, 9.9) +D (log, 0.01, 11.1) +D (log, 0.999, 1.001) +D (log2, 0.01, 11.1) +D (log2, 0.999, 1.001) +{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, +D (xpow, 0.01, 11.1) +D (ypow, -9.9, 9.9) +D (erf, -6.0, 6.0) + +F (expf, -9.9, 9.9) +F (exp2f, -9.9, 9.9) +F (logf, 0.01, 11.1) +F (log2f, 0.01, 11.1) +{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, +F (xpowf, 0.01, 11.1) +F (ypowf, -9.9, 9.9) +{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, +F (sinf, 0.1, 0.7) +F (sinf, 0.8, 3.1) +F (sinf, -3.1, 3.1) +F (sinf, 3.3, 33.3) +F (sinf, 100, 1000) +F (sinf, 1e6, 1e32) +F (cosf, 0.1, 0.7) +F (cosf, 0.8, 3.1) +F (cosf, -3.1, 3.1) +F (cosf, 3.3, 33.3) +F (cosf, 100, 1000) +F (cosf, 1e6, 1e32) +F (erff, -4.0, 4.0) +#ifdef __vpcs +VND (_ZGVnN2v_exp, -9.9, 9.9) +VND (_ZGVnN2v_log, 0.01, 11.1) +{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, +VND (_ZGVnN2v_sin, -3.1, 3.1) +VND (_ZGVnN2v_cos, -3.1, 3.1) +VNF (_ZGVnN4v_expf, -9.9, 9.9) +VNF (_ZGVnN4v_expf_1u, -9.9, 9.9) +VNF (_ZGVnN4v_exp2f, -9.9, 9.9) +VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9) +VNF (_ZGVnN4v_logf, 0.01, 11.1) +{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, +VNF (_ZGVnN4v_sinf, -3.1, 3.1) +VNF (_ZGVnN4v_cosf, -3.1, 3.1) +#endif + /* clang-format on */ diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h new file mode 100644 index 0000000000000000000000000000000000000000..062b9db56de51a741a698e13a547184461b2ca2b --- /dev/null +++ b/math/test/mathbench_wrappers.h @@ -0,0 +1,66 @@ +/* + * Function wrappers for mathbench. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifdef __vpcs + +__vpcs static v_float +xy_Z_powf (v_float x) +{ + return _ZGVnN4vv_powf (x, x); +} + +__vpcs static v_double +xy_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (x, x); +} + +#endif + +static double +xypow (double x) +{ + return pow (x, x); +} + +static float +xypowf (float x) +{ + return powf (x, x); +} + +static double +xpow (double x) +{ + return pow (x, 23.4); +} + +static float +xpowf (float x) +{ + return powf (x, 23.4f); +} + +static double +ypow (double x) +{ + return pow (2.34, x); +} + +static float +ypowf (float x) +{ + return powf (2.34f, x); +} + +static float +sincosf_wrap (float x) +{ + float s, c; + sincosf (x, &s, &c); + return s + c; +} diff --git a/math/test/mathtest.c b/math/test/mathtest.c index 310896738e478481a9f91ff878957a1f86accc2e..cedccfd39455930bf51ffdbb638b1be9935b4d80 100644 --- a/math/test/mathtest.c +++ b/math/test/mathtest.c @@ -1,8 +1,8 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 1998-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -196,9 +196,11 @@ int is_complex_rettype(int rettype) { #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name } #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name } +#ifndef PL /* sincosf wrappers for easier testing. */ static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; } static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; } +#endif test_func tfuncs[] = { /* trigonometric */ @@ -218,9 +220,10 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT), TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4), +#ifndef PL TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4), - +#endif /* hyperbolic */ TFUNC(at_d, rt_d, atanh, 4*ULPUNIT), TFUNC(at_d, rt_d, asinh, 4*ULPUNIT), @@ -251,6 +254,7 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4), TFUNC(at_s,rt_s, expm1f, ULPUNIT), + TFUNC(at_d,rt_d, exp10, ULPUNIT), /* power */ TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4), @@ -1018,6 +1022,7 @@ int runtest(testdetail t) { DO_DOP(d_arg1,op1r); DO_DOP(d_arg2,op2r); s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0]; + s_res.i = 0; /* * Detect NaNs, infinities and denormals on input, and set a @@ -1152,22 +1157,25 @@ int runtest(testdetail t) { tresultr[0] = t.resultr[0]; tresultr[1] = t.resultr[1]; resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd]; + resulti[0] = resulti[1] = 0; wres = 2; break; case rt_i: tresultr[0] = t.resultr[0]; resultr[0] = intres; + resulti[0] = 0; wres = 1; break; case rt_s: case rt_s2: tresultr[0] = t.resultr[0]; resultr[0] = s_res.i; + resulti[0] = 0; wres = 1; break; default: puts("unhandled rettype in runtest"); - wres = 0; + abort (); } if(t.resultc != rc_none) { int err = 0; diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c index 6be79e1df0d1acef5a5c3861f1ab73058e10b836..5b3e9b4f18e467c536d989292b91706af1ff4f67 100644 --- a/math/test/rtest/dotest.c +++ b/math/test/rtest/dotest.c @@ -2,7 +2,7 @@ * dotest.c - actually generate mathlib test cases * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h index 12a9c749e18e1127eb27922ad30d13ac3cbd4d1c..3ebd7ddaf85d7b37c163ea8994250ac20408b396 100644 --- a/math/test/rtest/intern.h +++ b/math/test/rtest/intern.h @@ -2,7 +2,7 @@ * intern.h * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef mathtest_intern_h diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c index 0d8ead891320a5c5afb3d72d7b7fbd82c5f6e540..3d533c946f79be126fc0b427a9578effed68179a 100644 --- a/math/test/rtest/main.c +++ b/math/test/rtest/main.c @@ -2,7 +2,7 @@ * main.c * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c index 56123966b8c48f8acbeb1501d1e56d5b1d5e3e2c..1de32580b733d347d9a62cfe2521a8aca3622b87 100644 --- a/math/test/rtest/random.c +++ b/math/test/rtest/random.c @@ -2,7 +2,7 @@ * random.c - random number generator for producing mathlib test cases * * Copyright (c) 1998-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "types.h" diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h index b4b22df82a3d768bdb8227f6731b0bce5d6ce843..0b477d72b2346b2adbcd7cc7fec20dd9ade42395 100644 --- a/math/test/rtest/random.h +++ b/math/test/rtest/random.h @@ -2,7 +2,7 @@ * random.h - header for random.c * * Copyright (c) 2009-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "types.h" diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c index c9f0daf76508194f5443bfe9fccbbfb89fe8964d..70a7844a48d613d1726b529e1347ccaec6439c1a 100644 --- a/math/test/rtest/semi.c +++ b/math/test/rtest/semi.c @@ -2,7 +2,7 @@ * semi.c: test implementations of mathlib seminumerical functions * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h index 17dc4158fb51e87e465c76ca5c9192cfb0dee71b..7a1444e55d288c93f665769a4a6fea37d42d7d33 100644 --- a/math/test/rtest/semi.h +++ b/math/test/rtest/semi.h @@ -2,7 +2,7 @@ * semi.h: header for semi.c * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef test_semi_h diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h index 53cd557fa4cf448d6d4f49dbd85cf8c514905d47..e15b4e06a0d4aac3a0595edf69bfe830f56d624c 100644 --- a/math/test/rtest/types.h +++ b/math/test/rtest/types.h @@ -2,7 +2,7 @@ * types.h * * Copyright (c) 2005-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef mathtest_types_h diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c index de45ac5768d0f750c1ad48c15902a02df9a8336d..441017192ab48b8332415c666cbd6d29c87c1e08 100644 --- a/math/test/rtest/wrappers.c +++ b/math/test/rtest/wrappers.c @@ -2,7 +2,7 @@ * wrappers.c - wrappers to modify output of MPFR/MPC test functions * * Copyright (c) 2014-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h index 7b09c85a59f114af56f6ec7dd9e0e7c00bd43721..0a8a58777d8aed7ae1c2e1f489b822919aa53967 100644 --- a/math/test/rtest/wrappers.h +++ b/math/test/rtest/wrappers.h @@ -2,7 +2,7 @@ * wrappers.h - wrappers to modify output of MPFR/MPC test functions * * Copyright (c) 2014-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ typedef struct { diff --git a/math/test/runulp.sh b/math/test/runulp.sh index 0190d9ab27fb104de780d9101507a85ee9ff7a2e..e2e03e3ae76196e8f94f2cdadd3147b4f4fafdac 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -2,8 +2,8 @@ # ULP error check script. # -# Copyright (c) 2019-2020, Arm Limited. -# SPDX-License-Identifier: MIT +# Copyright (c) 2019-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception #set -x set -eu @@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=0.02 +t exp10 0 0x1p-47 5000 +t exp10 -0 -0x1p-47 5000 +t exp10 0x1p-47 1 50000 +t exp10 -0x1p-47 -1 50000 +t exp10 1 0x1.34413509f79ffp8 50000 +t exp10 -1 -0x1.434e6420f4374p8 50000 +t exp10 0x1.34413509f79ffp8 inf 5000 +t exp10 -0x1.434e6420f4374p8 -inf 5000 + L=1.0 Ldir=0.9 t erf 0 0xffff000000000000 10000 @@ -143,15 +153,10 @@ Ldir=0.5 done # vector functions + Ldir=0.5 r='n' -flags="${ULPFLAGS:--q} -f" -runs= -check __s_exp 1 && runs=1 -runv= -check __v_exp 1 && runv=1 -runvn= -check __vn_exp 1 && runvn=1 +flags="${ULPFLAGS:--q}" range_exp=' 0 0xffff000000000000 10000 @@ -177,9 +182,10 @@ range_pow=' ' range_sin=' - 0 0xffff000000000000 10000 - 0x1p-4 0x1p4 400000 - -0x1p-23 0x1p23 400000 + 0 0x1p23 500000 + -0 -0x1p23 500000 + 0x1p23 inf 10000 + -0x1p23 -inf 10000 ' range_cos="$range_sin" @@ -199,9 +205,10 @@ range_logf=' ' range_sinf=' - 0 0xffff0000 10000 - 0x1p-4 0x1p4 300000 --0x1p-9 -0x1p9 300000 + 0 0x1p20 500000 + -0 -0x1p20 500000 + 0x1p20 inf 10000 + -0x1p20 -inf 10000 ' range_cosf="$range_sinf" @@ -229,9 +236,8 @@ L_sinf=1.4 L_cosf=1.4 L_powf=2.1 -while read G F R +while read G F D do - [ "$R" = 1 ] || continue case "$G" in \#*) continue ;; esac eval range="\${range_$G}" eval L="\${L_$G}" @@ -239,74 +245,35 @@ do do [ -n "$X" ] || continue case "$X" in \#*) continue ;; esac - t $F $X + disable_fenv="" + if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then + # If library was built with SIMD exceptions + # disabled, disable fenv checking in ulp + # tool. Otherwise, fenv checking may still be + # disabled by adding -f to the end of the run + # line. + disable_fenv="-f" + fi + t $D $disable_fenv $F $X done << EOF $range + EOF done << EOF # group symbol run -exp __s_exp $runs -exp __v_exp $runv -exp __vn_exp $runvn -exp _ZGVnN2v_exp $runvn - -log __s_log $runs -log __v_log $runv -log __vn_log $runvn -log _ZGVnN2v_log $runvn - -pow __s_pow $runs -pow __v_pow $runv -pow __vn_pow $runvn -pow _ZGVnN2vv_pow $runvn - -sin __s_sin $runs -sin __v_sin $runv -sin __vn_sin $runvn -sin _ZGVnN2v_sin $runvn - -cos __s_cos $runs -cos __v_cos $runv -cos __vn_cos $runvn -cos _ZGVnN2v_cos $runvn - -expf __s_expf $runs -expf __v_expf $runv -expf __vn_expf $runvn -expf _ZGVnN4v_expf $runvn - -expf_1u __s_expf_1u $runs -expf_1u __v_expf_1u $runv -expf_1u __vn_expf_1u $runvn - -exp2f __s_exp2f $runs -exp2f __v_exp2f $runv -exp2f __vn_exp2f $runvn -exp2f _ZGVnN4v_exp2f $runvn - -exp2f_1u __s_exp2f_1u $runs -exp2f_1u __v_exp2f_1u $runv -exp2f_1u __vn_exp2f_1u $runvn - -logf __s_logf $runs -logf __v_logf $runv -logf __vn_logf $runvn -logf _ZGVnN4v_logf $runvn - -sinf __s_sinf $runs -sinf __v_sinf $runv -sinf __vn_sinf $runvn -sinf _ZGVnN4v_sinf $runvn - -cosf __s_cosf $runs -cosf __v_cosf $runv -cosf __vn_cosf $runvn -cosf _ZGVnN4v_cosf $runvn - -powf __s_powf $runs -powf __v_powf $runv -powf __vn_powf $runvn -powf _ZGVnN4vv_powf $runvn +exp _ZGVnN2v_exp +log _ZGVnN2v_log +pow _ZGVnN2vv_pow -f +sin _ZGVnN2v_sin -z +cos _ZGVnN2v_cos +expf _ZGVnN4v_expf +expf_1u _ZGVnN4v_expf_1u -f +exp2f _ZGVnN4v_exp2f +exp2f_1u _ZGVnN4v_exp2f_1u -f +logf _ZGVnN4v_logf +sinf _ZGVnN4v_sinf -z +cosf _ZGVnN4v_cosf +powf _ZGVnN4vv_powf -f EOF [ 0 -eq $FAIL ] || { diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst index 79160443f0990058f70bc0d03be6be545f3fd6f7..7ea0d45795a3647c73ea1de9f6cbf956a1ef7bb9 100644 --- a/math/test/testcases/directed/cosf.tst +++ b/math/test/testcases/directed/cosf.tst @@ -1,7 +1,7 @@ ; cosf.tst - Directed test cases for SP cosine ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=cosf op1=7fc00001 result=7fc00001 errno=0 func=cosf op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst index 7fa4d1868c0eb1a27920eda1485ee7be4dbe0f01..12384cef0dd98e24ac842c45ed89012d9c1b0ef6 100644 --- a/math/test/testcases/directed/erf.tst +++ b/math/test/testcases/directed/erf.tst @@ -1,7 +1,7 @@ ; erf.tst - Directed test cases for erf ; ; Copyright (c) 2007-2020, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst index d05b7b1119c46c21ce7d22d2d3f2cbebff6eae44..28f8fa37f5aa7db743d399ab2a6af3070b009fe8 100644 --- a/math/test/testcases/directed/erff.tst +++ b/math/test/testcases/directed/erff.tst @@ -1,7 +1,7 @@ ; erff.tst ; ; Copyright (c) 2007-2020, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=erff op1=7fc00001 result=7fc00001 errno=0 func=erff op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst index 85d556cd1e00f75c3273e67d420adce2ea7849df..0bb2ef4579cc1c5313494ead5e0dc80f4a02153e 100644 --- a/math/test/testcases/directed/exp.tst +++ b/math/test/testcases/directed/exp.tst @@ -1,7 +1,7 @@ ; Directed test cases for exp ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp10.tst b/math/test/testcases/directed/exp10.tst new file mode 100644 index 0000000000000000000000000000000000000000..2cf4273bd1d718ef0332e78553e4c2c3c1cb5c2d --- /dev/null +++ b/math/test/testcases/directed/exp10.tst @@ -0,0 +1,15 @@ +; Directed test cases for exp10 +; +; Copyright (c) 2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0 +func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux +func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0 +func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst index fa56c9f8be4b91598121f7f376e68968d806001d..7069f9010c8ccf6e5407aadf2a581b12ff736fe4 100644 --- a/math/test/testcases/directed/exp2.tst +++ b/math/test/testcases/directed/exp2.tst @@ -1,7 +1,7 @@ ; Directed test cases for exp2 ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst index 38cfc3f78ac61dae04c0d0372110d3351e669848..6ca2eeab4e121e165703644bee54b5d855225886 100644 --- a/math/test/testcases/directed/exp2f.tst +++ b/math/test/testcases/directed/exp2f.tst @@ -1,7 +1,7 @@ ; exp2f.tst - Directed test cases for exp2f ; ; Copyright (c) 2017-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=exp2f op1=7fc00001 result=7fc00001 errno=0 func=exp2f op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst index ff0f671c2656a94b17f96d8f9a683a6b8436f674..89ae8fe78e6c17cd5295230c4185a8d11b98849d 100644 --- a/math/test/testcases/directed/expf.tst +++ b/math/test/testcases/directed/expf.tst @@ -1,7 +1,7 @@ ; expf.tst - Directed test cases for expf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=expf op1=7fc00001 result=7fc00001 errno=0 func=expf op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst index a0aa398cbf734396be64c61612f463524d283d15..686ea835645b9c7af857f5052efbb29c71493a20 100644 --- a/math/test/testcases/directed/log.tst +++ b/math/test/testcases/directed/log.tst @@ -1,7 +1,7 @@ ; Directed test cases for log ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst index ff1286cbd53e8ebfba5db81b9d244a598eb9a6ac..361bddec374bb16da87dd6fa5dc10d0a2f1ff366 100644 --- a/math/test/testcases/directed/log2.tst +++ b/math/test/testcases/directed/log2.tst @@ -1,7 +1,7 @@ ; Directed test cases for log2 ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst index 5832c4f08f1ecb6acf9fdbbb06f0ce75bac82f6d..5fce051cddba75e19eff4fd577a456a452554159 100644 --- a/math/test/testcases/directed/log2f.tst +++ b/math/test/testcases/directed/log2f.tst @@ -1,7 +1,7 @@ ; log2f.tst - Directed test cases for log2f ; ; Copyright (c) 2017-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log2f op1=7fc00001 result=7fc00001 errno=0 func=log2f op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst index 6e68a36e0f6a29f8d0646f450ed3abf0aafca260..a6d1b9d5c51fa1b9b7e8c5eef465f2e4f5cfb6f5 100644 --- a/math/test/testcases/directed/logf.tst +++ b/math/test/testcases/directed/logf.tst @@ -1,7 +1,7 @@ ; logf.tst - Directed test cases for logf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=logf op1=7fc00001 result=7fc00001 errno=0 func=logf op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst index 19665817153d03ef84dc85fe3be375bd63d2dad5..879d12864afe5d2c3e98e1c07095d7f58fe68b3b 100644 --- a/math/test/testcases/directed/pow.tst +++ b/math/test/testcases/directed/pow.tst @@ -1,7 +1,7 @@ ; Directed test cases for pow ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0 diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst index 3fa8b110f8bcb97196dca92030271ceb376644a8..46d5224008710127eb93863f5b19196cdc89693d 100644 --- a/math/test/testcases/directed/powf.tst +++ b/math/test/testcases/directed/powf.tst @@ -1,7 +1,7 @@ ; powf.tst - Directed test cases for powf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst index 4b33d2291c660c034fed47522966599203bb8b6c..cddb346558ea3c16ad3ca47ff7c2e3789aa31816 100644 --- a/math/test/testcases/directed/sincosf.tst +++ b/math/test/testcases/directed/sincosf.tst @@ -1,7 +1,7 @@ ; Directed test cases for SP sincos ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst index ded80b1598c6a3904ed8eb6baab351f493592bcc..041b13d5d6cbc5e56fb570126159a73b70d0d1ab 100644 --- a/math/test/testcases/directed/sinf.tst +++ b/math/test/testcases/directed/sinf.tst @@ -1,7 +1,7 @@ ; sinf.tst - Directed test cases for SP sine ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=sinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst index c24ff80d5d95eccc799de5bd3dd0876b19ae8fb9..8e885d61722a0b5e871e12af0a0cc0f4558f6d5b 100644 --- a/math/test/testcases/random/double.tst +++ b/math/test/testcases/random/double.tst @@ -1,7 +1,7 @@ !! double.tst - Random test case specification for DP functions !! !! Copyright (c) 1999-2019, Arm Limited. -!! SPDX-License-Identifier: MIT +!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception test exp 10000 test exp2 10000 diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst index d02a22750abe07b9b64b63a0caf9afcb3c83d50c..ea4a5a01521484b53a8413a7edfe673c110b2bae 100644 --- a/math/test/testcases/random/float.tst +++ b/math/test/testcases/random/float.tst @@ -1,7 +1,7 @@ !! single.tst - Random test case specification for SP functions !! !! Copyright (c) 1999-2019, Arm Limited. -!! SPDX-License-Identifier: MIT +!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception test sinf 10000 test cosf 10000 diff --git a/math/test/ulp.c b/math/test/ulp.c index 51479b87a0fde860e1584536fd13b8471cfca9a2..5ff29972e50ee01026e8f15af0e7c73008909bda 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -1,10 +1,11 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include #include #include @@ -23,11 +24,6 @@ # include #endif -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - static inline uint64_t asuint64 (double f) { @@ -212,73 +208,61 @@ struct conf unsigned long long n; double softlim; double errlim; + int ignore_zero_sign; }; -/* Wrappers for sincos. */ -static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} -static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} -static double sincos_sin(double x) {(void)cos(x); return sin(x);} -static double sincos_cos(double x) {(void)sin(x); return cos(x);} -#if USE_MPFR -static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } -static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } -#endif - /* A bit of a hack: call vector functions twice with the same input in lane 0 but a different value in other lanes: once with an in-range value and then with a special case value. */ static int secondcall; /* Wrappers for vector functions. */ -#if __aarch64__ && WANT_VMATH +#ifdef __vpcs typedef __f32x4_t v_float; typedef __f64x2_t v_double; -static const float fv[2] = {1.0f, -INFINITY}; -static const double dv[2] = {1.0, -INFINITY}; +/* First element of fv and dv may be changed by -c argument. */ +static float fv[2] = {1.0f, -INFINITY}; +static double dv[2] = {1.0, -INFINITY}; static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } - -static float v_sinf(float x) { return __v_sinf(argf(x))[0]; } -static float v_cosf(float x) { return __v_cosf(argf(x))[0]; } -static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; } -static float v_expf(float x) { return __v_expf(argf(x))[0]; } -static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; } -static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; } -static float v_logf(float x) { return __v_logf(argf(x))[0]; } -static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; } -static double v_sin(double x) { return __v_sin(argd(x))[0]; } -static double v_cos(double x) { return __v_cos(argd(x))[0]; } -static double v_exp(double x) { return __v_exp(argd(x))[0]; } -static double v_log(double x) { return __v_log(argd(x))[0]; } -static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; } -#ifdef __vpcs -static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; } -static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; } -static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; } -static float vn_expf(float x) { return __vn_expf(argf(x))[0]; } -static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; } -static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; } -static float vn_logf(float x) { return __vn_logf(argf(x))[0]; } -static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; } -static double vn_sin(double x) { return __vn_sin(argd(x))[0]; } -static double vn_cos(double x) { return __vn_cos(argd(x))[0]; } -static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } -static double vn_log(double x) { return __vn_log(argd(x))[0]; } -static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; } -static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } -static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } -static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } -static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } -static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } -static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } -static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } -static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } -static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } -static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } -static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } +#if WANT_SVE_MATH +#include +typedef __SVFloat32_t sv_float; +typedef __SVFloat64_t sv_double; + +static inline sv_float svargf(float x) { + int n = svcntw(); + float base[n]; + for (int i=0; iname; f++) printf ("\t%s\n", f->name); @@ -768,6 +719,7 @@ main (int argc, char *argv[]) conf.fenv = 1; conf.softlim = 0; conf.errlim = INFINITY; + conf.ignore_zero_sign = 0; for (;;) { argc--; @@ -807,11 +759,22 @@ main (int argc, char *argv[]) { argc--; argv++; - if (argc < 1) + if (argc < 1 || argv[0][1] != '\0') usage (); conf.rc = argv[0][0]; } break; + case 'z': + conf.ignore_zero_sign = 1; + break; +#ifdef __vpcs + case 'c': + argc--; + argv++; + fv[0] = strtof(argv[0], 0); + dv[0] = strtod(argv[0], 0); + break; +#endif default: usage (); } @@ -837,7 +800,19 @@ main (int argc, char *argv[]) if (strcmp (argv[0], f->name) == 0) break; if (!f->name) - usage (); + { +#ifndef __vpcs + /* Ignore vector math functions if vector math is not supported. */ + if (strncmp (argv[0], "_ZGVnN", 6) == 0) + exit (0); +#endif +#if !WANT_SVE_MATH + if (strncmp (argv[0], "_ZGVsMxv", 8) == 0) + exit (0); +#endif + printf ("math function %s not supported\n", argv[0]); + exit (1); + } if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG) conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */ if (!USE_MPFR && conf.mpfr) diff --git a/math/test/ulp.h b/math/test/ulp.h index a0c301664321067789322ba64932130fffa37000..b0bc59aeef8ddbd712d731e3e8d6635254fa7e88 100644 --- a/math/test/ulp.h +++ b/math/test/ulp.h @@ -1,8 +1,8 @@ /* * Generic functions for ULP error estimation. * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* For each different math function type, @@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t) /* Difference between exact result and closest real number that gets rounded to got, i.e. error before rounding, for a correctly rounded result the difference is 0. */ -static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) +static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, + int ignore_zero_sign) { RT(float) want = p->y; RT(float) d; @@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) if (RT(asuint) (got) == RT(asuint) (want)) return 0.0; + if (isnan (got) && isnan (want)) + /* Ignore sign of NaN. */ + return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY; if (signbit (got) != signbit (want)) - /* May have false positives with NaN. */ - //return isnan(got) && isnan(want) ? 0 : INFINITY; - return INFINITY; + { + /* Fall through to ULP calculation if ignoring sign of zero and at + exactly one of want and got is non-zero. */ + if (ignore_zero_sign && want == got) + return 0.0; + if (!ignore_zero_sign || (want != 0 && got != 0)) + return INFINITY; + } if (!isfinite (want) || !isfinite (got)) { if (isnan (got) != isnan (want)) @@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r, static inline void T(call_nofenv) (const struct fun *f, struct T(args) a, int r, RT(float) * y, int *ex) { + if (r != FE_TONEAREST) + fesetround (r); *y = T(call) (f, a); *ex = 0; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); } static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a, @@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a, int r, struct RT(ret) * p, RT(float) ygot, int exgot) { + if (r != FE_TONEAREST) + fesetround (r); RT(double) yl = T(call_long) (f, a); p->y = (RT(float)) yl; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); if (RT(isok_nofenv) (ygot, p->y)) return 1; p->ulpexp = RT(ulpscale) (p->y); @@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen, if (!ok) { int print = 0; - double err = RT(ulperr) (ygot, &want, r); + double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign); double abserr = fabs (err); // TODO: count errors below accuracy limit. if (abserr > 0) diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h new file mode 100644 index 0000000000000000000000000000000000000000..84f7927d393548617c480517b6709b875b0de70b --- /dev/null +++ b/math/test/ulp_funcs.h @@ -0,0 +1,40 @@ +/* + * Function entries for ulp. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +/* clang-format off */ + F1 (sin) + F1 (cos) + F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) + F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) + F1 (exp) + F1 (exp2) + F1 (log) + F1 (log2) + F2 (pow) + F1 (erf) + D1 (exp) + D1 (exp10) + D1 (exp2) + D1 (log) + D1 (log2) + D2 (pow) + D1 (erf) +#ifdef __vpcs + F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) + F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) + F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) + F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) + F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) + F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) + F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1) + F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) + F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) + F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) + F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) +#endif +/* clang-format on */ diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h new file mode 100644 index 0000000000000000000000000000000000000000..60dc3d6dd652875043118f36f38e64ebedf5ab4a --- /dev/null +++ b/math/test/ulp_wrappers.h @@ -0,0 +1,37 @@ +/* + * Function wrappers for ulp. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* clang-format off */ + +/* Wrappers for sincos. */ +static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} +static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} +static double sincos_sin(double x) {(void)cos(x); return sin(x);} +static double sincos_cos(double x) {(void)sin(x); return cos(x);} +#if USE_MPFR +static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } +static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } +#endif + +/* Wrappers for vector functions. */ +#ifdef __vpcs +static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } +static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } +static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; } +static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } +static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; } +static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } +static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } +static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } +static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } +static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } +static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } +static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } +static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } +#endif + +/* clang-format on */ diff --git a/math/tgamma128.c b/math/tgamma128.c new file mode 100644 index 0000000000000000000000000000000000000000..dda0da7e8adb4a7fa3b78826316f8fd8a4fae12a --- /dev/null +++ b/math/tgamma128.c @@ -0,0 +1,351 @@ +/* + * Implementation of the true gamma function (as opposed to lgamma) + * for 128-bit long double. + * + * Copyright (c) 2006,2009,2023 Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* + * This module implements the float128 gamma function under the name + * tgamma128. It's expected to be suitable for integration into system + * maths libraries under the standard name tgammal, if long double is + * 128-bit. Such a library will probably want to check the error + * handling and optimize the initial process of extracting the + * exponent, which is done here by simple and portable (but + * potentially slower) methods. + */ + +#include +#include +#include +#include + +#include "tgamma128.h" + +#define lenof(x) (sizeof(x)/sizeof(*(x))) + +/* + * Helper routine to evaluate a polynomial via Horner's rule + */ +static long double poly(const long double *coeffs, size_t n, long double x) +{ + long double result = coeffs[--n]; + + while (n > 0) + result = (result * x) + coeffs[--n]; + + return result; +} + +/* + * Compute sin(pi*x) / pi, for use in the reflection formula that + * relates gamma(-x) and gamma(x). + */ +static long double sin_pi_x_over_pi(long double x) +{ + int quo; + long double fracpart = remquol(x, 0.5L, &quo); + + long double sign = 1.0L; + if (quo & 2) + sign = -sign; + quo &= 1; + + if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) { + /* For numbers this size, sin(pi*x) is so close to pi*x that + * sin(pi*x)/pi is indistinguishable from x in float128 */ + return sign * fracpart; + } + + if (quo == 0) { + return sign * sinl(pi*fracpart) / pi; + } else { + return sign * cosl(pi*fracpart) / pi; + } +} + +/* Return tgamma(x) on the assumption that x >= 8. */ +static long double tgamma_large(long double x, + bool negative, long double negadjust) +{ + /* + * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K, + * where K is a correction factor computed as a polynomial in 1/x. + * + * (Vaguely inspired by the form of the Lanczos approximation, but + * I tried the Lanczos approximation itself and it suffers badly + * from big cancellation leading to loss of significance.) + */ + long double t = 1/x; + long double p = poly(coeffs_large, lenof(coeffs_large), t); + + /* + * To avoid overflow in cases where x^(x-0.5) does overflow + * but gamma(x) does not, we split x^(x-0.5) in half and + * multiply back up _after_ multiplying the shrinking factor + * of exp(-(x-0.5)). + * + * Note that computing x-0.5 and (x-0.5)/2 is exact for the + * relevant range of x, so the only sources of error are pow + * and exp themselves, plus the multiplications. + */ + long double powhalf = powl(x, (x-0.5L)/2.0L); + long double expret = expl(-(x-0.5L)); + + if (!negative) { + return (expret * powhalf) * powhalf * p; + } else { + /* + * Apply the reflection formula as commented below, but + * carefully: negadjust has magnitude less than 1, so it can + * turn a case where gamma(+x) would overflow into a case + * where gamma(-x) doesn't underflow. Not only that, but the + * FP format has greater range in the tiny domain due to + * denormals. For both reasons, it's not good enough to + * compute the positive result and then adjust it. + */ + long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p); + return ret / powhalf; + } +} + +/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */ +static long double tgamma_tiny(long double x, + bool negative, long double negadjust) +{ + /* + * For x near zero, we use a polynomial approximation to + * g = 1/(x*gamma(x)), and then return 1/(g*x). + */ + long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x); + if (!negative) + return 1.0L / (g*x); + else + return g / negadjust; +} + +/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */ +static long double tgamma_ultratiny(long double x, bool negative, + long double negadjust) +{ + /* On this interval, gamma can't even be distinguished from 1/x, + * so we skip the polynomial evaluation in tgamma_tiny, partly to + * save time and partly to avoid the tiny intermediate values + * setting the underflow exception flag. */ + if (!negative) + return 1.0L / x; + else + return 1.0L / negadjust; +} + +/* Return tgamma(x) on the assumption that 1 <= x <= 2. */ +static long double tgamma_central(long double x) +{ + /* + * In this central interval, our strategy is to finding the + * difference between x and the point where gamma has a minimum, + * and approximate based on that. + */ + + /* The difference between the input x and the minimum x. The first + * subtraction is expected to be exact, since x and min_hi have + * the same exponent (unless x=2, in which case it will still be + * exact). */ + long double t = (x - min_x_hi) - min_x_lo; + + /* + * Now use two different polynomials for the intervals [1,m] and + * [m,2]. + */ + long double p; + if (t < 0) + p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t); + else + p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t); + + return (min_y_lo + p * (t*t)) + min_y_hi; +} + +long double tgamma128(long double x) +{ + /* + * Start by extracting the number's sign and exponent, and ruling + * out cases of non-normalized numbers. + * + * For an implementation integrated into a system libm, it would + * almost certainly be quicker to do this by direct bitwise access + * to the input float128 value, using whatever is the local idiom + * for knowing its endianness. + * + * Integration into a system libc may also need to worry about + * setting errno, if that's the locally preferred way to report + * math.h errors. + */ + int sign = signbit(x); + int exponent; + switch (fpclassify(x)) { + case FP_NAN: + return x+x; /* propagate QNaN, make SNaN throw an exception */ + case FP_ZERO: + return 1/x; /* divide by zero on purpose to indicate a pole */ + case FP_INFINITE: + if (sign) { + return x-x; /* gamma(-inf) has indeterminate sign, so provoke an + * IEEE invalid operation exception to indicate that */ + } + return x; /* but gamma(+inf) is just +inf with no error */ + case FP_SUBNORMAL: + exponent = -16384; + break; + default: + frexpl(x, &exponent); + exponent--; + break; + } + + bool negative = false; + long double negadjust = 0.0L; + + if (sign) { + /* + * Euler's reflection formula is + * + * gamma(1-x) gamma(x) = pi/sin(pi*x) + * + * pi + * => gamma(x) = -------------------- + * gamma(1-x) sin(pi*x) + * + * But computing 1-x is going to lose a lot of accuracy when x + * is very small, so instead we transform using the recurrence + * gamma(t+1)=t gamma(t). Setting t=-x, this gives us + * gamma(1-x) = -x gamma(-x), so we now have + * + * pi + * gamma(x) = ---------------------- + * -x gamma(-x) sin(pi*x) + * + * which relates gamma(x) to gamma(-x), which is much nicer, + * since x can be turned into -x without rounding. + */ + negadjust = sin_pi_x_over_pi(x); + negative = true; + x = -x; + + /* + * Now the ultimate answer we want is + * + * 1 / (gamma(x) * x * negadjust) + * + * where x is the positive value we've just turned it into. + * + * For some of the cases below, we'll compute gamma(x) + * normally and then compute this adjusted value afterwards. + * But for others, we can implement the reciprocal operation + * in this formula by _avoiding_ an inversion that the + * sub-case was going to do anyway. + */ + + if (negadjust == 0) { + /* + * Special case for negative integers. Applying the + * reflection formula would cause division by zero, but + * standards would prefer we treat this error case as an + * invalid operation and return NaN instead. (Possibly + * because otherwise you'd have to decide which sign of + * infinity to return, and unlike the x=0 case, there's no + * sign of zero available to disambiguate.) + */ + return negadjust / negadjust; + } + } + + /* + * Split the positive domain into various cases. For cases where + * we do the negative-number adjustment the usual way, we'll leave + * the answer in 'g' and drop out of the if statement. + */ + long double g; + + if (exponent >= 11) { + /* + * gamma of any positive value this large overflows, and gamma + * of any negative value underflows. + */ + if (!negative) { + long double huge = 0x1p+12288L; + return huge * huge; /* provoke an overflow */ + } else { + long double tiny = 0x1p-12288L; + return tiny * tiny * negadjust; /* underflow, of the right sign */ + } + } else if (exponent >= 3) { + /* Negative-number adjustment happens inside here */ + return tgamma_large(x, negative, negadjust); + } else if (exponent < -113) { + /* Negative-number adjustment happens inside here */ + return tgamma_ultratiny(x, negative, negadjust); + } else if (exponent < -5) { + /* Negative-number adjustment happens inside here */ + return tgamma_tiny(x, negative, negadjust); + } else if (exponent == 0) { + g = tgamma_central(x); + } else if (exponent < 0) { + /* + * For x in [1/32,1) we range-reduce upwards to the interval + * [1,2), using the inverse of the normal recurrence formula: + * gamma(x) = gamma(x+1)/x. + */ + g = tgamma_central(1+x) / x; + } else { + /* + * For x in [2,8) we range-reduce downwards to the interval + * [1,2) by repeated application of the recurrence formula. + * + * Actually multiplying (x-1) by (x-2) by (x-3) and so on + * would introduce multiple ULPs of rounding error. We can get + * better accuracy by writing x = (k+1/2) + t, where k is an + * integer and |t|<1/2, and expanding out the obvious factor + * (x-1)(x-2)...(x-k+1) as a polynomial in t. + */ + long double mult; + int i = x; + if (i == 2) { /* x in [2,3) */ + mult = (x-1); + } else { + long double t = x - (i + 0.5L); + switch (i) { + /* E.g. for x=3.5+t, we want + * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */ + case 3: + mult = 3.75L+t*(4.0L+t); + break; + case 4: + mult = 13.125L+t*(17.75L+t*(7.5L+t)); + break; + case 5: + mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t))); + break; + case 6: + mult = 324.84375L+t*(570.5625L+t*(376.250L+t*( + 117.5L+t*(17.5L+t)))); + break; + case 7: + mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( + 1140.0L+t*(231.25L+t*(24.0L+t))))); + break; + } + } + + g = tgamma_central(x - (i-1)) * mult; + } + + if (!negative) { + /* Positive domain: return g unmodified */ + return g; + } else { + /* Negative domain: apply the reflection formula as commented above */ + return 1.0L / (g * x * negadjust); + } +} diff --git a/math/tgamma128.h b/math/tgamma128.h new file mode 100644 index 0000000000000000000000000000000000000000..ced10c3cc34ca26bcb3d6d8b31899ef9c3f35b15 --- /dev/null +++ b/math/tgamma128.h @@ -0,0 +1,141 @@ +/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006,2009,2023 Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* The largest positive value for which 128-bit tgamma does not overflow. */ +static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L; + +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +static const long double coeffs_large[] = { + 0x1.8535745aa79569579b9eec0f3bbcp+0L, + 0x1.0378f83c6fb8f0e51269f2b4a973p-3L, + 0x1.59f6a05094f69686c3380f4e2783p-8L, + -0x1.0b291dee952a82764a4859b081a6p-8L, + -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L, + 0x1.387a8b5f38dd77e7f139b1021e86p-10L, + 0x1.bca46637f65b13750c728cc29e40p-14L, + -0x1.d80401c00aef998c9e303151a51cp-11L, + -0x1.49cb6bb09f935a2053ccc2cf3711p-14L, + 0x1.4e950204437dcaf2be77f73a6f45p-10L, + 0x1.cb711a2d65f188bf60110934d6bep-14L, + -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L, + -0x1.0305ab9760cddb0d833e73766836p-12L, + 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L, + 0x1.bb4144740ad9290123fdcea684aap-11L, + -0x1.72ab4e88272a229bfafd192450f0p-5L, + 0x1.80c70ac6eb3b7a698983d25a62b8p-12L, + 0x1.e222791c6743ce3e3cae220fb236p-3L, + 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L, + -0x1.9d204fa235a42cd901b123d2ad47p+1L, + 0x1.55b56d1158f77ddb1c95fc44ab02p+0L, + 0x1.37f900a11dbd892abd7dde533e2dp+5L, + -0x1.2da49f4188dd89cb958369ef2401p+7L, + 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L, + -0x1.61433cebe649098c9611c4c7774ap+7L, +}; + +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +static const long double coeffs_tiny[] = { + 0x1.0000000000000000000000000000p+0L, + 0x1.2788cfc6fb618f49a37c7f0201fep-1L, + -0x1.4fcf4026afa2dceb8490ade22796p-1L, + -0x1.5815e8fa27047c8f42b5d9217244p-5L, + 0x1.5512320b43fbe5dfa771333518f7p-3L, + -0x1.59af103c340927bffdd44f954bfcp-5L, + -0x1.3b4af28483e210479657e5543366p-7L, + 0x1.d919c527f6070bfce9b29c2ace9cp-8L, + -0x1.317112ce35337def3556a18aa178p-10L, + -0x1.c364fe77a6f27677b985b1fa2e1dp-13L, + 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L, + -0x1.51cf9f090b5dc398ba86305e3634p-16L, + -0x1.4e80f64c04a339740de06ca9fa4ap-20L, + 0x1.241ddc2aef2ec20e58b08f2fda17p-20L, +}; + +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L; +static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L; + +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L; +static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +static const long double coeffs_central_neg[] = { + 0x1.b6c53f7377b83839c8a292e43b69p-2L, + 0x1.0bae9f40c7d09ed76e732045850ap-3L, + 0x1.4981175e14d04c3530e51d01c5fep-3L, + 0x1.79f77aaf032c948af3a9edbd2061p-4L, + 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L, + 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L, + 0x1.0b44c2f92982f887b55ec36dfdb0p-5L, + 0x1.6df1de1e178ef72ca7bd63d40870p-6L, + 0x1.f63f502bde27e81c0f5e13479b43p-7L, + 0x1.57fd67d901f40ea011353ad89a0ap-7L, + 0x1.d7151376eed187eb753e2273cafcp-8L, + 0x1.427162b5c6ff1d904c71ef53e37cp-8L, + 0x1.b954b8c3a56cf93e49ef6538928ap-9L, + 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L, + 0x1.9d35250d9b9378d9b59df734537ap-10L, + 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L, + 0x1.7e0db39bb99cdb52b028d9359380p-11L, + 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L, + 0x1.27521cf5fd24dcdf43524e6add11p-13L, + 0x1.06461d62243bf9a826b42349672fp-10L, + -0x1.2b852abead28209b4e0c756dc46ep-9L, + 0x1.be673c11a72c826115ec6d286c14p-8L, + -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L, + 0x1.fa362bd2dc68f41abef2d8600acdp-6L, + -0x1.a21585b2f52f8b23855de8e452edp-5L, + 0x1.1f234431ed032052fc92e64e0493p-4L, + -0x1.40d332476ca0199c60cdae3f9132p-4L, + 0x1.1d45dc665d86012eba2eea199cefp-4L, + -0x1.8491016cdd08dc9be7ade9b5fef3p-5L, + 0x1.7e7e2fbc6d49ad484300d6add324p-6L, + -0x1.e63fe3f874a37276a8d7d8b705ecp-8L, + 0x1.30a2a73944f8c84998314d69c23fp-10L, +}; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +static const long double coeffs_central_pos[] = { + 0x1.b6c53f7377b83839c8a292e22aa2p-2L, + -0x1.0bae9f40c7d09ed76e72e1c955dep-3L, + 0x1.4981175e14d04c3530ee5e1ecebcp-3L, + -0x1.79f77aaf032c948ac983d77f3e07p-4L, + 0x1.1e97bd10821095ab7dc94936cc11p-4L, + -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L, + 0x1.0b44c2f929837fafef7b5d9e80f1p-5L, + -0x1.6df1de1e175fe2a51faa25cddbb4p-6L, + 0x1.f63f502be57d11aed2cfe90843ffp-7L, + -0x1.57fd67d852f230015b9f64770273p-7L, + 0x1.d715138adc07e5fce81077070357p-8L, + -0x1.4271618e9fda8992a667adb15f4fp-8L, + 0x1.b954d15d9eb772e80fdd760672d7p-9L, + -0x1.2dfe391241d3cb79c8c15182843dp-9L, + 0x1.9d44396fcd48451c3ba924cee814p-10L, + -0x1.1ac195fb99739e341589e39803e6p-10L, + 0x1.82e46127b68f002770826e25f146p-11L, + -0x1.089dacd90d9f41493119ac178359p-11L, + 0x1.6993c007b20394a057d21f3d37f8p-12L, + -0x1.ec43a709f4446560c099dec8e31bp-13L, + 0x1.4ba36322f4074e9add9450f003cap-13L, + -0x1.b3f83a977965ca1b7937bf5b34cap-14L, + 0x1.10af346abc09cb25a6d9fe810b6ep-14L, + -0x1.38d8ea1188f242f50203edc395bdp-15L, + 0x1.39add987a948ec56f62b721a4475p-16L, + -0x1.02a4e141f286c8a967e2df9bc9adp-17L, + 0x1.433b50af22425f546e87113062d7p-19L, + -0x1.0c7b73cb0013f00aafc103e8e382p-21L, + 0x1.b852de313ec38da2297f6deaa6b4p-25L, +}; + +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L; diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya index bd72d6b7482089d27bce02848b85b074e4b737b3..6690adfcbb9b8e57cfb5e11ca73fa52594a8443c 100644 --- a/math/tools/cos.sollya +++ b/math/tools/cos.sollya @@ -1,7 +1,7 @@ // polynomial for approximating cos(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 8; // polynomial degree a = -pi/4; // interval diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya index b7a462cda5a4f8efb571c3ce3c296d42bb7d7e98..0668bdb5b3d30a088e09b38f099824e91368a237 100644 --- a/math/tools/exp.sollya +++ b/math/tools/exp.sollya @@ -1,7 +1,7 @@ // polynomial for approximating e^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 5; // poly degree N = 128; // table entries diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya index e760769601d40009575d6b121e969e7c09749acb..bd0a42d6bbcbc0c66157c423d19a2a26970eecd5 100644 --- a/math/tools/exp2.sollya +++ b/math/tools/exp2.sollya @@ -1,7 +1,7 @@ // polynomial for approximating 2^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // exp2f parameters deg = 3; // poly degree diff --git a/math/tools/log.sollya b/math/tools/log.sollya index 6df4db44b6f30133e38fa46a0824ea1356313fb1..5288f557292570e5f54ef2e80083407e51c82c41 100644 --- a/math/tools/log.sollya +++ b/math/tools/log.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 12; // poly degree // |log(1+x)| > 0x1p-4 outside the interval diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya index 4a364c0f111ff6acebfb0782b472b1500218187e..85811be5d90c9bb5acdee32f5dcfe6d3a2989514 100644 --- a/math/tools/log2.sollya +++ b/math/tools/log2.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log2(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 11; // poly degree // |log2(1+x)| > 0x1p-4 outside the interval diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya index 82c4dac26fa128d98f0905b12166efee28f0f180..d018ba0145d24d0d095b4393ff9263bcff89cdb0 100644 --- a/math/tools/log2_abs.sollya +++ b/math/tools/log2_abs.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log2(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 7; // poly degree // interval ~= 1/(2*N), where N is the table entries diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya index a2ac190fc49702e362decc43aafa5240d15730f5..5f9bfe41a6830f5a4ae4028bba223c0186d9c9ee 100644 --- a/math/tools/log_abs.sollya +++ b/math/tools/log_abs.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 6; // poly degree // interval ~= 1/(2*N), where N is the table entries diff --git a/math/tools/plot.py b/math/tools/plot.py index 6c8b89ff284b5a6e220d940fa06d0d56549a21c6..a0fa023225606e0b02afadede19c028d64d85d15 100755 --- a/math/tools/plot.py +++ b/math/tools/plot.py @@ -3,7 +3,7 @@ # ULP error plot tool. # # Copyright (c) 2019, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception import numpy as np import matplotlib.pyplot as plt diff --git a/math/tools/remez.jl b/math/tools/remez.jl index 2ff436f5287ff2d426413f6817a966ac82990439..1deab67d0660a946fac4e38d6394bae0aaeb7c98 100755 --- a/math/tools/remez.jl +++ b/math/tools/remez.jl @@ -4,7 +4,7 @@ # remez.jl - implementation of the Remez algorithm for polynomial approximation # # Copyright (c) 2015-2019, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception import Base.\ diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya index a6e851145c119e9a425e6af308d01b4022be44f5..a19300019867873928cb384f28d7ede5a46155dc 100644 --- a/math/tools/sin.sollya +++ b/math/tools/sin.sollya @@ -1,7 +1,7 @@ // polynomial for approximating sin(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 7; // polynomial degree a = -pi/4; // interval diff --git a/math/tools/tgamma128_gen.jl b/math/tools/tgamma128_gen.jl new file mode 100644 index 0000000000000000000000000000000000000000..da76e8b9b84ba8f5e0290db2e38e551b26c7c332 --- /dev/null +++ b/math/tools/tgamma128_gen.jl @@ -0,0 +1,212 @@ +# -*- julia -*- +# +# Generate tgamma128.h, containing polynomials and constants used by +# tgamma128.c. +# +# Copyright (c) 2006,2009,2023 Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +# This Julia program depends on the 'Remez' and 'SpecialFunctions' +# library packages. To install them, run this at the interactive Julia +# prompt: +# +# import Pkg; Pkg.add(["Remez", "SpecialFunctions"]) +# +# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04). + +import Printf +import Remez +import SpecialFunctions + +# Round a BigFloat to 128-bit long double and format it as a C99 hex +# float literal. +function quadhex(x) + sign = " " + if x < 0 + sign = "-" + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + mantissa = BigInt(round(x)) + + mantstr = string(mantissa, base=16, pad=29) + return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end], + exponent) +end + +# Round a BigFloat to 128-bit long double and return it still as a +# BigFloat. +function quadval(x, round=0) + sign = +1 + if x.sign < 0 + sign = -1 + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + if round < 0 + mantissa = floor(x) + elseif round > 0 + mantissa = ceil(x) + else + mantissa = round(x) + end + + return sign * mantissa * BigFloat(2)^(exponent - 112) +end + +# Output an array of BigFloats as a C array declaration. +function dumparray(a, name) + println("static const long double ", name, "[] = {") + for x in N + println(" ", quadhex(x), ",") + end + println("};") +end + +print("/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006,2009,2023 Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +") + +Base.MPFR.setprecision(512) + +e = exp(BigFloat(1)) + +print(" +/* The largest positive value for which 128-bit tgamma does not overflow. */ +") +lo = BigFloat("1000") +hi = BigFloat("2000") +while true + global lo + global hi + global max_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + max_x = mid + break + end + if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2)) + lo = mid + else + hi = mid + end +end +max_x = quadval(max_x, -1) +println("static const long double max_x = ", quadhex(max_x), ";") + +print(" +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? sqrt(BigFloat(2)*pi/e) : + exp(SpecialFunctions.logabsgamma(1/x)[1] + + (1/x-0.5)*(1+log(x))), + (0, 1/BigFloat(8)), + 24, 0, + (x, y) -> 1/y +) +dumparray(N, "coeffs_large") + +print(" +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)), + (0, 1/BigFloat(32)), + 13, 0, +) +dumparray(N, "coeffs_tiny") + +print(" +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +") +lo = BigFloat("1.4") +hi = BigFloat("1.5") +while true + global lo + global hi + global min_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + min_x = mid + break + end + if SpecialFunctions.digamma(mid) < 0 + lo = mid + else + hi = mid + end +end +min_x_hi = quadval(min_x, -1) +println("static const long double min_x_hi = ", quadhex(min_x_hi), ";") +println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";") + +print(" +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +") +min_y = SpecialFunctions.gamma(min_x) +min_y_hi = quadval(min_y, -1) +println("static const long double min_y_hi = ", quadhex(min_y_hi), ";") +println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";") + +function taylor_bodge(x) + # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2. + # Used in the Remez calls below for x values very near the origin, to avoid + # significance loss problems when trying to compute it directly via that + # formula (even in MPFR's extra precision). + return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506")))) +end + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) : + (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x), + (0, min_x - 1), + 31, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_neg") + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) : + (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x), + (0, 2 - min_x), + 28, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_pos") + +print(" +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +") +println("static const long double pi = ", quadhex(BigFloat(pi)), ";") diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya index c0abb63fb642a58ca023eb242a010b1a418e15fe..5fa7de7435a9863d3b9511cdff140977165a8333 100644 --- a/math/tools/v_exp.sollya +++ b/math/tools/v_exp.sollya @@ -1,7 +1,7 @@ // polynomial for approximating e^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 4; // poly degree N = 128; // table entries diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya index cc3d2c4ae72a1b860313625a9771e2fc1e19e93b..d982524eb920f0e581fd2b9221d493364a26bd37 100644 --- a/math/tools/v_log.sollya +++ b/math/tools/v_log.sollya @@ -1,7 +1,7 @@ // polynomial used for __v_log(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 6; // poly degree a = -0x1.fc1p-9; diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya index 65cc9957c624a6fd09a32762d7f4d7296e0b8319..63b9d65a1ac35a14b98a8dcab6a00637d35db4fb 100644 --- a/math/tools/v_sin.sollya +++ b/math/tools/v_sin.sollya @@ -1,7 +1,7 @@ // polynomial for approximating sin(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 15; // polynomial degree a = -pi/2; // interval diff --git a/math/v_cos.c b/math/v_cos.c deleted file mode 100644 index 20ba6bd0d0d9a4a5e98f56f0e374f22a88df2f7a..0000000000000000000000000000000000000000 --- a/math/v_cos.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Double-precision vector cos function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define HalfPi v_f64 (0x1.921fb54442d18p+0) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define RangeVal v_f64 (0x1p23) -#define AbsMask v_u64 (0x7fffffffffffffff) - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (cos, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(cos) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t odd, cmp; - - r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); - cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); - - /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = v_fma_f64 (InvPi, r + HalfPi, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - n -= v_f64 (0.5); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/math/v_cosf.c b/math/v_cosf.c deleted file mode 100644 index 150294b8845e735c06423bdbc9e78fe6bb567b06..0000000000000000000000000000000000000000 --- a/math/v_cosf.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Single-precision vector cos function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) -#define HalfPi v_f32 (0x1.921fb6p0f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (cosf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(cosf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t odd, cmp; - - r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); - cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); - - /* n = rint((|x|+pi/2)/pi) - 0.5 */ - n = v_fma_f32 (InvPi, r + HalfPi, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - n -= v_f32 (0.5f); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/math/v_exp.c b/math/v_exp.c deleted file mode 100644 index e459d53fddd2509f6f8ddb69328615e8cc80b2e8..0000000000000000000000000000000000000000 --- a/math/v_exp.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Double-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED -#include "v_exp.h" - -#if V_EXP_TABLE_BITS == 7 -/* maxerr: 1.88 +0.5 ulp - rel error: 1.4337*2^-53 - abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ -#define C1 v_f64 (0x1.ffffffffffd43p-2) -#define C2 v_f64 (0x1.55555c75adbb2p-3) -#define C3 v_f64 (0x1.55555da646206p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */ -#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */ -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63) -#elif V_EXP_TABLE_BITS == 8 -/* maxerr: 0.54 +0.5 ulp - rel error: 1.4318*2^-58 - abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */ -#define C1 v_f64 (0x1.fffffffffffd4p-2) -#define C2 v_f64 (0x1.5555571d6b68cp-3) -#define C3 v_f64 (0x1.5555576a59599p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep8) -#define Ln2hi v_f64 (0x1.62e42fefa39efp-9) -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64) -#endif - -#define N (1 << V_EXP_TABLE_BITS) -#define Tab __v_exp_data -#define IndexMask v_u64 (N - 1) -#define Shift v_f64 (0x1.8p+52) -#define Thres v_f64 (704.0) - -VPCS_ATTR -static v_f64_t -specialcase (v_f64_t s, v_f64_t y, v_f64_t n) -{ - v_f64_t absn = v_abs_f64 (n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); - v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); - v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); - v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); - v_f64_t r1 = s1 * s1; - v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; - return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); -} - -VPCS_ATTR -v_f64_t -V_NAME(exp) (v_f64_t x) -{ - v_f64_t n, r, r2, s, y, z; - v_u64_t cmp, u, e, i; - - cmp = v_cond_u64 (v_abs_f64 (x) > Thres); - - /* n = round(x/(ln2/N)). */ - z = v_fma_f64 (x, InvLn2, Shift); - u = v_as_u64_f64 (z); - n = z - Shift; - - /* r = x - n*ln2/N. */ - r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); - - e = u << (52 - V_EXP_TABLE_BITS); - i = u & IndexMask; - - /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - r2 = r * r; - y = v_fma_f64 (C2, r, C1); - y = v_fma_f64 (C3, r2, y); - y = v_fma_f64 (y, r2, r); - - /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - s = v_as_f64_u64 (u + e); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (s, y, n); - return v_fma_f64 (y, s, s); -} -VPCS_ALIAS -#endif diff --git a/math/v_exp.h b/math/v_exp.h deleted file mode 100644 index 305da19c0a53924f18007df499b0af2747b1cfa2..0000000000000000000000000000000000000000 --- a/math/v_exp.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Declarations for double-precision e^x vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_EXP_TABLE_BITS 7 - -extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; -#endif diff --git a/math/v_exp2f.c b/math/v_exp2f.c deleted file mode 100644 index e3ea5af3414dc848da0a12659444ff2626f6cfcc..0000000000000000000000000000000000000000 --- a/math/v_exp2f.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.962 ulp. */ - 0x1.59977ap-10f, - 0x1.3ce9e4p-7f, - 0x1.c6bd32p-5f, - 0x1.ebf9bcp-3f, - 0x1.62e422p-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -VPCS_ATTR -v_f32_t -V_NAME(exp2f) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, absn; - v_u32_t cmp, e; - - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn, cmp, scale); - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/math/v_exp_data.c b/math/v_exp_data.c deleted file mode 100644 index 365355497e95026692d683d656b8b286e3594446..0000000000000000000000000000000000000000 --- a/math/v_exp_data.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Lookup table for double-precision e^x vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_exp.h" -#if WANT_VMATH - -#define N (1 << V_EXP_TABLE_BITS) - -/* 2^(j/N), j=0..N. */ -const u64_t __v_exp_data[] = { -#if N == 128 -0x3ff0000000000000, -0x3feff63da9fb3335, -0x3fefec9a3e778061, -0x3fefe315e86e7f85, -0x3fefd9b0d3158574, -0x3fefd06b29ddf6de, -0x3fefc74518759bc8, -0x3fefbe3ecac6f383, -0x3fefb5586cf9890f, -0x3fefac922b7247f7, -0x3fefa3ec32d3d1a2, -0x3fef9b66affed31b, -0x3fef9301d0125b51, -0x3fef8abdc06c31cc, -0x3fef829aaea92de0, -0x3fef7a98c8a58e51, -0x3fef72b83c7d517b, -0x3fef6af9388c8dea, -0x3fef635beb6fcb75, -0x3fef5be084045cd4, -0x3fef54873168b9aa, -0x3fef4d5022fcd91d, -0x3fef463b88628cd6, -0x3fef3f49917ddc96, -0x3fef387a6e756238, -0x3fef31ce4fb2a63f, -0x3fef2b4565e27cdd, -0x3fef24dfe1f56381, -0x3fef1e9df51fdee1, -0x3fef187fd0dad990, -0x3fef1285a6e4030b, -0x3fef0cafa93e2f56, -0x3fef06fe0a31b715, -0x3fef0170fc4cd831, -0x3feefc08b26416ff, -0x3feef6c55f929ff1, -0x3feef1a7373aa9cb, -0x3feeecae6d05d866, -0x3feee7db34e59ff7, -0x3feee32dc313a8e5, -0x3feedea64c123422, -0x3feeda4504ac801c, -0x3feed60a21f72e2a, -0x3feed1f5d950a897, -0x3feece086061892d, -0x3feeca41ed1d0057, -0x3feec6a2b5c13cd0, -0x3feec32af0d7d3de, -0x3feebfdad5362a27, -0x3feebcb299fddd0d, -0x3feeb9b2769d2ca7, -0x3feeb6daa2cf6642, -0x3feeb42b569d4f82, -0x3feeb1a4ca5d920f, -0x3feeaf4736b527da, -0x3feead12d497c7fd, -0x3feeab07dd485429, -0x3feea9268a5946b7, -0x3feea76f15ad2148, -0x3feea5e1b976dc09, -0x3feea47eb03a5585, -0x3feea34634ccc320, -0x3feea23882552225, -0x3feea155d44ca973, -0x3feea09e667f3bcd, -0x3feea012750bdabf, -0x3fee9fb23c651a2f, -0x3fee9f7df9519484, -0x3fee9f75e8ec5f74, -0x3fee9f9a48a58174, -0x3fee9feb564267c9, -0x3feea0694fde5d3f, -0x3feea11473eb0187, -0x3feea1ed0130c132, -0x3feea2f336cf4e62, -0x3feea427543e1a12, -0x3feea589994cce13, -0x3feea71a4623c7ad, -0x3feea8d99b4492ed, -0x3feeaac7d98a6699, -0x3feeace5422aa0db, -0x3feeaf3216b5448c, -0x3feeb1ae99157736, -0x3feeb45b0b91ffc6, -0x3feeb737b0cdc5e5, -0x3feeba44cbc8520f, -0x3feebd829fde4e50, -0x3feec0f170ca07ba, -0x3feec49182a3f090, -0x3feec86319e32323, -0x3feecc667b5de565, -0x3feed09bec4a2d33, -0x3feed503b23e255d, -0x3feed99e1330b358, -0x3feede6b5579fdbf, -0x3feee36bbfd3f37a, -0x3feee89f995ad3ad, -0x3feeee07298db666, -0x3feef3a2b84f15fb, -0x3feef9728de5593a, -0x3feeff76f2fb5e47, -0x3fef05b030a1064a, -0x3fef0c1e904bc1d2, -0x3fef12c25bd71e09, -0x3fef199bdd85529c, -0x3fef20ab5fffd07a, -0x3fef27f12e57d14b, -0x3fef2f6d9406e7b5, -0x3fef3720dcef9069, -0x3fef3f0b555dc3fa, -0x3fef472d4a07897c, -0x3fef4f87080d89f2, -0x3fef5818dcfba487, -0x3fef60e316c98398, -0x3fef69e603db3285, -0x3fef7321f301b460, -0x3fef7c97337b9b5f, -0x3fef864614f5a129, -0x3fef902ee78b3ff6, -0x3fef9a51fbc74c83, -0x3fefa4afa2a490da, -0x3fefaf482d8e67f1, -0x3fefba1bee615a27, -0x3fefc52b376bba97, -0x3fefd0765b6e4540, -0x3fefdbfdad9cbe14, -0x3fefe7c1819e90d8, -0x3feff3c22b8f71f1, -#elif N == 256 -0x3ff0000000000000, -0x3feffb1afa5abcbf, -0x3feff63da9fb3335, -0x3feff168143b0281, -0x3fefec9a3e778061, -0x3fefe7d42e11bbcc, -0x3fefe315e86e7f85, -0x3fefde5f72f654b1, -0x3fefd9b0d3158574, -0x3fefd50a0e3c1f89, -0x3fefd06b29ddf6de, -0x3fefcbd42b72a836, -0x3fefc74518759bc8, -0x3fefc2bdf66607e0, -0x3fefbe3ecac6f383, -0x3fefb9c79b1f3919, -0x3fefb5586cf9890f, -0x3fefb0f145e46c85, -0x3fefac922b7247f7, -0x3fefa83b23395dec, -0x3fefa3ec32d3d1a2, -0x3fef9fa55fdfa9c5, -0x3fef9b66affed31b, -0x3fef973028d7233e, -0x3fef9301d0125b51, -0x3fef8edbab5e2ab6, -0x3fef8abdc06c31cc, -0x3fef86a814f204ab, -0x3fef829aaea92de0, -0x3fef7e95934f312e, -0x3fef7a98c8a58e51, -0x3fef76a45471c3c2, -0x3fef72b83c7d517b, -0x3fef6ed48695bbc0, -0x3fef6af9388c8dea, -0x3fef672658375d2f, -0x3fef635beb6fcb75, -0x3fef5f99f8138a1c, -0x3fef5be084045cd4, -0x3fef582f95281c6b, -0x3fef54873168b9aa, -0x3fef50e75eb44027, -0x3fef4d5022fcd91d, -0x3fef49c18438ce4d, -0x3fef463b88628cd6, -0x3fef42be3578a819, -0x3fef3f49917ddc96, -0x3fef3bdda27912d1, -0x3fef387a6e756238, -0x3fef351ffb82140a, -0x3fef31ce4fb2a63f, -0x3fef2e85711ece75, -0x3fef2b4565e27cdd, -0x3fef280e341ddf29, -0x3fef24dfe1f56381, -0x3fef21ba7591bb70, -0x3fef1e9df51fdee1, -0x3fef1b8a66d10f13, -0x3fef187fd0dad990, -0x3fef157e39771b2f, -0x3fef1285a6e4030b, -0x3fef0f961f641589, -0x3fef0cafa93e2f56, -0x3fef09d24abd886b, -0x3fef06fe0a31b715, -0x3fef0432edeeb2fd, -0x3fef0170fc4cd831, -0x3feefeb83ba8ea32, -0x3feefc08b26416ff, -0x3feef96266e3fa2d, -0x3feef6c55f929ff1, -0x3feef431a2de883b, -0x3feef1a7373aa9cb, -0x3feeef26231e754a, -0x3feeecae6d05d866, -0x3feeea401b7140ef, -0x3feee7db34e59ff7, -0x3feee57fbfec6cf4, -0x3feee32dc313a8e5, -0x3feee0e544ede173, -0x3feedea64c123422, -0x3feedc70df1c5175, -0x3feeda4504ac801c, -0x3feed822c367a024, -0x3feed60a21f72e2a, -0x3feed3fb2709468a, -0x3feed1f5d950a897, -0x3feecffa3f84b9d4, -0x3feece086061892d, -0x3feecc2042a7d232, -0x3feeca41ed1d0057, -0x3feec86d668b3237, -0x3feec6a2b5c13cd0, -0x3feec4e1e192aed2, -0x3feec32af0d7d3de, -0x3feec17dea6db7d7, -0x3feebfdad5362a27, -0x3feebe41b817c114, -0x3feebcb299fddd0d, -0x3feebb2d81d8abff, -0x3feeb9b2769d2ca7, -0x3feeb8417f4531ee, -0x3feeb6daa2cf6642, -0x3feeb57de83f4eef, -0x3feeb42b569d4f82, -0x3feeb2e2f4f6ad27, -0x3feeb1a4ca5d920f, -0x3feeb070dde910d2, -0x3feeaf4736b527da, -0x3feeae27dbe2c4cf, -0x3feead12d497c7fd, -0x3feeac0827ff07cc, -0x3feeab07dd485429, -0x3feeaa11fba87a03, -0x3feea9268a5946b7, -0x3feea84590998b93, -0x3feea76f15ad2148, -0x3feea6a320dceb71, -0x3feea5e1b976dc09, -0x3feea52ae6cdf6f4, -0x3feea47eb03a5585, -0x3feea3dd1d1929fd, -0x3feea34634ccc320, -0x3feea2b9febc8fb7, -0x3feea23882552225, -0x3feea1c1c70833f6, -0x3feea155d44ca973, -0x3feea0f4b19e9538, -0x3feea09e667f3bcd, -0x3feea052fa75173e, -0x3feea012750bdabf, -0x3fee9fdcddd47645, -0x3fee9fb23c651a2f, -0x3fee9f9298593ae5, -0x3fee9f7df9519484, -0x3fee9f7466f42e87, -0x3fee9f75e8ec5f74, -0x3fee9f8286ead08a, -0x3fee9f9a48a58174, -0x3fee9fbd35d7cbfd, -0x3fee9feb564267c9, -0x3feea024b1ab6e09, -0x3feea0694fde5d3f, -0x3feea0b938ac1cf6, -0x3feea11473eb0187, -0x3feea17b0976cfdb, -0x3feea1ed0130c132, -0x3feea26a62ff86f0, -0x3feea2f336cf4e62, -0x3feea3878491c491, -0x3feea427543e1a12, -0x3feea4d2add106d9, -0x3feea589994cce13, -0x3feea64c1eb941f7, -0x3feea71a4623c7ad, -0x3feea7f4179f5b21, -0x3feea8d99b4492ed, -0x3feea9cad931a436, -0x3feeaac7d98a6699, -0x3feeabd0a478580f, -0x3feeace5422aa0db, -0x3feeae05bad61778, -0x3feeaf3216b5448c, -0x3feeb06a5e0866d9, -0x3feeb1ae99157736, -0x3feeb2fed0282c8a, -0x3feeb45b0b91ffc6, -0x3feeb5c353aa2fe2, -0x3feeb737b0cdc5e5, -0x3feeb8b82b5f98e5, -0x3feeba44cbc8520f, -0x3feebbdd9a7670b3, -0x3feebd829fde4e50, -0x3feebf33e47a22a2, -0x3feec0f170ca07ba, -0x3feec2bb4d53fe0d, -0x3feec49182a3f090, -0x3feec674194bb8d5, -0x3feec86319e32323, -0x3feeca5e8d07f29e, -0x3feecc667b5de565, -0x3feece7aed8eb8bb, -0x3feed09bec4a2d33, -0x3feed2c980460ad8, -0x3feed503b23e255d, -0x3feed74a8af46052, -0x3feed99e1330b358, -0x3feedbfe53c12e59, -0x3feede6b5579fdbf, -0x3feee0e521356eba, -0x3feee36bbfd3f37a, -0x3feee5ff3a3c2774, -0x3feee89f995ad3ad, -0x3feeeb4ce622f2ff, -0x3feeee07298db666, -0x3feef0ce6c9a8952, -0x3feef3a2b84f15fb, -0x3feef68415b749b1, -0x3feef9728de5593a, -0x3feefc6e29f1c52a, -0x3feeff76f2fb5e47, -0x3fef028cf22749e4, -0x3fef05b030a1064a, -0x3fef08e0b79a6f1f, -0x3fef0c1e904bc1d2, -0x3fef0f69c3f3a207, -0x3fef12c25bd71e09, -0x3fef16286141b33d, -0x3fef199bdd85529c, -0x3fef1d1cd9fa652c, -0x3fef20ab5fffd07a, -0x3fef244778fafb22, -0x3fef27f12e57d14b, -0x3fef2ba88988c933, -0x3fef2f6d9406e7b5, -0x3fef33405751c4db, -0x3fef3720dcef9069, -0x3fef3b0f2e6d1675, -0x3fef3f0b555dc3fa, -0x3fef43155b5bab74, -0x3fef472d4a07897c, -0x3fef4b532b08c968, -0x3fef4f87080d89f2, -0x3fef53c8eacaa1d6, -0x3fef5818dcfba487, -0x3fef5c76e862e6d3, -0x3fef60e316c98398, -0x3fef655d71ff6075, -0x3fef69e603db3285, -0x3fef6e7cd63a8315, -0x3fef7321f301b460, -0x3fef77d5641c0658, -0x3fef7c97337b9b5f, -0x3fef81676b197d17, -0x3fef864614f5a129, -0x3fef8b333b16ee12, -0x3fef902ee78b3ff6, -0x3fef953924676d76, -0x3fef9a51fbc74c83, -0x3fef9f7977cdb740, -0x3fefa4afa2a490da, -0x3fefa9f4867cca6e, -0x3fefaf482d8e67f1, -0x3fefb4aaa2188510, -0x3fefba1bee615a27, -0x3fefbf9c1cb6412a, -0x3fefc52b376bba97, -0x3fefcac948dd7274, -0x3fefd0765b6e4540, -0x3fefd632798844f8, -0x3fefdbfdad9cbe14, -0x3fefe1d802243c89, -0x3fefe7c1819e90d8, -0x3fefedba3692d514, -0x3feff3c22b8f71f1, -0x3feff9d96b2a23d9, -#endif -}; -#endif diff --git a/math/v_expf.c b/math/v_expf.c deleted file mode 100644 index d403e00534f068d81edefca1f48b6800cf7ab363..0000000000000000000000000000000000000000 --- a/math/v_expf.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.45358 +0.5 ulp. */ - 0x1.0e4020p-7f, - 0x1.573e2ep-5f, - 0x1.555e66p-3f, - 0x1.fffdb6p-2f, - 0x1.ffffecp-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn, cmp, scale); - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/math/v_log.c b/math/v_log.c deleted file mode 100644 index d84c740d2b6b519a5572b41a2e4e91aba27b0477..0000000000000000000000000000000000000000 --- a/math/v_log.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#include "v_log.h" -#if V_SUPPORTED - -/* Worst-case error: 1.17 + 0.5 ulp. */ - -static const f64_t Poly[] = { - /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - -0x1.ffffffffffff7p-2, - 0x1.55555555170d4p-2, - -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, - -0x1.554e550bd501ep-3, -}; - -#define A0 v_f64 (Poly[0]) -#define A1 v_f64 (Poly[1]) -#define A2 v_f64 (Poly[2]) -#define A3 v_f64 (Poly[3]) -#define A4 v_f64 (Poly[4]) -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) -#define N (1 << V_LOG_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) - -struct entry -{ - v_f64_t invc; - v_f64_t logc; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - e.invc = __v_log_data[i].invc; - e.logc = __v_log_data[i].logc; -#else - e.invc[0] = __v_log_data[i[0]].invc; - e.logc[0] = __v_log_data[i[0]].logc; - e.invc[1] = __v_log_data[i[1]].invc; - e.logc[1] = __v_log_data[i[1]].logc; -#endif - return e; -} - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (log, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(log) (v_f64_t x) -{ - v_f64_t z, r, r2, p, y, kd, hi; - v_u64_t ix, iz, tmp, top, i, cmp; - v_s64_t k; - struct entry e; - - ix = v_as_u64_f64 (x); - top = ix >> 48; - cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N; - k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */ - iz = ix - (tmp & v_u64 (0xfffULL << 52)); - z = v_as_f64_u64 (iz); - e = lookup (i); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - kd = v_to_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = v_fma_f64 (kd, Ln2, e.logc + r); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = r * r; - y = v_fma_f64 (A3, r, A2); - p = v_fma_f64 (A1, r, A0); - y = v_fma_f64 (A4, r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/math/v_log.h b/math/v_log.h deleted file mode 100644 index bcc2fa6fa9305a936ae6b6e25997a27c2c4ab4e5..0000000000000000000000000000000000000000 --- a/math/v_log.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Declarations for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_LOG_TABLE_BITS 7 - -extern const struct v_log_data -{ - f64_t invc; - f64_t logc; -} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN; -#endif diff --git a/math/v_log_data.c b/math/v_log_data.c deleted file mode 100644 index 97ee5b09c6a9c2b6b100f444fa16e7dd801e5c5b..0000000000000000000000000000000000000000 --- a/math/v_log_data.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Lookup table for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_log.h" -#if WANT_VMATH - -#define N (1 << V_LOG_TABLE_BITS) - -/* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + poly(z/c - 1) - -where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) -and log(c) and 1/c for the ith subinterval comes from a lookup table: - - tab[i].invc = 1/c - tab[i].logc = (double)log(c) - -where c is near the center of the subinterval and is chosen by trying several -floating point invc candidates around 1/center and selecting one for which -the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval -that contains 1 and the previous one got tweaked to avoid cancellation. */ -const struct v_log_data __v_log_data[N] = { -{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2}, -{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2}, -{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2}, -{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2}, -{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2}, -{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2}, -{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2}, -{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2}, -{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2}, -{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2}, -{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2}, -{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2}, -{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2}, -{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2}, -{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2}, -{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2}, -{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2}, -{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2}, -{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2}, -{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3}, -{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3}, -{0x1.446f12b278001p+0, -0x1.e52e160484698p-3}, -{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3}, -{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3}, -{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3}, -{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3}, -{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3}, -{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3}, -{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3}, -{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3}, -{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3}, -{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3}, -{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3}, -{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3}, -{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3}, -{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3}, -{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3}, -{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3}, -{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3}, -{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3}, -{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3}, -{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3}, -{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3}, -{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3}, -{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3}, -{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4}, -{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4}, -{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4}, -{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4}, -{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4}, -{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4}, -{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4}, -{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4}, -{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4}, -{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4}, -{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4}, -{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4}, -{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4}, -{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4}, -{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4}, -{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5}, -{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5}, -{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5}, -{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5}, -{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5}, -{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5}, -{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5}, -{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5}, -{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6}, -{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6}, -{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6}, -{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6}, -{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7}, -{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7}, -{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9}, -{1.0, 0.0}, -{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8}, -{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7}, -{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6}, -{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6}, -{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5}, -{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5}, -{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5}, -{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5}, -{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4}, -{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4}, -{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4}, -{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4}, -{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4}, -{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4}, -{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4}, -{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4}, -{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4}, -{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3}, -{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3}, -{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3}, -{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3}, -{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3}, -{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3}, -{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3}, -{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3}, -{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3}, -{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3}, -{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3}, -{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3}, -{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3}, -{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3}, -{0x1.9998e1480b618p-1, 0x1.c903161240163p-3}, -{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3}, -{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3}, -{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3}, -{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3}, -{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2}, -{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2}, -{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2}, -{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2}, -{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2}, -{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2}, -{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2}, -{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2}, -{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2}, -{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2}, -{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2}, -{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2}, -{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2}, -{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2}, -{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2}, -{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2}, -}; -#endif diff --git a/math/v_logf.c b/math/v_logf.c deleted file mode 100644 index 7373192f03fae52c113eabcb69067019e6e2a70c..0000000000000000000000000000000000000000 --- a/math/v_logf.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 3.34 ulp error */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, - -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, -}; -#define P7 v_f32 (Poly[0]) -#define P6 v_f32 (Poly[1]) -#define P5 v_f32 (Poly[2]) -#define P4 v_f32 (Poly[3]) -#define P3 v_f32 (Poly[4]) -#define P2 v_f32 (Poly[5]) -#define P1 v_f32 (Poly[6]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(logf) (v_f32_t x) -{ - v_f32_t n, p, q, r, r2, y; - v_u32_t u, cmp; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */ - u -= Off; - n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */ - u &= Mask; - u += Off; - r = v_as_f32_u32 (u) - v_f32 (1.0f); - - /* y = log(1+r) + n*ln2. */ - r2 = r * r; - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = v_fma_f32 (P6, r, P5); - q = v_fma_f32 (P4, r, P3); - y = v_fma_f32 (P2, r, P1); - p = v_fma_f32 (P7, r2, p); - q = v_fma_f32 (p, r2, q); - y = v_fma_f32 (q, r2, y); - p = v_fma_f32 (Ln2, n, r); - y = v_fma_f32 (y, r2, p); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/math/v_math.h b/math/v_math.h deleted file mode 100644 index f2cc4670bb9b8524c0318952b3e0a417a73746b1..0000000000000000000000000000000000000000 --- a/math/v_math.h +++ /dev/null @@ -1,641 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif -#if WANT_VMATH - -/* The goal of this header is to allow vector and scalar - build of the same algorithm, the provided intrinsic - wrappers are also vector length agnostic so they can - be implemented for SVE too (or other simd architectures) - and then the code should work on those targets too. */ - -#if SCALAR -#define V_NAME(x) __s_##x -#elif VPCS && __aarch64__ -#define V_NAME(x) __vn_##x -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -#else -#define V_NAME(x) __v_##x -#endif - -#ifndef VPCS_ATTR -#define VPCS_ATTR -#endif -#ifndef VPCS_ALIAS -#define VPCS_ALIAS -#endif - -#include -#include "math_config.h" - -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; - -/* reinterpret as type1 from type2. */ -static inline u32_t -as_u32_f32 (f32_t x) -{ - union { f32_t f; u32_t u; } r = {x}; - return r.u; -} -static inline f32_t -as_f32_u32 (u32_t x) -{ - union { u32_t u; f32_t f; } r = {x}; - return r.f; -} -static inline s32_t -as_s32_u32 (u32_t x) -{ - union { u32_t u; s32_t i; } r = {x}; - return r.i; -} -static inline u32_t -as_u32_s32 (s32_t x) -{ - union { s32_t i; u32_t u; } r = {x}; - return r.u; -} -static inline u64_t -as_u64_f64 (f64_t x) -{ - union { f64_t f; u64_t u; } r = {x}; - return r.u; -} -static inline f64_t -as_f64_u64 (u64_t x) -{ - union { u64_t u; f64_t f; } r = {x}; - return r.f; -} -static inline s64_t -as_s64_u64 (u64_t x) -{ - union { u64_t u; s64_t i; } r = {x}; - return r.i; -} -static inline u64_t -as_u64_s64 (s64_t x) -{ - union { s64_t i; u64_t u; } r = {x}; - return r.u; -} - -#if SCALAR -#define V_SUPPORTED 1 -typedef f32_t v_f32_t; -typedef u32_t v_u32_t; -typedef s32_t v_s32_t; -typedef f64_t v_f64_t; -typedef u64_t v_u64_t; -typedef s64_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 1; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return x; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return x; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return x; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - *x = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - *x = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - *x = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x ? -1 : 0; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return __builtin_fabsf (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return __builtin_fmaf (x, y, z); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return __builtin_roundf (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return __builtin_lroundf (x); /* relies on -fno-math-errno. */ -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return x; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return f (x); -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return f (x1, x2); -} - -static inline int -v_lanes64 (void) -{ - return 1; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return x; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return x; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return x; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - *x = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x ? -1 : 0; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return __builtin_fabs (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return __builtin_fma (x, y, z); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return __builtin_round (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return __builtin_lround (x); /* relies on -fno-math-errno. */ -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return x; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return f (x); -} - -#elif __aarch64__ -#define V_SUPPORTED 1 -#include -typedef float32x4_t v_f32_t; -typedef uint32x4_t v_u32_t; -typedef int32x4_t v_s32_t; -typedef float64x2_t v_f64_t; -typedef uint64x2_t v_u64_t; -typedef int64x2_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return (v_f32_t){x, x, x, x}; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return (v_u32_t){x, x, x, x}; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return (v_s32_t){x, x, x, x}; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x[i]; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x[i]; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x[i]; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - (*x)[i] = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return vabsq_f32 (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return vfmaq_f32 (z, x, y); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return vrndaq_f32 (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return vcvtaq_s32_f32 (x); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return ( - v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return (v_f64_t){x, x}; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return (v_u64_t){x, x}; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return (v_s64_t){x, x}; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x[i]; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - (*x)[i] = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return vabsq_f64 (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return vfmaq_f64 (z, x, y); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return vrndaq_f64 (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return vcvtaq_s64_f64 (x); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return (v_f64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return (v_u64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; -} -#endif - -#endif -#endif diff --git a/math/v_powf.c b/math/v_powf.c deleted file mode 100644 index fb80fa6f184688ee7396a12121604b12d9b1db1a..0000000000000000000000000000000000000000 --- a/math/v_powf.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define SBITS 5 -#define Tlog v__powf_log2_data.tab -#define Texp v__exp2f_data.tab -#define A v__powf_log2_data.poly -#define C v__exp2f_data.poly -#define LOGDEG 4 - -#if LOGDEG == 5 -/* 1.01 ulp */ -#define OFF v_u32 (0x3f330000) -#define TBITS 4 -#elif LOGDEG == 4 -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */ -#define OFF v_u32 (0x3f35d000) -#define TBITS 5 -#endif - -#define V_EXP2F_TABLE_BITS SBITS -#define V_EXP2F_POLY_ORDER 3 -struct v_exp2f_data -{ - uint64_t tab[1 << V_EXP2F_TABLE_BITS]; - double poly[V_EXP2F_POLY_ORDER]; -}; - -#define V_POWF_LOG2_TABLE_BITS TBITS -#define V_POWF_LOG2_POLY_ORDER LOGDEG -#define SCALE ((double) (1 << SBITS)) -struct v_powf_log2_data -{ - struct - { - double invc, logc; - } tab[1 << V_POWF_LOG2_TABLE_BITS]; - double poly[V_POWF_LOG2_POLY_ORDER]; -}; - -static const struct v_powf_log2_data v__powf_log2_data = { -#if LOGDEG == 5 - .tab = { -{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE }, -{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE }, -{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE }, -{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE }, -{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE }, -{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE }, -{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE }, -{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE }, -{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE }, -{ 0x1p+0, 0x0p+0 * SCALE }, -{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE }, -{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE }, -{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE }, -{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE }, -{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE }, -{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE }, - }, -/* rel err: 1.46 * 2^-32 */ - .poly = { -0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE, -0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE, -0x1.71547652ab82bp0 * SCALE, - } -#elif LOGDEG == 4 - .tab = { -{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE}, -{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE}, -{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE}, -{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE}, -{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE}, -{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE}, -{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE}, -{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE}, -{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE}, -{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE}, -{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE}, -{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE}, -{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE}, -{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE}, -{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE}, -{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE}, -{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE}, -{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE}, -{0x1p+0, 0x0p+0 * SCALE}, -{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE}, -{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE}, -{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE}, -{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE}, -{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE}, -{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE}, -{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE}, -{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE}, -{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE}, -{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE}, -{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE}, -{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE}, -{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE}, - }, -/* rel err: 1.5 * 2^-30 */ - .poly = { - -0x1.6ff5daa3b3d7cp-2 * SCALE, - 0x1.ec81d03c01aebp-2 * SCALE, - -0x1.71547bb43f101p-1 * SCALE, - 0x1.7154764a815cbp0 * SCALE, - } -#endif -}; - -static const struct v_exp2f_data v__exp2f_data = { - .tab = { -0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, -0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, -0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, -0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, -0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, -0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, -0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, -0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, - }, -/* rel err: 1.69 * 2^-34 */ - .poly = { -0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE - }, -}; - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(powf) (v_f32_t x, v_f32_t y) -{ - v_u32_t u, tmp, cmp, i, top, iz; - v_s32_t k; - v_f32_t ret; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - tmp = u - OFF; - i = (tmp >> (23 - TBITS)) % (1 << TBITS); - top = tmp & 0xff800000; - iz = u - top; - k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */ - - for (int lane = 0; lane < v_lanes32 (); lane++) - { - uint32_t si, siz; - int32_t sk; - float sy; - - /* Use double precision for each lane. */ - double invc, logc, z, r, p, y0, logx, ylogx, kd, s; - uint64_t ki, t; - - si = v_get_u32 (i, lane); - siz = v_get_u32 (iz, lane); - sk = v_get_s32 (k, lane); - sy = v_get_f32 (y, lane); - - invc = Tlog[si].invc; - logc = Tlog[si].logc; - z = (double) as_f32_u32 (siz); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ - r = __builtin_fma (z, invc, -1.0); - y0 = logc + (double) sk; - - /* Polynomial to approximate log1p(r)/ln2. */ -#if LOGDEG == 5 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + A[4]; - logx = r * logx + y0; -#elif LOGDEG == 4 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; -#endif - ylogx = sy * logx; - v_set_u32 (&cmp, lane, - (as_u64_f64 (ylogx) >> 47 & 0xffff) - >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47 - ? 1 - : v_get_u32 (cmp, lane)); - - /* N*x = k + r with r in [-1/2, 1/2] */ -#if TOINT_INTRINSICS - kd = roundtoint (ylogx); /* k */ - ki = converttoint (ylogx); -#else -# define SHIFT 0x1.8p52 - kd = eval_as_double (ylogx + SHIFT); - ki = asuint64 (kd); - kd -= SHIFT; -#endif - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ - t = Texp[ki % (1 << SBITS)]; - t += ki << (52 - SBITS); - s = as_f64_u64 (t); - p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - v_set_f32 (&ret, lane, p); - } - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, ret, cmp); - return ret; -} -VPCS_ALIAS -#endif diff --git a/math/v_sin.c b/math/v_sin.c deleted file mode 100644 index 2b9ed059189ca0402c8ec93f915fa6d3ed11be88..0000000000000000000000000000000000000000 --- a/math/v_sin.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Double-precision vector sin function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define RangeVal v_f64 (0x1p23) -#define AbsMask v_u64 (0x7fffffffffffffff) - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (sin, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(sin) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t sign, odd, cmp; - - r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); - sign = v_as_u64_f64 (x) & ~AbsMask; - cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); - - /* n = rint(|x|/pi). */ - n = v_fma_f64 (InvPi, r, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/math/v_sinf.c b/math/v_sinf.c deleted file mode 100644 index e66bfce6d8aa4888cfe610d2c7250a144366091b..0000000000000000000000000000000000000000 --- a/math/v_sinf.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Single-precision vector sin function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (sinf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(sinf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t sign, odd, cmp; - - r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); - sign = v_as_u32_f32 (x) & ~AbsMask; - cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); - - /* n = rint(|x|/pi) */ - n = v_fma_f32 (InvPi, r, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/math/vn_cos.c b/math/vn_cos.c deleted file mode 100644 index b57a549eba68b3c9dba8a4f06a68fb80c73352c1..0000000000000000000000000000000000000000 --- a/math/vn_cos.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cos. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos) -#include "v_cos.c" -#endif diff --git a/math/vn_cosf.c b/math/vn_cosf.c deleted file mode 100644 index 6321d4620fa700ece0d12e0ccd2445fbd4a299ec..0000000000000000000000000000000000000000 --- a/math/vn_cosf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cosf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf) -#include "v_cosf.c" -#endif diff --git a/math/vn_exp.c b/math/vn_exp.c deleted file mode 100644 index 06e269d41766bbc7040fdd92cde5782142db0d57..0000000000000000000000000000000000000000 --- a/math/vn_exp.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp) -#include "v_exp.c" -#endif diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c deleted file mode 100644 index db9707e86f16f94ce8d05149a58efd6fa518de14..0000000000000000000000000000000000000000 --- a/math/vn_exp2f.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f) -#include "v_exp2f.c" -#endif diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c deleted file mode 100644 index 17bd0abd7a60450f157462def7fb66b450044a75..0000000000000000000000000000000000000000 --- a/math/vn_exp2f_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_exp2f_1u.c" -#endif diff --git a/math/vn_expf.c b/math/vn_expf.c deleted file mode 100644 index 0652907225d94898aa9034b86bb2b361e0ea3586..0000000000000000000000000000000000000000 --- a/math/vn_expf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) -#include "v_expf.c" -#endif diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c deleted file mode 100644 index 3be7768148225aa7756bd5f19a2dd026ab2d35f5..0000000000000000000000000000000000000000 --- a/math/vn_expf_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_expf_1u.c" -#endif diff --git a/math/vn_log.c b/math/vn_log.c deleted file mode 100644 index b58fe8ff820a7bb49aafb18d0d287c45d35f6aff..0000000000000000000000000000000000000000 --- a/math/vn_log.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log) -#include "v_log.c" -#endif diff --git a/math/vn_logf.c b/math/vn_logf.c deleted file mode 100644 index cc5b8ae3ed55fec377883dd1dfabb4c678e3c48e..0000000000000000000000000000000000000000 --- a/math/vn_logf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_logf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf) -#include "v_logf.c" -#endif diff --git a/math/vn_pow.c b/math/vn_pow.c deleted file mode 100644 index 260950113b04016a2b8425b6a6333be1830248c1..0000000000000000000000000000000000000000 --- a/math/vn_pow.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_pow. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow) -#include "v_pow.c" -#endif diff --git a/math/vn_powf.c b/math/vn_powf.c deleted file mode 100644 index 095d07e337ad27d26699a4159be158a756e2d79a..0000000000000000000000000000000000000000 --- a/math/vn_powf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_powf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf) -#include "v_powf.c" -#endif diff --git a/math/vn_sin.c b/math/vn_sin.c deleted file mode 100644 index 905c7962335029212e84676883f9e275b06c56a4..0000000000000000000000000000000000000000 --- a/math/vn_sin.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sin. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin) -#include "v_sin.c" -#endif diff --git a/math/vn_sinf.c b/math/vn_sinf.c deleted file mode 100644 index 1214e1a556385b12e1e90bf74ed3e5828f8182d5..0000000000000000000000000000000000000000 --- a/math/vn_sinf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf) -#include "v_sinf.c" -#endif diff --git a/networking/Dir.mk b/networking/Dir.mk index b49610341171f43700b2af195fe7b4c7f2402af7..2589e0a1f91c47b76a50bf78e1c7aa01d3ec495f 100644 --- a/networking/Dir.mk +++ b/networking/Dir.mk @@ -1,7 +1,7 @@ # Makefile fragment - requires GNU make # # Copyright (c) 2019-2020, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/networking B := build/networking diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c index 6d5be58b1f32d1d49482129a62d7c40e715f9d4f..90c00eb7cabe5a0f3e28b6e8f94c17e9f5750334 100644 --- a/networking/aarch64/chksum_simd.c +++ b/networking/aarch64/chksum_simd.c @@ -2,7 +2,7 @@ * AArch64-specific checksum implementation using NEON * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "networking.h" diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c index 7f69adfc963c375221bf1d661f2b6f37e5fc56c9..ae08fe5dd0566632cfffdcf245c4d3915884cbd3 100644 --- a/networking/arm/chksum_simd.c +++ b/networking/arm/chksum_simd.c @@ -2,7 +2,7 @@ * Armv7-A specific checksum implementation using NEON * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "networking.h" diff --git a/networking/chksum.c b/networking/chksum.c index 95ce5baa94e43e9008e2b0750713cf0efb77e7ed..329482ffdcee963b4deed851ce56af0f0748b6b8 100644 --- a/networking/chksum.c +++ b/networking/chksum.c @@ -3,7 +3,7 @@ * This sum is often used as a simple checksum in networking. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "networking.h" diff --git a/networking/chksum_common.h b/networking/chksum_common.h index 958c8cc0742e7fb2b58e2bda236f836f69715ee9..16f0f6c11df7015ed0a87e0032685a69c74c154f 100644 --- a/networking/chksum_common.h +++ b/networking/chksum_common.h @@ -2,7 +2,7 @@ * Common code for checksum implementations * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef CHKSUM_COMMON_H diff --git a/networking/include/networking.h b/networking/include/networking.h index a88feff883394ef5c4d7bb840813d5af7f584e90..297dd4bfab0234ceabf663f5e39552b1e08f63ac 100644 --- a/networking/include/networking.h +++ b/networking/include/networking.h @@ -2,7 +2,7 @@ * Public API. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ unsigned short __chksum (const void *, unsigned int); diff --git a/networking/test/chksum.c b/networking/test/chksum.c index 41b98120f2758b54b8d13122caffb00224cc3139..239b5b88777be2a4870b4fd65fc29ddadc5ba11a 100644 --- a/networking/test/chksum.c +++ b/networking/test/chksum.c @@ -2,7 +2,7 @@ * Ones' complement checksum test & benchmark * * Copyright (c) 2016-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE diff --git a/string/Dir.mk b/string/Dir.mk index cf3453f7580d381464b4ebb5eacfe1306a427822..40ff5acc093e9d042afdcb6748aa540da6970816 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -1,7 +1,7 @@ # Makefile fragment - requires GNU make # # Copyright (c) 2019-2021, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/string B := build/string diff --git a/string/README.contributors b/string/README.contributors new file mode 100644 index 0000000000000000000000000000000000000000..0b4a51b563669a48e24d35135eb4ef50293ef2af --- /dev/null +++ b/string/README.contributors @@ -0,0 +1,30 @@ +STYLE REQUIREMENTS +================== + +1. Most code in this sub-directory is expected to be upstreamed into glibc so + the GNU Coding Standard and glibc specific conventions should be followed + to ease upstreaming. + +2. ABI and symbols: the code should be written so it is suitable for inclusion + into a libc with minimal changes. This e.g. means that internal symbols + should be hidden and in the implementation reserved namespace according to + ISO C and POSIX rules. If possible the built shared libraries and static + library archives should be usable to override libc symbols at link time (or + at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI + (other than symbol versioning), this cannot be done reliably for static + linking so this is a best effort requirement. + +3. API: include headers should be suitable for benchmarking and testing code + and should not conflict with libc headers. + + +CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY +================================================ +1. Code: + - The assumptions of the code must be clearly documented. + + - Assembly style should be consistent across different implementations. + + +2. Performance: + - Benchmarking is needed on several microarchitectures. diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S index 84339f73cf23770b991c15e62eaba4b186a3201e..207e22950c6d3c4e42c20460cf4a3b1d7fde9eec 100644 --- a/string/aarch64/__mtag_tag_region.S +++ b/string/aarch64/__mtag_tag_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_region - tag memory * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S index f58364ca6fcb8c11b548b4288efdd21c716d5866..44b8e0114f4265d1ba02acb0a8622ca27a9a6973 100644 --- a/string/aarch64/__mtag_tag_zero_region.S +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_zero_region - tag memory and fill it with zero bytes * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h new file mode 100644 index 0000000000000000000000000000000000000000..131b95e1fea98f789a678ce075846425cf0a24e6 --- /dev/null +++ b/string/aarch64/asmdefs.h @@ -0,0 +1,106 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#ifdef __ILP32__ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 2; \ + .word 4; \ + .word 12; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .text +#else +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text +#endif + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + +/* Compiler supports SVE instructions */ +#ifndef HAVE_SVE +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) +# define HAVE_SVE 1 +# else +# define HAVE_SVE 0 +# endif +#endif + +#endif diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S index 5a54242d7de62303fe852f099f7025f32eac9f63..131b7fa36ec2dda1154e4435f53c5f45d6af1baf 100644 --- a/string/aarch64/check-arch.S +++ b/string/aarch64/check-arch.S @@ -1,8 +1,8 @@ /* * check ARCH setting. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__aarch64__ @@ -10,4 +10,4 @@ #endif /* Include for GNU property notes. */ -#include "../asmdefs.h" +#include "asmdefs.h" diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S index c2e967d1004e06e372725f5cc8ddb95aeb629aa2..948c3cbc7dd43a773d035c9fcf364d994fe3b5a8 100644 --- a/string/aarch64/memchr-mte.S +++ b/string/aarch64/memchr-mte.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,25 +23,21 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define vrepchr v0 #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) PTR_ARG (0) @@ -50,55 +46,53 @@ ENTRY (__memchr_aarch64_mte) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) rbit synd, synd clz synd, synd - add result, srcin, synd, lsr 2 cmp cntin, synd, lsr 2 + add result, srcin, synd, lsr 2 csel result, result, xzr, hi ret + .p2align 3 L(start_loop): sub tmp, src, srcin - add tmp, tmp, 16 + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 4 L(loop32): - ldr qdata, [src, 16]! + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, 16]! - subs cntrem, cntrem, 32 + ldr qdata, [src, 16] cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + subs cntrem, cntrem, 32 + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + sub cntrem, src, srcin fmov synd, dend - add tmp, srcin, cntin - sub cntrem, tmp, src + sub cntrem, cntin, cntrem #ifndef __AARCH64EB__ rbit synd, synd #endif diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S index c22e6596f19bdde2e6ced26a3ca11e99c0c5b7f5..b851cf31f2383e874c96b24ba82006d82e52f060 100644 --- a/string/aarch64/memchr-sve.S +++ b/string/aarch64/memchr-sve.S @@ -1,11 +1,11 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index 353f0d1eac53098f8b8e921d12af1404ec2cf96c..fe6cfe2bc0e28d56100536ec25186f0543b03897 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S index 78c5ecaa4cdcba0b826d62369d40f18afa8313d9..d52ce4555344e5b2fcca1ebcbc8b99651c0097fb 100644 --- a/string/aarch64/memcmp-sve.S +++ b/string/aarch64/memcmp-sve.S @@ -1,11 +1,11 @@ /* * memcmp - compare memory * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index 3b1026642eee805ca31d7f88b13eac082ce4b726..35135e72cc8e5324ade0a2443dc17fa1098142d6 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -1,103 +1,84 @@ /* memcmp - compare memory * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define data3 x5 +#define data3w w5 +#define data4 x6 +#define data4w w6 +#define tmp x6 +#define src1end x7 +#define src2end x8 -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 ENTRY (__memcmp_aarch64) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) - subs limit, limit, 8 - b.lo L(less8) - - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) - - subs limit, limit, 8 - b.gt L(more16) - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) - -L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) - - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 + cmp limit, 16 + b.lo L(less16) + ldp data1, data3, [src1] + ldp data2, data4, [src2] + ccmp data1, data2, 0, ne + ccmp data3, data4, 0, eq + b.ne L(return2) + + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 b.ls L(last_bytes) + cmp limit, 160 + b.hs L(loop_align) + sub limit, limit, 32 - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) - - /* Align src1 and adjust src2 with bytes not yet done. */ - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 - - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ .p2align 4 -L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi - ccmp data1h, data2h, 0, eq - b.eq L(loop16) - +L(loop32): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ccmp data3, data4, 0, eq + b.ne L(return2) + cmp limit, 16 + b.ls L(last_bytes) + + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] cmp data1, data2 - bne L(return) + ccmp data3, data4, 0, eq + b.ne L(return2) + add src1, src1, 32 + add src2, src2, 32 +L(last64): + subs limit, limit, 32 + b.hi L(loop32) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +L(return2): cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): @@ -105,33 +86,105 @@ L(return): rev data1, data1 rev data2, data2 #endif - cmp data1, data2 -L(ret_eq): + cmp data1, data2 cset result, ne cneg result, result, lo ret .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less16): + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, L(less8) + ldr data1, [src1] + ldr data2, [src2] + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b L(return2) + + .p2align 4 L(less8): - adds limit, limit, 4 - b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 + tbz limit, 2, L(less4) + ldr data1w, [src1] + ldr data2w, [src2] + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b L(return2) + +L(less4): + tbz limit, 1, L(less2) + ldrh data1w, [src1] + ldrh data2w, [src2] cmp data1w, data2w b.ne L(return) - sub limit, limit, 4 -L(less4): - adds limit, limit, 4 - beq L(ret_eq) -L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) +L(less2): + mov result, 0 + tbz limit, 0, L(return_zero) + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] sub result, data1w, data2w +L(return_zero): ret -END (__memcmp_aarch64) +L(loop_align): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne L(return2) + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +L(loop64): + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq L(loop64) + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, L(last64) + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo + ret +END (__memcmp_aarch64) diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S index f97f2c3047b96e489ff97395173f2069469144e0..e6527d0dac2c48c3b313f25a5d61314df87871e0 100644 --- a/string/aarch64/memcpy-advsimd.S +++ b/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S new file mode 100644 index 0000000000000000000000000000000000000000..b45c31418717cd1e5cc7f29dd42aceab31d784c8 --- /dev/null +++ b/string/aarch64/memcpy-mops.S @@ -0,0 +1,21 @@ +/* + * memcpy using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memcpy_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ + .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ + .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */ + ret + +END (__memcpy_aarch64_mops) diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S new file mode 100644 index 0000000000000000000000000000000000000000..e8a946d7db37f44fa8b819be5cf81fe0ee5f719d --- /dev/null +++ b/string/aarch64/memcpy-sve.S @@ -0,0 +1,177 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#ifdef HAVE_SVE + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) + +#endif diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index 8a967cdf4d2b5c014ce0737c19e4884297cd18b7..2b1a592feb39b5c831a908bda3f42bf3f9fc44ab 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S new file mode 100644 index 0000000000000000000000000000000000000000..6c73017bb16f00ded1eaaaa5bf61fe9e68de5e9c --- /dev/null +++ b/string/aarch64/memmove-mops.S @@ -0,0 +1,21 @@ +/* + * memmove using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memmove_aarch64_mops) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ + .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ + .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */ + ret + +END (__memmove_aarch64_mops) diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S index 7b4be847cecbf93820be6ca931cf6b4569bf382f..6418bdf56f414880540632cd8c8257ed3d95d6d2 100644 --- a/string/aarch64/memrchr.S +++ b/string/aarch64/memrchr.S @@ -1,8 +1,8 @@ /* * memrchr - find last character in a memory zone. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,7 +23,6 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define end x8 #define endm1 x9 @@ -31,19 +30,16 @@ #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memrchr_aarch64) PTR_ARG (0) @@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b neg shift, end, lsl 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsl synd, synd, shift cbz synd, L(start_loop) @@ -69,34 +62,36 @@ ENTRY (__memrchr_aarch64) csel result, result, xzr, hi ret + nop L(start_loop): - sub tmp, end, src - subs cntrem, cntin, tmp + subs cntrem, src, srcin b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) + sub cntrem, cntrem, 1 + tbz cntrem, 4, L(loop32_2) + add src, src, 16 - .p2align 4 + .p2align 5 L(loop32): - ldr qdata, [src, -16]! + ldr qdata, [src, -32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, -16]! + ldr qdata, [src, -16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + sub src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend add tmp, src, 15 diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S new file mode 100644 index 0000000000000000000000000000000000000000..ec791493bae9c019b92374f0920edbec00b10507 --- /dev/null +++ b/string/aarch64/memset-mops.S @@ -0,0 +1,20 @@ +/* + * memset using MOPS extension. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "asmdefs.h" + +ENTRY (__memset_aarch64_mops) + PTR_ARG (0) + SIZE_ARG (2) + + mov x3, x0 + .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ + .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ + .inst 0x19c18443 /* sete [x3]!, x2!, x1 */ + ret + +END (__memset_aarch64_mops) diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index 9fcd97579913b025028f6728098ebd570992cb7d..553b0fcaefea5e5ae60c4ef583b80dc81f165ae6 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -1,8 +1,8 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define val x1 diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S deleted file mode 100644 index f1c7119065152def69dabaa5edfd92ada06685f1..0000000000000000000000000000000000000000 --- a/string/aarch64/stpcpy-mte.S +++ /dev/null @@ -1,10 +0,0 @@ -/* - * stpcpy - copy a string returning pointer to end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#define BUILD_STPCPY 1 - -#include "strcpy-mte.S" diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S index 82dd9717b0a0af44d7a14bee1ff9de16df7a6535..5d3f14b86026882d092f567f598ed46fdbe9447f 100644 --- a/string/aarch64/stpcpy-sve.S +++ b/string/aarch64/stpcpy-sve.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S index 4f62aa46238987bbbd634b3fb794433d7bd74965..155c68d75a7b23a7c4f1be9cf864a7c1f1287ccd 100644 --- a/string/aarch64/stpcpy.S +++ b/string/aarch64/stpcpy.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index dcb0e46258709760e7ef1c7d81e47a86457a2846..6ec08f7acc766b652cee0c340541f74ac01cebd7 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,8 +19,7 @@ #define src x2 #define tmp1 x1 -#define wtmp2 w3 -#define tmp3 x3 +#define tmp2 x3 #define vrepchr v0 #define vdata v1 @@ -28,39 +27,30 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 -#define vend v6 -#define dend d6 +#define vend v5 +#define dend d5 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-1 are set if the relevant byte matched the - requested character, bits 2-3 are set if the byte is NUL (or matched), and - bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd - bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits - in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + per byte. Bits 0-1 are set if the relevant byte matched the requested + character, bits 2-3 are set if the byte is NUL or matched. Count trailing + zeroes gives the position of the matching byte if it is a multiple of 4. + If it is not a multiple of 4, there was no match. */ ENTRY (__strchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov wtmp2, 0x3003 - dup vrepmask.8h, wtmp2 + movi vrepmask.16b, 0x33 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp2, 0xf00f - dup vrepmask2.8h, wtmp2 - bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - lsl tmp3, srcin, 2 - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - + lsl tmp2, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend - lsr tmp1, tmp1, tmp3 + lsr tmp1, tmp1, tmp2 cbz tmp1, L(loop) rbit tmp1, tmp1 @@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) + sub src, src, 16 +L(end): #ifdef __AARCH64EB__ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend #else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend rbit tmp1, tmp1 #endif + add src, src, 16 clz tmp1, tmp1 - /* Tmp1 is an even multiple of 2 if the target character was - found first. Otherwise we've found the end of string. */ + /* Tmp1 is a multiple of 4 if the target character was found. */ tst tmp1, 2 add result, src, tmp1, lsr 2 csel result, result, xzr, eq diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S index 13ba9f44f9c5a3dd716252b0459955cfe12c3b18..ff075167bfefb7dcf66869626c28c7d58163ab7f 100644 --- a/string/aarch64/strchr-sve.S +++ b/string/aarch64/strchr-sve.S @@ -1,11 +1,11 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 1063cbfd77aa817ed1502e0b2c39643fb102c16b..37193bd947a73dbf7167e3b10d5ddb8e2510dd31 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S index 1b0d0a63094c6567c3ee3654b416635f28a8acfd..543ee88bb285852eb6a7cf22cf25480f6403c98d 100644 --- a/string/aarch64/strchrnul-mte.S +++ b/string/aarch64/strchrnul-mte.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -20,38 +20,32 @@ #define src x2 #define tmp1 x1 #define tmp2 x3 -#define tmp2w w3 #define vrepchr v0 #define vdata v1 #define qdata q1 #define vhas_nul v2 #define vhas_chr v3 -#define vrepmask v4 -#define vend v5 -#define dend d5 +#define vend v4 +#define dend d4 -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov tmp2w, 0xf00f - dup vrepmask.8h, tmp2w cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b lsl tmp2, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov tmp1, dend lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ cbz tmp1, L(loop) @@ -63,15 +57,22 @@ ENTRY (__strchrnul_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b fmov tmp1, dend cbz tmp1, L(loop) - - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 +L(end): + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + add src, src, 16 fmov tmp1, dend #ifndef __AARCH64EB__ rbit tmp1, tmp1 diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S index 428ff1a3d008325778eccc4e9fe1ec99bfc70bb5..0005f9177514082544bc0f5f5a245ef5632430a7 100644 --- a/string/aarch64/strchrnul-sve.S +++ b/string/aarch64/strchrnul-sve.S @@ -2,7 +2,7 @@ * strchrnul - find a character or nul in a string * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STRCHRNUL diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index a4230d919b478d3001d412a7b3574f7ec94d2fb1..666e8d0304c16d4f9ebb8fa443670a40673b934b 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S deleted file mode 100644 index 12d1a6b51dd3442ca89ba7994569ce9e54b0e351..0000000000000000000000000000000000000000 --- a/string/aarch64/strcmp-mte.S +++ /dev/null @@ -1,189 +0,0 @@ -/* - * strcmp - compare two strings - * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - - -/* Assumptions: - * - * ARMv8-a, AArch64. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -#define src1 x0 -#define src2 x1 -#define result x0 - -#define data1 x2 -#define data1w w2 -#define data2 x3 -#define data2w w3 -#define has_nul x4 -#define diff x5 -#define off1 x5 -#define syndrome x6 -#define tmp x6 -#define data3 x7 -#define zeroones x8 -#define shift x9 -#define off2 x10 - -/* On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. */ -#ifdef __AARCH64EB__ -# define LS_FW lsl -#else -# define LS_FW lsr -#endif - -/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. - Since carry propagation makes 0x1 bytes before a NUL byte appear - NUL too in big-endian, byte-reverse the data before the NUL check. */ - - -ENTRY (__strcmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - sub off2, src2, src1 - mov zeroones, REP8_01 - and tmp, src1, 7 - tst off2, 7 - b.ne L(misaligned8) - cbnz tmp, L(mutual_align) - - .p2align 4 - -L(loop_aligned): - ldr data2, [src1, off2] - ldr data1, [src1], 8 -L(start_realigned): -#ifdef __AARCH64EB__ - rev tmp, data1 - sub has_nul, tmp, zeroones - orr tmp, tmp, REP8_7f -#else - sub has_nul, data1, zeroones - orr tmp, data1, REP8_7f -#endif - bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ - ccmp data1, data2, 0, eq - b.eq L(loop_aligned) -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul -L(end): -#ifndef __AARCH64EB__ - rev syndrome, syndrome - rev data1, data1 - rev data2, data2 -#endif - clz shift, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - lsl data1, data1, shift - lsl data2, data2, shift - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, 56 - sub result, data1, data2, lsr 56 - ret - - .p2align 4 - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, 7 - ldr data2, [src1, off2] - ldr data1, [src1], 8 - neg shift, src2, lsl 3 /* Bits to alignment -64. */ - mov tmp, -1 - LS_FW tmp, tmp, shift - orr data1, data1, tmp - orr data2, data2, tmp - b L(start_realigned) - -L(misaligned8): - /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond the end of SRC2. */ - cbz tmp, L(src1_aligned) -L(do_misaligned): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - cmp data1w, 0 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.ne L(done) - tst src1, 7 - b.ne L(do_misaligned) - -L(src1_aligned): - neg shift, src2, lsl 3 - bic src2, src2, 7 - ldr data3, [src2], 8 -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - lsr tmp, zeroones, shift - orr data3, data3, tmp - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - bics has_nul, has_nul, tmp - b.ne L(tail) - - sub off1, src2, src1 - - .p2align 4 - -L(loop_unaligned): - ldr data3, [src1, off1] - ldr data2, [src1, off2] -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - ldr data1, [src1], 8 - bics has_nul, has_nul, tmp - ccmp data1, data2, 0, eq - b.eq L(loop_unaligned) - - lsl tmp, has_nul, shift -#ifdef __AARCH64EB__ - rev tmp, tmp -#endif - eor diff, data1, data2 - orr syndrome, diff, tmp - cbnz syndrome, L(end) -L(tail): - ldr data1, [src1] - neg shift, shift - lsr data2, data3, shift - lsr has_nul, has_nul, shift -#ifdef __AARCH64EB__ - rev data2, data2 - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul - b L(end) - -L(done): - sub result, data1, data2 - ret - -END (__strcmp_aarch64_mte) - diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S index e6d2da5411cac58a14b62d4767022a0c22b87ecc..eaf909a378f1f52dcf180e1f10a82dba071d94c9 100644 --- a/string/aarch64/strcmp-sve.S +++ b/string/aarch64/strcmp-sve.S @@ -1,11 +1,11 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index 7714ebf5577d84a279f911914f5f7f28d41f3e8c..137a9aa06681a3c6d00062c88cddf8b9a227c220 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -1,168 +1,184 @@ /* * strcmp - compare two strings * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ + /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 -/* Parameters and result. */ #define src1 x0 #define src2 x1 #define result x0 -/* Internal variables. */ #define data1 x2 #define data1w w2 #define data2 x3 #define data2w w3 #define has_nul x4 #define diff x5 +#define off1 x5 #define syndrome x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define zeroones x10 -#define pos x11 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + - /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) PTR_ARG (0) PTR_ARG (1) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ + cbnz tmp, L(mutual_align) + + .p2align 4 + L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 + ldr data2, [src1, off2] + ldr data1, [src1], 8 L(start_realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - L(end): -#ifndef __AARCH64EB__ +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos + lsl data1, data1, shift + lsl data2, data2, shift /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 ret -#endif + + .p2align 4 L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off - the bytes that preceed the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2, data2, tmp2 + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp b L(start_realigned) L(misaligned8): /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond page boundary in - SRC2. */ - tst src1, #7 - b.eq L(loop_misaligned) + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) L(do_misaligned): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ b.ne L(done) - tst src1, #7 + tst src1, 7 b.ne L(do_misaligned) -L(loop_misaligned): - /* Test if we are within the last dword of the end of a 4K page. If - yes then jump back to the misaligned loop to copy a byte at a time. */ - and tmp1, src2, #0xff8 - eor tmp1, tmp1, #0xff8 - cbz tmp1, L(do_misaligned) - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_misaligned) b L(end) L(done): diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S deleted file mode 100644 index 88c222d61e53ad6841b10ef2b874852df203d800..0000000000000000000000000000000000000000 --- a/string/aarch64/strcpy-mte.S +++ /dev/null @@ -1,161 +0,0 @@ -/* - * strcpy/stpcpy - copy a string returning pointer to start/end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define dstin x0 -#define srcin x1 -#define result x0 - -#define src x2 -#define dst x3 -#define len x4 -#define synd x4 -#define tmp x5 -#define wtmp w5 -#define shift x5 -#define data1 x6 -#define dataw1 w6 -#define data2 x7 -#define dataw2 w7 - -#define dataq q0 -#define vdata v0 -#define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 -#define dataq2 q1 - -#ifdef BUILD_STPCPY -# define STRCPY __stpcpy_aarch64_mte -# define IFSTPCPY(X,...) X,__VA_ARGS__ -#else -# define STRCPY __strcpy_aarch64_mte -# define IFSTPCPY(X,...) -#endif - -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ - -ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) - bic src, srcin, 15 - mov wtmp, 0xf00f - ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp - cmeq vhas_nul.16b, vdata.16b, 0 - lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - lsr synd, synd, shift - cbnz synd, L(tail) - - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(start_loop) - -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - sub tmp, src, srcin - clz len, synd - add len, tmp, len, lsr 2 - tbz len, 4, L(less16) - sub tmp, len, 15 - ldr dataq, [srcin] - ldr dataq2, [srcin, tmp] - str dataq, [dstin] - str dataq2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4,,8 -L(tail): - rbit synd, synd - clz len, synd - lsr len, len, 2 - - .p2align 4 -L(less16): - tbz len, 3, L(less8) - sub tmp, len, 7 - ldr data1, [srcin] - ldr data2, [srcin, tmp] - str data1, [dstin] - str data2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(less8): - subs tmp, len, 3 - b.lo L(less4) - ldr dataw1, [srcin] - ldr dataw2, [srcin, tmp] - str dataw1, [dstin] - str dataw2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - -L(less4): - cbz len, L(zerobyte) - ldrh dataw1, [srcin] - strh dataw1, [dstin] -L(zerobyte): - strb wzr, [dstin, len] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(start_loop): - sub len, src, srcin - ldr dataq2, [srcin] - add dst, dstin, len - str dataq2, [dstin] - - .p2align 5 -L(loop): - str dataq, [dst], 16 - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - fmov synd, dend -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - clz len, synd - lsr len, len, 2 - sub tmp, len, 15 - ldr dataq, [src, tmp] - str dataq, [dst, tmp] - IFSTPCPY (add result, dst, len) - ret - -END (STRCPY) diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S index f515462e09ae768dbc921ba2928150dd5a98c6e7..00e72dce4451b3ead0e83c2c90832088ab79fb50 100644 --- a/string/aarch64/strcpy-sve.S +++ b/string/aarch64/strcpy-sve.S @@ -1,11 +1,11 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 6e9ed424b693919e95f7fbe8569fc9024633715a..97ae37ea422973e3eeea510bf63c3f314ff574d3 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -1,311 +1,156 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. - - To test the page crossing code path more thoroughly, compile with - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ #define dstin x0 #define srcin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 #define dst x3 -#define data1 x4 -#define data1w w4 -#define data2 x5 -#define data2w w5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define data1a x13 -#define data2a x14 -#define pos x15 -#define len x16 -#define to_align x17 +#define len x4 +#define synd x4 +#define tmp x5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vend v2 +#define dend d2 +#define dataq2 q1 #ifdef BUILD_STPCPY -#define STRCPY __stpcpy_aarch64 -#else -#define STRCPY __strcpy_aarch64 -#endif - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - /* AArch64 systems have a minimum page size of 4k. We can do a quick - page size check for crossing this boundary on entry and if we - do not, then we can short-circuit much of the entry code. We - expect early page-crossing strings to be rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite - predictable, even with random strings. - - We don't bother checking for larger page sizes, the cost of setting - up the correct page size is just not worth the extra gain from - a small reduction in the cases taking the slow path. Note that - we only care about whether the first fetch, which may be - misaligned, crosses a page boundary - after that we move to aligned - fetches for the remainder of the string. */ - -#ifdef STRCPY_TEST_PAGE_CROSS - /* Make everything that isn't Qword aligned look like a page cross. */ -#define MIN_PAGE_P2 4 +# define STRCPY __stpcpy_aarch64 +# define IFSTPCPY(X,...) X,__VA_ARGS__ #else -#define MIN_PAGE_P2 12 +# define STRCPY __strcpy_aarch64 +# define IFSTPCPY(X,...) #endif -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (STRCPY) PTR_ARG (0) PTR_ARG (1) - /* For moderately short strings, the fastest way to do the copy is to - calculate the length of the string in the same way as strlen, then - essentially do a memcpy of the result. This avoids the need for - multiple byte copies and further means that by the time we - reach the bulk copy loop we know we can always use DWord - accesses. We expect __strcpy_aarch64 to rarely be called repeatedly - with the same source string, so branch prediction is likely to - always be difficult - we mitigate against this by preferring - conditional select operations over branches whenever this is - feasible. */ - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) - mov zeroones, #REP8_01 - and to_align, srcin, #15 - cmp tmp2, #(MIN_PAGE_SIZE - 16) - neg tmp1, to_align - /* The first fetch will straddle a (possible) page boundary iff - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte - aligned string will never fail the page align check, so will - always take the fast path. */ - b.gt L(page_cross) - -L(page_cross_ok): - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* Because we expect the end to be found within 16 characters - (profiling shows this is the most common case), it's worth - swapping the bytes now to save having to recalculate the - termination syndrome later. We preserve data1 and data2 - so that we can re-use the values later on. */ - rev tmp2, data1 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - rev tmp4, data2 - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f + bic src, srcin, 15 + ld1 {vdata.16b}, [src] + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd #endif - bics has_nul2, tmp3, tmp4 - b.eq L(bulk_entry) + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret - /* The string is short (<=16 bytes). We don't know exactly how - short though, yet. Work out the exact length so that we can - quickly select the optimal copy strategy. */ -L(fp_gt8): - rev has_nul2, has_nul2 - clz pos, has_nul2 - mov tmp2, #56 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - sub pos, tmp2, pos -#ifdef __AARCH64EB__ - lsr data2, data2, pos -#else - lsl data2, data2, pos -#endif - str data2, [dst, #1] +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] str data1, [dstin] -#ifdef BUILD_STPCPY - add dstin, dst, #8 -#endif + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_le8): - rev has_nul1, has_nul1 - clz pos, has_nul1 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - subs tmp2, pos, #24 /* Pos in bits. */ - b.lt L(fp_lt4) -#ifdef __AARCH64EB__ - mov tmp2, #56 - sub pos, tmp2, pos - lsr data2, data1, pos - lsr data1, data1, #32 -#else - lsr data2, data1, tmp2 -#endif - /* 4->7 bytes to copy. */ - str data2w, [dst, #-3] - str data1w, [dstin] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_lt4): - cbz pos, L(fp_lt2) - /* 2->3 bytes to copy. */ -#ifdef __AARCH64EB__ - lsr data1, data1, #48 -#endif - strh data1w, [dstin] - /* Fall-through, one byte (max) to go. */ -L(fp_lt2): - /* Null-terminated string. Last character must be zero! */ - strb wzr, [dst] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret - - .p2align 6 - /* Aligning here ensures that the entry code and main loop all lies - within one 64-byte cache line. */ -L(bulk_entry): - sub to_align, to_align, #16 - stp data1, data2, [dstin] - sub src, srcin, to_align - sub dst, dstin, to_align - b L(entry_no_page_cross) - - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -L(main_loop): - stp data1, data2, [dst], #16 -L(entry_no_page_cross): - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(main_loop) - /* Since we know we are copying at least 16 bytes, the fastest way - to deal with the tail is to determine the location of the - trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, ne - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] -#ifdef BUILD_STPCPY - sub dstin, dst, #1 -#endif +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) ret -L(page_cross): - bic src, srcin, #15 - /* Start by loading two words at [srcin & ~15], then forcing the - bytes that precede srcin to 0xff. This means they never look - like termination bytes. */ - ldp data1, data2, [src] - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - tst to_align, #7 - csetm tmp2, ne -#ifdef __AARCH64EB__ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + .p2align 4 +L(start_loop): + sub tmp, srcin, dstin + ldr dataq2, [srcin] + sub dst, src, tmp + str dataq2, [dstin] +L(loop): + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + add dst, dst, 16 +L(loopend): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + sub dst, dst, 31 +#ifndef __AARCH64EB__ + rbit synd, synd #endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - cmp to_align, #8 - csinv data1, data1, xzr, lt - csel data2, data2, data2a, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(page_cross_ok) - /* We now need to make data1 and data2 look like they've been - loaded directly from srcin. Do a rotate on the 128-bit value. */ - lsl tmp1, to_align, #3 /* Bytes->bits. */ - neg tmp2, to_align, lsl #3 -#ifdef __AARCH64EB__ - lsl data1a, data1, tmp1 - lsr tmp4, data2, tmp2 - lsl data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - rev tmp2, data1 - rev tmp4, data2 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - lsr data1a, data1, tmp1 - lsl tmp4, data2, tmp2 - lsr data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, L(fp_le8) - bic has_nul2, tmp3, tmp4 - b L(fp_gt8) + clz len, synd + lsr len, len, 2 + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) + ret END (STRCPY) - diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S index 7cf41d5c1eac995332ae42bbaf962116eb32457d..77235797f7c54fe5af374120f76362148b11ce0f 100644 --- a/string/aarch64/strlen-mte.S +++ b/string/aarch64/strlen-mte.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define result x0 @@ -19,35 +19,26 @@ #define src x1 #define synd x2 #define tmp x3 -#define wtmp w3 #define shift x4 #define data q0 #define vdata v0 #define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 - mov wtmp, 0xf00f ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(loop) @@ -59,19 +50,25 @@ ENTRY (__strlen_aarch64_mte) .p2align 5 L(loop): - ldr data, [src, 16]! + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop_end) + ldr data, [src, 32]! cmeq vhas_nul.16b, vdata.16b, 0 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + sub src, src, 16 +L(loop_end): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ sub result, src, srcin fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif + add result, result, 16 clz tmp, synd add result, result, tmp, lsr 2 ret diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S index 2392493f1a3c4c79b67f790bfa064766253e55e7..12ebbdba5c93ae99a195dd8abef828c2b7804982 100644 --- a/string/aarch64/strlen-sve.S +++ b/string/aarch64/strlen-sve.S @@ -1,11 +1,11 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index a1b164a49238243419c89a365dd6757f9e9be7cd..6f6f08f636b248abc9c9b2e847545588efca281b 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Not MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define len x0 @@ -36,6 +36,7 @@ #define tmp x2 #define tmpw w2 #define synd x3 +#define syndw w3 #define shift x4 /* For the first 32 bytes, NUL detection works on the principle that @@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64) add len, len, tmp1, lsr 3 ret - .p2align 3 /* Look for a NUL byte at offset 16..31 in the string. */ L(bytes16_31): ldp data1, data2, [srcin, 16] @@ -138,6 +138,7 @@ L(bytes16_31): add len, len, tmp1, lsr 3 ret + nop L(loop_entry): bic src, srcin, 31 @@ -153,18 +154,12 @@ L(loop): /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ cmeq maskv.16b, datav1.16b, 0 sub len, src, srcin - tst synd, 0xffffffff - b.ne 1f + cbnz syndw, 1f cmeq maskv.16b, datav2.16b, 0 add len, len, 16 1: /* Generate a bitmask and compute correct byte offset. */ -#ifdef __AARCH64EB__ - bic maskv.8h, 0xf0 -#else - bic maskv.8h, 0x0f, lsl 8 -#endif - umaxp maskv.16b, maskv.16b, maskv.16b + shrn maskv.8b, maskv.8h, 4 fmov synd, maskd #ifndef __AARCH64EB__ rbit synd, synd @@ -173,8 +168,6 @@ L(loop): add len, len, tmp, lsr 2 ret - .p2align 4 - L(page_cross): bic src, srcin, 31 mov tmpw, 0x0c03 diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S deleted file mode 100644 index c9d6fc8a158beca38419a6ccf82cd8573394f7b6..0000000000000000000000000000000000000000 --- a/string/aarch64/strncmp-mte.S +++ /dev/null @@ -1,307 +0,0 @@ -/* - * strncmp - compare two strings - * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result x0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define syndrome x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define zeroones x11 -#define pos x12 -#define mask x13 -#define endloop x14 -#define count mask -#define offset pos -#define neg_offset x15 - -/* Define endian dependent shift operations. - On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. - LS_BK means shifting towards later bytes. - */ -#ifdef __AARCH64EB__ -#define LS_FW lsl -#define LS_BK lsr -#else -#define LS_FW lsr -#define LS_BK lsl -#endif - -ENTRY (__strncmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 - and count, src1, #7 - b.ne L(misaligned8) - cbnz count, L(mutual_align) - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - .p2align 4 -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit, limit, #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, hi /* Last Dword or differences. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp endloop, #0, #0, eq - b.eq L(loop_aligned) - /* End of main loop */ - -L(full_check): -#ifndef __AARCH64EB__ - orr syndrome, diff, has_nul - add limit, limit, 8 /* Rewind limit to before last subs. */ -L(syndrome_check): - /* Limit was reached. Check if the NUL byte or the difference - is before the limit. */ - rev syndrome, syndrome - rev data1, data1 - clz pos, syndrome - rev data2, data2 - lsl data1, data1, pos - cmp limit, pos, lsr #3 - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - csel result, result, xzr, hi - ret -#else - /* Not reached the limit, must have found the end or a diff. */ - tbz limit, #63, L(not_limit) - add tmp1, limit, 8 - cbz limit, L(not_limit) - - lsl limit, tmp1, #3 /* Bits -> bytes. */ - mov mask, #~0 - lsr mask, mask, limit - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ -L(end_quick): - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#endif - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. - We also need to adjust the limit calculations, but without - overflowing if the limit is near ULONG_MAX. */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ - ldr data2, [src2], #8 - mov tmp2, #~0 - LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit and ensure it doesn't overflow. */ - adds limit, limit, count - csinv limit, limit, xzr, lo - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b L(start_realigned) - - .p2align 4 - /* Don't bother with dwords for up to 16 bytes. */ -L(misaligned8): - cmp limit, #16 - b.hs L(try_misaligned_words) - -L(byte_loop): - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq L(byte_loop) -L(done): - sub result, data1, data2 - ret - /* Align the SRC1 to a dword by doing a bytewise compare and then do - the dword loop. */ -L(try_misaligned_words): - cbz count, L(src1_aligned) - - neg count, count - and count, count, #7 - sub limit, limit, count - -L(page_end_loop): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne L(done) - subs count, count, #1 - b.hi L(page_end_loop) - - /* The following diagram explains the comparison of misaligned strings. - The bytes are shown in natural order. For little-endian, it is - reversed in the registers. The "x" bytes are before the string. - The "|" separates data that is loaded at one time. - src1 | a a a a a a a a | b b b c c c c c | . . . - src2 | x x x x x a a a a a a a a b b b | c c c c c . . . - - After shifting in each step, the data looks like this: - STEP_A STEP_B STEP_C - data1 a a a a a a a a b b b c c c c c b b b c c c c c - data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c - - The bytes with "0" are eliminated from the syndrome via mask. - - Align SRC2 down to 16 bytes. This way we can read 16 bytes at a - time from SRC2. The comparison happens in 3 steps. After each step - the loop can exit, or read from SRC1 or SRC2. */ -L(src1_aligned): - /* Calculate offset from 8 byte alignment to string start in bits. No - need to mask offset since shifts are ignoring upper bits. */ - lsl offset, src2, #3 - bic src2, src2, #0xf - mov mask, -1 - neg neg_offset, offset - ldr data1, [src1], #8 - ldp tmp1, tmp2, [src2], #16 - LS_BK mask, mask, neg_offset - and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ - /* Skip the first compare if data in tmp1 is irrelevant. */ - tbnz offset, 6, L(misaligned_mid_loop) - -L(loop_misaligned): - /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ - LS_FW data2, tmp1, offset - LS_BK tmp1, tmp2, neg_offset - subs limit, limit, #8 - orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ - sub has_nul, data1, zeroones - eor diff, data1, data2 /* Non-zero if differences found. */ - orr tmp3, data1, #REP8_7f - csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ - orr tmp3, endloop, has_nul - cbnz tmp3, L(full_check) - - ldr data1, [src1], #8 -L(misaligned_mid_loop): - /* STEP_B: Compare first part of data1 to second part of tmp2. */ - LS_FW data2, tmp2, offset -#ifdef __AARCH64EB__ - /* For big-endian we do a byte reverse to avoid carry-propagation - problem described above. This way we can reuse the has_nul in the - next step and also use syndrome value trick at the end. */ - rev tmp3, data1 - #define data1_fixed tmp3 -#else - #define data1_fixed data1 -#endif - sub has_nul, data1_fixed, zeroones - orr tmp3, data1_fixed, #REP8_7f - eor diff, data2, data1 /* Non-zero if differences found. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - cmp limit, neg_offset, lsr #3 - orr syndrome, diff, has_nul - bic syndrome, syndrome, mask /* Ignore later bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - /* STEP_C: Compare second part of data1 to first part of tmp1. */ - ldp tmp1, tmp2, [src2], #16 - cmp limit, #8 - LS_BK data2, tmp1, neg_offset - eor diff, data2, data1 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - and syndrome, syndrome, mask /* Ignore earlier bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - ldr data1, [src1], #8 - sub limit, limit, #8 - b L(loop_misaligned) - -#ifdef __AARCH64EB__ -L(syndrome_check): - clz pos, syndrome - cmp pos, limit, lsl #3 - b.lo L(end_quick) -#endif - -L(ret0): - mov result, #0 - ret -END(__strncmp_aarch64_mte) - diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S index 234190e245b0ba30f6257fad70b9fcbc4ce767cd..6a9e9f7b6437fdab851d5a4a4651b3f4922bf06b 100644 --- a/string/aarch64/strncmp-sve.S +++ b/string/aarch64/strncmp-sve.S @@ -1,11 +1,11 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index 738b6539cab647129d801a21bb7b88876b37c070..128a10c52bb175436312c6326030c4d34cc4190f 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -1,20 +1,20 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 /* Parameters and result. */ #define src1 x0 @@ -35,10 +35,24 @@ #define tmp3 x10 #define zeroones x11 #define pos x12 -#define limit_wd x13 -#define mask x14 -#define endloop x15 +#define mask x13 +#define endloop x14 #define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif ENTRY (__strncmp_aarch64) PTR_ARG (0) @@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64) and count, src1, #7 b.ne L(misaligned8) cbnz count, L(mutual_align) - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and @@ -63,56 +74,52 @@ L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 L(start_realigned): - subs limit_wd, limit_wd, #1 + subs limit, limit, #8 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) /* End of main loop */ - /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): +L(full_check): +#ifndef __AARCH64EB__ orr syndrome, diff, has_nul - -#ifndef __AARCH64EB__ + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ clz pos, syndrome rev data2, data2 lsl data1, data1, pos + cmp limit, pos, lsr #3 lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ lsr data1, data1, #56 sub result, data1, data2, lsr #56 + csel result, result, xzr, hi ret #else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): /* For big-endian we cannot use the trick with the syndrome value as carry-propagation can corrupt the upper bits if the trailing bytes in the string contain 0x01. */ @@ -133,10 +140,11 @@ L(not_limit): rev has_nul, has_nul orr syndrome, diff, has_nul clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ +L(end_quick): lsl data1, data1, pos lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then @@ -158,22 +166,12 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#endif - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count - add tmp3, tmp3, count + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 - add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) .p2align 4 @@ -196,13 +194,11 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - lsr limit_wd, limit, #3 - cbz count, L(do_misaligned) + cbz count, L(src1_aligned) neg count, count and count, count, #7 sub limit, limit, count - lsr limit_wd, limit, #3 L(page_end_loop): ldrb data1w, [src1], #1 @@ -213,48 +209,100 @@ L(page_end_loop): subs count, count, #1 b.hi L(page_end_loop) -L(do_misaligned): - /* Prepare ourselves for the next page crossing. Unlike the aligned - loop, we fetch 1 less dword because we risk crossing bounds on - SRC2. */ - mov count, #8 - subs limit_wd, limit_wd, #1 - b.lo L(done_loop) -L(loop_misaligned): - and tmp2, src2, #0xff8 - eor tmp2, tmp2, #0xff8 - cbz tmp2, L(page_end_loop) + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset ldr data1, [src1], #8 - ldr data2, [src2], #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) - subs limit_wd, limit_wd, #1 - b.pl L(loop_misaligned) + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) -L(done_loop): - /* We found a difference or a NULL before the limit was reached. */ - and limit, limit, #7 - cbz limit, L(not_limit) - /* Read the last word. */ - sub src1, src1, 8 - sub src2, src2, 8 - ldr data1, [src1, limit] - ldr data2, [src2, limit] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif L(ret0): mov result, #0 ret - -END ( __strncmp_aarch64) +END(__strncmp_aarch64) diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S index 5b9ebf7763bc2491011641702eac4dbc32f45482..6c43dc427da7a9279ed400a6186bbd162cc10148 100644 --- a/string/aarch64/strnlen-sve.S +++ b/string/aarch64/strnlen-sve.S @@ -1,11 +1,11 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index 48d2495d2082be8318c88148eb21d00ee6f0b421..f2090a7485a5646dec85bfb0b4fce421471adb13 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -1,8 +1,8 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define cntin x1 @@ -20,39 +20,30 @@ #define src x2 #define synd x3 #define shift x4 -#define wtmp w4 #define tmp x4 #define cntrem x5 #define qdata q0 #define vdata v0 #define vhas_chr v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strnlen_aarch64) PTR_ARG (0) SIZE_ARG (1) bic src, srcin, 15 - mov wtmp, 0xf00f cbz cntin, L(nomatch) - ld1 {vdata.16b}, [src], 16 - dup vrepmask.8h, wtmp + ld1 {vdata.16b}, [src] cmeq vhas_chr.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) @@ -64,37 +55,40 @@ L(finish): csel result, cntin, result, ls ret +L(nomatch): + mov result, cntin + ret + L(start_loop): sub tmp, src, srcin + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 5 L(loop32): - ldr qdata, [src], 16 + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, 0 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src], 16 + ldr qdata, [src, 16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, 0 - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) - +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ - sub src, src, 16 - mov synd, vend.d[0] + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ sub result, src, srcin + fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif @@ -104,9 +98,5 @@ L(end): csel result, cntin, result, ls ret -L(nomatch): - mov result, cntin - ret - END (__strnlen_aarch64) diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S index 1e4fb1a68f7e8bc21a65f5925194f5d188d01e7c..bb61ab9ad4e7c5d5966daa950d7ef2c2dec4726d 100644 --- a/string/aarch64/strrchr-mte.S +++ b/string/aarch64/strrchr-mte.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,7 +19,6 @@ #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -31,7 +30,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S index d36d69af37fd71a23f656ae0c5bc87f719bd3073..825a7384cfc11831455e5544408e9d5faf8ce57f 100644 --- a/string/aarch64/strrchr-sve.S +++ b/string/aarch64/strrchr-sve.S @@ -1,11 +1,11 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S index 56185ff534e3915d3ada2c025b2943489b9b2d7b..bf9cb297b6cb3f4bc539594edb7e4a6cddc96f20 100644 --- a/string/aarch64/strrchr.S +++ b/string/aarch64/strrchr.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c index d5d4ea7e0309a0a9e00dca54048cbb8dc7bb4c00..e070be586b528dc57d40f709e93ad2e10c34f053 100644 --- a/string/bench/memcpy.c +++ b/string/bench/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy benchmark. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE @@ -13,14 +13,15 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 5000 +#define ITERS 5000 #define ITERS2 20000000 -#define ITERS3 500000 -#define MAX_COPIES 8192 -#define SIZE (256*1024) +#define ITERS3 200000 +#define NUM_TESTS 16384 +#define MIN_SIZE 32768 +#define MAX_SIZE (1024 * 1024) -static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64))); -static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64))); +static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); +static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); #define F(x) {#x, x}, @@ -30,15 +31,21 @@ static const struct fun void *(*fun)(void *, const void *, size_t); } funtab[] = { - F(memcpy) #if __aarch64__ F(__memcpy_aarch64) # if __ARM_NEON F(__memcpy_aarch64_simd) # endif +# if __ARM_FEATURE_SVE + F(__memcpy_aarch64_sve) +# endif +# if WANT_MOPS + F(__memcpy_aarch64_mops) +# endif #elif __arm__ F(__memcpy_arm) #endif + F(memcpy) #undef F {0, 0} }; @@ -109,7 +116,7 @@ typedef struct uint64_t len : 16; } copy_t; -static copy_t copy[MAX_COPIES]; +static copy_t test_arr[NUM_TESTS]; typedef char *(*proto_t) (char *, const char *, size_t); @@ -140,14 +147,14 @@ init_copies (size_t max_size) size_t total = 0; /* Create a random set of copies with the given size and alignment distributions. */ - for (int i = 0; i < MAX_COPIES; i++) + for (int i = 0; i < NUM_TESTS; i++) { - copy[i].dst = (rand32 (0) & (max_size - 1)); - copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; - copy[i].src = (rand32 (0) & (max_size - 1)); - copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; - copy[i].len = size_arr[rand32 (0) & SIZE_MASK]; - total += copy[i].len; + test_arr[i].dst = (rand32 (0) & (max_size - 1)); + test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].src = (rand32 (0) & (max_size - 1)); + test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK]; + total += test_arr[i].len; } return total; @@ -160,25 +167,27 @@ int main (void) memset (a, 1, sizeof (a)); memset (b, 2, sizeof (b)); - printf("Random memcpy:\n"); + printf("Random memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { size_t total = 0; uint64_t tsum = 0; - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); rand32 (0x12345678); - for (int size = 16384; size <= SIZE; size *= 2) + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) { size_t copy_size = init_copies (size) * ITERS; - for (int c = 0; c < MAX_COPIES; c++) - funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, + test_arr[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) - for (int c = 0; c < MAX_COPIES; c++) - funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, + test_arr[c].len); t = clock_get_ns () - t; total += copy_size; tsum += t; @@ -187,74 +196,147 @@ int main (void) printf( "avg %.2f\n", (double)total / tsum); } - printf ("\nMedium memcpy:\n"); + size_t total = 0; + uint64_t tsum = 0; + printf ("%22s ", "memcpy_call"); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + size_t copy_size = init_copies (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); + t = clock_get_ns () - t; + total += copy_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); + } + printf( "avg %.2f\n", (double)total / tsum); + + + printf ("\nAligned medium memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 16; size <= 512; size *= 2) + for (int size = 8; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); } printf ("\n"); } - printf ("\nLarge memcpy:\n"); + printf ("%22s ", "memcpy_call"); + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + memcpy (b, a, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + + + printf ("\nUnaligned medium memcpy (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 1024; size <= 32768; size *= 2) + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (b + 3, a + 1, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("%22s ", "memcpy_call"); + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + memcpy (b + 3, a + 1, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + + + printf ("\nLarge memcpy (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } - printf ("\nUnaligned forwards memmove:\n"); + printf ("%22s ", "memcpy_call"); + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + memcpy (b, a, size); + t = clock_get_ns () - t; + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n"); + + + printf ("\nUnaligned forwards memmove (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 1024; size <= 32768; size *= 2) + for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a, a + 256 + (i & 31), size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } - printf ("\nUnaligned backwards memmove:\n"); + printf ("\nUnaligned backwards memmove (bytes/ns):\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s (B/ns) ", funtab[f].name); + printf ("%22s ", funtab[f].name); - for (int size = 1024; size <= 32768; size *= 2) + for (int size = 1024; size <= 65536; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a + 256 + (i & 31), a, size); t = clock_get_ns () - t; - printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, - size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); } printf ("\n"); } + printf ("\n"); return 0; } diff --git a/string/bench/memset.c b/string/bench/memset.c new file mode 100644 index 0000000000000000000000000000000000000000..990e23ba9a368bb28960d4211b1d3e3f4d96dee4 --- /dev/null +++ b/string/bench/memset.c @@ -0,0 +1,243 @@ +/* + * memset benchmark. + * + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 5000 +#define ITERS2 20000000 +#define ITERS3 1000000 +#define NUM_TESTS 16384 +#define MIN_SIZE 32768 +#define MAX_SIZE (1024 * 1024) + +static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64))); + +#define F(x) {#x, x}, + +static const struct fun +{ + const char *name; + void *(*fun)(void *, int, size_t); +} funtab[] = +{ +#if __aarch64__ + F(__memset_aarch64) +#elif __arm__ + F(__memset_arm) +#endif + F(memset) +#undef F + {0, 0} +}; + +typedef struct { uint32_t offset : 20, len : 12; } memset_test_t; +static memset_test_t test_arr[NUM_TESTS]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM-1) +static uint8_t len_arr[SIZE_NUM]; + +/* Frequency data for memset sizes up to 4096 based on SPEC2017. */ +static freq_data_t memset_len_freq[] = +{ +{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412}, +{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414}, +{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192}, +{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140}, +{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118}, +{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74}, +{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54}, +{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33}, +{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22}, +{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15}, +{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11}, +{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6}, +{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5}, +{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3}, +{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2}, +{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2}, +{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2}, +{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1}, +{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1}, +{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1}, +{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1}, +{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1}, +{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1}, +{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1}, +{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1}, +{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1}, +{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1}, +{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1}, +{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM-1) +static uint8_t align_arr[ALIGN_NUM]; + +/* Alignment data for memset based on SPEC2017. */ +static align_data_t memset_align_freq[] = +{ + {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0} +}; + +static void +init_memset_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++) + for (j = 0, size = memset_len_freq[i].size; j < freq; j++) + len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++) + for (j = 0, size = memset_align_freq[i].align; j < freq; j++) + align_arr[n++] = size - 1; + assert (n == ALIGN_NUM); +} + +static size_t +init_memset (size_t max_size) +{ + size_t total = 0; + /* Create a random set of memsets with the given size and alignment + distributions. */ + for (int i = 0; i < NUM_TESTS; i++) + { + test_arr[i].offset = (rand32 (0) & (max_size - 1)); + test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK]; + test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK]; + total += test_arr[i].len; + } + + return total; +} + + +int main (void) +{ + init_memset_distribution (); + + memset (a, 1, sizeof (a)); + + printf("Random memset (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t total_size = 0; + uint64_t tsum = 0; + printf ("%22s ", funtab[f].name); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + size_t memset_size = init_memset (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); + t = clock_get_ns () - t; + total_size += memset_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); + } + printf( "avg %.2f\n", (double)total_size / tsum); + } + + size_t total_size = 0; + uint64_t tsum = 0; + printf ("%22s ", "memset_call"); + rand32 (0x12345678); + + for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + { + size_t memset_size = init_memset (size) * ITERS; + + for (int c = 0; c < NUM_TESTS; c++) + memset (a + test_arr[c].offset, 0, test_arr[c].len); + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_TESTS; c++) + memset (a + test_arr[c].offset, 0, test_arr[c].len); + t = clock_get_ns () - t; + total_size += memset_size; + tsum += t; + printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); + } + printf( "avg %.2f\n", (double)total_size / tsum); + + + printf ("\nMedium memset (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a, 0, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("%22s ", "memset_call"); + for (int size = 8; size <= 512; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + memset (a, 0, size); + t = clock_get_ns () - t; + printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + } + + + printf ("\nLarge memset (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a, 0, size); + t = clock_get_ns () - t; + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("%22s ", "memset_call"); + for (int size = 1024; size <= 65536; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + memset (a, 0, size); + t = clock_get_ns () - t; + printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + } + printf ("\n\n"); + + return 0; +} diff --git a/string/bench/strlen.c b/string/bench/strlen.c index cc0f04bee5471a4c623e047f773bde10f0e8aac7..f05d0d5b89e6f1c689d38ea45d2feefb99bf5f82 100644 --- a/string/bench/strlen.c +++ b/string/bench/strlen.c @@ -1,8 +1,8 @@ /* * strlen benchmark. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define _GNU_SOURCE @@ -13,10 +13,10 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 2000 +#define ITERS 5000 #define ITERS2 20000000 #define ITERS3 2000000 -#define NUM_STRLEN 16384 +#define NUM_TESTS 16384 #define MAX_ALIGN 32 #define MAX_STRLEN 256 @@ -49,7 +49,7 @@ static const struct fun }; #undef F -static uint16_t strlen_tests[NUM_STRLEN]; +static uint16_t strlen_tests[NUM_TESTS]; typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; @@ -117,7 +117,7 @@ init_strlen_tests (void) /* Create a random set of strlen input strings using the string length and alignment distributions. */ - for (int n = 0; n < NUM_STRLEN; n++) + for (int n = 0; n < NUM_TESTS; n++) { int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; @@ -141,14 +141,14 @@ int main (void) size_t res = 0, strlen_size = 0, mask = maskv; printf ("%22s ", funtab[f].name); - for (int c = 0; c < NUM_STRLEN; c++) + for (int c = 0; c < NUM_TESTS; c++) strlen_size += funtab[f].fun (a + strlen_tests[c]); strlen_size *= ITERS; /* Measure latency of strlen result with (res & mask). */ uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_STRLEN; c++) + for (int c = 0; c < NUM_TESTS; c++) res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); t = clock_get_ns () - t; printf ("%.2f\n", (double)strlen_size / t); diff --git a/string/include/benchlib.h b/string/include/benchlib.h index 0f2ce2eb6bce2685432d4207f987f3896c4b8363..f1bbea388cd217981dbf6513a1c0a1fadbc894bc 100644 --- a/string/include/benchlib.h +++ b/string/include/benchlib.h @@ -2,7 +2,7 @@ * Benchmark support functions. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 378c3cd2d64590c05aa1cb80f6ba2559be017d2d..650c52cbda786613bbd5daf64a827903b54bb3ba 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -29,19 +29,17 @@ size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); void * __memchr_aarch64_mte (const void *, int, size_t); -char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); -char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); char *__strrchr_aarch64_mte (const char *, int); -int __strcmp_aarch64_mte (const char *, const char *); -int __strncmp_aarch64_mte (const char *, const char *, size_t); #if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); #endif # if __ARM_FEATURE_SVE +void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t); void *__memchr_aarch64_sve (const void *, int, size_t); int __memcmp_aarch64_sve (const void *, const void *, size_t); char *__strchr_aarch64_sve (const char *, int); @@ -54,6 +52,11 @@ size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif +# if WANT_MOPS +void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t); +void *__memset_aarch64_mops (void *, int, size_t); +# endif # if __ARM_FEATURE_MEMORY_TAGGING void *__mtag_tag_region (void *, size_t); void *__mtag_tag_zero_region (void *, size_t); diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c index d8c02d92d626a6e754b756cdcb17945e6a6a14ad..c45fa6662a77bbdab77fe6998ffb3830952016fa 100644 --- a/string/test/__mtag_tag_region.c +++ b/string/test/__mtag_tag_region.c @@ -2,7 +2,7 @@ * __mtag_tag_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c index 221c223a2f3105ab02c7b21b9560a81bddf4355d..a4a7861620d1f4db8eedc438cae77aa8145040d7 100644 --- a/string/test/__mtag_tag_zero_region.c +++ b/string/test/__mtag_tag_zero_region.c @@ -2,7 +2,7 @@ * __mtag_tag_zero_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/string/test/memchr.c b/string/test/memchr.c index 0ff77f5710bf2d413b5e1f9a4c5243e0fe945c2c..c6a94481c0adbaeaf27b81c0d18643a25236f623 100644 --- a/string/test/memchr.c +++ b/string/test/memchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/string/test/memcmp.c b/string/test/memcmp.c index 7a7cf9cff35af2c22248dfd21609b7e83af68976..f9236b83a60d446315cbc5ddb27f03458d50b538 100644 --- a/string/test/memcmp.c +++ b/string/test/memcmp.c @@ -2,7 +2,7 @@ * memcmp test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/string/test/memcpy.c b/string/test/memcpy.c index ce0ceeef5ee844e5feadaf2cb18020436e1e9b12..0c2c75a29e2d45c13a6d900a4a8e21984266b2d8 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -28,6 +28,12 @@ static const struct fun # if __ARM_NEON F(__memcpy_aarch64_simd, 1) # endif +# if __ARM_FEATURE_SVE + F(__memcpy_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memcpy_aarch64_mops, 1) +# endif #elif __arm__ F(__memcpy_arm, 0) #endif diff --git a/string/test/memmove.c b/string/test/memmove.c index 689b68c98af264c8d5e485e7134a0f216fce555c..a5149d74465dad744ec85bee844f053b8727739c 100644 --- a/string/test/memmove.c +++ b/string/test/memmove.c @@ -1,8 +1,8 @@ /* * memmove test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -28,6 +28,12 @@ static const struct fun # if __ARM_NEON F(__memmove_aarch64_simd, 1) # endif +# if __ARM_FEATURE_SVE + F(__memmove_aarch64_sve, 1) +# endif +# if WANT_MOPS + F(__memmove_aarch64_mops, 1) +# endif #endif {0, 0, 0} // clang-format on diff --git a/string/test/memrchr.c b/string/test/memrchr.c index adf96f049cc938ee48cf51c1a1fea94ac73af60a..4171a56daefd6596cc453d075292960db6225d0f 100644 --- a/string/test/memrchr.c +++ b/string/test/memrchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/string/test/memset.c b/string/test/memset.c index f1721442dbaf83f682859526632655c7ad65cd75..3489e2986a71c18e40d1a08d069664b0149de415 100644 --- a/string/test/memset.c +++ b/string/test/memset.c @@ -2,7 +2,7 @@ * memset test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -25,6 +25,9 @@ static const struct fun F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) +# if WANT_MOPS + F(__memset_aarch64_mops, 1) +# endif #elif __arm__ F(__memset_arm, 0) #endif diff --git a/string/test/mte.h b/string/test/mte.h index e67cbd9d2d400ac1b6bbb4ce815073f483fdb20b..40b0ecf6c194df67a51a14bbe6d3a262dc441590 100644 --- a/string/test/mte.h +++ b/string/test/mte.h @@ -2,7 +2,7 @@ * Memory tagging testing code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef __TEST_MTE_H diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c index 1827e68c9a30e75b75e467968c75cca7e4f54dc8..0300892a1f3ccaf0dc35ea0b3e85bca861ce99cc 100644 --- a/string/test/stpcpy.c +++ b/string/test/stpcpy.c @@ -1,8 +1,8 @@ /* * stpcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE @@ -28,8 +28,7 @@ static const struct fun // clang-format off F(stpcpy, 0) #if __aarch64__ - F(__stpcpy_aarch64, 0) - F(__stpcpy_aarch64_mte, 1) + F(__stpcpy_aarch64, 1) # if __ARM_FEATURE_SVE F(__stpcpy_aarch64_sve, 1) # endif diff --git a/string/test/strchr.c b/string/test/strchr.c index f3ae982ef0adf0850986741f84e9f63d131d9cfe..66180acfb57c6b824bcd39b8e23bada7ab3904a7 100644 --- a/string/test/strchr.c +++ b/string/test/strchr.c @@ -2,7 +2,7 @@ * strchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c index 6c30ab2123f16aac57b59896740e859018fb3bf0..aad0bf59da664e02495e82ab014b19fb81b3576b 100644 --- a/string/test/strchrnul.c +++ b/string/test/strchrnul.c @@ -2,7 +2,7 @@ * strchrnul test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/string/test/strcmp.c b/string/test/strcmp.c index d57b54ed50a8a5e8b742805444510ec98a62851d..4aa95f4f2f1dd6e00fc97082abf8994f5fce2643 100644 --- a/string/test/strcmp.c +++ b/string/test/strcmp.c @@ -1,8 +1,8 @@ /* * strcmp test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strcmp, 0) #if __aarch64__ - F(__strcmp_aarch64, 0) - F(__strcmp_aarch64_mte, 1) + F(__strcmp_aarch64, 1) # if __ARM_FEATURE_SVE F(__strcmp_aarch64_sve, 1) # endif diff --git a/string/test/strcpy.c b/string/test/strcpy.c index e84cace9c8c610e6f03892be2eb8fc3c92d537ea..af297f90396a95d6b88cbf0357aa1860d862f62c 100644 --- a/string/test/strcpy.c +++ b/string/test/strcpy.c @@ -1,8 +1,8 @@ /* * strcpy test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strcpy, 0) #if __aarch64__ - F(__strcpy_aarch64, 0) - F(__strcpy_aarch64_mte, 1) + F(__strcpy_aarch64, 1) # if __ARM_FEATURE_SVE F(__strcpy_aarch64_sve, 1) # endif diff --git a/string/test/stringtest.h b/string/test/stringtest.h index fe855fc217369099ab10af634f392517edf89f66..6bb7e1fdfeca2d291cfba0d254d564ed3c51d57b 100644 --- a/string/test/stringtest.h +++ b/string/test/stringtest.h @@ -2,7 +2,7 @@ * Common string test code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/string/test/strlen.c b/string/test/strlen.c index 6278380f26df71b5742944cca66d4a7568957ea6..47ef3dcf0ef0c94adf16d07d77c341038a125389 100644 --- a/string/test/strlen.c +++ b/string/test/strlen.c @@ -1,15 +1,14 @@ /* * strlen test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include #include #include #include -#include #include #include "mte.h" #include "stringlib.h" diff --git a/string/test/strncmp.c b/string/test/strncmp.c index 018a8a431ab8ca55110b814e0e089fde6f199772..4bbab6f934509708d760b7cf99d8fbf8c57b21e7 100644 --- a/string/test/strncmp.c +++ b/string/test/strncmp.c @@ -1,8 +1,8 @@ /* * strncmp test. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include @@ -24,8 +24,7 @@ static const struct fun // clang-format off F(strncmp, 0) #if __aarch64__ - F(__strncmp_aarch64, 0) - F(__strncmp_aarch64_mte, 1) + F(__strncmp_aarch64, 1) # if __ARM_FEATURE_SVE F(__strncmp_aarch64_sve, 1) # endif diff --git a/string/test/strnlen.c b/string/test/strnlen.c index 0dea00eaf8e3dc41bc465aa201a312e3a85bf230..a800fd1993cdc21a9023fb5eabfb50781a2b9d70 100644 --- a/string/test/strnlen.c +++ b/string/test/strnlen.c @@ -2,7 +2,7 @@ * strnlen test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _GNU_SOURCE diff --git a/string/test/strrchr.c b/string/test/strrchr.c index fedbdc52fcc1151ffbbd168ef3bd1cb42c700ff0..580ca497f8a46b1ae92d1e3288b29d2d13178ccf 100644 --- a/string/test/strrchr.c +++ b/string/test/strrchr.c @@ -2,7 +2,7 @@ * strrchr test. * * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S index 26ade0a0c7db635acdbb3bd9592fee3ce9ec540d..5afcf7b7ee548aa275f105f72714d390da4d076a 100644 --- a/string/x86_64/check-arch.S +++ b/string/x86_64/check-arch.S @@ -2,7 +2,7 @@ * check ARCH setting. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__x86_64__