From 2164faec8a22d2934ae775587c38fb15e5852e51 Mon Sep 17 00:00:00 2001 From: openharmony_ci <120357966@qq.com> Date: Sat, 9 Dec 2023 09:54:06 +0000 Subject: [PATCH] =?UTF-8?q?=E5=9B=9E=E9=80=80=20'Pull=20Request=20!22=20:?= =?UTF-8?q?=20=E5=B0=86optimized-routine=E4=BB=8E21.02=E5=8D=87=E7=BA=A7?= =?UTF-8?q?=E8=87=B323.01=E7=89=88=E6=9C=AC'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- LICENSE | 230 +------- Makefile | 7 +- OAT.xml | 39 +- README | 14 +- config.mk.dist | 25 +- math/Dir.mk | 17 +- math/README.contributors | 78 --- math/aarch64/v_cos.c | 87 --- math/aarch64/v_cosf.c | 82 --- math/aarch64/v_exp.c | 125 ----- math/aarch64/v_exp2f.c | 113 ---- math/aarch64/v_exp_data.c | 146 ------ math/aarch64/v_expf.c | 122 ----- math/aarch64/v_log.c | 100 ---- math/aarch64/v_log_data.c | 156 ------ math/aarch64/v_logf.c | 74 --- math/aarch64/v_math.h | 135 ----- math/aarch64/v_powf.c | 148 ------ math/aarch64/v_sin.c | 97 ---- math/aarch64/v_sinf.c | 82 --- math/cosf.c | 6 +- math/erf.c | 2 +- math/erf_data.c | 2 +- math/erff.c | 2 +- math/erff_data.c | 2 +- math/exp.c | 2 +- math/exp10.c | 129 ----- math/exp2.c | 2 +- math/exp2f.c | 2 +- math/exp2f_data.c | 2 +- math/exp_data.c | 23 +- math/expf.c | 2 +- math/include/mathlib.h | 69 ++- math/log.c | 2 +- math/log2.c | 2 +- math/log2_data.c | 2 +- math/log2f.c | 2 +- math/log2f_data.c | 2 +- math/log_data.c | 2 +- math/logf.c | 6 +- math/logf_data.c | 2 +- math/math_config.h | 34 +- math/math_err.c | 2 +- math/math_errf.c | 2 +- math/pow.c | 2 +- math/pow_log_data.c | 2 +- math/powf.c | 2 +- math/powf_log2_data.c | 2 +- math/s_cos.c | 6 + math/s_cosf.c | 6 + math/s_exp.c | 6 + math/s_exp2f.c | 6 + math/s_exp2f_1u.c | 6 + math/s_expf.c | 6 + math/s_expf_1u.c | 6 + math/s_log.c | 6 + math/s_logf.c | 6 + math/s_pow.c | 6 + math/s_powf.c | 6 + math/s_sin.c | 6 + math/s_sinf.c | 6 + math/sincosf.c | 6 +- math/sincosf.h | 6 +- math/sincosf_data.c | 2 +- math/sinf.c | 6 +- math/test/mathbench.c | 369 ++++++++----- math/test/mathbench_funcs.h | 62 --- math/test/mathbench_wrappers.h | 66 --- math/test/mathtest.c | 16 +- math/test/rtest/dotest.c | 2 +- math/test/rtest/intern.h | 2 +- math/test/rtest/main.c | 2 +- math/test/rtest/random.c | 2 +- math/test/rtest/random.h | 2 +- math/test/rtest/semi.c | 2 +- math/test/rtest/semi.h | 2 +- math/test/rtest/types.h | 2 +- math/test/rtest/wrappers.c | 2 +- math/test/rtest/wrappers.h | 2 +- math/test/runulp.sh | 127 +++-- math/test/testcases/directed/cosf.tst | 2 +- math/test/testcases/directed/erf.tst | 2 +- math/test/testcases/directed/erff.tst | 2 +- math/test/testcases/directed/exp.tst | 2 +- math/test/testcases/directed/exp10.tst | 15 - math/test/testcases/directed/exp2.tst | 2 +- math/test/testcases/directed/exp2f.tst | 2 +- math/test/testcases/directed/expf.tst | 2 +- math/test/testcases/directed/log.tst | 2 +- math/test/testcases/directed/log2.tst | 2 +- math/test/testcases/directed/log2f.tst | 2 +- math/test/testcases/directed/logf.tst | 2 +- math/test/testcases/directed/pow.tst | 2 +- math/test/testcases/directed/powf.tst | 2 +- math/test/testcases/directed/sincosf.tst | 2 +- math/test/testcases/directed/sinf.tst | 2 +- math/test/testcases/random/double.tst | 2 +- math/test/testcases/random/float.tst | 2 +- math/test/ulp.c | 245 +++++---- math/test/ulp.h | 31 +- math/test/ulp_funcs.h | 40 -- math/test/ulp_wrappers.h | 37 -- math/tgamma128.c | 351 ------------- math/tgamma128.h | 141 ----- math/tools/cos.sollya | 2 +- math/tools/exp.sollya | 2 +- math/tools/exp2.sollya | 2 +- math/tools/log.sollya | 2 +- math/tools/log2.sollya | 2 +- math/tools/log2_abs.sollya | 2 +- math/tools/log_abs.sollya | 2 +- math/tools/plot.py | 2 +- math/tools/remez.jl | 2 +- math/tools/sin.sollya | 2 +- math/tools/tgamma128_gen.jl | 212 -------- math/tools/v_exp.sollya | 2 +- math/tools/v_log.sollya | 2 +- math/tools/v_sin.sollya | 2 +- math/v_cos.c | 87 +++ math/v_cosf.c | 76 +++ math/v_exp.c | 94 ++++ math/v_exp.h | 14 + math/v_exp2f.c | 78 +++ math/{aarch64 => }/v_exp2f_1u.c | 61 ++- math/v_exp_data.c | 403 ++++++++++++++ math/v_expf.c | 83 +++ math/{aarch64 => }/v_expf_1u.c | 69 +-- math/v_log.c | 104 ++++ math/v_log.h | 18 + math/v_log_data.c | 158 ++++++ math/v_logf.c | 73 +++ math/v_math.h | 641 +++++++++++++++++++++++ math/{aarch64 => }/v_pow.c | 21 +- math/v_powf.c | 235 +++++++++ math/v_sin.c | 86 +++ math/v_sinf.c | 75 +++ math/vn_cos.c | 12 + math/vn_cosf.c | 12 + math/vn_exp.c | 12 + math/vn_exp2f.c | 12 + math/vn_exp2f_1u.c | 11 + math/vn_expf.c | 12 + math/vn_expf_1u.c | 11 + math/vn_log.c | 12 + math/vn_logf.c | 12 + math/vn_pow.c | 12 + math/vn_powf.c | 12 + math/vn_sin.c | 12 + math/vn_sinf.c | 12 + networking/Dir.mk | 2 +- networking/aarch64/chksum_simd.c | 2 +- networking/arm/chksum_simd.c | 2 +- networking/chksum.c | 2 +- networking/chksum_common.h | 2 +- networking/include/networking.h | 2 +- networking/test/chksum.c | 2 +- string/Dir.mk | 2 +- string/README.contributors | 30 -- string/aarch64/__mtag_tag_region.S | 6 +- string/aarch64/__mtag_tag_zero_region.S | 6 +- string/aarch64/asmdefs.h | 106 ---- string/aarch64/check-arch.S | 6 +- string/aarch64/memchr-mte.S | 58 +- string/aarch64/memchr-sve.S | 6 +- string/aarch64/memchr.S | 6 +- string/aarch64/memcmp-sve.S | 6 +- string/aarch64/memcmp.S | 239 ++++----- string/aarch64/memcpy-advsimd.S | 6 +- string/aarch64/memcpy-mops.S | 21 - string/aarch64/memcpy-sve.S | 177 ------- string/aarch64/memcpy.S | 6 +- string/aarch64/memmove-mops.S | 21 - string/aarch64/memrchr.S | 51 +- string/aarch64/memset-mops.S | 20 - string/aarch64/memset.S | 6 +- string/aarch64/stpcpy-mte.S | 10 + string/aarch64/stpcpy-sve.S | 2 +- string/aarch64/stpcpy.S | 2 +- string/aarch64/strchr-mte.S | 58 +- string/aarch64/strchr-sve.S | 6 +- string/aarch64/strchr.S | 6 +- string/aarch64/strchrnul-mte.S | 47 +- string/aarch64/strchrnul-sve.S | 2 +- string/aarch64/strchrnul.S | 6 +- string/aarch64/strcmp-mte.S | 189 +++++++ string/aarch64/strcmp-sve.S | 6 +- string/aarch64/strcmp.S | 238 ++++----- string/aarch64/strcpy-mte.S | 161 ++++++ string/aarch64/strcpy-sve.S | 6 +- string/aarch64/strcpy.S | 395 +++++++++----- string/aarch64/strlen-mte.S | 41 +- string/aarch64/strlen-sve.S | 6 +- string/aarch64/strlen.S | 21 +- string/aarch64/strncmp-mte.S | 307 +++++++++++ string/aarch64/strncmp-sve.S | 6 +- string/aarch64/strncmp.S | 238 ++++----- string/aarch64/strnlen-sve.S | 6 +- string/aarch64/strnlen.S | 60 ++- string/aarch64/strrchr-mte.S | 58 +- string/aarch64/strrchr-sve.S | 6 +- string/aarch64/strrchr.S | 6 +- string/bench/memcpy.c | 170 ++---- string/bench/memset.c | 243 --------- string/bench/strlen.c | 16 +- string/include/benchlib.h | 2 +- string/include/stringlib.h | 15 +- string/test/__mtag_tag_region.c | 2 +- string/test/__mtag_tag_zero_region.c | 2 +- string/test/memchr.c | 2 +- string/test/memcmp.c | 2 +- string/test/memcpy.c | 10 +- string/test/memmove.c | 10 +- string/test/memrchr.c | 2 +- string/test/memset.c | 5 +- string/test/mte.h | 2 +- string/test/stpcpy.c | 7 +- string/test/strchr.c | 2 +- string/test/strchrnul.c | 2 +- string/test/strcmp.c | 7 +- string/test/strcpy.c | 7 +- string/test/stringtest.h | 2 +- string/test/strlen.c | 5 +- string/test/strncmp.c | 7 +- string/test/strnlen.c | 2 +- string/test/strrchr.c | 2 +- string/x86_64/check-arch.S | 2 +- 226 files changed, 4783 insertions(+), 5035 deletions(-) delete mode 100644 math/README.contributors delete mode 100644 math/aarch64/v_cos.c delete mode 100644 math/aarch64/v_cosf.c delete mode 100644 math/aarch64/v_exp.c delete mode 100644 math/aarch64/v_exp2f.c delete mode 100644 math/aarch64/v_exp_data.c delete mode 100644 math/aarch64/v_expf.c delete mode 100644 math/aarch64/v_log.c delete mode 100644 math/aarch64/v_log_data.c delete mode 100644 math/aarch64/v_logf.c delete mode 100644 math/aarch64/v_math.h delete mode 100644 math/aarch64/v_powf.c delete mode 100644 math/aarch64/v_sin.c delete mode 100644 math/aarch64/v_sinf.c delete mode 100644 math/exp10.c create mode 100644 math/s_cos.c create mode 100644 math/s_cosf.c create mode 100644 math/s_exp.c create mode 100644 math/s_exp2f.c create mode 100644 math/s_exp2f_1u.c create mode 100644 math/s_expf.c create mode 100644 math/s_expf_1u.c create mode 100644 math/s_log.c create mode 100644 math/s_logf.c create mode 100644 math/s_pow.c create mode 100644 math/s_powf.c create mode 100644 math/s_sin.c create mode 100644 math/s_sinf.c delete mode 100644 math/test/mathbench_funcs.h delete mode 100644 math/test/mathbench_wrappers.h delete mode 100644 math/test/testcases/directed/exp10.tst delete mode 100644 math/test/ulp_funcs.h delete mode 100644 math/test/ulp_wrappers.h delete mode 100644 math/tgamma128.c delete mode 100644 math/tgamma128.h delete mode 100644 math/tools/tgamma128_gen.jl create mode 100644 math/v_cos.c create mode 100644 math/v_cosf.c create mode 100644 math/v_exp.c create mode 100644 math/v_exp.h create mode 100644 math/v_exp2f.c rename math/{aarch64 => }/v_exp2f_1u.c (43%) create mode 100644 math/v_exp_data.c create mode 100644 math/v_expf.c rename math/{aarch64 => }/v_expf_1u.c (39%) create mode 100644 math/v_log.c create mode 100644 math/v_log.h create mode 100644 math/v_log_data.c create mode 100644 math/v_logf.c create mode 100644 math/v_math.h rename math/{aarch64 => }/v_pow.c (35%) create mode 100644 math/v_powf.c create mode 100644 math/v_sin.c create mode 100644 math/v_sinf.c create mode 100644 math/vn_cos.c create mode 100644 math/vn_cosf.c create mode 100644 math/vn_exp.c create mode 100644 math/vn_exp2f.c create mode 100644 math/vn_exp2f_1u.c create mode 100644 math/vn_expf.c create mode 100644 math/vn_expf_1u.c create mode 100644 math/vn_log.c create mode 100644 math/vn_logf.c create mode 100644 math/vn_pow.c create mode 100644 math/vn_powf.c create mode 100644 math/vn_sin.c create mode 100644 math/vn_sinf.c delete mode 100644 string/README.contributors delete mode 100644 string/aarch64/asmdefs.h delete mode 100644 string/aarch64/memcpy-mops.S delete mode 100644 string/aarch64/memcpy-sve.S delete mode 100644 string/aarch64/memmove-mops.S delete mode 100644 string/aarch64/memset-mops.S create mode 100644 string/aarch64/stpcpy-mte.S create mode 100644 string/aarch64/strcmp-mte.S create mode 100644 string/aarch64/strcpy-mte.S create mode 100644 string/aarch64/strncmp-mte.S delete mode 100644 string/bench/memset.c diff --git a/LICENSE b/LICENSE index 20a4b77..2543b82 100644 --- a/LICENSE +++ b/LICENSE @@ -1,11 +1,6 @@ -MIT OR Apache-2.0 WITH LLVM-exception -===================================== - - MIT License ------------ -Copyright (c) 1999-2022, Arm Limited. +Copyright (c) 1999-2019, Arm Limited. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -24,226 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -Apache-2.0 WITH LLVM-exception ------------------------------- - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - ---- LLVM Exceptions to the Apache 2.0 License ---- - -As an exception, if, as a result of your compiling your source code, portions -of this Software are embedded into an Object form of such source code, you -may redistribute such embedded portions in such Object form without complying -with the conditions of Sections 4(a), 4(b) and 4(d) of the License. - -In addition, if you combine or link compiled forms of this Software with -software that is licensed under the GPLv2 ("Combined Software") and if a -court of competent jurisdiction determines that the patent provision (Section -3), the indemnity provision (Section 9) or other Section of the License -conflicts with the conditions of the GPLv2, you may retroactively and -prospectively choose to deem waived or otherwise exclude such Section(s) of -the License, but only in their entirety and only with respect to the Combined -Software. diff --git a/Makefile b/Makefile index c487896..169f89e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile - requires GNU make # -# Copyright (c) 2018-2022, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# Copyright (c) 2018-2020, Arm Limited. +# SPDX-License-Identifier: MIT srcdir = . prefix = /usr @@ -11,7 +11,6 @@ includedir = $(prefix)/include # Configure these in config.mk, do not make changes in this file. SUBS = math string networking -PLSUBS = math HOST_CC = cc HOST_CFLAGS = -std=c99 -O2 HOST_LDFLAGS = @@ -21,7 +20,6 @@ CPPFLAGS = CFLAGS = -std=c99 -O2 CFLAGS_SHARED = -fPIC CFLAGS_ALL = -Ibuild/include $(CPPFLAGS) $(CFLAGS) -CFLAGS_PL = -Ibuild/pl/include $(CPPFLAGS) $(CFLAGS) -DPL LDFLAGS = LDLIBS = AR = $(CROSS_COMPILE)ar @@ -53,7 +51,6 @@ $(DIRS): mkdir -p $@ $(filter %.os,$(ALL_FILES)): CFLAGS_ALL += $(CFLAGS_SHARED) -$(filter %.os,$(ALL_FILES)): CFLAGS_PL += $(CFLAGS_SHARED) build/%.o: $(srcdir)/%.S $(CC) $(CFLAGS_ALL) -c -o $@ $< diff --git a/OAT.xml b/OAT.xml index ab48a78..71acb93 100644 --- a/OAT.xml +++ b/OAT.xml @@ -19,7 +19,7 @@ policylist: 1. policy: If the OAT-Default.xml policies do not meet your requirements, please add policies here. 2. policyitem: The fields type, name, path, desc is required, and the fields rule, group, filefilter is optional,the default value is: - + 3. policyitem type: "compatibility" is used to check license compatibility in the specified path; "license" is used to check source license header in the specified path; @@ -49,43 +49,10 @@ All configurations in this file will be merged to OAT-Default.xml, if you have a - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - + diff --git a/README b/README index a2143a2..9e1a34f 100644 --- a/README +++ b/README @@ -2,17 +2,14 @@ Arm Optimized Routines ---------------------- This repository contains implementations of library functions -provided by Arm. The outbound license is available under a dual -license, at the user’s election, as reflected in the LICENSE file. -Contributions to this project are accepted, but Contributors have -to sign an Assignment Agreement, please follow the instructions in +provided by Arm under MIT License (See LICENSE). Contributions +to this project are accepted, but Contributors have to sign an +Assignment Agreement, please follow the instructions in contributor-agreement.pdf. This is needed so upstreaming code -to projects that require copyright assignment is possible. Further -contribution requirements are documented in README.contributors of -the appropriate subdirectory. +to projects that require copyright assignment is possible. Regular quarterly releases are tagged as vYY.MM, the latest -release is v23.01. +release is v21.02. Source code layout: @@ -27,7 +24,6 @@ networking/test/ - networking test and benchmark related sources. string/ - string routines subproject sources. string/include/ - string library public headers. string/test/ - string test and benchmark related sources. -pl/... - separately maintained performance library code. The steps to build the target libraries and run the tests: diff --git a/config.mk.dist b/config.mk.dist index c4a6dba..177e1ac 100644 --- a/config.mk.dist +++ b/config.mk.dist @@ -1,14 +1,11 @@ # Example config.mk # -# Copyright (c) 2018-2022, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# Copyright (c) 2018-2020, Arm Limited. +# SPDX-License-Identifier: MIT # Subprojects to build SUBS = math string networking -# Subsubprojects to build if subproject pl is built -PLSUBS = math - # Target architecture: aarch64, arm or x86_64 ARCH = aarch64 @@ -59,22 +56,8 @@ math-cflags += -ffp-contract=fast -fno-math-errno # Use with clang. #math-cflags += -ffp-contract=fast -# Disable/enable SVE vector math code and tests -WANT_SVE_MATH = 0 -ifeq ($(WANT_SVE_MATH), 1) - math-cflags += -march=armv8.2-a+sve -endif -math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH) - -# If defined to 1, set errno in math functions according to ISO C. Many math -# libraries do not set errno, so this is 0 by default. It may need to be -# set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. -WANT_ERRNO = 0 -math-cflags += -DWANT_ERRNO=$(WANT_ERRNO) - -# If set to 1, set fenv in vector math routines. -WANT_SIMD_EXCEPT = 0 -math-cflags += -DWANT_SIMD_EXCEPT=$(WANT_SIMD_EXCEPT) +# Disable vector math code +#math-cflags += -DWANT_VMATH=0 # Disable fenv checks #math-ulpflags = -q -f diff --git a/math/Dir.mk b/math/Dir.mk index d6385d2..3b841ab 100644 --- a/math/Dir.mk +++ b/math/Dir.mk @@ -1,14 +1,12 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019-2022, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# Copyright (c) 2019, Arm Limited. +# SPDX-License-Identifier: MIT S := $(srcdir)/math B := build/math math-lib-srcs := $(wildcard $(S)/*.[cS]) -math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) - math-test-srcs := \ $(S)/test/mathtest.c \ $(S)/test/mathbench.c \ @@ -17,7 +15,6 @@ math-test-srcs := \ math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS]) math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) -math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h)) math-libs := \ build/lib/libmathlib.so \ @@ -45,11 +42,10 @@ math-files := \ $(math-tools) \ $(math-host-tools) \ $(math-includes) \ - $(math-test-includes) \ -all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) +all-math: $(math-libs) $(math-tools) $(math-includes) -$(math-objs): $(math-includes) $(math-test-includes) +$(math-objs): $(math-includes) $(math-objs): CFLAGS_ALL += $(math-cflags) $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) @@ -87,9 +83,6 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a build/include/%.h: $(S)/include/%.h cp $< $@ -build/include/test/%.h: $(S)/test/%.h - cp $< $@ - build/bin/%.sh: $(S)/test/%.sh cp $< $@ @@ -103,7 +96,7 @@ check-math-rtest: $(math-host-tools) $(math-tools) cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags) check-math-ulp: $(math-tools) - ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR) + ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR) check-math: check-math-test check-math-rtest check-math-ulp diff --git a/math/README.contributors b/math/README.contributors deleted file mode 100644 index 33e7ba3..0000000 --- a/math/README.contributors +++ /dev/null @@ -1,78 +0,0 @@ -STYLE REQUIREMENTS -================== - -1. Most code in this sub-directory is expected to be upstreamed into glibc so - the GNU Coding Standard and glibc specific conventions should be followed - to ease upstreaming. - -2. ABI and symbols: the code should be written so it is suitable for inclusion - into a libc with minimal changes. This e.g. means that internal symbols - should be hidden and in the implementation reserved namespace according to - ISO C and POSIX rules. If possible the built shared libraries and static - library archives should be usable to override libc symbols at link time (or - at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI - (other than symbol versioning), this cannot be done reliably for static - linking so this is a best effort requirement. - -3. API: include headers should be suitable for benchmarking and testing code - and should not conflict with libc headers. - - -CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY -============================================== - -1. Math functions have quality and performance requirements. - -2. Quality: - - Worst-case ULP error should be small in the entire input domain (for most - common double precision scalar functions the target is < 0.66 ULP error, - and < 1 ULP for single precision, even performance optimized function - variant should not have > 5 ULP error if the goal is to be a drop in - replacement for a standard math function), this should be tested - statistically (or on all inputs if possible in reasonable amount of time). - The ulp tool is for this and runulp.sh should be updated for new functions. - - - All standard rounding modes need to be supported but in non-default rounding - modes the quality requirement can be relaxed. (Non-nearest rounded - computation can be slow and inaccurate but has to be correct for conformance - reasons.) - - - Special cases and error handling need to follow ISO C Annex F requirements, - POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts: - https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions - this should be tested by direct tests (glibc test system may be used for it). - - - Error handling code should be decoupled from the approximation code as much - as possible. (There are helper functions, these take care of errno as well - as exception raising.) - - - Vector math code does not need to work in non-nearest rounding mode and error - handling side effects need not happen (fenv exceptions and errno), but the - result should be correct (within quality requirements, which are lower for - vector code than for scalar code). - - - Error bounds of the approximation should be clearly documented. - - - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux - systems. (Routines and features can be disabled on specific targets, but - the build must complete). On aarch64, both little- and big-endian targets - are supported as well as valid combinations of architecture extensions. - The configurations that should be tested depend on the contribution. - -3. Performance: - - Common math code should be benchmarked on modern aarch64 microarchitectures - over typical inputs. - - - Performance improvements should be documented (relative numbers can be - published; it is enough to use the mathbench microbenchmark tool which should - be updated for new functions). - - - Attention should be paid to the compilation flags: for aarch64 fma - contraction should be on and math errno turned off so some builtins can be - inlined. - - - The code should be reasonably performant on x86_64 too, e.g. some rounding - instructions and fma may not be available on x86_64, such builtins turn into - libc calls with slow code. Such slowdown is not acceptable, a faster fallback - should be present: glibc and bionic use the same code on all targets. (This - does not apply to vector math code). diff --git a/math/aarch64/v_cos.c b/math/aarch64/v_cos.c deleted file mode 100644 index 9a73575..0000000 --- a/math/aarch64/v_cos.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Double-precision vector cos function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - float64x2_t poly[7]; - float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; -} data = { - /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ - .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), - V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), - V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), - V2 (-0x1.9e9540300a1p-41) }, - .inv_pi = V2 (0x1.45f306dc9c883p-2), - .half_pi = V2 (0x1.921fb54442d18p+0), - .pi_1 = V2 (0x1.921fb54442d18p+1), - .pi_2 = V2 (0x1.1a62633145c06p-53), - .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), - .range_val = V2 (0x1p23) -}; - -#define C(i) d->poly[i] - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) -{ - y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); - return v_call_f64 (cos, x, y, cmp); -} - -float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; - uint64x2_t odd, cmp; - -#if WANT_SIMD_EXCEPT - r = vabsq_f64 (x); - cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), - vreinterpretq_u64_f64 (d->range_val)); - if (unlikely (v_any_u64 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - special-case handler later. */ - r = vbslq_f64 (cmp, v_f64 (1.0), r); -#else - cmp = vcageq_f64 (x, d->range_val); - r = x; -#endif - - /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); - n = vsubq_f64 (n, v_f64 (0.5)); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f64 (r, d->pi_1, n); - r = vfmsq_f64 (r, d->pi_2, n); - r = vfmsq_f64 (r, d->pi_3, n); - - /* sin(r) poly approx. */ - r2 = vmulq_f64 (r, r); - r3 = vmulq_f64 (r2, r); - r4 = vmulq_f64 (r2, r2); - - t1 = vfmaq_f64 (C (4), C (5), r2); - t2 = vfmaq_f64 (C (2), C (3), r2); - t3 = vfmaq_f64 (C (0), C (1), r2); - - y = vfmaq_f64 (t1, C (6), r4); - y = vfmaq_f64 (t2, y, r4); - y = vfmaq_f64 (t3, y, r4); - y = vfmaq_f64 (r, y, r3); - - if (unlikely (v_any_u64 (cmp))) - return special_case (x, y, odd, cmp); - return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); -} diff --git a/math/aarch64/v_cosf.c b/math/aarch64/v_cosf.c deleted file mode 100644 index b9890b2..0000000 --- a/math/aarch64/v_cosf.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Single-precision vector cos function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - float32x4_t poly[4]; - float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; -} data = { - /* 1.886 ulp error. */ - .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), - V4 (0x1.5b2e76p-19f) }, - - .pi_1 = V4 (0x1.921fb6p+1f), - .pi_2 = V4 (-0x1.777a5cp-24f), - .pi_3 = V4 (-0x1.ee59dap-49f), - - .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), - .half_pi = V4 (0x1.921fb6p0f), - .range_val = V4 (0x1p20f) -}; - -#define C(i) d->poly[i] - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) -{ - /* Fall back to scalar code. */ - y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); - return v_call_f32 (cosf, x, y, cmp); -} - -float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, r3, y; - uint32x4_t odd, cmp; - -#if WANT_SIMD_EXCEPT - r = vabsq_f32 (x); - cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), - vreinterpretq_u32_f32 (d->range_val)); - if (unlikely (v_any_u32 (cmp))) - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - special-case handler later. */ - r = vbslq_f32 (cmp, v_f32 (1.0f), r); -#else - cmp = vcageq_f32 (x, d->range_val); - r = x; -#endif - - /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); - n = vsubq_f32 (n, v_f32 (0.5f)); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f32 (r, d->pi_1, n); - r = vfmsq_f32 (r, d->pi_2, n); - r = vfmsq_f32 (r, d->pi_3, n); - - /* y = sin(r). */ - r2 = vmulq_f32 (r, r); - r3 = vmulq_f32 (r2, r); - y = vfmaq_f32 (C (2), C (3), r2); - y = vfmaq_f32 (C (1), y, r2); - y = vfmaq_f32 (C (0), y, r2); - y = vfmaq_f32 (r, y, r3); - - if (unlikely (v_any_u32 (cmp))) - return special_case (x, y, odd, cmp); - return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); -} diff --git a/math/aarch64/v_exp.c b/math/aarch64/v_exp.c deleted file mode 100644 index bc5609f..0000000 --- a/math/aarch64/v_exp.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Double-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -#define N (1 << V_EXP_TABLE_BITS) -#define IndexMask (N - 1) - -const static volatile struct -{ - float64x2_t poly[3]; - float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; -#if !WANT_SIMD_EXCEPT - float64x2_t special_bound, scale_thresh; -#endif -} data = { - /* maxerr: 1.88 +0.5 ulp - rel error: 1.4337*2^-53 - abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ - .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), - V2 (0x1.55555da646206p-5) }, -#if !WANT_SIMD_EXCEPT - .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ - .special_bound = V2 (704.0), -#endif - .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ - .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ - .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), - .shift = V2 (0x1.8p+52) -}; - -#define C(i) data.poly[i] -#define Tab __v_exp_data - -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ -# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ -# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f64 (exp, x, y, cmp); -} - -#else - -# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ -/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ -# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ -# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ - -static inline float64x2_t VPCS_ATTR -special_case (float64x2_t s, float64x2_t y, float64x2_t n) -{ - /* 2^(n/N) may overflow, break it up into s1*s2. */ - uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); - float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); - float64x2_t s2 = vreinterpretq_f64_u64 ( - vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); - uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); - float64x2_t r1 = vmulq_f64 (s1, s1); - float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); - return vbslq_f64 (cmp, r1, r0); -} - -#endif - -float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) -{ - float64x2_t n, r, r2, s, y, z; - uint64x2_t cmp, u, e; - -#if WANT_SIMD_EXCEPT - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special_case to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - float64x2_t xm = x; - uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); - cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); - if (unlikely (v_any_u64 (cmp))) - x = vbslq_f64 (cmp, v_f64 (1), x); -#else - cmp = vcagtq_f64 (x, data.special_bound); -#endif - - /* n = round(x/(ln2/N)). */ - z = vfmaq_f64 (data.shift, x, data.inv_ln2); - u = vreinterpretq_u64_f64 (z); - n = vsubq_f64 (z, data.shift); - - /* r = x - n*ln2/N. */ - r = x; - r = vfmsq_f64 (r, data.ln2_hi, n); - r = vfmsq_f64 (r, data.ln2_lo, n); - - e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); - - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ - r2 = vmulq_f64 (r, r); - y = vfmaq_f64 (C (0), C (1), r); - y = vfmaq_f64 (y, C (2), r2); - y = vfmaq_f64 (r, y, r2); - - /* s = 2^(n/N). */ - u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; - s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); - - if (unlikely (v_any_u64 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f64 (s, y, s), cmp); -#else - return special_case (s, y, n); -#endif - - return vfmaq_f64 (s, y, s); -} diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/v_exp2f.c deleted file mode 100644 index e402205..0000000 --- a/math/aarch64/v_exp2f.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - float32x4_t poly[5]; - uint32x4_t exponent_bias; -#if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; -#endif -} data = { - /* maxerr: 1.962 ulp. */ - .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), - V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, - .exponent_bias = V4 (0x3f800000), -#if !WANT_SIMD_EXCEPT - .special_bound = V4 (126.0f), - .scale_thresh = V4 (192.0f), -#endif -}; - -#define C(i) d->poly[i] - -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ -# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ -# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine for special lanes. */ - return v_call_f32 (exp2f, x, y, cmp); -} - -#else - -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, - float32x4_t scale, const struct data *d) -{ - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); - float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); - uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); - float32x4_t r2 = vmulq_f32 (s1, s1); - float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - float32x4_t r0 = vfmaq_f32 (scale, poly, scale); - float32x4_t r = vbslq_f32 (cmp1, r1, r0); - return vbslq_f32 (cmp2, r2, r); -} - -#endif - -float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly; - uint32x4_t cmp, e; - -#if WANT_SIMD_EXCEPT - /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ - uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special_case to fix special lanes later. This is only necessary if fenv - exceptions are to be triggered correctly. */ - if (unlikely (v_any_u32 (cmp))) - x = vbslq_f32 (cmp, v_f32 (1), x); -#endif - - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ - n = vrndaq_f32 (x); - r = vsubq_f32 (x, n); - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); - -#if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); -#endif - - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); - q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); - - if (unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); -#else - return special_case (poly, n, e, cmp, scale, d); -#endif - - return vfmaq_f32 (scale, poly, scale); -} diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c deleted file mode 100644 index 45f0848..0000000 --- a/math/aarch64/v_exp_data.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Lookup table for double-precision e^x vector function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" - -# define N (1 << V_EXP_TABLE_BITS) - -/* 2^(j/N), j=0..N. */ -const uint64_t __v_exp_data[] = { -# if N == 128 - 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, - 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, - 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, - 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, - 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, - 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, - 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, - 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, - 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, - 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, - 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, - 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, - 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, - 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, - 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, - 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, - 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, - 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, - 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, - 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, - 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, - 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, - 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, - 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, - 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, - 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, - 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, - 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, - 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, - 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, - 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, - 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, - 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, - 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, - 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, - 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, - 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, - 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, - 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, - 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, - 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, - 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, - 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, -# elif N == 256 - 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, - 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, - 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, - 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, - 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, - 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, - 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, - 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, - 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, - 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, - 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, - 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, - 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, - 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, - 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, - 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, - 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, - 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, - 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, - 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, - 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, - 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, - 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, - 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, - 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, - 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, - 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, - 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, - 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, - 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, - 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, - 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, - 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, - 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, - 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, - 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, - 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, - 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, - 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, - 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, - 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, - 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, - 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, - 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, - 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, - 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, - 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, - 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, - 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, - 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, - 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, - 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, - 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, - 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, - 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, - 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, - 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, - 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, - 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, - 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, - 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, - 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, - 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, - 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, - 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, - 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, - 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, - 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, - 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, - 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, - 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, - 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, - 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, - 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, - 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, - 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, - 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, - 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, - 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, - 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, - 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, - 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, - 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, - 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, - 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, - 0x3feff9d96b2a23d9, -# endif -}; diff --git a/math/aarch64/v_expf.c b/math/aarch64/v_expf.c deleted file mode 100644 index 34e8b60..0000000 --- a/math/aarch64/v_expf.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - float32x4_t poly[5]; - float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; - uint32x4_t exponent_bias; -#if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; -#endif -} data = { - /* maxerr: 1.45358 +0.5 ulp. */ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, - .shift = V4 (0x1.8p23f), - .inv_ln2 = V4 (0x1.715476p+0f), - .ln2_hi = V4 (0x1.62e4p-1f), - .ln2_lo = V4 (0x1.7f7d1cp-20f), - .exponent_bias = V4 (0x3f800000), -#if !WANT_SIMD_EXCEPT - .special_bound = V4 (126.0f), - .scale_thresh = V4 (192.0f), -#endif -}; - -#define C(i) d->poly[i] - -#if WANT_SIMD_EXCEPT - -# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ -# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ -# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) -{ - /* If fenv exceptions are to be triggered correctly, fall back to the scalar - routine to special lanes. */ - return v_call_f32 (expf, x, y, cmp); -} - -#else - -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, - float32x4_t scale, const struct data *d) -{ - /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); - float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); - uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); - float32x4_t r2 = vmulq_f32 (s1, s1); - float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - float32x4_t r0 = vfmaq_f32 (scale, poly, scale); - float32x4_t r = vbslq_f32 (cmp1, r1, r0); - return vbslq_f32 (cmp2, r2, r); -} - -#endif - -float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly, z; - uint32x4_t cmp, e; - -#if WANT_SIMD_EXCEPT - /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - cmp = vcgeq_u32 ( - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), - TinyBound), - SpecialBound); - float32x4_t xm = x; - /* If any lanes are special, mask them with 1 and retain a copy of x to allow - special case handler to fix special lanes later. This is only necessary if - fenv exceptions are to be triggered correctly. */ - if (unlikely (v_any_u32 (cmp))) - x = vbslq_f32 (cmp, v_f32 (1), x); -#endif - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - z = vfmaq_f32 (d->shift, x, d->inv_ln2); - n = vsubq_f32 (z, d->shift); - r = vfmsq_f32 (x, n, d->ln2_hi); - r = vfmsq_f32 (r, n, d->ln2_lo); - e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); - -#if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); -#endif - - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); - q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); - - if (unlikely (v_any_u32 (cmp))) -#if WANT_SIMD_EXCEPT - return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); -#else - return special_case (poly, n, e, cmp, scale, d); -#endif - - return vfmaq_f32 (scale, poly, scale); -} diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c deleted file mode 100644 index 1d1c1fa..0000000 --- a/math/aarch64/v_log.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - uint64x2_t min_norm; - uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t ln2; - uint64x2_t sign_exp_mask; -} data = { - /* Worst-case error: 1.17 + 0.5 ulp. - Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), - V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), - V2 (-0x1.554e550bd501ep-3) }, - .ln2 = V2 (0x1.62e42fefa39efp-1), - .min_norm = V2 (0x0010000000000000), - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ - .sign_exp_mask = V2 (0xfff0000000000000) -}; - -#define A(i) d->poly[i] -#define N (1 << V_LOG_TABLE_BITS) -#define IndexMask (N - 1) -#define Off v_u64 (0x3fe6900900000000) - -struct entry -{ - float64x2_t invc; - float64x2_t logc; -}; - -static inline struct entry -lookup (uint64x2_t i) -{ - /* Since N is a power of 2, n % N = n & (N - 1). */ - struct entry e; - uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; - float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); - float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); - e.invc = vuzp1q_f64 (e0, e1); - e.logc = vuzp2q_f64 (e0, e1); - return e; -} - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, - uint32x2_t cmp) -{ - return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); -} - -float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - float64x2_t z, r, r2, p, y, kd, hi; - uint64x2_t ix, iz, tmp; - uint32x2_t cmp; - int64x2_t k; - struct entry e; - - ix = vreinterpretq_u64_f64 (x); - cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); - - /* x = 2^k z; where z is in range [Off,2*Off) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = vsubq_u64 (ix, Off); - k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ - iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); - z = vreinterpretq_f64_u64 (iz); - e = lookup (tmp); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - kd = vcvtq_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = vmulq_f64 (r, r); - y = vfmaq_f64 (A (2), A (3), r); - p = vfmaq_f64 (A (0), A (1), r); - y = vfmaq_f64 (y, A (4), r2); - y = vfmaq_f64 (p, y, r2); - - if (unlikely (v_any_u32h (cmp))) - return special_case (x, y, hi, r2, cmp); - return vfmaq_f64 (hi, y, r2); -} diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c deleted file mode 100644 index 82351bb..0000000 --- a/math/aarch64/v_log_data.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Lookup table for double-precision log(x) vector function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" - -#define N (1 << V_LOG_TABLE_BITS) - -const struct v_log_data __v_log_data = { - /* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + poly(z/c - 1) - - where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, - N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: - - table[i].invc = 1/c - table[i].logc = (double)log(c) - - where c is near the center of the subinterval and is chosen by trying several - floating point invc candidates around 1/center and selecting one for which - the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval - that contains 1 and the previous one got tweaked to avoid cancellation. */ - .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, - { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, - { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, - { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, - { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, - { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, - { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, - { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, - { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, - { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, - { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, - { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, - { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, - { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, - { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, - { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, - { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, - { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, - { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, - { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, - { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, - { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, - { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, - { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, - { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, - { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, - { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, - { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, - { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, - { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, - { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, - { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, - { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, - { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, - { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, - { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, - { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, - { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, - { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, - { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, - { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, - { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, - { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, - { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, - { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, - { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, - { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, - { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, - { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, - { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, - { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, - { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, - { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, - { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, - { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, - { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, - { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, - { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, - { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, - { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, - { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, - { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, - { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, - { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, - { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, - { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, - { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, - { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, - { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, - { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, - { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, - { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, - { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, - { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, - { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, - { 1.0, 0.0 }, - { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, - { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, - { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, - { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, - { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, - { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, - { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, - { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, - { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, - { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, - { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, - { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, - { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, - { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, - { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, - { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, - { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, - { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, - { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, - { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, - { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, - { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, - { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, - { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, - { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, - { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, - { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, - { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, - { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, - { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, - { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, - { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, - { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, - { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, - { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, - { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, - { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, - { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, - { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, - { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, - { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, - { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, - { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, - { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, - { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, - { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, - { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, - { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, - { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, - { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, - { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, - { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } -}; diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c deleted file mode 100644 index 66ebbbc..0000000 --- a/math/aarch64/v_logf.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - uint32x4_t min_norm; - uint16x8_t special_bound; - float32x4_t poly[7]; - float32x4_t ln2, tiny_bound; - uint32x4_t off, mantissa_mask; -} data = { - /* 3.34 ulp error. */ - .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), - V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), - V4 (-0x1.ffffc8p-2f) }, - .ln2 = V4 (0x1.62e43p-1f), - .tiny_bound = V4 (0x1p-126), - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ - .off = V4 (0x3f2aaaab), /* 0.666667. */ - .mantissa_mask = V4 (0x007fffff) -}; - -#define P(i) d->poly[7 - i] - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, - uint16x4_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); -} - -float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, p, q, r, r2, y; - uint32x4_t u; - uint16x4_t cmp; - - u = vreinterpretq_u32_f32 (x); - cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); - n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vandq_u32 (u, d->mantissa_mask); - u = vaddq_u32 (u, d->off); - r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); - - /* y = log(1+r) + n*ln2. */ - r2 = vmulq_f32 (r, r); - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = vfmaq_f32 (P (5), P (6), r); - q = vfmaq_f32 (P (3), P (4), r); - y = vfmaq_f32 (P (1), P (2), r); - p = vfmaq_f32 (p, P (7), r2); - q = vfmaq_f32 (q, p, r2); - y = vfmaq_f32 (y, q, r2); - p = vfmaq_f32 (r, d->ln2, n); - - if (unlikely (v_any_u16h (cmp))) - return special_case (x, y, r2, p, cmp); - return vfmaq_f32 (p, y, r2); -} diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h deleted file mode 100644 index 1dc9916..0000000 --- a/math/aarch64/v_math.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#if !__aarch64__ -# error "Cannot build without AArch64" -#endif - -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) - -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f -#define V_NAME_D1(fun) _ZGVnN2v_##fun -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f -#define V_NAME_D2(fun) _ZGVnN2vv_##fun - -#include -#include "../math_config.h" -#include - -/* Shorthand helpers for declaring constants. */ -# define V2(X) { X, X } -# define V4(X) { X, X, X, X } -# define V8(X) { X, X, X, X, X, X, X, X } - -static inline int -v_any_u16h (uint16x4_t x) -{ - return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; -} - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline float32x4_t -v_f32 (float x) -{ - return (float32x4_t) V4 (x); -} -static inline uint32x4_t -v_u32 (uint32_t x) -{ - return (uint32x4_t) V4 (x); -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (uint32x4_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -static inline int -v_any_u32h (uint32x2_t x) -{ - return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; -} -static inline float32x4_t -v_lookup_f32 (const float *tab, uint32x4_t idx) -{ - return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline uint32x4_t -v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) -{ - return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline float32x4_t -v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) -{ - return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline float32x4_t -v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, - float32x4_t y, uint32x4_t p) -{ - return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], - p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], - p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline float64x2_t -v_f64 (double x) -{ - return (float64x2_t) V2 (x); -} -static inline uint64x2_t -v_u64 (uint64_t x) -{ - return (uint64x2_t) V2 (x); -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (uint64x2_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -static inline float64x2_t -v_lookup_f64 (const double *tab, uint64x2_t idx) -{ - return (float64x2_t){tab[idx[0]], tab[idx[1]]}; -} -static inline uint64x2_t -v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) -{ - return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; -} -static inline float64x2_t -v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) -{ - double p1 = p[1]; - double x1 = x[1]; - if (likely (p[0])) - y[0] = f (x[0]); - if (likely (p1)) - y[1] = f (x1); - return y; -} - -#endif diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c deleted file mode 100644 index 3a4163a..0000000 --- a/math/aarch64/v_powf.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "v_math.h" - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Thresh v_u32 (0x7f000000) /* Max - Min. */ -#define MantissaMask v_u32 (0x007fffff) - -#define A data.log2_poly -#define C data.exp2f_poly - -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ -#define Off v_u32 (0x3f35d000) - -#define V_POWF_LOG2_TABLE_BITS 5 -#define V_EXP2F_TABLE_BITS 5 -#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) -#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) - -static const struct -{ - struct - { - double invc, logc; - } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; - double log2_poly[4]; - uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; - double exp2f_poly[3]; -} data = { - .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, - {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, - {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, - {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, - {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, - {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, - {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, - {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, - {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, - {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, - {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, - {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, - {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, - {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, - {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, - {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, - {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, - {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, - {0x1p+0, 0x0p+0 * Scale}, - {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, - {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, - {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, - {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, - {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, - {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, - {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, - {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, - {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, - {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, - {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, - {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, - {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, - .log2_poly = { /* rel err: 1.5 * 2^-30. */ - -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, - -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, - .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, - 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, - 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, - 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, - 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, - 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, - 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, - 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, - 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, - 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, - 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, - .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ - 0x1.c6af84b912394p-5 / Scale / Scale / Scale, - 0x1.ebfce50fac4f3p-3 / Scale / Scale, - 0x1.62e42ff0c52d6p-1 / Scale}}; - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) -{ - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); - uint32x4_t tmp = vsubq_u32 (u, Off); - uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), - Log2IdxMask); - uint32x4_t top = vbicq_u32 (tmp, MantissaMask); - uint32x4_t iz = vsubq_u32 (u, top); - int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), - 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ - - float32x4_t ret; - for (int lane = 0; lane < 4; lane++) - { - /* Use double precision for each lane. */ - double invc = data.log2_tab[i[lane]].invc; - double logc = data.log2_tab[i[lane]].logc; - double z = (double) asfloat (iz[lane]); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ - double r = __builtin_fma (z, invc, -1.0); - double y0 = logc + (double) k[lane]; - - /* Polynomial to approximate log1p(r)/ln2. */ - double logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; - double ylogx = y[lane] * logx; - cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) - >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 - ? 1 - : cmp[lane]; - - /* N*x = k + r with r in [-1/2, 1/2]. */ - double kd = round (ylogx); - uint64_t ki = lround (ylogx); - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ - uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; - t += ki << (52 - V_EXP2F_TABLE_BITS); - double s = asdouble (t); - double p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - ret[lane] = p; - } - if (unlikely (v_any_u32 (cmp))) - return special_case (x, y, ret, cmp); - return ret; -} diff --git a/math/aarch64/v_sin.c b/math/aarch64/v_sin.c deleted file mode 100644 index 04129c3..0000000 --- a/math/aarch64/v_sin.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Double-precision vector sin function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - float64x2_t poly[7]; - float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; -} data = { - .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), - V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), - V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), - V2 (-0x1.9e9540300a1p-41) }, - - .range_val = V2 (0x1p23), - .inv_pi = V2 (0x1.45f306dc9c883p-2), - .pi_1 = V2 (0x1.921fb54442d18p+1), - .pi_2 = V2 (0x1.1a62633145c06p-53), - .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), -}; - -#if WANT_SIMD_EXCEPT -# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ -# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ -#endif - -#define C(i) d->poly[i] - -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) -{ - y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); - return v_call_f64 (sin, x, y, cmp); -} - -/* Vector (AdvSIMD) sin approximation. - Maximum observed error in [-pi/2, pi/2], where argument is not reduced, - is 2.87 ULP: - _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 - want 0x1.fffffffa7dc05p-1 - Maximum observed error in the entire non-special domain ([-2^23, 2^23]) - is 3.22 ULP: - _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 - want 0x1.ffdcd125c84f8p-3. */ -float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; - uint64x2_t odd, cmp; - -#if WANT_SIMD_EXCEPT - /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be - triggered correctly, set any special lanes to 1 (which is neutral w.r.t. - fenv). These lanes will be fixed by special-case handler later. */ - uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); - cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); - r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); -#else - r = x; - cmp = vcageq_f64 (x, d->range_val); -#endif - - /* n = rint(|x|/pi). */ - n = vfmaq_f64 (d->shift, d->inv_pi, r); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = vfmsq_f64 (r, d->pi_1, n); - r = vfmsq_f64 (r, d->pi_2, n); - r = vfmsq_f64 (r, d->pi_3, n); - - /* sin(r) poly approx. */ - r2 = vmulq_f64 (r, r); - r3 = vmulq_f64 (r2, r); - r4 = vmulq_f64 (r2, r2); - - t1 = vfmaq_f64 (C (4), C (5), r2); - t2 = vfmaq_f64 (C (2), C (3), r2); - t3 = vfmaq_f64 (C (0), C (1), r2); - - y = vfmaq_f64 (t1, C (6), r4); - y = vfmaq_f64 (t2, y, r4); - y = vfmaq_f64 (t3, y, r4); - y = vfmaq_f64 (r, y, r3); - - if (unlikely (v_any_u64 (cmp))) - return special_case (x, y, odd, cmp); - return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); -} diff --git a/math/aarch64/v_sinf.c b/math/aarch64/v_sinf.c deleted file mode 100644 index 3368798..0000000 --- a/math/aarch64/v_sinf.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Single-precision vector sin function. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "mathlib.h" -#include "v_math.h" - -static const struct data -{ - float32x4_t poly[4]; - float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; -} data = { - /* 1.886 ulp error. */ - .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), - V4 (0x1.5b2e76p-19f) }, - - .pi_1 = V4 (0x1.921fb6p+1f), - .pi_2 = V4 (-0x1.777a5cp-24f), - .pi_3 = V4 (-0x1.ee59dap-49f), - - .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), - .range_val = V4 (0x1p20f) -}; - -#if WANT_SIMD_EXCEPT -# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ -# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ -#endif - -#define C(i) d->poly[i] - -static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) -{ - /* Fall back to scalar code. */ - y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); - return v_call_f32 (sinf, x, y, cmp); -} - -float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) -{ - const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, y; - uint32x4_t odd, cmp; - -#if WANT_SIMD_EXCEPT - uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); - /* If fenv exceptions are to be triggered correctly, set any special lanes - to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by - special-case handler later. */ - r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); -#else - r = x; - cmp = vcageq_f32 (x, d->range_val); -#endif - - /* n = rint(|x|/pi) */ - n = vfmaq_f32 (d->shift, d->inv_pi, r); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = vfmsq_f32 (r, d->pi_1, n); - r = vfmsq_f32 (r, d->pi_2, n); - r = vfmsq_f32 (r, d->pi_3, n); - - /* y = sin(r) */ - r2 = vmulq_f32 (r, r); - y = vfmaq_f32 (C (2), C (3), r2); - y = vfmaq_f32 (C (1), y, r2); - y = vfmaq_f32 (C (0), y, r2); - y = vfmaq_f32 (r, vmulq_f32 (y, r2), r); - - if (unlikely (v_any_u32 (cmp))) - return special_case (x, y, odd, cmp); - return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); -} diff --git a/math/cosf.c b/math/cosf.c index 6293ce8..f29f194 100644 --- a/math/cosf.c +++ b/math/cosf.c @@ -1,8 +1,8 @@ /* * Single-precision cos function. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -22,7 +22,7 @@ cosf (float y) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4f)) + if (abstop12 (y) < abstop12 (pio4)) { double x2 = x * x; diff --git a/math/erf.c b/math/erf.c index 5f9f40d..12d7e51 100644 --- a/math/erf.c +++ b/math/erf.c @@ -2,7 +2,7 @@ * Double-precision erf(x) function. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/erf_data.c b/math/erf_data.c index 10cf1fa..807875b 100644 --- a/math/erf_data.c +++ b/math/erf_data.c @@ -2,7 +2,7 @@ * Shared data between erf and erfc. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/erff.c b/math/erff.c index 9fa476d..a58e825 100644 --- a/math/erff.c +++ b/math/erff.c @@ -2,7 +2,7 @@ * Single-precision erf(x) function. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/erff_data.c b/math/erff_data.c index f822788..fa6b1ef 100644 --- a/math/erff_data.c +++ b/math/erff_data.c @@ -2,7 +2,7 @@ * Data for approximation of erff. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/exp.c b/math/exp.c index 1de500c..7f5024c 100644 --- a/math/exp.c +++ b/math/exp.c @@ -2,7 +2,7 @@ * Double-precision e^x function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/exp10.c b/math/exp10.c deleted file mode 100644 index 0fbec4c..0000000 --- a/math/exp10.c +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Double-precision 10^x function. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "math_config.h" - -#define N (1 << EXP_TABLE_BITS) -#define IndexMask (N - 1) -#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */ -#define UFlowBound -0x1.5ep+8 /* -350. */ -#define SmallTop 0x3c6 /* top12(0x1p-57). */ -#define BigTop 0x407 /* top12(0x1p8). */ -#define Thresh 0x41 /* BigTop - SmallTop. */ -#define Shift __exp_data.shift -#define C(i) __exp_data.exp10_poly[i] - -static double -special_case (uint64_t sbits, double_t tmp, uint64_t ki) -{ - double_t scale, y; - - if (ki - (1ull << 16) < 0x80000000) - { - /* The exponent of scale might have overflowed by 1. */ - sbits -= 1ull << 52; - scale = asdouble (sbits); - y = 2 * (scale + scale * tmp); - return check_oflow (eval_as_double (y)); - } - - /* n < 0, need special care in the subnormal range. */ - sbits += 1022ull << 52; - scale = asdouble (sbits); - y = scale + scale * tmp; - - if (y < 1.0) - { - /* Round y to the right precision before scaling it into the subnormal - range to avoid double rounding that can cause 0.5+E/2 ulp error where - E is the worst-case ulp error outside the subnormal range. So this - is only useful if the goal is better than 1 ulp worst-case error. */ - double_t lo = scale - y + scale * tmp; - double_t hi = 1.0 + y; - lo = 1.0 - hi + y + lo; - y = eval_as_double (hi + lo) - 1.0; - /* Avoid -0.0 with downward rounding. */ - if (WANT_ROUNDING && y == 0.0) - y = 0.0; - /* The underflow exception needs to be signaled explicitly. */ - force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); - } - y = 0x1p-1022 * y; - - return check_uflow (y); -} - -/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */ -double -exp10 (double x) -{ - uint64_t ix = asuint64 (x); - uint32_t abstop = (ix >> 52) & 0x7ff; - - if (unlikely (abstop - SmallTop >= Thresh)) - { - if (abstop - SmallTop >= 0x80000000) - /* Avoid spurious underflow for tiny x. - Note: 0 is common input. */ - return x + 1; - if (abstop == 0x7ff) - return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0; - if (x >= OFlowBound) - return __math_oflow (0); - if (x < UFlowBound) - return __math_uflow (0); - - /* Large x is special-cased below. */ - abstop = 0; - } - - /* Reduce x: z = x * N / log10(2), k = round(z). */ - double_t z = __exp_data.invlog10_2N * x; - double_t kd; - int64_t ki; -#if TOINT_INTRINSICS - kd = roundtoint (z); - ki = converttoint (z); -#else - kd = eval_as_double (z + Shift); - kd -= Shift; - ki = kd; -#endif - - /* r = x - k * log10(2), r in [-0.5, 0.5]. */ - double_t r = x; - r = __exp_data.neglog10_2hiN * kd + r; - r = __exp_data.neglog10_2loN * kd + r; - - /* exp10(x) = 2^(k/N) * 2^(r/N). - Approximate the two components separately. */ - - /* s = 2^(k/N), using lookup table. */ - uint64_t e = ki << (52 - EXP_TABLE_BITS); - uint64_t i = (ki & IndexMask) * 2; - uint64_t u = __exp_data.tab[i + 1]; - uint64_t sbits = u + e; - - double_t tail = asdouble (__exp_data.tab[i]); - - /* 2^(r/N) ~= 1 + r * Poly(r). */ - double_t r2 = r * r; - double_t p = C (0) + r * C (1); - double_t y = C (2) + r * C (3); - y = y + r2 * C (4); - y = p + r2 * y; - y = tail + y * r; - - if (unlikely (abstop == 0)) - return special_case (sbits, y, ki); - - /* Assemble components: - y = 2^(r/N) * 2^(k/N) - ~= (y + 1) * s. */ - double_t s = asdouble (sbits); - return eval_as_double (s * y + s); -} diff --git a/math/exp2.c b/math/exp2.c index a1eee44..35ab39f 100644 --- a/math/exp2.c +++ b/math/exp2.c @@ -2,7 +2,7 @@ * Double-precision 2^x function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/exp2f.c b/math/exp2f.c index 776c3dd..94b3253 100644 --- a/math/exp2f.c +++ b/math/exp2f.c @@ -2,7 +2,7 @@ * Single-precision 2^x function. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/exp2f_data.c b/math/exp2f_data.c index f0cb7fc..3fb0ad1 100644 --- a/math/exp2f_data.c +++ b/math/exp2f_data.c @@ -2,7 +2,7 @@ * Shared data between expf, exp2f and powf. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/exp_data.c b/math/exp_data.c index 9df4e0b..cba7683 100644 --- a/math/exp_data.c +++ b/math/exp_data.c @@ -2,7 +2,7 @@ * Shared data between exp, exp2 and pow. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" @@ -12,7 +12,6 @@ const struct exp_data __exp_data = { // N/ln2 .invln2N = 0x1.71547652b82fep0 * N, -.invlog10_2N = 0x1.a934f0979a371p1 * N, // -ln2/N #if N == 64 .negln2hiN = -0x1.62e42fefa0000p-7, @@ -27,8 +26,6 @@ const struct exp_data __exp_data = { .negln2hiN = -0x1.62e42fef80000p-10, .negln2loN = -0x1.1cf79abc9e3b4p-45, #endif -.neglog10_2hiN = -0x1.3441350ap-2 / N, -.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N, // Used for rounding when !TOINT_INTRINSICS #if EXP_USE_TOINT_NARROW .shift = 0x1800000000.8p0, @@ -150,24 +147,6 @@ const struct exp_data __exp_data = { 0x1.3b2ab786ee1dap-7, #endif }, -.exp10_poly = { -#if EXP10_POLY_WIDE -/* Range is wider if using shift-based reduction: coeffs generated - using Remez in [-log10(2)/128, log10(2)/128 ]. */ -0x1.26bb1bbb55515p1, -0x1.53524c73cd32bp1, -0x1.0470591e1a108p1, -0x1.2bd77b12fe9a8p0, -0x1.14289fef24b78p-1 -#else -/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */ -0x1.26bb1bbb55516p1, -0x1.53524c73ce9fep1, -0x1.0470591ce4b26p1, -0x1.2bd76577fe684p0, -0x1.1446eeccd0efbp-1 -#endif -}, // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) // tab[2*k] = asuint64(T[k]) // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N diff --git a/math/expf.c b/math/expf.c index 08a20d5..9b2f0c3 100644 --- a/math/expf.c +++ b/math/expf.c @@ -2,7 +2,7 @@ * Single-precision e^x function. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/include/mathlib.h b/math/include/mathlib.h index 64cbb9c..279d829 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2015-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2015-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #ifndef _MATHLIB_H @@ -18,33 +18,74 @@ float cosf (float); void sincosf (float, float*, float*); double exp (double); -double exp10 (double); double exp2 (double); double log (double); double log2 (double); double pow (double, double); +/* Scalar functions using the vector algorithm with identical result. */ +float __s_sinf (float); +float __s_cosf (float); +float __s_expf (float); +float __s_expf_1u (float); +float __s_exp2f (float); +float __s_exp2f_1u (float); +float __s_logf (float); +float __s_powf (float, float); +double __s_sin (double); +double __s_cos (double); +double __s_exp (double); +double __s_log (double); +double __s_pow (double, double); + #if __aarch64__ -# if __GNUC__ >= 5 +#if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -# elif __clang_major__*100+__clang_minor__ >= 305 +#elif __clang_major__*100+__clang_minor__ >= 305 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -# else -# error Unsupported compiler -# endif +#else +#error Unsupported compiler +#endif + +/* Vector functions following the base PCS. */ +__f32x4_t __v_sinf (__f32x4_t); +__f32x4_t __v_cosf (__f32x4_t); +__f32x4_t __v_expf (__f32x4_t); +__f32x4_t __v_expf_1u (__f32x4_t); +__f32x4_t __v_exp2f (__f32x4_t); +__f32x4_t __v_exp2f_1u (__f32x4_t); +__f32x4_t __v_logf (__f32x4_t); +__f32x4_t __v_powf (__f32x4_t, __f32x4_t); +__f64x2_t __v_sin (__f64x2_t); +__f64x2_t __v_cos (__f64x2_t); +__f64x2_t __v_exp (__f64x2_t); +__f64x2_t __v_log (__f64x2_t); +__f64x2_t __v_pow (__f64x2_t, __f64x2_t); -# if __GNUC__ >= 9 || __clang_major__ >= 8 -# undef __vpcs -# define __vpcs __attribute__((__aarch64_vector_pcs__)) +#if __GNUC__ >= 9 || __clang_major__ >= 8 +#define __vpcs __attribute__((__aarch64_vector_pcs__)) + +/* Vector functions following the vector PCS. */ +__vpcs __f32x4_t __vn_sinf (__f32x4_t); +__vpcs __f32x4_t __vn_cosf (__f32x4_t); +__vpcs __f32x4_t __vn_expf (__f32x4_t); +__vpcs __f32x4_t __vn_expf_1u (__f32x4_t); +__vpcs __f32x4_t __vn_exp2f (__f32x4_t); +__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t); +__vpcs __f32x4_t __vn_logf (__f32x4_t); +__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t); +__vpcs __f64x2_t __vn_sin (__f64x2_t); +__vpcs __f64x2_t __vn_cos (__f64x2_t); +__vpcs __f64x2_t __vn_exp (__f64x2_t); +__vpcs __f64x2_t __vn_log (__f64x2_t); +__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t); /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); -__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); @@ -53,7 +94,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); -# endif +#endif #endif #endif diff --git a/math/log.c b/math/log.c index 43dfc2a..d3b7bc6 100644 --- a/math/log.c +++ b/math/log.c @@ -2,7 +2,7 @@ * Double-precision log(x) function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/log2.c b/math/log2.c index 3f9c21b..55102b7 100644 --- a/math/log2.c +++ b/math/log2.c @@ -2,7 +2,7 @@ * Double-precision log2(x) function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/log2_data.c b/math/log2_data.c index 293bd7d..3fc9b47 100644 --- a/math/log2_data.c +++ b/math/log2_data.c @@ -2,7 +2,7 @@ * Data for log2. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/log2f.c b/math/log2f.c index 0a44fa2..acb629e 100644 --- a/math/log2f.c +++ b/math/log2f.c @@ -2,7 +2,7 @@ * Single-precision log2 function. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/log2f_data.c b/math/log2f_data.c index 4866ef7..f3546d7 100644 --- a/math/log2f_data.c +++ b/math/log2f_data.c @@ -2,7 +2,7 @@ * Data definition for log2f. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/log_data.c b/math/log_data.c index 3ecc1f4..96a098d 100644 --- a/math/log_data.c +++ b/math/log_data.c @@ -2,7 +2,7 @@ * Data for log. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/logf.c b/math/logf.c index 820f74c..cfbaee1 100644 --- a/math/logf.c +++ b/math/logf.c @@ -1,8 +1,8 @@ /* * Single-precision log function. * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2017-2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -57,7 +57,7 @@ logf (float x) tmp = ix - OFF; i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; k = (int32_t) tmp >> 23; /* arithmetic shift */ - iz = ix - (tmp & 0xff800000); + iz = ix - (tmp & 0x1ff << 23); invc = T[i].invc; logc = T[i].logc; z = (double_t) asfloat (iz); diff --git a/math/logf_data.c b/math/logf_data.c index 0424768..e8973ce 100644 --- a/math/logf_data.c +++ b/math/logf_data.c @@ -2,7 +2,7 @@ * Data definition for logf. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/math_config.h b/math/math_config.h index 394aaeb..e851043 100644 --- a/math/math_config.h +++ b/math/math_config.h @@ -1,8 +1,8 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2017-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #ifndef _MATH_CONFIG_H @@ -92,17 +92,6 @@ # define unlikely(x) (x) #endif -/* Return ptr but hide its value from the compiler so accesses through it - cannot be optimized based on the contents. */ -#define ptr_barrier(ptr) \ - ({ \ - __typeof (ptr) __ptr = (ptr); \ - __asm("" : "+r"(__ptr)); \ - __ptr; \ - }) - -/* Symbol renames to avoid libc conflicts. */ - #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ @@ -392,22 +381,15 @@ extern const struct powf_log2_data #define EXP_USE_TOINT_NARROW 0 #define EXP2_POLY_ORDER 5 #define EXP2_POLY_WIDE 0 -/* Wider exp10 polynomial necessary for good precision in non-nearest rounding - and !TOINT_INTRINSICS. */ -#define EXP10_POLY_WIDE 0 extern const struct exp_data { double invln2N; - double invlog10_2N; double shift; double negln2hiN; double negln2loN; - double neglog10_2hiN; - double neglog10_2loN; double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; - double exp10_poly[5]; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; @@ -477,16 +459,4 @@ extern const struct erf_data double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; } __erf_data HIDDEN; -#define V_EXP_TABLE_BITS 7 -extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; - -#define V_LOG_TABLE_BITS 7 -extern const struct v_log_data -{ - struct - { - double invc, logc; - } table[1 << V_LOG_TABLE_BITS]; -} __v_log_data HIDDEN; - #endif diff --git a/math/math_err.c b/math/math_err.c index cfe0728..1bf9538 100644 --- a/math/math_err.c +++ b/math/math_err.c @@ -2,7 +2,7 @@ * Double-precision math error handling. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/math_errf.c b/math/math_errf.c index 4233918..d5350b8 100644 --- a/math/math_errf.c +++ b/math/math_errf.c @@ -2,7 +2,7 @@ * Single-precision math error handling. * * Copyright (c) 2017-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/pow.c b/math/pow.c index af719fe..86842c6 100644 --- a/math/pow.c +++ b/math/pow.c @@ -2,7 +2,7 @@ * Double-precision x^y function. * * Copyright (c) 2018-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/pow_log_data.c b/math/pow_log_data.c index 2a4c250..45569c5 100644 --- a/math/pow_log_data.c +++ b/math/pow_log_data.c @@ -2,7 +2,7 @@ * Data for the log part of pow. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/powf.c b/math/powf.c index 05c80bb..6ba45d3 100644 --- a/math/powf.c +++ b/math/powf.c @@ -2,7 +2,7 @@ * Single-precision pow function. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/powf_log2_data.c b/math/powf_log2_data.c index 243836a..97e0d98 100644 --- a/math/powf_log2_data.c +++ b/math/powf_log2_data.c @@ -2,7 +2,7 @@ * Data definition for powf. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "math_config.h" diff --git a/math/s_cos.c b/math/s_cos.c new file mode 100644 index 0000000..53a95b0 --- /dev/null +++ b/math/s_cos.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_cos.c" diff --git a/math/s_cosf.c b/math/s_cosf.c new file mode 100644 index 0000000..914c02e --- /dev/null +++ b/math/s_cosf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_cosf.c" diff --git a/math/s_exp.c b/math/s_exp.c new file mode 100644 index 0000000..ac7246b --- /dev/null +++ b/math/s_exp.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_exp.c" diff --git a/math/s_exp2f.c b/math/s_exp2f.c new file mode 100644 index 0000000..df7dfd6 --- /dev/null +++ b/math/s_exp2f.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_exp2f.c" diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c new file mode 100644 index 0000000..5e3852b --- /dev/null +++ b/math/s_exp2f_1u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_exp2f_1u.c" diff --git a/math/s_expf.c b/math/s_expf.c new file mode 100644 index 0000000..3492c46 --- /dev/null +++ b/math/s_expf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_expf.c" diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c new file mode 100644 index 0000000..eb7bbcb --- /dev/null +++ b/math/s_expf_1u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_expf_1u.c" diff --git a/math/s_log.c b/math/s_log.c new file mode 100644 index 0000000..23289cf --- /dev/null +++ b/math/s_log.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_log.c" diff --git a/math/s_logf.c b/math/s_logf.c new file mode 100644 index 0000000..9399350 --- /dev/null +++ b/math/s_logf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_logf.c" diff --git a/math/s_pow.c b/math/s_pow.c new file mode 100644 index 0000000..2e34c9f --- /dev/null +++ b/math/s_pow.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_pow.c" diff --git a/math/s_powf.c b/math/s_powf.c new file mode 100644 index 0000000..6d91a4a --- /dev/null +++ b/math/s_powf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_powf.c" diff --git a/math/s_sin.c b/math/s_sin.c new file mode 100644 index 0000000..06982c2 --- /dev/null +++ b/math/s_sin.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_sin.c" diff --git a/math/s_sinf.c b/math/s_sinf.c new file mode 100644 index 0000000..68ca908 --- /dev/null +++ b/math/s_sinf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_sinf.c" diff --git a/math/sincosf.c b/math/sincosf.c index 446f21d..9746f1c 100644 --- a/math/sincosf.c +++ b/math/sincosf.c @@ -1,8 +1,8 @@ /* * Single-precision sin/cos function. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4f)) + if (abstop12 (y) < abstop12 (pio4)) { double x2 = x * x; diff --git a/math/sincosf.h b/math/sincosf.h index ec23ed7..1e80fc9 100644 --- a/math/sincosf.h +++ b/math/sincosf.h @@ -1,8 +1,8 @@ /* * Header for sinf, cosf and sincosf. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -12,7 +12,7 @@ /* 2PI * 2^-64. */ static const double pi63 = 0x1.921FB54442D18p-62; /* PI / 4. */ -static const float pio4f = 0x1.921FB6p-1f; +static const double pio4 = 0x1.921FB54442D18p-1; /* The constants and polynomials for sine and cosine. */ typedef struct diff --git a/math/sincosf_data.c b/math/sincosf_data.c index 2252529..ab4ac47 100644 --- a/math/sincosf_data.c +++ b/math/sincosf_data.c @@ -2,7 +2,7 @@ * Data definition for sinf, cosf and sincosf. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/sinf.c b/math/sinf.c index 8dd8ae4..ddbc1da 100644 --- a/math/sinf.c +++ b/math/sinf.c @@ -1,8 +1,8 @@ /* * Single-precision sin function. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -21,7 +21,7 @@ sinf (float y) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4f)) + if (abstop12 (y) < abstop12 (pio4)) { s = x * x; diff --git a/math/test/mathbench.c b/math/test/mathbench.c index b2711e5..0c17826 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -1,8 +1,8 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #undef _GNU_SOURCE @@ -15,6 +15,11 @@ #include #include "mathlib.h" +#ifndef WANT_VMATH +/* Enable the build of vector math code. */ +# define WANT_VMATH 1 +#endif + /* Number of measurements, best result is reported. */ #define MEASURE 60 /* Array size. */ @@ -29,9 +34,8 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#ifdef __vpcs -#include -typedef float64x2_t v_double; +#if __aarch64__ && WANT_VMATH +typedef __f64x2_t v_double; #define v_double_len() 2 @@ -47,7 +51,7 @@ v_double_dup (double x) return (v_double){x, x}; } -typedef float32x4_t v_float; +typedef __f32x4_t v_float; #define v_float_len() 4 @@ -72,91 +76,141 @@ typedef float v_float; #define v_float_len(x) 1 #define v_float_load(x) (x)[0] #define v_float_dup(x) (x) - #endif -#if WANT_SVE_MATH -#include -typedef svbool_t sv_bool; -typedef svfloat64_t sv_double; +static double +dummy (double x) +{ + return x; +} -#define sv_double_len() svcntd() +static float +dummyf (float x) +{ + return x; +} -static inline sv_double -sv_double_load (const double *p) +#if WANT_VMATH +#if __aarch64__ +static v_double +__v_dummy (v_double x) { - svbool_t pg = svptrue_b64(); - return svld1(pg, p); + return x; } -static inline sv_double -sv_double_dup (double x) +static v_float +__v_dummyf (v_float x) { - return svdup_n_f64(x); + return x; } -typedef svfloat32_t sv_float; +#ifdef __vpcs +__vpcs static v_double +__vn_dummy (v_double x) +{ + return x; +} -#define sv_float_len() svcntw() +__vpcs static v_float +__vn_dummyf (v_float x) +{ + return x; +} -static inline sv_float -sv_float_load (const float *p) +__vpcs static v_float +xy__vn_powf (v_float x) { - svbool_t pg = svptrue_b32(); - return svld1(pg, p); + return __vn_powf (x, x); } -static inline sv_float -sv_float_dup (float x) +__vpcs static v_float +xy_Z_powf (v_float x) { - return svdup_n_f32(x); + return _ZGVnN4vv_powf (x, x); +} + +__vpcs static v_double +xy__vn_pow (v_double x) +{ + return __vn_pow (x, x); +} + +__vpcs static v_double +xy_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (x, x); } -#else -/* dummy definitions to make things compile. */ -#define sv_double_len(x) 1 -#define sv_float_len(x) 1 #endif -static double -dummy (double x) +static v_float +xy__v_powf (v_float x) { - return x; + return __v_powf (x, x); } -static float -dummyf (float x) +static v_double +xy__v_pow (v_double x) { - return x; + return __v_pow (x, x); } -#ifdef __vpcs -__vpcs static v_double -__vn_dummy (v_double x) +#endif + +static float +xy__s_powf (float x) { - return x; + return __s_powf (x, x); } -__vpcs static v_float -__vn_dummyf (v_float x) +static double +xy__s_pow (double x) { - return x; + return __s_pow (x, x); } #endif -#if WANT_SVE_MATH -static sv_double -__sv_dummy (sv_double x, sv_bool pg) + +static double +xypow (double x) { - return x; + return pow (x, x); } -static sv_float -__sv_dummyf (sv_float x, sv_bool pg) +static float +xypowf (float x) { - return x; + return powf (x, x); } -#endif +static double +xpow (double x) +{ + return pow (x, 23.4); +} + +static float +xpowf (float x) +{ + return powf (x, 23.4f); +} + +static double +ypow (double x) +{ + return pow (2.34, x); +} + +static float +ypowf (float x) +{ + return powf (2.34f, x); +} -#include "test/mathbench_wrappers.h" +static float +sincosf_wrap (float x) +{ + float s, c; + sincosf (x, &s, &c); + return s + c; +} static const struct fun { @@ -169,40 +223,127 @@ static const struct fun { double (*d) (double); float (*f) (float); + v_double (*vd) (v_double); + v_float (*vf) (v_float); #ifdef __vpcs __vpcs v_double (*vnd) (v_double); __vpcs v_float (*vnf) (v_float); -#endif -#if WANT_SVE_MATH - sv_double (*svd) (sv_double, sv_bool); - sv_float (*svf) (sv_float, sv_bool); #endif } fun; } funtab[] = { #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, +#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, +#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, -#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, -#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) +D (exp, -9.9, 9.9) +D (exp, 0.5, 1.0) +D (exp2, -9.9, 9.9) +D (log, 0.01, 11.1) +D (log, 0.999, 1.001) +D (log2, 0.01, 11.1) +D (log2, 0.999, 1.001) +{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, +D (xpow, 0.01, 11.1) +D (ypow, -9.9, 9.9) +D (erf, -6.0, 6.0) + F (dummyf, 1.0, 2.0) +F (expf, -9.9, 9.9) +F (exp2f, -9.9, 9.9) +F (logf, 0.01, 11.1) +F (log2f, 0.01, 11.1) +{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, +F (xpowf, 0.01, 11.1) +F (ypowf, -9.9, 9.9) +{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, +F (sinf, 0.1, 0.7) +F (sinf, 0.8, 3.1) +F (sinf, -3.1, 3.1) +F (sinf, 3.3, 33.3) +F (sinf, 100, 1000) +F (sinf, 1e6, 1e32) +F (cosf, 0.1, 0.7) +F (cosf, 0.8, 3.1) +F (cosf, -3.1, 3.1) +F (cosf, 3.3, 33.3) +F (cosf, 100, 1000) +F (cosf, 1e6, 1e32) +F (erff, -4.0, 4.0) +#if WANT_VMATH +D (__s_sin, -3.1, 3.1) +D (__s_cos, -3.1, 3.1) +D (__s_exp, -9.9, 9.9) +D (__s_log, 0.01, 11.1) +{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, +F (__s_expf, -9.9, 9.9) +F (__s_expf_1u, -9.9, 9.9) +F (__s_exp2f, -9.9, 9.9) +F (__s_exp2f_1u, -9.9, 9.9) +F (__s_logf, 0.01, 11.1) +{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, +F (__s_sinf, -3.1, 3.1) +F (__s_cosf, -3.1, 3.1) +#if __aarch64__ +VD (__v_dummy, 1.0, 2.0) +VD (__v_sin, -3.1, 3.1) +VD (__v_cos, -3.1, 3.1) +VD (__v_exp, -9.9, 9.9) +VD (__v_log, 0.01, 11.1) +{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, +VF (__v_dummyf, 1.0, 2.0) +VF (__v_expf, -9.9, 9.9) +VF (__v_expf_1u, -9.9, 9.9) +VF (__v_exp2f, -9.9, 9.9) +VF (__v_exp2f_1u, -9.9, 9.9) +VF (__v_logf, 0.01, 11.1) +{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, +VF (__v_sinf, -3.1, 3.1) +VF (__v_cosf, -3.1, 3.1) #ifdef __vpcs VND (__vn_dummy, 1.0, 2.0) +VND (__vn_exp, -9.9, 9.9) +VND (_ZGVnN2v_exp, -9.9, 9.9) +VND (__vn_log, 0.01, 11.1) +VND (_ZGVnN2v_log, 0.01, 11.1) +{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, +{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, +VND (__vn_sin, -3.1, 3.1) +VND (_ZGVnN2v_sin, -3.1, 3.1) +VND (__vn_cos, -3.1, 3.1) +VND (_ZGVnN2v_cos, -3.1, 3.1) VNF (__vn_dummyf, 1.0, 2.0) +VNF (__vn_expf, -9.9, 9.9) +VNF (_ZGVnN4v_expf, -9.9, 9.9) +VNF (__vn_expf_1u, -9.9, 9.9) +VNF (__vn_exp2f, -9.9, 9.9) +VNF (_ZGVnN4v_exp2f, -9.9, 9.9) +VNF (__vn_exp2f_1u, -9.9, 9.9) +VNF (__vn_logf, 0.01, 11.1) +VNF (_ZGVnN4v_logf, 0.01, 11.1) +{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, +{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, +VNF (__vn_sinf, -3.1, 3.1) +VNF (_ZGVnN4v_sinf, -3.1, 3.1) +VNF (__vn_cosf, -3.1, 3.1) +VNF (_ZGVnN4v_cosf, -3.1, 3.1) +#endif #endif -#if WANT_SVE_MATH -SVD (__sv_dummy, 1.0, 2.0) -SVF (__sv_dummyf, 1.0, 2.0) #endif -#include "test/mathbench_funcs.h" {0}, #undef F #undef D +#undef VF +#undef VD #undef VNF #undef VND -#undef SVF -#undef SVD }; static void @@ -301,75 +442,69 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } -#ifdef __vpcs static void -run_vn_thruput (__vpcs v_double f (v_double)) +run_v_thruput (v_double f (v_double)) { for (int i = 0; i < N; i += v_double_len ()) f (v_double_load (A+i)); } static void -runf_vn_thruput (__vpcs v_float f (v_float)) +runf_v_thruput (v_float f (v_float)) { for (int i = 0; i < N; i += v_float_len ()) f (v_float_load (Af+i)); } static void -run_vn_latency (__vpcs v_double f (v_double)) +run_v_latency (v_double f (v_double)) { - volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; - uint64x2_t sel = vsel; - v_double prev = v_double_dup (0); + v_double z = v_double_dup (zero); + v_double prev = z; for (int i = 0; i < N; i += v_double_len ()) - prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); + prev = f (v_double_load (A+i) + prev * z); } static void -runf_vn_latency (__vpcs v_float f (v_float)) +runf_v_latency (v_float f (v_float)) { - volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; - uint32x4_t sel = vsel; - v_float prev = v_float_dup (0); + v_float z = v_float_dup (zero); + v_float prev = z; for (int i = 0; i < N; i += v_float_len ()) - prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); + prev = f (v_float_load (Af+i) + prev * z); } -#endif -#if WANT_SVE_MATH +#ifdef __vpcs static void -run_sv_thruput (sv_double f (sv_double, sv_bool)) +run_vn_thruput (__vpcs v_double f (v_double)) { - for (int i = 0; i < N; i += sv_double_len ()) - f (sv_double_load (A+i), svptrue_b64 ()); + for (int i = 0; i < N; i += v_double_len ()) + f (v_double_load (A+i)); } static void -runf_sv_thruput (sv_float f (sv_float, sv_bool)) +runf_vn_thruput (__vpcs v_float f (v_float)) { - for (int i = 0; i < N; i += sv_float_len ()) - f (sv_float_load (Af+i), svptrue_b32 ()); + for (int i = 0; i < N; i += v_float_len ()) + f (v_float_load (Af+i)); } static void -run_sv_latency (sv_double f (sv_double, sv_bool)) +run_vn_latency (__vpcs v_double f (v_double)) { - volatile sv_bool vsel = svptrue_b64 (); - sv_bool sel = vsel; - sv_double prev = sv_double_dup (0); - for (int i = 0; i < N; i += sv_double_len ()) - prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); + v_double z = v_double_dup (zero); + v_double prev = z; + for (int i = 0; i < N; i += v_double_len ()) + prev = f (v_double_load (A+i) + prev * z); } static void -runf_sv_latency (sv_float f (sv_float, sv_bool)) +runf_vn_latency (__vpcs v_float f (v_float)) { - volatile sv_bool vsel = svptrue_b32 (); - sv_bool sel = vsel; - sv_float prev = sv_float_dup (0); - for (int i = 0; i < N; i += sv_float_len ()) - prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); + v_float z = v_float_dup (zero); + v_float prev = z; + for (int i = 0; i < N; i += v_float_len ()) + prev = f (v_float_load (Af+i) + prev * z); } #endif @@ -404,10 +539,10 @@ bench1 (const struct fun *f, int type, double lo, double hi) const char *s = type == 't' ? "rthruput" : "latency"; int vlen = 1; - if (f->vec == 'n') - vlen = f->prec == 'd' ? v_double_len() : v_float_len(); - else if (f->vec == 's') - vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); + if (f->vec && f->prec == 'd') + vlen = v_double_len(); + else if (f->vec && f->prec == 'f') + vlen = v_float_len(); if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); @@ -417,6 +552,14 @@ bench1 (const struct fun *f, int type, double lo, double hi) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); + else if (f->prec == 'd' && type == 't' && f->vec == 'v') + TIMEIT (run_v_thruput, f->fun.vd); + else if (f->prec == 'd' && type == 'l' && f->vec == 'v') + TIMEIT (run_v_latency, f->fun.vd); + else if (f->prec == 'f' && type == 't' && f->vec == 'v') + TIMEIT (runf_v_thruput, f->fun.vf); + else if (f->prec == 'f' && type == 'l' && f->vec == 'v') + TIMEIT (runf_v_latency, f->fun.vf); #ifdef __vpcs else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); @@ -427,32 +570,20 @@ bench1 (const struct fun *f, int type, double lo, double hi) else if (f->prec == 'f' && type == 'l' && f->vec == 'n') TIMEIT (runf_vn_latency, f->fun.vnf); #endif -#if WANT_SVE_MATH - else if (f->prec == 'd' && type == 't' && f->vec == 's') - TIMEIT (run_sv_thruput, f->fun.svd); - else if (f->prec == 'd' && type == 'l' && f->vec == 's') - TIMEIT (run_sv_latency, f->fun.svd); - else if (f->prec == 'f' && type == 't' && f->vec == 's') - TIMEIT (runf_sv_thruput, f->fun.svf); - else if (f->prec == 'f' && type == 'l' && f->vec == 's') - TIMEIT (runf_sv_latency, f->fun.svf); -#endif if (type == 't') { ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", - f->name, s, + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi, vlen); + (unsigned long long) dt, lo, hi); } else if (type == 'l') { ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); - printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", - f->name, s, + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi, vlen); + (unsigned long long) dt, lo, hi); } fflush (stdout); } diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h deleted file mode 100644 index 84c4e68..0000000 --- a/math/test/mathbench_funcs.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Function entries for mathbench. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -/* clang-format off */ -D (exp, -9.9, 9.9) -D (exp, 0.5, 1.0) -D (exp10, -9.9, 9.9) -D (exp2, -9.9, 9.9) -D (log, 0.01, 11.1) -D (log, 0.999, 1.001) -D (log2, 0.01, 11.1) -D (log2, 0.999, 1.001) -{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, -D (xpow, 0.01, 11.1) -D (ypow, -9.9, 9.9) -D (erf, -6.0, 6.0) - -F (expf, -9.9, 9.9) -F (exp2f, -9.9, 9.9) -F (logf, 0.01, 11.1) -F (log2f, 0.01, 11.1) -{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, -F (xpowf, 0.01, 11.1) -F (ypowf, -9.9, 9.9) -{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, -F (sinf, 0.1, 0.7) -F (sinf, 0.8, 3.1) -F (sinf, -3.1, 3.1) -F (sinf, 3.3, 33.3) -F (sinf, 100, 1000) -F (sinf, 1e6, 1e32) -F (cosf, 0.1, 0.7) -F (cosf, 0.8, 3.1) -F (cosf, -3.1, 3.1) -F (cosf, 3.3, 33.3) -F (cosf, 100, 1000) -F (cosf, 1e6, 1e32) -F (erff, -4.0, 4.0) -#ifdef __vpcs -VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (_ZGVnN2v_log, 0.01, 11.1) -{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (_ZGVnN2v_cos, -3.1, 3.1) -VNF (_ZGVnN4v_expf, -9.9, 9.9) -VNF (_ZGVnN4v_expf_1u, -9.9, 9.9) -VNF (_ZGVnN4v_exp2f, -9.9, 9.9) -VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9) -VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (_ZGVnN4v_cosf, -3.1, 3.1) -#endif - /* clang-format on */ diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h deleted file mode 100644 index 062b9db..0000000 --- a/math/test/mathbench_wrappers.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Function wrappers for mathbench. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifdef __vpcs - -__vpcs static v_float -xy_Z_powf (v_float x) -{ - return _ZGVnN4vv_powf (x, x); -} - -__vpcs static v_double -xy_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (x, x); -} - -#endif - -static double -xypow (double x) -{ - return pow (x, x); -} - -static float -xypowf (float x) -{ - return powf (x, x); -} - -static double -xpow (double x) -{ - return pow (x, 23.4); -} - -static float -xpowf (float x) -{ - return powf (x, 23.4f); -} - -static double -ypow (double x) -{ - return pow (2.34, x); -} - -static float -ypowf (float x) -{ - return powf (2.34f, x); -} - -static float -sincosf_wrap (float x) -{ - float s, c; - sincosf (x, &s, &c); - return s + c; -} diff --git a/math/test/mathtest.c b/math/test/mathtest.c index cedccfd..3108967 100644 --- a/math/test/mathtest.c +++ b/math/test/mathtest.c @@ -1,8 +1,8 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 1998-2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -196,11 +196,9 @@ int is_complex_rettype(int rettype) { #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name } #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name } -#ifndef PL /* sincosf wrappers for easier testing. */ static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; } static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; } -#endif test_func tfuncs[] = { /* trigonometric */ @@ -220,10 +218,9 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT), TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4), -#ifndef PL TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4), -#endif + /* hyperbolic */ TFUNC(at_d, rt_d, atanh, 4*ULPUNIT), TFUNC(at_d, rt_d, asinh, 4*ULPUNIT), @@ -254,7 +251,6 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4), TFUNC(at_s,rt_s, expm1f, ULPUNIT), - TFUNC(at_d,rt_d, exp10, ULPUNIT), /* power */ TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4), @@ -1022,7 +1018,6 @@ int runtest(testdetail t) { DO_DOP(d_arg1,op1r); DO_DOP(d_arg2,op2r); s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0]; - s_res.i = 0; /* * Detect NaNs, infinities and denormals on input, and set a @@ -1157,25 +1152,22 @@ int runtest(testdetail t) { tresultr[0] = t.resultr[0]; tresultr[1] = t.resultr[1]; resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd]; - resulti[0] = resulti[1] = 0; wres = 2; break; case rt_i: tresultr[0] = t.resultr[0]; resultr[0] = intres; - resulti[0] = 0; wres = 1; break; case rt_s: case rt_s2: tresultr[0] = t.resultr[0]; resultr[0] = s_res.i; - resulti[0] = 0; wres = 1; break; default: puts("unhandled rettype in runtest"); - abort (); + wres = 0; } if(t.resultc != rc_none) { int err = 0; diff --git a/math/test/rtest/dotest.c b/math/test/rtest/dotest.c index 5b3e9b4..6be79e1 100644 --- a/math/test/rtest/dotest.c +++ b/math/test/rtest/dotest.c @@ -2,7 +2,7 @@ * dotest.c - actually generate mathlib test cases * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/test/rtest/intern.h b/math/test/rtest/intern.h index 3ebd7dd..12a9c74 100644 --- a/math/test/rtest/intern.h +++ b/math/test/rtest/intern.h @@ -2,7 +2,7 @@ * intern.h * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef mathtest_intern_h diff --git a/math/test/rtest/main.c b/math/test/rtest/main.c index 3d533c9..0d8ead8 100644 --- a/math/test/rtest/main.c +++ b/math/test/rtest/main.c @@ -2,7 +2,7 @@ * main.c * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/test/rtest/random.c b/math/test/rtest/random.c index 1de3258..5612396 100644 --- a/math/test/rtest/random.c +++ b/math/test/rtest/random.c @@ -2,7 +2,7 @@ * random.c - random number generator for producing mathlib test cases * * Copyright (c) 1998-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "types.h" diff --git a/math/test/rtest/random.h b/math/test/rtest/random.h index 0b477d7..b4b22df 100644 --- a/math/test/rtest/random.h +++ b/math/test/rtest/random.h @@ -2,7 +2,7 @@ * random.h - header for random.c * * Copyright (c) 2009-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "types.h" diff --git a/math/test/rtest/semi.c b/math/test/rtest/semi.c index 70a7844..c9f0daf 100644 --- a/math/test/rtest/semi.c +++ b/math/test/rtest/semi.c @@ -2,7 +2,7 @@ * semi.c: test implementations of mathlib seminumerical functions * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/test/rtest/semi.h b/math/test/rtest/semi.h index 7a1444e..17dc415 100644 --- a/math/test/rtest/semi.h +++ b/math/test/rtest/semi.h @@ -2,7 +2,7 @@ * semi.h: header for semi.c * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef test_semi_h diff --git a/math/test/rtest/types.h b/math/test/rtest/types.h index e15b4e0..53cd557 100644 --- a/math/test/rtest/types.h +++ b/math/test/rtest/types.h @@ -2,7 +2,7 @@ * types.h * * Copyright (c) 2005-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef mathtest_types_h diff --git a/math/test/rtest/wrappers.c b/math/test/rtest/wrappers.c index 4410171..de45ac5 100644 --- a/math/test/rtest/wrappers.c +++ b/math/test/rtest/wrappers.c @@ -2,7 +2,7 @@ * wrappers.c - wrappers to modify output of MPFR/MPC test functions * * Copyright (c) 2014-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/math/test/rtest/wrappers.h b/math/test/rtest/wrappers.h index 0a8a587..7b09c85 100644 --- a/math/test/rtest/wrappers.h +++ b/math/test/rtest/wrappers.h @@ -2,7 +2,7 @@ * wrappers.h - wrappers to modify output of MPFR/MPC test functions * * Copyright (c) 2014-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ typedef struct { diff --git a/math/test/runulp.sh b/math/test/runulp.sh index e2e03e3..0190d9a 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -2,8 +2,8 @@ # ULP error check script. # -# Copyright (c) 2019-2023, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# Copyright (c) 2019-2020, Arm Limited. +# SPDX-License-Identifier: MIT #set -x set -eu @@ -72,16 +72,6 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 -L=0.02 -t exp10 0 0x1p-47 5000 -t exp10 -0 -0x1p-47 5000 -t exp10 0x1p-47 1 50000 -t exp10 -0x1p-47 -1 50000 -t exp10 1 0x1.34413509f79ffp8 50000 -t exp10 -1 -0x1.434e6420f4374p8 50000 -t exp10 0x1.34413509f79ffp8 inf 5000 -t exp10 -0x1.434e6420f4374p8 -inf 5000 - L=1.0 Ldir=0.9 t erf 0 0xffff000000000000 10000 @@ -153,10 +143,15 @@ Ldir=0.5 done # vector functions - Ldir=0.5 r='n' -flags="${ULPFLAGS:--q}" +flags="${ULPFLAGS:--q} -f" +runs= +check __s_exp 1 && runs=1 +runv= +check __v_exp 1 && runv=1 +runvn= +check __vn_exp 1 && runvn=1 range_exp=' 0 0xffff000000000000 10000 @@ -182,10 +177,9 @@ range_pow=' ' range_sin=' - 0 0x1p23 500000 - -0 -0x1p23 500000 - 0x1p23 inf 10000 - -0x1p23 -inf 10000 + 0 0xffff000000000000 10000 + 0x1p-4 0x1p4 400000 + -0x1p-23 0x1p23 400000 ' range_cos="$range_sin" @@ -205,10 +199,9 @@ range_logf=' ' range_sinf=' - 0 0x1p20 500000 - -0 -0x1p20 500000 - 0x1p20 inf 10000 - -0x1p20 -inf 10000 + 0 0xffff0000 10000 + 0x1p-4 0x1p4 300000 +-0x1p-9 -0x1p9 300000 ' range_cosf="$range_sinf" @@ -236,8 +229,9 @@ L_sinf=1.4 L_cosf=1.4 L_powf=2.1 -while read G F D +while read G F R do + [ "$R" = 1 ] || continue case "$G" in \#*) continue ;; esac eval range="\${range_$G}" eval L="\${L_$G}" @@ -245,35 +239,74 @@ do do [ -n "$X" ] || continue case "$X" in \#*) continue ;; esac - disable_fenv="" - if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then - # If library was built with SIMD exceptions - # disabled, disable fenv checking in ulp - # tool. Otherwise, fenv checking may still be - # disabled by adding -f to the end of the run - # line. - disable_fenv="-f" - fi - t $D $disable_fenv $F $X + t $F $X done << EOF $range - EOF done << EOF # group symbol run -exp _ZGVnN2v_exp -log _ZGVnN2v_log -pow _ZGVnN2vv_pow -f -sin _ZGVnN2v_sin -z -cos _ZGVnN2v_cos -expf _ZGVnN4v_expf -expf_1u _ZGVnN4v_expf_1u -f -exp2f _ZGVnN4v_exp2f -exp2f_1u _ZGVnN4v_exp2f_1u -f -logf _ZGVnN4v_logf -sinf _ZGVnN4v_sinf -z -cosf _ZGVnN4v_cosf -powf _ZGVnN4vv_powf -f +exp __s_exp $runs +exp __v_exp $runv +exp __vn_exp $runvn +exp _ZGVnN2v_exp $runvn + +log __s_log $runs +log __v_log $runv +log __vn_log $runvn +log _ZGVnN2v_log $runvn + +pow __s_pow $runs +pow __v_pow $runv +pow __vn_pow $runvn +pow _ZGVnN2vv_pow $runvn + +sin __s_sin $runs +sin __v_sin $runv +sin __vn_sin $runvn +sin _ZGVnN2v_sin $runvn + +cos __s_cos $runs +cos __v_cos $runv +cos __vn_cos $runvn +cos _ZGVnN2v_cos $runvn + +expf __s_expf $runs +expf __v_expf $runv +expf __vn_expf $runvn +expf _ZGVnN4v_expf $runvn + +expf_1u __s_expf_1u $runs +expf_1u __v_expf_1u $runv +expf_1u __vn_expf_1u $runvn + +exp2f __s_exp2f $runs +exp2f __v_exp2f $runv +exp2f __vn_exp2f $runvn +exp2f _ZGVnN4v_exp2f $runvn + +exp2f_1u __s_exp2f_1u $runs +exp2f_1u __v_exp2f_1u $runv +exp2f_1u __vn_exp2f_1u $runvn + +logf __s_logf $runs +logf __v_logf $runv +logf __vn_logf $runvn +logf _ZGVnN4v_logf $runvn + +sinf __s_sinf $runs +sinf __v_sinf $runv +sinf __vn_sinf $runvn +sinf _ZGVnN4v_sinf $runvn + +cosf __s_cosf $runs +cosf __v_cosf $runv +cosf __vn_cosf $runvn +cosf _ZGVnN4v_cosf $runvn + +powf __s_powf $runs +powf __v_powf $runv +powf __vn_powf $runvn +powf _ZGVnN4vv_powf $runvn EOF [ 0 -eq $FAIL ] || { diff --git a/math/test/testcases/directed/cosf.tst b/math/test/testcases/directed/cosf.tst index 7ea0d45..7916044 100644 --- a/math/test/testcases/directed/cosf.tst +++ b/math/test/testcases/directed/cosf.tst @@ -1,7 +1,7 @@ ; cosf.tst - Directed test cases for SP cosine ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=cosf op1=7fc00001 result=7fc00001 errno=0 func=cosf op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/erf.tst b/math/test/testcases/directed/erf.tst index 12384ce..7fa4d18 100644 --- a/math/test/testcases/directed/erf.tst +++ b/math/test/testcases/directed/erf.tst @@ -1,7 +1,7 @@ ; erf.tst - Directed test cases for erf ; ; Copyright (c) 2007-2020, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/erff.tst b/math/test/testcases/directed/erff.tst index 28f8fa3..d05b7b1 100644 --- a/math/test/testcases/directed/erff.tst +++ b/math/test/testcases/directed/erff.tst @@ -1,7 +1,7 @@ ; erff.tst ; ; Copyright (c) 2007-2020, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=erff op1=7fc00001 result=7fc00001 errno=0 func=erff op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/exp.tst b/math/test/testcases/directed/exp.tst index 0bb2ef4..85d556c 100644 --- a/math/test/testcases/directed/exp.tst +++ b/math/test/testcases/directed/exp.tst @@ -1,7 +1,7 @@ ; Directed test cases for exp ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp10.tst b/math/test/testcases/directed/exp10.tst deleted file mode 100644 index 2cf4273..0000000 --- a/math/test/testcases/directed/exp10.tst +++ /dev/null @@ -1,15 +0,0 @@ -; Directed test cases for exp10 -; -; Copyright (c) 2023, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 -func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 -func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i -func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i -func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 -func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox -func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0 -func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux -func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0 -func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/math/test/testcases/directed/exp2.tst b/math/test/testcases/directed/exp2.tst index 7069f90..fa56c9f 100644 --- a/math/test/testcases/directed/exp2.tst +++ b/math/test/testcases/directed/exp2.tst @@ -1,7 +1,7 @@ ; Directed test cases for exp2 ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/exp2f.tst b/math/test/testcases/directed/exp2f.tst index 6ca2eea..38cfc3f 100644 --- a/math/test/testcases/directed/exp2f.tst +++ b/math/test/testcases/directed/exp2f.tst @@ -1,7 +1,7 @@ ; exp2f.tst - Directed test cases for exp2f ; ; Copyright (c) 2017-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=exp2f op1=7fc00001 result=7fc00001 errno=0 func=exp2f op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/expf.tst b/math/test/testcases/directed/expf.tst index 89ae8fe..ff0f671 100644 --- a/math/test/testcases/directed/expf.tst +++ b/math/test/testcases/directed/expf.tst @@ -1,7 +1,7 @@ ; expf.tst - Directed test cases for expf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=expf op1=7fc00001 result=7fc00001 errno=0 func=expf op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/log.tst b/math/test/testcases/directed/log.tst index 686ea83..a0aa398 100644 --- a/math/test/testcases/directed/log.tst +++ b/math/test/testcases/directed/log.tst @@ -1,7 +1,7 @@ ; Directed test cases for log ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2.tst b/math/test/testcases/directed/log2.tst index 361bdde..ff1286c 100644 --- a/math/test/testcases/directed/log2.tst +++ b/math/test/testcases/directed/log2.tst @@ -1,7 +1,7 @@ ; Directed test cases for log2 ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/math/test/testcases/directed/log2f.tst b/math/test/testcases/directed/log2f.tst index 5fce051..5832c4f 100644 --- a/math/test/testcases/directed/log2f.tst +++ b/math/test/testcases/directed/log2f.tst @@ -1,7 +1,7 @@ ; log2f.tst - Directed test cases for log2f ; ; Copyright (c) 2017-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=log2f op1=7fc00001 result=7fc00001 errno=0 func=log2f op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/logf.tst b/math/test/testcases/directed/logf.tst index a6d1b9d..6e68a36 100644 --- a/math/test/testcases/directed/logf.tst +++ b/math/test/testcases/directed/logf.tst @@ -1,7 +1,7 @@ ; logf.tst - Directed test cases for logf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=logf op1=7fc00001 result=7fc00001 errno=0 func=logf op1=ffc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/pow.tst b/math/test/testcases/directed/pow.tst index 879d128..1966581 100644 --- a/math/test/testcases/directed/pow.tst +++ b/math/test/testcases/directed/pow.tst @@ -1,7 +1,7 @@ ; Directed test cases for pow ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0 diff --git a/math/test/testcases/directed/powf.tst b/math/test/testcases/directed/powf.tst index 46d5224..3fa8b11 100644 --- a/math/test/testcases/directed/powf.tst +++ b/math/test/testcases/directed/powf.tst @@ -1,7 +1,7 @@ ; powf.tst - Directed test cases for powf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i diff --git a/math/test/testcases/directed/sincosf.tst b/math/test/testcases/directed/sincosf.tst index cddb346..4b33d22 100644 --- a/math/test/testcases/directed/sincosf.tst +++ b/math/test/testcases/directed/sincosf.tst @@ -1,7 +1,7 @@ ; Directed test cases for SP sincos ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/directed/sinf.tst b/math/test/testcases/directed/sinf.tst index 041b13d..ded80b1 100644 --- a/math/test/testcases/directed/sinf.tst +++ b/math/test/testcases/directed/sinf.tst @@ -1,7 +1,7 @@ ; sinf.tst - Directed test cases for SP sine ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +; SPDX-License-Identifier: MIT func=sinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/math/test/testcases/random/double.tst b/math/test/testcases/random/double.tst index 8e885d6..c24ff80 100644 --- a/math/test/testcases/random/double.tst +++ b/math/test/testcases/random/double.tst @@ -1,7 +1,7 @@ !! double.tst - Random test case specification for DP functions !! !! Copyright (c) 1999-2019, Arm Limited. -!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +!! SPDX-License-Identifier: MIT test exp 10000 test exp2 10000 diff --git a/math/test/testcases/random/float.tst b/math/test/testcases/random/float.tst index ea4a5a0..d02a227 100644 --- a/math/test/testcases/random/float.tst +++ b/math/test/testcases/random/float.tst @@ -1,7 +1,7 @@ !! single.tst - Random test case specification for SP functions !! !! Copyright (c) 1999-2019, Arm Limited. -!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +!! SPDX-License-Identifier: MIT test sinf 10000 test cosf 10000 diff --git a/math/test/ulp.c b/math/test/ulp.c index 5ff2997..51479b8 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -1,11 +1,10 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ -#define _GNU_SOURCE #include #include #include @@ -24,6 +23,11 @@ # include #endif +#ifndef WANT_VMATH +/* Enable the build of vector math code. */ +# define WANT_VMATH 1 +#endif + static inline uint64_t asuint64 (double f) { @@ -208,61 +212,73 @@ struct conf unsigned long long n; double softlim; double errlim; - int ignore_zero_sign; }; +/* Wrappers for sincos. */ +static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} +static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} +static double sincos_sin(double x) {(void)cos(x); return sin(x);} +static double sincos_cos(double x) {(void)sin(x); return cos(x);} +#if USE_MPFR +static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } +static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } +#endif + /* A bit of a hack: call vector functions twice with the same input in lane 0 but a different value in other lanes: once with an in-range value and then with a special case value. */ static int secondcall; /* Wrappers for vector functions. */ -#ifdef __vpcs +#if __aarch64__ && WANT_VMATH typedef __f32x4_t v_float; typedef __f64x2_t v_double; -/* First element of fv and dv may be changed by -c argument. */ -static float fv[2] = {1.0f, -INFINITY}; -static double dv[2] = {1.0, -INFINITY}; +static const float fv[2] = {1.0f, -INFINITY}; +static const double dv[2] = {1.0, -INFINITY}; static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } -#if WANT_SVE_MATH -#include -typedef __SVFloat32_t sv_float; -typedef __SVFloat64_t sv_double; - -static inline sv_float svargf(float x) { - int n = svcntw(); - float base[n]; - for (int i=0; iname; f++) printf ("\t%s\n", f->name); @@ -719,7 +768,6 @@ main (int argc, char *argv[]) conf.fenv = 1; conf.softlim = 0; conf.errlim = INFINITY; - conf.ignore_zero_sign = 0; for (;;) { argc--; @@ -759,22 +807,11 @@ main (int argc, char *argv[]) { argc--; argv++; - if (argc < 1 || argv[0][1] != '\0') + if (argc < 1) usage (); conf.rc = argv[0][0]; } break; - case 'z': - conf.ignore_zero_sign = 1; - break; -#ifdef __vpcs - case 'c': - argc--; - argv++; - fv[0] = strtof(argv[0], 0); - dv[0] = strtod(argv[0], 0); - break; -#endif default: usage (); } @@ -800,19 +837,7 @@ main (int argc, char *argv[]) if (strcmp (argv[0], f->name) == 0) break; if (!f->name) - { -#ifndef __vpcs - /* Ignore vector math functions if vector math is not supported. */ - if (strncmp (argv[0], "_ZGVnN", 6) == 0) - exit (0); -#endif -#if !WANT_SVE_MATH - if (strncmp (argv[0], "_ZGVsMxv", 8) == 0) - exit (0); -#endif - printf ("math function %s not supported\n", argv[0]); - exit (1); - } + usage (); if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG) conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */ if (!USE_MPFR && conf.mpfr) diff --git a/math/test/ulp.h b/math/test/ulp.h index b0bc59a..a0c3016 100644 --- a/math/test/ulp.h +++ b/math/test/ulp.h @@ -1,8 +1,8 @@ /* * Generic functions for ULP error estimation. * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT */ /* For each different math function type, @@ -37,8 +37,7 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t) /* Difference between exact result and closest real number that gets rounded to got, i.e. error before rounding, for a correctly rounded result the difference is 0. */ -static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, - int ignore_zero_sign) +static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) { RT(float) want = p->y; RT(float) d; @@ -46,18 +45,10 @@ static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, if (RT(asuint) (got) == RT(asuint) (want)) return 0.0; - if (isnan (got) && isnan (want)) - /* Ignore sign of NaN. */ - return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY; if (signbit (got) != signbit (want)) - { - /* Fall through to ULP calculation if ignoring sign of zero and at - exactly one of want and got is non-zero. */ - if (ignore_zero_sign && want == got) - return 0.0; - if (!ignore_zero_sign || (want != 0 && got != 0)) - return INFINITY; - } + /* May have false positives with NaN. */ + //return isnan(got) && isnan(want) ? 0 : INFINITY; + return INFINITY; if (!isfinite (want) || !isfinite (got)) { if (isnan (got) != isnan (want)) @@ -123,12 +114,8 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r, static inline void T(call_nofenv) (const struct fun *f, struct T(args) a, int r, RT(float) * y, int *ex) { - if (r != FE_TONEAREST) - fesetround (r); *y = T(call) (f, a); *ex = 0; - if (r != FE_TONEAREST) - fesetround (FE_TONEAREST); } static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a, @@ -168,12 +155,8 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a, int r, struct RT(ret) * p, RT(float) ygot, int exgot) { - if (r != FE_TONEAREST) - fesetround (r); RT(double) yl = T(call_long) (f, a); p->y = (RT(float)) yl; - if (r != FE_TONEAREST) - fesetround (FE_TONEAREST); if (RT(isok_nofenv) (ygot, p->y)) return 1; p->ulpexp = RT(ulpscale) (p->y); @@ -305,7 +288,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen, if (!ok) { int print = 0; - double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign); + double err = RT(ulperr) (ygot, &want, r); double abserr = fabs (err); // TODO: count errors below accuracy limit. if (abserr > 0) diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h deleted file mode 100644 index 84f7927..0000000 --- a/math/test/ulp_funcs.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Function entries for ulp. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -/* clang-format off */ - F1 (sin) - F1 (cos) - F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) - F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) - F1 (exp) - F1 (exp2) - F1 (log) - F1 (log2) - F2 (pow) - F1 (erf) - D1 (exp) - D1 (exp10) - D1 (exp2) - D1 (log) - D1 (log2) - D2 (pow) - D1 (erf) -#ifdef __vpcs - F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) - F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) - F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) -#endif -/* clang-format on */ diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h deleted file mode 100644 index 60dc3d6..0000000 --- a/math/test/ulp_wrappers.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Function wrappers for ulp. - * - * Copyright (c) 2022-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -/* clang-format off */ - -/* Wrappers for sincos. */ -static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} -static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} -static double sincos_sin(double x) {(void)cos(x); return sin(x);} -static double sincos_cos(double x) {(void)sin(x); return cos(x);} -#if USE_MPFR -static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } -static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } -#endif - -/* Wrappers for vector functions. */ -#ifdef __vpcs -static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } -static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } -static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; } -static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } -static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; } -static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } -static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } -static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } -static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } -static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } -static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } -static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } -static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } -#endif - -/* clang-format on */ diff --git a/math/tgamma128.c b/math/tgamma128.c deleted file mode 100644 index dda0da7..0000000 --- a/math/tgamma128.c +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Implementation of the true gamma function (as opposed to lgamma) - * for 128-bit long double. - * - * Copyright (c) 2006,2009,2023 Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -/* - * This module implements the float128 gamma function under the name - * tgamma128. It's expected to be suitable for integration into system - * maths libraries under the standard name tgammal, if long double is - * 128-bit. Such a library will probably want to check the error - * handling and optimize the initial process of extracting the - * exponent, which is done here by simple and portable (but - * potentially slower) methods. - */ - -#include -#include -#include -#include - -#include "tgamma128.h" - -#define lenof(x) (sizeof(x)/sizeof(*(x))) - -/* - * Helper routine to evaluate a polynomial via Horner's rule - */ -static long double poly(const long double *coeffs, size_t n, long double x) -{ - long double result = coeffs[--n]; - - while (n > 0) - result = (result * x) + coeffs[--n]; - - return result; -} - -/* - * Compute sin(pi*x) / pi, for use in the reflection formula that - * relates gamma(-x) and gamma(x). - */ -static long double sin_pi_x_over_pi(long double x) -{ - int quo; - long double fracpart = remquol(x, 0.5L, &quo); - - long double sign = 1.0L; - if (quo & 2) - sign = -sign; - quo &= 1; - - if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) { - /* For numbers this size, sin(pi*x) is so close to pi*x that - * sin(pi*x)/pi is indistinguishable from x in float128 */ - return sign * fracpart; - } - - if (quo == 0) { - return sign * sinl(pi*fracpart) / pi; - } else { - return sign * cosl(pi*fracpart) / pi; - } -} - -/* Return tgamma(x) on the assumption that x >= 8. */ -static long double tgamma_large(long double x, - bool negative, long double negadjust) -{ - /* - * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K, - * where K is a correction factor computed as a polynomial in 1/x. - * - * (Vaguely inspired by the form of the Lanczos approximation, but - * I tried the Lanczos approximation itself and it suffers badly - * from big cancellation leading to loss of significance.) - */ - long double t = 1/x; - long double p = poly(coeffs_large, lenof(coeffs_large), t); - - /* - * To avoid overflow in cases where x^(x-0.5) does overflow - * but gamma(x) does not, we split x^(x-0.5) in half and - * multiply back up _after_ multiplying the shrinking factor - * of exp(-(x-0.5)). - * - * Note that computing x-0.5 and (x-0.5)/2 is exact for the - * relevant range of x, so the only sources of error are pow - * and exp themselves, plus the multiplications. - */ - long double powhalf = powl(x, (x-0.5L)/2.0L); - long double expret = expl(-(x-0.5L)); - - if (!negative) { - return (expret * powhalf) * powhalf * p; - } else { - /* - * Apply the reflection formula as commented below, but - * carefully: negadjust has magnitude less than 1, so it can - * turn a case where gamma(+x) would overflow into a case - * where gamma(-x) doesn't underflow. Not only that, but the - * FP format has greater range in the tiny domain due to - * denormals. For both reasons, it's not good enough to - * compute the positive result and then adjust it. - */ - long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p); - return ret / powhalf; - } -} - -/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */ -static long double tgamma_tiny(long double x, - bool negative, long double negadjust) -{ - /* - * For x near zero, we use a polynomial approximation to - * g = 1/(x*gamma(x)), and then return 1/(g*x). - */ - long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x); - if (!negative) - return 1.0L / (g*x); - else - return g / negadjust; -} - -/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */ -static long double tgamma_ultratiny(long double x, bool negative, - long double negadjust) -{ - /* On this interval, gamma can't even be distinguished from 1/x, - * so we skip the polynomial evaluation in tgamma_tiny, partly to - * save time and partly to avoid the tiny intermediate values - * setting the underflow exception flag. */ - if (!negative) - return 1.0L / x; - else - return 1.0L / negadjust; -} - -/* Return tgamma(x) on the assumption that 1 <= x <= 2. */ -static long double tgamma_central(long double x) -{ - /* - * In this central interval, our strategy is to finding the - * difference between x and the point where gamma has a minimum, - * and approximate based on that. - */ - - /* The difference between the input x and the minimum x. The first - * subtraction is expected to be exact, since x and min_hi have - * the same exponent (unless x=2, in which case it will still be - * exact). */ - long double t = (x - min_x_hi) - min_x_lo; - - /* - * Now use two different polynomials for the intervals [1,m] and - * [m,2]. - */ - long double p; - if (t < 0) - p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t); - else - p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t); - - return (min_y_lo + p * (t*t)) + min_y_hi; -} - -long double tgamma128(long double x) -{ - /* - * Start by extracting the number's sign and exponent, and ruling - * out cases of non-normalized numbers. - * - * For an implementation integrated into a system libm, it would - * almost certainly be quicker to do this by direct bitwise access - * to the input float128 value, using whatever is the local idiom - * for knowing its endianness. - * - * Integration into a system libc may also need to worry about - * setting errno, if that's the locally preferred way to report - * math.h errors. - */ - int sign = signbit(x); - int exponent; - switch (fpclassify(x)) { - case FP_NAN: - return x+x; /* propagate QNaN, make SNaN throw an exception */ - case FP_ZERO: - return 1/x; /* divide by zero on purpose to indicate a pole */ - case FP_INFINITE: - if (sign) { - return x-x; /* gamma(-inf) has indeterminate sign, so provoke an - * IEEE invalid operation exception to indicate that */ - } - return x; /* but gamma(+inf) is just +inf with no error */ - case FP_SUBNORMAL: - exponent = -16384; - break; - default: - frexpl(x, &exponent); - exponent--; - break; - } - - bool negative = false; - long double negadjust = 0.0L; - - if (sign) { - /* - * Euler's reflection formula is - * - * gamma(1-x) gamma(x) = pi/sin(pi*x) - * - * pi - * => gamma(x) = -------------------- - * gamma(1-x) sin(pi*x) - * - * But computing 1-x is going to lose a lot of accuracy when x - * is very small, so instead we transform using the recurrence - * gamma(t+1)=t gamma(t). Setting t=-x, this gives us - * gamma(1-x) = -x gamma(-x), so we now have - * - * pi - * gamma(x) = ---------------------- - * -x gamma(-x) sin(pi*x) - * - * which relates gamma(x) to gamma(-x), which is much nicer, - * since x can be turned into -x without rounding. - */ - negadjust = sin_pi_x_over_pi(x); - negative = true; - x = -x; - - /* - * Now the ultimate answer we want is - * - * 1 / (gamma(x) * x * negadjust) - * - * where x is the positive value we've just turned it into. - * - * For some of the cases below, we'll compute gamma(x) - * normally and then compute this adjusted value afterwards. - * But for others, we can implement the reciprocal operation - * in this formula by _avoiding_ an inversion that the - * sub-case was going to do anyway. - */ - - if (negadjust == 0) { - /* - * Special case for negative integers. Applying the - * reflection formula would cause division by zero, but - * standards would prefer we treat this error case as an - * invalid operation and return NaN instead. (Possibly - * because otherwise you'd have to decide which sign of - * infinity to return, and unlike the x=0 case, there's no - * sign of zero available to disambiguate.) - */ - return negadjust / negadjust; - } - } - - /* - * Split the positive domain into various cases. For cases where - * we do the negative-number adjustment the usual way, we'll leave - * the answer in 'g' and drop out of the if statement. - */ - long double g; - - if (exponent >= 11) { - /* - * gamma of any positive value this large overflows, and gamma - * of any negative value underflows. - */ - if (!negative) { - long double huge = 0x1p+12288L; - return huge * huge; /* provoke an overflow */ - } else { - long double tiny = 0x1p-12288L; - return tiny * tiny * negadjust; /* underflow, of the right sign */ - } - } else if (exponent >= 3) { - /* Negative-number adjustment happens inside here */ - return tgamma_large(x, negative, negadjust); - } else if (exponent < -113) { - /* Negative-number adjustment happens inside here */ - return tgamma_ultratiny(x, negative, negadjust); - } else if (exponent < -5) { - /* Negative-number adjustment happens inside here */ - return tgamma_tiny(x, negative, negadjust); - } else if (exponent == 0) { - g = tgamma_central(x); - } else if (exponent < 0) { - /* - * For x in [1/32,1) we range-reduce upwards to the interval - * [1,2), using the inverse of the normal recurrence formula: - * gamma(x) = gamma(x+1)/x. - */ - g = tgamma_central(1+x) / x; - } else { - /* - * For x in [2,8) we range-reduce downwards to the interval - * [1,2) by repeated application of the recurrence formula. - * - * Actually multiplying (x-1) by (x-2) by (x-3) and so on - * would introduce multiple ULPs of rounding error. We can get - * better accuracy by writing x = (k+1/2) + t, where k is an - * integer and |t|<1/2, and expanding out the obvious factor - * (x-1)(x-2)...(x-k+1) as a polynomial in t. - */ - long double mult; - int i = x; - if (i == 2) { /* x in [2,3) */ - mult = (x-1); - } else { - long double t = x - (i + 0.5L); - switch (i) { - /* E.g. for x=3.5+t, we want - * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */ - case 3: - mult = 3.75L+t*(4.0L+t); - break; - case 4: - mult = 13.125L+t*(17.75L+t*(7.5L+t)); - break; - case 5: - mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t))); - break; - case 6: - mult = 324.84375L+t*(570.5625L+t*(376.250L+t*( - 117.5L+t*(17.5L+t)))); - break; - case 7: - mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( - 1140.0L+t*(231.25L+t*(24.0L+t))))); - break; - } - } - - g = tgamma_central(x - (i-1)) * mult; - } - - if (!negative) { - /* Positive domain: return g unmodified */ - return g; - } else { - /* Negative domain: apply the reflection formula as commented above */ - return 1.0L / (g * x * negadjust); - } -} diff --git a/math/tgamma128.h b/math/tgamma128.h deleted file mode 100644 index ced10c3..0000000 --- a/math/tgamma128.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Polynomial coefficients and other constants for tgamma128.c. - * - * Copyright (c) 2006,2009,2023 Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -/* The largest positive value for which 128-bit tgamma does not overflow. */ -static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L; - -/* Coefficients of the polynomial used in the tgamma_large() subroutine */ -static const long double coeffs_large[] = { - 0x1.8535745aa79569579b9eec0f3bbcp+0L, - 0x1.0378f83c6fb8f0e51269f2b4a973p-3L, - 0x1.59f6a05094f69686c3380f4e2783p-8L, - -0x1.0b291dee952a82764a4859b081a6p-8L, - -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L, - 0x1.387a8b5f38dd77e7f139b1021e86p-10L, - 0x1.bca46637f65b13750c728cc29e40p-14L, - -0x1.d80401c00aef998c9e303151a51cp-11L, - -0x1.49cb6bb09f935a2053ccc2cf3711p-14L, - 0x1.4e950204437dcaf2be77f73a6f45p-10L, - 0x1.cb711a2d65f188bf60110934d6bep-14L, - -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L, - -0x1.0305ab9760cddb0d833e73766836p-12L, - 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L, - 0x1.bb4144740ad9290123fdcea684aap-11L, - -0x1.72ab4e88272a229bfafd192450f0p-5L, - 0x1.80c70ac6eb3b7a698983d25a62b8p-12L, - 0x1.e222791c6743ce3e3cae220fb236p-3L, - 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L, - -0x1.9d204fa235a42cd901b123d2ad47p+1L, - 0x1.55b56d1158f77ddb1c95fc44ab02p+0L, - 0x1.37f900a11dbd892abd7dde533e2dp+5L, - -0x1.2da49f4188dd89cb958369ef2401p+7L, - 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L, - -0x1.61433cebe649098c9611c4c7774ap+7L, -}; - -/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ -static const long double coeffs_tiny[] = { - 0x1.0000000000000000000000000000p+0L, - 0x1.2788cfc6fb618f49a37c7f0201fep-1L, - -0x1.4fcf4026afa2dceb8490ade22796p-1L, - -0x1.5815e8fa27047c8f42b5d9217244p-5L, - 0x1.5512320b43fbe5dfa771333518f7p-3L, - -0x1.59af103c340927bffdd44f954bfcp-5L, - -0x1.3b4af28483e210479657e5543366p-7L, - 0x1.d919c527f6070bfce9b29c2ace9cp-8L, - -0x1.317112ce35337def3556a18aa178p-10L, - -0x1.c364fe77a6f27677b985b1fa2e1dp-13L, - 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L, - -0x1.51cf9f090b5dc398ba86305e3634p-16L, - -0x1.4e80f64c04a339740de06ca9fa4ap-20L, - 0x1.241ddc2aef2ec20e58b08f2fda17p-20L, -}; - -/* The location within the interval [1,2] where gamma has a minimum. - * Specified as the sum of two 128-bit values, for extra precision. */ -static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L; -static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L; - -/* The actual minimum value that gamma takes at that location. - * Again specified as the sum of two 128-bit values. */ -static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L; -static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L; - -/* Coefficients of the polynomial used in the tgamma_central() subroutine - * for computing gamma on the interval [1,min_x] */ -static const long double coeffs_central_neg[] = { - 0x1.b6c53f7377b83839c8a292e43b69p-2L, - 0x1.0bae9f40c7d09ed76e732045850ap-3L, - 0x1.4981175e14d04c3530e51d01c5fep-3L, - 0x1.79f77aaf032c948af3a9edbd2061p-4L, - 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L, - 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L, - 0x1.0b44c2f92982f887b55ec36dfdb0p-5L, - 0x1.6df1de1e178ef72ca7bd63d40870p-6L, - 0x1.f63f502bde27e81c0f5e13479b43p-7L, - 0x1.57fd67d901f40ea011353ad89a0ap-7L, - 0x1.d7151376eed187eb753e2273cafcp-8L, - 0x1.427162b5c6ff1d904c71ef53e37cp-8L, - 0x1.b954b8c3a56cf93e49ef6538928ap-9L, - 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L, - 0x1.9d35250d9b9378d9b59df734537ap-10L, - 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L, - 0x1.7e0db39bb99cdb52b028d9359380p-11L, - 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L, - 0x1.27521cf5fd24dcdf43524e6add11p-13L, - 0x1.06461d62243bf9a826b42349672fp-10L, - -0x1.2b852abead28209b4e0c756dc46ep-9L, - 0x1.be673c11a72c826115ec6d286c14p-8L, - -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L, - 0x1.fa362bd2dc68f41abef2d8600acdp-6L, - -0x1.a21585b2f52f8b23855de8e452edp-5L, - 0x1.1f234431ed032052fc92e64e0493p-4L, - -0x1.40d332476ca0199c60cdae3f9132p-4L, - 0x1.1d45dc665d86012eba2eea199cefp-4L, - -0x1.8491016cdd08dc9be7ade9b5fef3p-5L, - 0x1.7e7e2fbc6d49ad484300d6add324p-6L, - -0x1.e63fe3f874a37276a8d7d8b705ecp-8L, - 0x1.30a2a73944f8c84998314d69c23fp-10L, -}; - -/* Coefficients of the polynomial used in the tgamma_central() subroutine - * for computing gamma on the interval [min_x,2] */ -static const long double coeffs_central_pos[] = { - 0x1.b6c53f7377b83839c8a292e22aa2p-2L, - -0x1.0bae9f40c7d09ed76e72e1c955dep-3L, - 0x1.4981175e14d04c3530ee5e1ecebcp-3L, - -0x1.79f77aaf032c948ac983d77f3e07p-4L, - 0x1.1e97bd10821095ab7dc94936cc11p-4L, - -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L, - 0x1.0b44c2f929837fafef7b5d9e80f1p-5L, - -0x1.6df1de1e175fe2a51faa25cddbb4p-6L, - 0x1.f63f502be57d11aed2cfe90843ffp-7L, - -0x1.57fd67d852f230015b9f64770273p-7L, - 0x1.d715138adc07e5fce81077070357p-8L, - -0x1.4271618e9fda8992a667adb15f4fp-8L, - 0x1.b954d15d9eb772e80fdd760672d7p-9L, - -0x1.2dfe391241d3cb79c8c15182843dp-9L, - 0x1.9d44396fcd48451c3ba924cee814p-10L, - -0x1.1ac195fb99739e341589e39803e6p-10L, - 0x1.82e46127b68f002770826e25f146p-11L, - -0x1.089dacd90d9f41493119ac178359p-11L, - 0x1.6993c007b20394a057d21f3d37f8p-12L, - -0x1.ec43a709f4446560c099dec8e31bp-13L, - 0x1.4ba36322f4074e9add9450f003cap-13L, - -0x1.b3f83a977965ca1b7937bf5b34cap-14L, - 0x1.10af346abc09cb25a6d9fe810b6ep-14L, - -0x1.38d8ea1188f242f50203edc395bdp-15L, - 0x1.39add987a948ec56f62b721a4475p-16L, - -0x1.02a4e141f286c8a967e2df9bc9adp-17L, - 0x1.433b50af22425f546e87113062d7p-19L, - -0x1.0c7b73cb0013f00aafc103e8e382p-21L, - 0x1.b852de313ec38da2297f6deaa6b4p-25L, -}; - -/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine - */ -static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L; diff --git a/math/tools/cos.sollya b/math/tools/cos.sollya index 6690adf..bd72d6b 100644 --- a/math/tools/cos.sollya +++ b/math/tools/cos.sollya @@ -1,7 +1,7 @@ // polynomial for approximating cos(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 8; // polynomial degree a = -pi/4; // interval diff --git a/math/tools/exp.sollya b/math/tools/exp.sollya index 0668bdb..b7a462c 100644 --- a/math/tools/exp.sollya +++ b/math/tools/exp.sollya @@ -1,7 +1,7 @@ // polynomial for approximating e^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 5; // poly degree N = 128; // table entries diff --git a/math/tools/exp2.sollya b/math/tools/exp2.sollya index bd0a42d..e760769 100644 --- a/math/tools/exp2.sollya +++ b/math/tools/exp2.sollya @@ -1,7 +1,7 @@ // polynomial for approximating 2^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT // exp2f parameters deg = 3; // poly degree diff --git a/math/tools/log.sollya b/math/tools/log.sollya index 5288f55..6df4db4 100644 --- a/math/tools/log.sollya +++ b/math/tools/log.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 12; // poly degree // |log(1+x)| > 0x1p-4 outside the interval diff --git a/math/tools/log2.sollya b/math/tools/log2.sollya index 85811be..4a364c0 100644 --- a/math/tools/log2.sollya +++ b/math/tools/log2.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log2(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 11; // poly degree // |log2(1+x)| > 0x1p-4 outside the interval diff --git a/math/tools/log2_abs.sollya b/math/tools/log2_abs.sollya index d018ba0..82c4dac 100644 --- a/math/tools/log2_abs.sollya +++ b/math/tools/log2_abs.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log2(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 7; // poly degree // interval ~= 1/(2*N), where N is the table entries diff --git a/math/tools/log_abs.sollya b/math/tools/log_abs.sollya index 5f9bfe4..a2ac190 100644 --- a/math/tools/log_abs.sollya +++ b/math/tools/log_abs.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 6; // poly degree // interval ~= 1/(2*N), where N is the table entries diff --git a/math/tools/plot.py b/math/tools/plot.py index a0fa023..6c8b89f 100755 --- a/math/tools/plot.py +++ b/math/tools/plot.py @@ -3,7 +3,7 @@ # ULP error plot tool. # # Copyright (c) 2019, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# SPDX-License-Identifier: MIT import numpy as np import matplotlib.pyplot as plt diff --git a/math/tools/remez.jl b/math/tools/remez.jl index 1deab67..2ff436f 100755 --- a/math/tools/remez.jl +++ b/math/tools/remez.jl @@ -4,7 +4,7 @@ # remez.jl - implementation of the Remez algorithm for polynomial approximation # # Copyright (c) 2015-2019, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# SPDX-License-Identifier: MIT import Base.\ diff --git a/math/tools/sin.sollya b/math/tools/sin.sollya index a193000..a6e8511 100644 --- a/math/tools/sin.sollya +++ b/math/tools/sin.sollya @@ -1,7 +1,7 @@ // polynomial for approximating sin(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 7; // polynomial degree a = -pi/4; // interval diff --git a/math/tools/tgamma128_gen.jl b/math/tools/tgamma128_gen.jl deleted file mode 100644 index da76e8b..0000000 --- a/math/tools/tgamma128_gen.jl +++ /dev/null @@ -1,212 +0,0 @@ -# -*- julia -*- -# -# Generate tgamma128.h, containing polynomials and constants used by -# tgamma128.c. -# -# Copyright (c) 2006,2009,2023 Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - -# This Julia program depends on the 'Remez' and 'SpecialFunctions' -# library packages. To install them, run this at the interactive Julia -# prompt: -# -# import Pkg; Pkg.add(["Remez", "SpecialFunctions"]) -# -# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04). - -import Printf -import Remez -import SpecialFunctions - -# Round a BigFloat to 128-bit long double and format it as a C99 hex -# float literal. -function quadhex(x) - sign = " " - if x < 0 - sign = "-" - x = -x - end - - exponent = BigInt(floor(log2(x))) - exponent = max(exponent, -16382) - @assert(exponent <= 16383) # else overflow - - x /= BigFloat(2)^exponent - @assert(1 <= x < 2) - x *= BigFloat(2)^112 - mantissa = BigInt(round(x)) - - mantstr = string(mantissa, base=16, pad=29) - return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end], - exponent) -end - -# Round a BigFloat to 128-bit long double and return it still as a -# BigFloat. -function quadval(x, round=0) - sign = +1 - if x.sign < 0 - sign = -1 - x = -x - end - - exponent = BigInt(floor(log2(x))) - exponent = max(exponent, -16382) - @assert(exponent <= 16383) # else overflow - - x /= BigFloat(2)^exponent - @assert(1 <= x < 2) - x *= BigFloat(2)^112 - if round < 0 - mantissa = floor(x) - elseif round > 0 - mantissa = ceil(x) - else - mantissa = round(x) - end - - return sign * mantissa * BigFloat(2)^(exponent - 112) -end - -# Output an array of BigFloats as a C array declaration. -function dumparray(a, name) - println("static const long double ", name, "[] = {") - for x in N - println(" ", quadhex(x), ",") - end - println("};") -end - -print("/* - * Polynomial coefficients and other constants for tgamma128.c. - * - * Copyright (c) 2006,2009,2023 Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ -") - -Base.MPFR.setprecision(512) - -e = exp(BigFloat(1)) - -print(" -/* The largest positive value for which 128-bit tgamma does not overflow. */ -") -lo = BigFloat("1000") -hi = BigFloat("2000") -while true - global lo - global hi - global max_x - - mid = (lo + hi) / 2 - if mid == lo || mid == hi - max_x = mid - break - end - if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2)) - lo = mid - else - hi = mid - end -end -max_x = quadval(max_x, -1) -println("static const long double max_x = ", quadhex(max_x), ";") - -print(" -/* Coefficients of the polynomial used in the tgamma_large() subroutine */ -") -N, D, E, X = Remez.ratfn_minimax( - x -> x==0 ? sqrt(BigFloat(2)*pi/e) : - exp(SpecialFunctions.logabsgamma(1/x)[1] + - (1/x-0.5)*(1+log(x))), - (0, 1/BigFloat(8)), - 24, 0, - (x, y) -> 1/y -) -dumparray(N, "coeffs_large") - -print(" -/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ -") -N, D, E, X = Remez.ratfn_minimax( - x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)), - (0, 1/BigFloat(32)), - 13, 0, -) -dumparray(N, "coeffs_tiny") - -print(" -/* The location within the interval [1,2] where gamma has a minimum. - * Specified as the sum of two 128-bit values, for extra precision. */ -") -lo = BigFloat("1.4") -hi = BigFloat("1.5") -while true - global lo - global hi - global min_x - - mid = (lo + hi) / 2 - if mid == lo || mid == hi - min_x = mid - break - end - if SpecialFunctions.digamma(mid) < 0 - lo = mid - else - hi = mid - end -end -min_x_hi = quadval(min_x, -1) -println("static const long double min_x_hi = ", quadhex(min_x_hi), ";") -println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";") - -print(" -/* The actual minimum value that gamma takes at that location. - * Again specified as the sum of two 128-bit values. */ -") -min_y = SpecialFunctions.gamma(min_x) -min_y_hi = quadval(min_y, -1) -println("static const long double min_y_hi = ", quadhex(min_y_hi), ";") -println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";") - -function taylor_bodge(x) - # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2. - # Used in the Remez calls below for x values very near the origin, to avoid - # significance loss problems when trying to compute it directly via that - # formula (even in MPFR's extra precision). - return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506")))) -end - -print(" -/* Coefficients of the polynomial used in the tgamma_central() subroutine - * for computing gamma on the interval [1,min_x] */ -") -N, D, E, X = Remez.ratfn_minimax( - x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) : - (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x), - (0, min_x - 1), - 31, 0, - (x, y) -> x^2, -) -dumparray(N, "coeffs_central_neg") - -print(" -/* Coefficients of the polynomial used in the tgamma_central() subroutine - * for computing gamma on the interval [min_x,2] */ -") -N, D, E, X = Remez.ratfn_minimax( - x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) : - (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x), - (0, 2 - min_x), - 28, 0, - (x, y) -> x^2, -) -dumparray(N, "coeffs_central_pos") - -print(" -/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine - */ -") -println("static const long double pi = ", quadhex(BigFloat(pi)), ";") diff --git a/math/tools/v_exp.sollya b/math/tools/v_exp.sollya index 5fa7de7..c0abb63 100644 --- a/math/tools/v_exp.sollya +++ b/math/tools/v_exp.sollya @@ -1,7 +1,7 @@ // polynomial for approximating e^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 4; // poly degree N = 128; // table entries diff --git a/math/tools/v_log.sollya b/math/tools/v_log.sollya index d982524..cc3d2c4 100644 --- a/math/tools/v_log.sollya +++ b/math/tools/v_log.sollya @@ -1,7 +1,7 @@ // polynomial used for __v_log(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 6; // poly degree a = -0x1.fc1p-9; diff --git a/math/tools/v_sin.sollya b/math/tools/v_sin.sollya index 63b9d65..65cc995 100644 --- a/math/tools/v_sin.sollya +++ b/math/tools/v_sin.sollya @@ -1,7 +1,7 @@ // polynomial for approximating sin(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +// SPDX-License-Identifier: MIT deg = 15; // polynomial degree a = -pi/2; // interval diff --git a/math/v_cos.c b/math/v_cos.c new file mode 100644 index 0000000..20ba6bd --- /dev/null +++ b/math/v_cos.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const double Poly[] = { +/* worst-case error is 3.5 ulp. + abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ +-0x1.9f4a9c8b21dc9p-41, + 0x1.60e88a10163f2p-33, +-0x1.ae6361b7254e7p-26, + 0x1.71de382e8d62bp-19, +-0x1.a01a019aeb4ffp-13, + 0x1.111111110b25ep-7, +-0x1.55555555554c3p-3, +}; + +#define C7 v_f64 (Poly[0]) +#define C6 v_f64 (Poly[1]) +#define C5 v_f64 (Poly[2]) +#define C4 v_f64 (Poly[3]) +#define C3 v_f64 (Poly[4]) +#define C2 v_f64 (Poly[5]) +#define C1 v_f64 (Poly[6]) + +#define InvPi v_f64 (0x1.45f306dc9c883p-2) +#define HalfPi v_f64 (0x1.921fb54442d18p+0) +#define Pi1 v_f64 (0x1.921fb54442d18p+1) +#define Pi2 v_f64 (0x1.1a62633145c06p-53) +#define Pi3 v_f64 (0x1.c1cd129024e09p-106) +#define Shift v_f64 (0x1.8p52) +#define RangeVal v_f64 (0x1p23) +#define AbsMask v_u64 (0x7fffffffffffffff) + +VPCS_ATTR +__attribute__ ((noinline)) static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (cos, x, y, cmp); +} + +VPCS_ATTR +v_f64_t +V_NAME(cos) (v_f64_t x) +{ + v_f64_t n, r, r2, y; + v_u64_t odd, cmp; + + r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); + cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = v_fma_f64 (InvPi, r + HalfPi, Shift); + odd = v_as_u64_f64 (n) << 63; + n -= Shift; + n -= v_f64 (0.5); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = v_fma_f64 (-Pi1, n, r); + r = v_fma_f64 (-Pi2, n, r); + r = v_fma_f64 (-Pi3, n, r); + + /* sin(r) poly approx. */ + r2 = r * r; + y = v_fma_f64 (C7, r2, C6); + y = v_fma_f64 (y, r2, C5); + y = v_fma_f64 (y, r2, C4); + y = v_fma_f64 (y, r2, C3); + y = v_fma_f64 (y, r2, C2); + y = v_fma_f64 (y, r2, C1); + y = v_fma_f64 (y * r2, r, r); + + /* sign. */ + y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/v_cosf.c b/math/v_cosf.c new file mode 100644 index 0000000..150294b --- /dev/null +++ b/math/v_cosf.c @@ -0,0 +1,76 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* 1.886 ulp error */ + 0x1.5b2e76p-19f, + -0x1.9f42eap-13f, + 0x1.110df4p-7f, + -0x1.555548p-3f, +}; +#define Pi1 v_f32 (0x1.921fb6p+1f) +#define Pi2 v_f32 (-0x1.777a5cp-24f) +#define Pi3 v_f32 (-0x1.ee59dap-49f) +#define A3 v_f32 (Poly[3]) +#define A5 v_f32 (Poly[2]) +#define A7 v_f32 (Poly[1]) +#define A9 v_f32 (Poly[0]) +#define RangeVal v_f32 (0x1p20f) +#define InvPi v_f32 (0x1.45f306p-2f) +#define Shift v_f32 (0x1.8p+23f) +#define AbsMask v_u32 (0x7fffffff) +#define HalfPi v_f32 (0x1.921fb6p0f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (cosf, x, y, cmp); +} + +VPCS_ATTR +v_f32_t +V_NAME(cosf) (v_f32_t x) +{ + v_f32_t n, r, r2, y; + v_u32_t odd, cmp; + + r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); + cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); + + /* n = rint((|x|+pi/2)/pi) - 0.5 */ + n = v_fma_f32 (InvPi, r + HalfPi, Shift); + odd = v_as_u32_f32 (n) << 31; + n -= Shift; + n -= v_f32 (0.5f); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = v_fma_f32 (-Pi1, n, r); + r = v_fma_f32 (-Pi2, n, r); + r = v_fma_f32 (-Pi3, n, r); + + /* y = sin(r) */ + r2 = r * r; + y = v_fma_f32 (A9, r2, A7); + y = v_fma_f32 (y, r2, A5); + y = v_fma_f32 (y, r2, A3); + y = v_fma_f32 (y * r2, r, r); + + /* sign fix */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/v_exp.c b/math/v_exp.c new file mode 100644 index 0000000..e459d53 --- /dev/null +++ b/math/v_exp.c @@ -0,0 +1,94 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED +#include "v_exp.h" + +#if V_EXP_TABLE_BITS == 7 +/* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ +#define C1 v_f64 (0x1.ffffffffffd43p-2) +#define C2 v_f64 (0x1.55555c75adbb2p-3) +#define C3 v_f64 (0x1.55555da646206p-5) +#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */ +#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */ +#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63) +#elif V_EXP_TABLE_BITS == 8 +/* maxerr: 0.54 +0.5 ulp + rel error: 1.4318*2^-58 + abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */ +#define C1 v_f64 (0x1.fffffffffffd4p-2) +#define C2 v_f64 (0x1.5555571d6b68cp-3) +#define C3 v_f64 (0x1.5555576a59599p-5) +#define InvLn2 v_f64 (0x1.71547652b82fep8) +#define Ln2hi v_f64 (0x1.62e42fefa39efp-9) +#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64) +#endif + +#define N (1 << V_EXP_TABLE_BITS) +#define Tab __v_exp_data +#define IndexMask v_u64 (N - 1) +#define Shift v_f64 (0x1.8p+52) +#define Thres v_f64 (704.0) + +VPCS_ATTR +static v_f64_t +specialcase (v_f64_t s, v_f64_t y, v_f64_t n) +{ + v_f64_t absn = v_abs_f64 (n); + + /* 2^(n/N) may overflow, break it up into s1*s2. */ + v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); + v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); + v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); + v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); + v_f64_t r1 = s1 * s1; + v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; + return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); +} + +VPCS_ATTR +v_f64_t +V_NAME(exp) (v_f64_t x) +{ + v_f64_t n, r, r2, s, y, z; + v_u64_t cmp, u, e, i; + + cmp = v_cond_u64 (v_abs_f64 (x) > Thres); + + /* n = round(x/(ln2/N)). */ + z = v_fma_f64 (x, InvLn2, Shift); + u = v_as_u64_f64 (z); + n = z - Shift; + + /* r = x - n*ln2/N. */ + r = x; + r = v_fma_f64 (-Ln2hi, n, r); + r = v_fma_f64 (-Ln2lo, n, r); + + e = u << (52 - V_EXP_TABLE_BITS); + i = u & IndexMask; + + /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + r2 = r * r; + y = v_fma_f64 (C2, r, C1); + y = v_fma_f64 (C3, r2, y); + y = v_fma_f64 (y, r2, r); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (Tab, i); + s = v_as_f64_u64 (u + e); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (s, y, n); + return v_fma_f64 (y, s, s); +} +VPCS_ALIAS +#endif diff --git a/math/v_exp.h b/math/v_exp.h new file mode 100644 index 0000000..305da19 --- /dev/null +++ b/math/v_exp.h @@ -0,0 +1,14 @@ +/* + * Declarations for double-precision e^x vector function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "v_math.h" +#if WANT_VMATH + +#define V_EXP_TABLE_BITS 7 + +extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; +#endif diff --git a/math/v_exp2f.c b/math/v_exp2f.c new file mode 100644 index 0000000..e3ea5af --- /dev/null +++ b/math/v_exp2f.c @@ -0,0 +1,78 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* maxerr: 1.962 ulp. */ + 0x1.59977ap-10f, + 0x1.3ce9e4p-7f, + 0x1.c6bd32p-5f, + 0x1.ebf9bcp-3f, + 0x1.62e422p-1f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) +{ + /* 2^n may overflow, break it up into s1*s2. */ + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); + v_u32_t r2 = v_as_u32_f32 (s1 * s1); + v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); + return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); +} + +VPCS_ATTR +v_f32_t +V_NAME(exp2f) (v_f32_t x) +{ + v_f32_t n, r, r2, scale, p, q, poly, absn; + v_u32_t cmp, e; + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ +#if 0 + v_f32_t z; + z = x + Shift; + n = z - Shift; + r = x - n; + e = v_as_u32_f32 (z) << 23; +#else + n = v_round_f32 (x); + r = x - n; + e = v_as_u32_s32 (v_round_s32 (x)) << 23; +#endif + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + r2 = r * r; + p = v_fma_f32 (C0, r, C1); + q = v_fma_f32 (C2, r, C3); + q = v_fma_f32 (p, r2, q); + p = C4 * r; + poly = v_fma_f32 (q, r2, p); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn, cmp, scale); + return v_fma_f32 (poly, scale, scale); +} +VPCS_ALIAS +#endif diff --git a/math/aarch64/v_exp2f_1u.c b/math/v_exp2f_1u.c similarity index 43% rename from math/aarch64/v_exp2f_1u.c rename to math/v_exp2f_1u.c index ba6b02f..1caa14d 100644 --- a/math/aarch64/v_exp2f_1u.c +++ b/math/v_exp2f_1u.c @@ -1,12 +1,13 @@ /* * Single-precision vector 2^x function. * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include "mathlib.h" #include "v_math.h" +#if V_SUPPORTED static const float Poly[] = { /* maxerr: 0.878 ulp. */ @@ -24,49 +25,51 @@ static const float Poly[] = { #define Ln2hi v_f32 (0x1.62e4p-1f) #define Ln2lo v_f32 (0x1.7f7d1cp-20f) -static float32x4_t VPCS_ATTR NOINLINE -specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); - float32x4_t s2 = vreinterpretq_f32_u32 (e - b); - uint32x4_t cmp = absn > v_f32 (192.0f); - float32x4_t r1 = s1 * s1; - float32x4_t r0 = poly * s1 * s2; - return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) - | (~cmp & vreinterpretq_u32_f32 (r0))); + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); + v_f32_t r1 = s1 * s1; + v_f32_t r0 = poly * s1 * s2; + return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); } -float32x4_t VPCS_ATTR -_ZGVnN4v_exp2f_1u (float32x4_t x) +VPCS_ATTR +v_f32_t +V_NAME(exp2f_1u) (v_f32_t x) { - float32x4_t n, r, scale, poly, absn; - uint32x4_t cmp, e; + v_f32_t n, r, scale, poly, absn; + v_u32_t cmp, e; /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] x = n + r, with r in [-1/2, 1/2]. */ #if 0 - float32x4_t z; + v_f32_t z; z = x + Shift; n = z - Shift; r = x - n; - e = vreinterpretq_u32_f32 (z) << 23; + e = v_as_u32_f32 (z) << 23; #else - n = vrndaq_f32 (x); + n = v_round_f32 (x); r = x - n; - e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; + e = v_as_u32_s32 (v_round_s32 (x)) << 23; #endif - scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); - absn = vabsq_f32 (n); - cmp = absn > v_f32 (126.0f); - poly = vfmaq_f32 (C1, C0, r); - poly = vfmaq_f32 (C2, poly, r); - poly = vfmaq_f32 (C3, poly, r); - poly = vfmaq_f32 (C4, poly, r); - poly = vfmaq_f32 (C5, poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + poly = v_fma_f32 (C0, r, C1); + poly = v_fma_f32 (poly, r, C2); + poly = v_fma_f32 (poly, r, C3); + poly = v_fma_f32 (poly, r, C4); + poly = v_fma_f32 (poly, r, C5); + poly = v_fma_f32 (poly, r, v_f32 (1.0f)); if (unlikely (v_any_u32 (cmp))) return specialcase (poly, n, e, absn); return scale * poly; } +#endif diff --git a/math/v_exp_data.c b/math/v_exp_data.c new file mode 100644 index 0000000..3653554 --- /dev/null +++ b/math/v_exp_data.c @@ -0,0 +1,403 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "v_exp.h" +#if WANT_VMATH + +#define N (1 << V_EXP_TABLE_BITS) + +/* 2^(j/N), j=0..N. */ +const u64_t __v_exp_data[] = { +#if N == 128 +0x3ff0000000000000, +0x3feff63da9fb3335, +0x3fefec9a3e778061, +0x3fefe315e86e7f85, +0x3fefd9b0d3158574, +0x3fefd06b29ddf6de, +0x3fefc74518759bc8, +0x3fefbe3ecac6f383, +0x3fefb5586cf9890f, +0x3fefac922b7247f7, +0x3fefa3ec32d3d1a2, +0x3fef9b66affed31b, +0x3fef9301d0125b51, +0x3fef8abdc06c31cc, +0x3fef829aaea92de0, +0x3fef7a98c8a58e51, +0x3fef72b83c7d517b, +0x3fef6af9388c8dea, +0x3fef635beb6fcb75, +0x3fef5be084045cd4, +0x3fef54873168b9aa, +0x3fef4d5022fcd91d, +0x3fef463b88628cd6, +0x3fef3f49917ddc96, +0x3fef387a6e756238, +0x3fef31ce4fb2a63f, +0x3fef2b4565e27cdd, +0x3fef24dfe1f56381, +0x3fef1e9df51fdee1, +0x3fef187fd0dad990, +0x3fef1285a6e4030b, +0x3fef0cafa93e2f56, +0x3fef06fe0a31b715, +0x3fef0170fc4cd831, +0x3feefc08b26416ff, +0x3feef6c55f929ff1, +0x3feef1a7373aa9cb, +0x3feeecae6d05d866, +0x3feee7db34e59ff7, +0x3feee32dc313a8e5, +0x3feedea64c123422, +0x3feeda4504ac801c, +0x3feed60a21f72e2a, +0x3feed1f5d950a897, +0x3feece086061892d, +0x3feeca41ed1d0057, +0x3feec6a2b5c13cd0, +0x3feec32af0d7d3de, +0x3feebfdad5362a27, +0x3feebcb299fddd0d, +0x3feeb9b2769d2ca7, +0x3feeb6daa2cf6642, +0x3feeb42b569d4f82, +0x3feeb1a4ca5d920f, +0x3feeaf4736b527da, +0x3feead12d497c7fd, +0x3feeab07dd485429, +0x3feea9268a5946b7, +0x3feea76f15ad2148, +0x3feea5e1b976dc09, +0x3feea47eb03a5585, +0x3feea34634ccc320, +0x3feea23882552225, +0x3feea155d44ca973, +0x3feea09e667f3bcd, +0x3feea012750bdabf, +0x3fee9fb23c651a2f, +0x3fee9f7df9519484, +0x3fee9f75e8ec5f74, +0x3fee9f9a48a58174, +0x3fee9feb564267c9, +0x3feea0694fde5d3f, +0x3feea11473eb0187, +0x3feea1ed0130c132, +0x3feea2f336cf4e62, +0x3feea427543e1a12, +0x3feea589994cce13, +0x3feea71a4623c7ad, +0x3feea8d99b4492ed, +0x3feeaac7d98a6699, +0x3feeace5422aa0db, +0x3feeaf3216b5448c, +0x3feeb1ae99157736, +0x3feeb45b0b91ffc6, +0x3feeb737b0cdc5e5, +0x3feeba44cbc8520f, +0x3feebd829fde4e50, +0x3feec0f170ca07ba, +0x3feec49182a3f090, +0x3feec86319e32323, +0x3feecc667b5de565, +0x3feed09bec4a2d33, +0x3feed503b23e255d, +0x3feed99e1330b358, +0x3feede6b5579fdbf, +0x3feee36bbfd3f37a, +0x3feee89f995ad3ad, +0x3feeee07298db666, +0x3feef3a2b84f15fb, +0x3feef9728de5593a, +0x3feeff76f2fb5e47, +0x3fef05b030a1064a, +0x3fef0c1e904bc1d2, +0x3fef12c25bd71e09, +0x3fef199bdd85529c, +0x3fef20ab5fffd07a, +0x3fef27f12e57d14b, +0x3fef2f6d9406e7b5, +0x3fef3720dcef9069, +0x3fef3f0b555dc3fa, +0x3fef472d4a07897c, +0x3fef4f87080d89f2, +0x3fef5818dcfba487, +0x3fef60e316c98398, +0x3fef69e603db3285, +0x3fef7321f301b460, +0x3fef7c97337b9b5f, +0x3fef864614f5a129, +0x3fef902ee78b3ff6, +0x3fef9a51fbc74c83, +0x3fefa4afa2a490da, +0x3fefaf482d8e67f1, +0x3fefba1bee615a27, +0x3fefc52b376bba97, +0x3fefd0765b6e4540, +0x3fefdbfdad9cbe14, +0x3fefe7c1819e90d8, +0x3feff3c22b8f71f1, +#elif N == 256 +0x3ff0000000000000, +0x3feffb1afa5abcbf, +0x3feff63da9fb3335, +0x3feff168143b0281, +0x3fefec9a3e778061, +0x3fefe7d42e11bbcc, +0x3fefe315e86e7f85, +0x3fefde5f72f654b1, +0x3fefd9b0d3158574, +0x3fefd50a0e3c1f89, +0x3fefd06b29ddf6de, +0x3fefcbd42b72a836, +0x3fefc74518759bc8, +0x3fefc2bdf66607e0, +0x3fefbe3ecac6f383, +0x3fefb9c79b1f3919, +0x3fefb5586cf9890f, +0x3fefb0f145e46c85, +0x3fefac922b7247f7, +0x3fefa83b23395dec, +0x3fefa3ec32d3d1a2, +0x3fef9fa55fdfa9c5, +0x3fef9b66affed31b, +0x3fef973028d7233e, +0x3fef9301d0125b51, +0x3fef8edbab5e2ab6, +0x3fef8abdc06c31cc, +0x3fef86a814f204ab, +0x3fef829aaea92de0, +0x3fef7e95934f312e, +0x3fef7a98c8a58e51, +0x3fef76a45471c3c2, +0x3fef72b83c7d517b, +0x3fef6ed48695bbc0, +0x3fef6af9388c8dea, +0x3fef672658375d2f, +0x3fef635beb6fcb75, +0x3fef5f99f8138a1c, +0x3fef5be084045cd4, +0x3fef582f95281c6b, +0x3fef54873168b9aa, +0x3fef50e75eb44027, +0x3fef4d5022fcd91d, +0x3fef49c18438ce4d, +0x3fef463b88628cd6, +0x3fef42be3578a819, +0x3fef3f49917ddc96, +0x3fef3bdda27912d1, +0x3fef387a6e756238, +0x3fef351ffb82140a, +0x3fef31ce4fb2a63f, +0x3fef2e85711ece75, +0x3fef2b4565e27cdd, +0x3fef280e341ddf29, +0x3fef24dfe1f56381, +0x3fef21ba7591bb70, +0x3fef1e9df51fdee1, +0x3fef1b8a66d10f13, +0x3fef187fd0dad990, +0x3fef157e39771b2f, +0x3fef1285a6e4030b, +0x3fef0f961f641589, +0x3fef0cafa93e2f56, +0x3fef09d24abd886b, +0x3fef06fe0a31b715, +0x3fef0432edeeb2fd, +0x3fef0170fc4cd831, +0x3feefeb83ba8ea32, +0x3feefc08b26416ff, +0x3feef96266e3fa2d, +0x3feef6c55f929ff1, +0x3feef431a2de883b, +0x3feef1a7373aa9cb, +0x3feeef26231e754a, +0x3feeecae6d05d866, +0x3feeea401b7140ef, +0x3feee7db34e59ff7, +0x3feee57fbfec6cf4, +0x3feee32dc313a8e5, +0x3feee0e544ede173, +0x3feedea64c123422, +0x3feedc70df1c5175, +0x3feeda4504ac801c, +0x3feed822c367a024, +0x3feed60a21f72e2a, +0x3feed3fb2709468a, +0x3feed1f5d950a897, +0x3feecffa3f84b9d4, +0x3feece086061892d, +0x3feecc2042a7d232, +0x3feeca41ed1d0057, +0x3feec86d668b3237, +0x3feec6a2b5c13cd0, +0x3feec4e1e192aed2, +0x3feec32af0d7d3de, +0x3feec17dea6db7d7, +0x3feebfdad5362a27, +0x3feebe41b817c114, +0x3feebcb299fddd0d, +0x3feebb2d81d8abff, +0x3feeb9b2769d2ca7, +0x3feeb8417f4531ee, +0x3feeb6daa2cf6642, +0x3feeb57de83f4eef, +0x3feeb42b569d4f82, +0x3feeb2e2f4f6ad27, +0x3feeb1a4ca5d920f, +0x3feeb070dde910d2, +0x3feeaf4736b527da, +0x3feeae27dbe2c4cf, +0x3feead12d497c7fd, +0x3feeac0827ff07cc, +0x3feeab07dd485429, +0x3feeaa11fba87a03, +0x3feea9268a5946b7, +0x3feea84590998b93, +0x3feea76f15ad2148, +0x3feea6a320dceb71, +0x3feea5e1b976dc09, +0x3feea52ae6cdf6f4, +0x3feea47eb03a5585, +0x3feea3dd1d1929fd, +0x3feea34634ccc320, +0x3feea2b9febc8fb7, +0x3feea23882552225, +0x3feea1c1c70833f6, +0x3feea155d44ca973, +0x3feea0f4b19e9538, +0x3feea09e667f3bcd, +0x3feea052fa75173e, +0x3feea012750bdabf, +0x3fee9fdcddd47645, +0x3fee9fb23c651a2f, +0x3fee9f9298593ae5, +0x3fee9f7df9519484, +0x3fee9f7466f42e87, +0x3fee9f75e8ec5f74, +0x3fee9f8286ead08a, +0x3fee9f9a48a58174, +0x3fee9fbd35d7cbfd, +0x3fee9feb564267c9, +0x3feea024b1ab6e09, +0x3feea0694fde5d3f, +0x3feea0b938ac1cf6, +0x3feea11473eb0187, +0x3feea17b0976cfdb, +0x3feea1ed0130c132, +0x3feea26a62ff86f0, +0x3feea2f336cf4e62, +0x3feea3878491c491, +0x3feea427543e1a12, +0x3feea4d2add106d9, +0x3feea589994cce13, +0x3feea64c1eb941f7, +0x3feea71a4623c7ad, +0x3feea7f4179f5b21, +0x3feea8d99b4492ed, +0x3feea9cad931a436, +0x3feeaac7d98a6699, +0x3feeabd0a478580f, +0x3feeace5422aa0db, +0x3feeae05bad61778, +0x3feeaf3216b5448c, +0x3feeb06a5e0866d9, +0x3feeb1ae99157736, +0x3feeb2fed0282c8a, +0x3feeb45b0b91ffc6, +0x3feeb5c353aa2fe2, +0x3feeb737b0cdc5e5, +0x3feeb8b82b5f98e5, +0x3feeba44cbc8520f, +0x3feebbdd9a7670b3, +0x3feebd829fde4e50, +0x3feebf33e47a22a2, +0x3feec0f170ca07ba, +0x3feec2bb4d53fe0d, +0x3feec49182a3f090, +0x3feec674194bb8d5, +0x3feec86319e32323, +0x3feeca5e8d07f29e, +0x3feecc667b5de565, +0x3feece7aed8eb8bb, +0x3feed09bec4a2d33, +0x3feed2c980460ad8, +0x3feed503b23e255d, +0x3feed74a8af46052, +0x3feed99e1330b358, +0x3feedbfe53c12e59, +0x3feede6b5579fdbf, +0x3feee0e521356eba, +0x3feee36bbfd3f37a, +0x3feee5ff3a3c2774, +0x3feee89f995ad3ad, +0x3feeeb4ce622f2ff, +0x3feeee07298db666, +0x3feef0ce6c9a8952, +0x3feef3a2b84f15fb, +0x3feef68415b749b1, +0x3feef9728de5593a, +0x3feefc6e29f1c52a, +0x3feeff76f2fb5e47, +0x3fef028cf22749e4, +0x3fef05b030a1064a, +0x3fef08e0b79a6f1f, +0x3fef0c1e904bc1d2, +0x3fef0f69c3f3a207, +0x3fef12c25bd71e09, +0x3fef16286141b33d, +0x3fef199bdd85529c, +0x3fef1d1cd9fa652c, +0x3fef20ab5fffd07a, +0x3fef244778fafb22, +0x3fef27f12e57d14b, +0x3fef2ba88988c933, +0x3fef2f6d9406e7b5, +0x3fef33405751c4db, +0x3fef3720dcef9069, +0x3fef3b0f2e6d1675, +0x3fef3f0b555dc3fa, +0x3fef43155b5bab74, +0x3fef472d4a07897c, +0x3fef4b532b08c968, +0x3fef4f87080d89f2, +0x3fef53c8eacaa1d6, +0x3fef5818dcfba487, +0x3fef5c76e862e6d3, +0x3fef60e316c98398, +0x3fef655d71ff6075, +0x3fef69e603db3285, +0x3fef6e7cd63a8315, +0x3fef7321f301b460, +0x3fef77d5641c0658, +0x3fef7c97337b9b5f, +0x3fef81676b197d17, +0x3fef864614f5a129, +0x3fef8b333b16ee12, +0x3fef902ee78b3ff6, +0x3fef953924676d76, +0x3fef9a51fbc74c83, +0x3fef9f7977cdb740, +0x3fefa4afa2a490da, +0x3fefa9f4867cca6e, +0x3fefaf482d8e67f1, +0x3fefb4aaa2188510, +0x3fefba1bee615a27, +0x3fefbf9c1cb6412a, +0x3fefc52b376bba97, +0x3fefcac948dd7274, +0x3fefd0765b6e4540, +0x3fefd632798844f8, +0x3fefdbfdad9cbe14, +0x3fefe1d802243c89, +0x3fefe7c1819e90d8, +0x3fefedba3692d514, +0x3feff3c22b8f71f1, +0x3feff9d96b2a23d9, +#endif +}; +#endif diff --git a/math/v_expf.c b/math/v_expf.c new file mode 100644 index 0000000..d403e00 --- /dev/null +++ b/math/v_expf.c @@ -0,0 +1,83 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* maxerr: 1.45358 +0.5 ulp. */ + 0x1.0e4020p-7f, + 0x1.573e2ep-5f, + 0x1.555e66p-3f, + 0x1.fffdb6p-2f, + 0x1.ffffecp-1f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) +{ + /* 2^n may overflow, break it up into s1*s2. */ + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); + v_u32_t r2 = v_as_u32_f32 (s1 * s1); + v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); + return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); +} + +VPCS_ATTR +v_f32_t +V_NAME(expf) (v_f32_t x) +{ + v_f32_t n, r, r2, scale, p, q, poly, absn, z; + v_u32_t cmp, e; + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ +#if 1 + z = v_fma_f32 (x, InvLn2, Shift); + n = z - Shift; + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_f32 (z) << 23; +#else + z = x * InvLn2; + n = v_round_f32 (z); + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_s32 (v_round_s32 (z)) << 23; +#endif + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + r2 = r * r; + p = v_fma_f32 (C0, r, C1); + q = v_fma_f32 (C2, r, C3); + q = v_fma_f32 (p, r2, q); + p = C4 * r; + poly = v_fma_f32 (q, r2, p); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn, cmp, scale); + return v_fma_f32 (poly, scale, scale); +} +VPCS_ALIAS +#endif diff --git a/math/aarch64/v_expf_1u.c b/math/v_expf_1u.c similarity index 39% rename from math/aarch64/v_expf_1u.c rename to math/v_expf_1u.c index 43d03fa..023bd24 100644 --- a/math/aarch64/v_expf_1u.c +++ b/math/v_expf_1u.c @@ -1,12 +1,13 @@ /* * Single-precision vector e^x function. * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT */ #include "mathlib.h" #include "v_math.h" +#if V_SUPPORTED static const float Poly[] = { /* maxerr: 0.36565 +0.5 ulp. */ @@ -27,51 +28,53 @@ static const float Poly[] = { #define Ln2hi v_f32 (0x1.62e4p-1f) #define Ln2lo v_f32 (0x1.7f7d1cp-20f) -static float32x4_t VPCS_ATTR NOINLINE -specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); - float32x4_t s2 = vreinterpretq_f32_u32 (e - b); - uint32x4_t cmp = absn > v_f32 (192.0f); - float32x4_t r1 = s1 * s1; - float32x4_t r0 = poly * s1 * s2; - return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) - | (~cmp & vreinterpretq_u32_f32 (r0))); + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); + v_f32_t r1 = s1 * s1; + v_f32_t r0 = poly * s1 * s2; + return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); } -float32x4_t VPCS_ATTR -_ZGVnN4v_expf_1u (float32x4_t x) +VPCS_ATTR +v_f32_t +V_NAME(expf_1u) (v_f32_t x) { - float32x4_t n, r, scale, poly, absn, z; - uint32x4_t cmp, e; + v_f32_t n, r, scale, poly, absn, z; + v_u32_t cmp, e; /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ #if 1 - z = vfmaq_f32 (Shift, x, InvLn2); + z = v_fma_f32 (x, InvLn2, Shift); n = z - Shift; - r = vfmaq_f32 (x, n, -Ln2hi); - r = vfmaq_f32 (r, n, -Ln2lo); - e = vreinterpretq_u32_f32 (z) << 23; + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_f32 (z) << 23; #else z = x * InvLn2; - n = vrndaq_f32 (z); - r = vfmaq_f32 (x, n, -Ln2hi); - r = vfmaq_f32 (r, n, -Ln2lo); - e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; + n = v_round_f32 (z); + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_s32 (v_round_s32 (z)) << 23; #endif - scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); - absn = vabsq_f32 (n); - cmp = absn > v_f32 (126.0f); - poly = vfmaq_f32 (C1, C0, r); - poly = vfmaq_f32 (C2, poly, r); - poly = vfmaq_f32 (C3, poly, r); - poly = vfmaq_f32 (C4, poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); - poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + poly = v_fma_f32 (C0, r, C1); + poly = v_fma_f32 (poly, r, C2); + poly = v_fma_f32 (poly, r, C3); + poly = v_fma_f32 (poly, r, C4); + poly = v_fma_f32 (poly, r, v_f32 (1.0f)); + poly = v_fma_f32 (poly, r, v_f32 (1.0f)); if (unlikely (v_any_u32 (cmp))) return specialcase (poly, n, e, absn); return scale * poly; } +#endif diff --git a/math/v_log.c b/math/v_log.c new file mode 100644 index 0000000..d84c740 --- /dev/null +++ b/math/v_log.c @@ -0,0 +1,104 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_log.h" +#if V_SUPPORTED + +/* Worst-case error: 1.17 + 0.5 ulp. */ + +static const f64_t Poly[] = { + /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + -0x1.ffffffffffff7p-2, + 0x1.55555555170d4p-2, + -0x1.0000000399c27p-2, + 0x1.999b2e90e94cap-3, + -0x1.554e550bd501ep-3, +}; + +#define A0 v_f64 (Poly[0]) +#define A1 v_f64 (Poly[1]) +#define A2 v_f64 (Poly[2]) +#define A3 v_f64 (Poly[3]) +#define A4 v_f64 (Poly[4]) +#define Ln2 v_f64 (0x1.62e42fefa39efp-1) +#define N (1 << V_LOG_TABLE_BITS) +#define OFF v_u64 (0x3fe6900900000000) + +struct entry +{ + v_f64_t invc; + v_f64_t logc; +}; + +static inline struct entry +lookup (v_u64_t i) +{ + struct entry e; +#ifdef SCALAR + e.invc = __v_log_data[i].invc; + e.logc = __v_log_data[i].logc; +#else + e.invc[0] = __v_log_data[i[0]].invc; + e.logc[0] = __v_log_data[i[0]].logc; + e.invc[1] = __v_log_data[i[1]].invc; + e.logc[1] = __v_log_data[i[1]].logc; +#endif + return e; +} + +VPCS_ATTR +__attribute__ ((noinline)) static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (log, x, y, cmp); +} + +VPCS_ATTR +v_f64_t +V_NAME(log) (v_f64_t x) +{ + v_f64_t z, r, r2, p, y, kd, hi; + v_u64_t ix, iz, tmp, top, i, cmp; + v_s64_t k; + struct entry e; + + ix = v_as_u64_f64 (x); + top = ix >> 48; + cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N; + k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */ + iz = ix - (tmp & v_u64 (0xfffULL << 52)); + z = v_as_f64_u64 (iz); + e = lookup (i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); + kd = v_to_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = v_fma_f64 (kd, Ln2, e.logc + r); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = r * r; + y = v_fma_f64 (A3, r, A2); + p = v_fma_f64 (A1, r, A0); + y = v_fma_f64 (A4, r2, y); + y = v_fma_f64 (y, r2, p); + y = v_fma_f64 (y, r2, hi); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/v_log.h b/math/v_log.h new file mode 100644 index 0000000..bcc2fa6 --- /dev/null +++ b/math/v_log.h @@ -0,0 +1,18 @@ +/* + * Declarations for double-precision log(x) vector function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "v_math.h" +#if WANT_VMATH + +#define V_LOG_TABLE_BITS 7 + +extern const struct v_log_data +{ + f64_t invc; + f64_t logc; +} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN; +#endif diff --git a/math/v_log_data.c b/math/v_log_data.c new file mode 100644 index 0000000..97ee5b0 --- /dev/null +++ b/math/v_log_data.c @@ -0,0 +1,158 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "v_log.h" +#if WANT_VMATH + +#define N (1 << V_LOG_TABLE_BITS) + +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + +where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) +and log(c) and 1/c for the ith subinterval comes from a lookup table: + + tab[i].invc = 1/c + tab[i].logc = (double)log(c) + +where c is near the center of the subinterval and is chosen by trying several +floating point invc candidates around 1/center and selecting one for which +the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval +that contains 1 and the previous one got tweaked to avoid cancellation. */ +const struct v_log_data __v_log_data[N] = { +{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2}, +{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2}, +{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2}, +{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2}, +{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2}, +{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2}, +{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2}, +{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2}, +{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2}, +{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2}, +{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2}, +{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2}, +{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2}, +{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2}, +{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2}, +{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2}, +{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2}, +{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2}, +{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2}, +{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3}, +{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3}, +{0x1.446f12b278001p+0, -0x1.e52e160484698p-3}, +{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3}, +{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3}, +{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3}, +{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3}, +{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3}, +{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3}, +{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3}, +{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3}, +{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3}, +{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3}, +{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3}, +{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3}, +{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3}, +{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3}, +{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3}, +{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3}, +{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3}, +{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3}, +{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3}, +{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3}, +{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3}, +{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3}, +{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3}, +{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4}, +{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4}, +{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4}, +{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4}, +{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4}, +{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4}, +{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4}, +{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4}, +{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4}, +{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4}, +{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4}, +{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4}, +{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4}, +{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4}, +{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4}, +{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5}, +{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5}, +{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5}, +{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5}, +{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5}, +{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5}, +{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5}, +{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5}, +{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6}, +{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6}, +{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6}, +{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6}, +{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7}, +{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7}, +{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9}, +{1.0, 0.0}, +{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8}, +{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7}, +{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6}, +{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6}, +{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5}, +{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5}, +{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5}, +{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5}, +{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4}, +{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4}, +{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4}, +{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4}, +{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4}, +{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4}, +{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4}, +{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4}, +{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4}, +{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3}, +{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3}, +{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3}, +{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3}, +{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3}, +{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3}, +{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3}, +{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3}, +{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3}, +{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3}, +{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3}, +{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3}, +{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3}, +{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3}, +{0x1.9998e1480b618p-1, 0x1.c903161240163p-3}, +{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3}, +{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3}, +{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3}, +{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3}, +{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2}, +{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2}, +{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2}, +{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2}, +{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2}, +{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2}, +{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2}, +{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2}, +{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2}, +{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2}, +{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2}, +{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2}, +{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2}, +{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2}, +{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2}, +{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2}, +}; +#endif diff --git a/math/v_logf.c b/math/v_logf.c new file mode 100644 index 0000000..7373192 --- /dev/null +++ b/math/v_logf.c @@ -0,0 +1,73 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* 3.34 ulp error */ + -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, + -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, +}; +#define P7 v_f32 (Poly[0]) +#define P6 v_f32 (Poly[1]) +#define P5 v_f32 (Poly[2]) +#define P4 v_f32 (Poly[3]) +#define P3 v_f32 (Poly[4]) +#define P2 v_f32 (Poly[5]) +#define P1 v_f32 (Poly[6]) + +#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Mask v_u32 (0x007fffff) +#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ + +VPCS_ATTR +__attribute__ ((noinline)) static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, x, y, cmp); +} + +VPCS_ATTR +v_f32_t +V_NAME(logf) (v_f32_t x) +{ + v_f32_t n, p, q, r, r2, y; + v_u32_t u, cmp; + + u = v_as_u32_f32 (x); + cmp = v_cond_u32 (u - Min >= Max - Min); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */ + u -= Off; + n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */ + u &= Mask; + u += Off; + r = v_as_f32_u32 (u) - v_f32 (1.0f); + + /* y = log(1+r) + n*ln2. */ + r2 = r * r; + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = v_fma_f32 (P6, r, P5); + q = v_fma_f32 (P4, r, P3); + y = v_fma_f32 (P2, r, P1); + p = v_fma_f32 (P7, r2, p); + q = v_fma_f32 (p, r2, q); + y = v_fma_f32 (q, r2, y); + p = v_fma_f32 (Ln2, n, r); + y = v_fma_f32 (y, r2, p); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/v_math.h b/math/v_math.h new file mode 100644 index 0000000..f2cc467 --- /dev/null +++ b/math/v_math.h @@ -0,0 +1,641 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#ifndef WANT_VMATH +/* Enable the build of vector math code. */ +# define WANT_VMATH 1 +#endif +#if WANT_VMATH + +/* The goal of this header is to allow vector and scalar + build of the same algorithm, the provided intrinsic + wrappers are also vector length agnostic so they can + be implemented for SVE too (or other simd architectures) + and then the code should work on those targets too. */ + +#if SCALAR +#define V_NAME(x) __s_##x +#elif VPCS && __aarch64__ +#define V_NAME(x) __vn_##x +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) +#else +#define V_NAME(x) __v_##x +#endif + +#ifndef VPCS_ATTR +#define VPCS_ATTR +#endif +#ifndef VPCS_ALIAS +#define VPCS_ALIAS +#endif + +#include +#include "math_config.h" + +typedef float f32_t; +typedef uint32_t u32_t; +typedef int32_t s32_t; +typedef double f64_t; +typedef uint64_t u64_t; +typedef int64_t s64_t; + +/* reinterpret as type1 from type2. */ +static inline u32_t +as_u32_f32 (f32_t x) +{ + union { f32_t f; u32_t u; } r = {x}; + return r.u; +} +static inline f32_t +as_f32_u32 (u32_t x) +{ + union { u32_t u; f32_t f; } r = {x}; + return r.f; +} +static inline s32_t +as_s32_u32 (u32_t x) +{ + union { u32_t u; s32_t i; } r = {x}; + return r.i; +} +static inline u32_t +as_u32_s32 (s32_t x) +{ + union { s32_t i; u32_t u; } r = {x}; + return r.u; +} +static inline u64_t +as_u64_f64 (f64_t x) +{ + union { f64_t f; u64_t u; } r = {x}; + return r.u; +} +static inline f64_t +as_f64_u64 (u64_t x) +{ + union { u64_t u; f64_t f; } r = {x}; + return r.f; +} +static inline s64_t +as_s64_u64 (u64_t x) +{ + union { u64_t u; s64_t i; } r = {x}; + return r.i; +} +static inline u64_t +as_u64_s64 (s64_t x) +{ + union { s64_t i; u64_t u; } r = {x}; + return r.u; +} + +#if SCALAR +#define V_SUPPORTED 1 +typedef f32_t v_f32_t; +typedef u32_t v_u32_t; +typedef s32_t v_s32_t; +typedef f64_t v_f64_t; +typedef u64_t v_u64_t; +typedef s64_t v_s64_t; + +static inline int +v_lanes32 (void) +{ + return 1; +} + +static inline v_f32_t +v_f32 (f32_t x) +{ + return x; +} +static inline v_u32_t +v_u32 (u32_t x) +{ + return x; +} +static inline v_s32_t +v_s32 (s32_t x) +{ + return x; +} + +static inline f32_t +v_get_f32 (v_f32_t x, int i) +{ + return x; +} +static inline u32_t +v_get_u32 (v_u32_t x, int i) +{ + return x; +} +static inline s32_t +v_get_s32 (v_s32_t x, int i) +{ + return x; +} + +static inline void +v_set_f32 (v_f32_t *x, int i, f32_t v) +{ + *x = v; +} +static inline void +v_set_u32 (v_u32_t *x, int i, u32_t v) +{ + *x = v; +} +static inline void +v_set_s32 (v_s32_t *x, int i, s32_t v) +{ + *x = v; +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (v_u32_t x) +{ + return x != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u32_t +v_cond_u32 (v_u32_t x) +{ + return x ? -1 : 0; +} +static inline v_f32_t +v_abs_f32 (v_f32_t x) +{ + return __builtin_fabsf (x); +} +static inline v_f32_t +v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) +{ + return __builtin_fmaf (x, y, z); +} +static inline v_f32_t +v_round_f32 (v_f32_t x) +{ + return __builtin_roundf (x); +} +static inline v_s32_t +v_round_s32 (v_f32_t x) +{ + return __builtin_lroundf (x); /* relies on -fno-math-errno. */ +} +/* convert to type1 from type2. */ +static inline v_f32_t +v_to_f32_s32 (v_s32_t x) +{ + return x; +} +static inline v_f32_t +v_to_f32_u32 (v_u32_t x) +{ + return x; +} +/* reinterpret as type1 from type2. */ +static inline v_u32_t +v_as_u32_f32 (v_f32_t x) +{ + union { v_f32_t f; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_as_f32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_f32_t f; } r = {x}; + return r.f; +} +static inline v_s32_t +v_as_s32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_s32_t i; } r = {x}; + return r.i; +} +static inline v_u32_t +v_as_u32_s32 (v_s32_t x) +{ + union { v_s32_t i; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_lookup_f32 (const f32_t *tab, v_u32_t idx) +{ + return tab[idx]; +} +static inline v_u32_t +v_lookup_u32 (const u32_t *tab, v_u32_t idx) +{ + return tab[idx]; +} +static inline v_f32_t +v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +{ + return f (x); +} +static inline v_f32_t +v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, + v_u32_t p) +{ + return f (x1, x2); +} + +static inline int +v_lanes64 (void) +{ + return 1; +} +static inline v_f64_t +v_f64 (f64_t x) +{ + return x; +} +static inline v_u64_t +v_u64 (u64_t x) +{ + return x; +} +static inline v_s64_t +v_s64 (s64_t x) +{ + return x; +} +static inline f64_t +v_get_f64 (v_f64_t x, int i) +{ + return x; +} +static inline void +v_set_f64 (v_f64_t *x, int i, f64_t v) +{ + *x = v; +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (v_u64_t x) +{ + return x != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u64_t +v_cond_u64 (v_u64_t x) +{ + return x ? -1 : 0; +} +static inline v_f64_t +v_abs_f64 (v_f64_t x) +{ + return __builtin_fabs (x); +} +static inline v_f64_t +v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) +{ + return __builtin_fma (x, y, z); +} +static inline v_f64_t +v_round_f64 (v_f64_t x) +{ + return __builtin_round (x); +} +static inline v_s64_t +v_round_s64 (v_f64_t x) +{ + return __builtin_lround (x); /* relies on -fno-math-errno. */ +} +/* convert to type1 from type2. */ +static inline v_f64_t +v_to_f64_s64 (v_s64_t x) +{ + return x; +} +static inline v_f64_t +v_to_f64_u64 (v_u64_t x) +{ + return x; +} +/* reinterpret as type1 from type2. */ +static inline v_u64_t +v_as_u64_f64 (v_f64_t x) +{ + union { v_f64_t f; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_as_f64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_f64_t f; } r = {x}; + return r.f; +} +static inline v_s64_t +v_as_s64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_s64_t i; } r = {x}; + return r.i; +} +static inline v_u64_t +v_as_u64_s64 (v_s64_t x) +{ + union { v_s64_t i; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_lookup_f64 (const f64_t *tab, v_u64_t idx) +{ + return tab[idx]; +} +static inline v_u64_t +v_lookup_u64 (const u64_t *tab, v_u64_t idx) +{ + return tab[idx]; +} +static inline v_f64_t +v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) +{ + return f (x); +} + +#elif __aarch64__ +#define V_SUPPORTED 1 +#include +typedef float32x4_t v_f32_t; +typedef uint32x4_t v_u32_t; +typedef int32x4_t v_s32_t; +typedef float64x2_t v_f64_t; +typedef uint64x2_t v_u64_t; +typedef int64x2_t v_s64_t; + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline v_f32_t +v_f32 (f32_t x) +{ + return (v_f32_t){x, x, x, x}; +} +static inline v_u32_t +v_u32 (u32_t x) +{ + return (v_u32_t){x, x, x, x}; +} +static inline v_s32_t +v_s32 (s32_t x) +{ + return (v_s32_t){x, x, x, x}; +} + +static inline f32_t +v_get_f32 (v_f32_t x, int i) +{ + return x[i]; +} +static inline u32_t +v_get_u32 (v_u32_t x, int i) +{ + return x[i]; +} +static inline s32_t +v_get_s32 (v_s32_t x, int i) +{ + return x[i]; +} + +static inline void +v_set_f32 (v_f32_t *x, int i, f32_t v) +{ + (*x)[i] = v; +} +static inline void +v_set_u32 (v_u32_t *x, int i, u32_t v) +{ + (*x)[i] = v; +} +static inline void +v_set_s32 (v_s32_t *x, int i, s32_t v) +{ + (*x)[i] = v; +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (v_u32_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u32_t +v_cond_u32 (v_u32_t x) +{ + return x; +} +static inline v_f32_t +v_abs_f32 (v_f32_t x) +{ + return vabsq_f32 (x); +} +static inline v_f32_t +v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) +{ + return vfmaq_f32 (z, x, y); +} +static inline v_f32_t +v_round_f32 (v_f32_t x) +{ + return vrndaq_f32 (x); +} +static inline v_s32_t +v_round_s32 (v_f32_t x) +{ + return vcvtaq_s32_f32 (x); +} +/* convert to type1 from type2. */ +static inline v_f32_t +v_to_f32_s32 (v_s32_t x) +{ + return (v_f32_t){x[0], x[1], x[2], x[3]}; +} +static inline v_f32_t +v_to_f32_u32 (v_u32_t x) +{ + return (v_f32_t){x[0], x[1], x[2], x[3]}; +} +/* reinterpret as type1 from type2. */ +static inline v_u32_t +v_as_u32_f32 (v_f32_t x) +{ + union { v_f32_t f; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_as_f32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_f32_t f; } r = {x}; + return r.f; +} +static inline v_s32_t +v_as_s32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_s32_t i; } r = {x}; + return r.i; +} +static inline v_u32_t +v_as_u32_s32 (v_s32_t x) +{ + union { v_s32_t i; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_lookup_f32 (const f32_t *tab, v_u32_t idx) +{ + return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline v_u32_t +v_lookup_u32 (const u32_t *tab, v_u32_t idx) +{ + return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline v_f32_t +v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +{ + return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline v_f32_t +v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, + v_u32_t p) +{ + return ( + v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline v_f64_t +v_f64 (f64_t x) +{ + return (v_f64_t){x, x}; +} +static inline v_u64_t +v_u64 (u64_t x) +{ + return (v_u64_t){x, x}; +} +static inline v_s64_t +v_s64 (s64_t x) +{ + return (v_s64_t){x, x}; +} +static inline f64_t +v_get_f64 (v_f64_t x, int i) +{ + return x[i]; +} +static inline void +v_set_f64 (v_f64_t *x, int i, f64_t v) +{ + (*x)[i] = v; +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (v_u64_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u64_t +v_cond_u64 (v_u64_t x) +{ + return x; +} +static inline v_f64_t +v_abs_f64 (v_f64_t x) +{ + return vabsq_f64 (x); +} +static inline v_f64_t +v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) +{ + return vfmaq_f64 (z, x, y); +} +static inline v_f64_t +v_round_f64 (v_f64_t x) +{ + return vrndaq_f64 (x); +} +static inline v_s64_t +v_round_s64 (v_f64_t x) +{ + return vcvtaq_s64_f64 (x); +} +/* convert to type1 from type2. */ +static inline v_f64_t +v_to_f64_s64 (v_s64_t x) +{ + return (v_f64_t){x[0], x[1]}; +} +static inline v_f64_t +v_to_f64_u64 (v_u64_t x) +{ + return (v_f64_t){x[0], x[1]}; +} +/* reinterpret as type1 from type2. */ +static inline v_u64_t +v_as_u64_f64 (v_f64_t x) +{ + union { v_f64_t f; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_as_f64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_f64_t f; } r = {x}; + return r.f; +} +static inline v_s64_t +v_as_s64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_s64_t i; } r = {x}; + return r.i; +} +static inline v_u64_t +v_as_u64_s64 (v_s64_t x) +{ + union { v_s64_t i; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_lookup_f64 (const f64_t *tab, v_u64_t idx) +{ + return (v_f64_t){tab[idx[0]], tab[idx[1]]}; +} +static inline v_u64_t +v_lookup_u64 (const u64_t *tab, v_u64_t idx) +{ + return (v_u64_t){tab[idx[0]], tab[idx[1]]}; +} +static inline v_f64_t +v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) +{ + return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; +} +#endif + +#endif +#endif diff --git a/math/aarch64/v_pow.c b/math/v_pow.c similarity index 35% rename from math/aarch64/v_pow.c rename to math/v_pow.c index 734f166..a209d57 100644 --- a/math/aarch64/v_pow.c +++ b/math/v_pow.c @@ -1,22 +1,27 @@ /* * Double-precision vector pow function. * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include "mathlib.h" #include "v_math.h" +#if V_SUPPORTED -float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +VPCS_ATTR +v_f64_t +V_NAME(pow) (v_f64_t x, v_f64_t y) { - float64x2_t z; + v_f64_t z; for (int lane = 0; lane < v_lanes64 (); lane++) { - double sx = x[lane]; - double sy = y[lane]; - double sz = pow (sx, sy); - z[lane] = sz; + f64_t sx = v_get_f64 (x, lane); + f64_t sy = v_get_f64 (y, lane); + f64_t sz = pow (sx, sy); + v_set_f64 (&z, lane, sz); } return z; } +VPCS_ALIAS +#endif diff --git a/math/v_powf.c b/math/v_powf.c new file mode 100644 index 0000000..fb80fa6 --- /dev/null +++ b/math/v_powf.c @@ -0,0 +1,235 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define SBITS 5 +#define Tlog v__powf_log2_data.tab +#define Texp v__exp2f_data.tab +#define A v__powf_log2_data.poly +#define C v__exp2f_data.poly +#define LOGDEG 4 + +#if LOGDEG == 5 +/* 1.01 ulp */ +#define OFF v_u32 (0x3f330000) +#define TBITS 4 +#elif LOGDEG == 4 +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */ +#define OFF v_u32 (0x3f35d000) +#define TBITS 5 +#endif + +#define V_EXP2F_TABLE_BITS SBITS +#define V_EXP2F_POLY_ORDER 3 +struct v_exp2f_data +{ + uint64_t tab[1 << V_EXP2F_TABLE_BITS]; + double poly[V_EXP2F_POLY_ORDER]; +}; + +#define V_POWF_LOG2_TABLE_BITS TBITS +#define V_POWF_LOG2_POLY_ORDER LOGDEG +#define SCALE ((double) (1 << SBITS)) +struct v_powf_log2_data +{ + struct + { + double invc, logc; + } tab[1 << V_POWF_LOG2_TABLE_BITS]; + double poly[V_POWF_LOG2_POLY_ORDER]; +}; + +static const struct v_powf_log2_data v__powf_log2_data = { +#if LOGDEG == 5 + .tab = { +{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE }, +{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE }, +{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE }, +{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE }, +{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE }, +{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE }, +{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE }, +{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE }, +{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE }, +{ 0x1p+0, 0x0p+0 * SCALE }, +{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE }, +{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE }, +{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE }, +{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE }, +{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE }, +{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE }, + }, +/* rel err: 1.46 * 2^-32 */ + .poly = { +0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE, +0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE, +0x1.71547652ab82bp0 * SCALE, + } +#elif LOGDEG == 4 + .tab = { +{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE}, +{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE}, +{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE}, +{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE}, +{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE}, +{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE}, +{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE}, +{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE}, +{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE}, +{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE}, +{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE}, +{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE}, +{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE}, +{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE}, +{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE}, +{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE}, +{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE}, +{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE}, +{0x1p+0, 0x0p+0 * SCALE}, +{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE}, +{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE}, +{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE}, +{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE}, +{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE}, +{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE}, +{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE}, +{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE}, +{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE}, +{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE}, +{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE}, +{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE}, +{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE}, + }, +/* rel err: 1.5 * 2^-30 */ + .poly = { + -0x1.6ff5daa3b3d7cp-2 * SCALE, + 0x1.ec81d03c01aebp-2 * SCALE, + -0x1.71547bb43f101p-1 * SCALE, + 0x1.7154764a815cbp0 * SCALE, + } +#endif +}; + +static const struct v_exp2f_data v__exp2f_data = { + .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, +/* rel err: 1.69 * 2^-34 */ + .poly = { +0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE + }, +}; + +VPCS_ATTR +__attribute__ ((noinline)) static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +VPCS_ATTR +v_f32_t +V_NAME(powf) (v_f32_t x, v_f32_t y) +{ + v_u32_t u, tmp, cmp, i, top, iz; + v_s32_t k; + v_f32_t ret; + + u = v_as_u32_f32 (x); + cmp = v_cond_u32 (u - Min >= Max - Min); + tmp = u - OFF; + i = (tmp >> (23 - TBITS)) % (1 << TBITS); + top = tmp & 0xff800000; + iz = u - top; + k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */ + + for (int lane = 0; lane < v_lanes32 (); lane++) + { + uint32_t si, siz; + int32_t sk; + float sy; + + /* Use double precision for each lane. */ + double invc, logc, z, r, p, y0, logx, ylogx, kd, s; + uint64_t ki, t; + + si = v_get_u32 (i, lane); + siz = v_get_u32 (iz, lane); + sk = v_get_s32 (k, lane); + sy = v_get_f32 (y, lane); + + invc = Tlog[si].invc; + logc = Tlog[si].logc; + z = (double) as_f32_u32 (siz); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ + r = __builtin_fma (z, invc, -1.0); + y0 = logc + (double) sk; + + /* Polynomial to approximate log1p(r)/ln2. */ +#if LOGDEG == 5 + logx = A[0]; + logx = r * logx + A[1]; + logx = r * logx + A[2]; + logx = r * logx + A[3]; + logx = r * logx + A[4]; + logx = r * logx + y0; +#elif LOGDEG == 4 + logx = A[0]; + logx = r * logx + A[1]; + logx = r * logx + A[2]; + logx = r * logx + A[3]; + logx = r * logx + y0; +#endif + ylogx = sy * logx; + v_set_u32 (&cmp, lane, + (as_u64_f64 (ylogx) >> 47 & 0xffff) + >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47 + ? 1 + : v_get_u32 (cmp, lane)); + + /* N*x = k + r with r in [-1/2, 1/2] */ +#if TOINT_INTRINSICS + kd = roundtoint (ylogx); /* k */ + ki = converttoint (ylogx); +#else +# define SHIFT 0x1.8p52 + kd = eval_as_double (ylogx + SHIFT); + ki = asuint64 (kd); + kd -= SHIFT; +#endif + r = ylogx - kd; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + t = Texp[ki % (1 << SBITS)]; + t += ki << (52 - SBITS); + s = as_f64_u64 (t); + p = C[0]; + p = __builtin_fma (p, r, C[1]); + p = __builtin_fma (p, r, C[2]); + p = __builtin_fma (p, s * r, s); + + v_set_f32 (&ret, lane, p); + } + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, ret, cmp); + return ret; +} +VPCS_ALIAS +#endif diff --git a/math/v_sin.c b/math/v_sin.c new file mode 100644 index 0000000..2b9ed05 --- /dev/null +++ b/math/v_sin.c @@ -0,0 +1,86 @@ +/* + * Double-precision vector sin function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const double Poly[] = { +/* worst-case error is 3.5 ulp. + abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ +-0x1.9f4a9c8b21dc9p-41, + 0x1.60e88a10163f2p-33, +-0x1.ae6361b7254e7p-26, + 0x1.71de382e8d62bp-19, +-0x1.a01a019aeb4ffp-13, + 0x1.111111110b25ep-7, +-0x1.55555555554c3p-3, +}; + +#define C7 v_f64 (Poly[0]) +#define C6 v_f64 (Poly[1]) +#define C5 v_f64 (Poly[2]) +#define C4 v_f64 (Poly[3]) +#define C3 v_f64 (Poly[4]) +#define C2 v_f64 (Poly[5]) +#define C1 v_f64 (Poly[6]) + +#define InvPi v_f64 (0x1.45f306dc9c883p-2) +#define Pi1 v_f64 (0x1.921fb54442d18p+1) +#define Pi2 v_f64 (0x1.1a62633145c06p-53) +#define Pi3 v_f64 (0x1.c1cd129024e09p-106) +#define Shift v_f64 (0x1.8p52) +#define RangeVal v_f64 (0x1p23) +#define AbsMask v_u64 (0x7fffffffffffffff) + +VPCS_ATTR +__attribute__ ((noinline)) static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (sin, x, y, cmp); +} + +VPCS_ATTR +v_f64_t +V_NAME(sin) (v_f64_t x) +{ + v_f64_t n, r, r2, y; + v_u64_t sign, odd, cmp; + + r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); + sign = v_as_u64_f64 (x) & ~AbsMask; + cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); + + /* n = rint(|x|/pi). */ + n = v_fma_f64 (InvPi, r, Shift); + odd = v_as_u64_f64 (n) << 63; + n -= Shift; + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = v_fma_f64 (-Pi1, n, r); + r = v_fma_f64 (-Pi2, n, r); + r = v_fma_f64 (-Pi3, n, r); + + /* sin(r) poly approx. */ + r2 = r * r; + y = v_fma_f64 (C7, r2, C6); + y = v_fma_f64 (y, r2, C5); + y = v_fma_f64 (y, r2, C4); + y = v_fma_f64 (y, r2, C3); + y = v_fma_f64 (y, r2, C2); + y = v_fma_f64 (y, r2, C1); + y = v_fma_f64 (y * r2, r, r); + + /* sign. */ + y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/v_sinf.c b/math/v_sinf.c new file mode 100644 index 0000000..e66bfce --- /dev/null +++ b/math/v_sinf.c @@ -0,0 +1,75 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* 1.886 ulp error */ + 0x1.5b2e76p-19f, + -0x1.9f42eap-13f, + 0x1.110df4p-7f, + -0x1.555548p-3f, +}; +#define Pi1 v_f32 (0x1.921fb6p+1f) +#define Pi2 v_f32 (-0x1.777a5cp-24f) +#define Pi3 v_f32 (-0x1.ee59dap-49f) +#define A3 v_f32 (Poly[3]) +#define A5 v_f32 (Poly[2]) +#define A7 v_f32 (Poly[1]) +#define A9 v_f32 (Poly[0]) +#define RangeVal v_f32 (0x1p20f) +#define InvPi v_f32 (0x1.45f306p-2f) +#define Shift v_f32 (0x1.8p+23f) +#define AbsMask v_u32 (0x7fffffff) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (sinf, x, y, cmp); +} + +VPCS_ATTR +v_f32_t +V_NAME(sinf) (v_f32_t x) +{ + v_f32_t n, r, r2, y; + v_u32_t sign, odd, cmp; + + r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); + sign = v_as_u32_f32 (x) & ~AbsMask; + cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); + + /* n = rint(|x|/pi) */ + n = v_fma_f32 (InvPi, r, Shift); + odd = v_as_u32_f32 (n) << 31; + n -= Shift; + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = v_fma_f32 (-Pi1, n, r); + r = v_fma_f32 (-Pi2, n, r); + r = v_fma_f32 (-Pi3, n, r); + + /* y = sin(r) */ + r2 = r * r; + y = v_fma_f32 (A9, r2, A7); + y = v_fma_f32 (y, r2, A5); + y = v_fma_f32 (y, r2, A3); + y = v_fma_f32 (y * r2, r, r); + + /* sign fix */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS +#endif diff --git a/math/vn_cos.c b/math/vn_cos.c new file mode 100644 index 0000000..b57a549 --- /dev/null +++ b/math/vn_cos.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cos. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos) +#include "v_cos.c" +#endif diff --git a/math/vn_cosf.c b/math/vn_cosf.c new file mode 100644 index 0000000..6321d46 --- /dev/null +++ b/math/vn_cosf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cosf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf) +#include "v_cosf.c" +#endif diff --git a/math/vn_exp.c b/math/vn_exp.c new file mode 100644 index 0000000..06e269d --- /dev/null +++ b/math/vn_exp.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_exp. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp) +#include "v_exp.c" +#endif diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c new file mode 100644 index 0000000..db9707e --- /dev/null +++ b/math/vn_exp2f.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_exp2f. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f) +#include "v_exp2f.c" +#endif diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c new file mode 100644 index 0000000..17bd0ab --- /dev/null +++ b/math/vn_exp2f_1u.c @@ -0,0 +1,11 @@ +/* + * AdvSIMD vector PCS variant of __v_exp2f_1u. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#include "v_exp2f_1u.c" +#endif diff --git a/math/vn_expf.c b/math/vn_expf.c new file mode 100644 index 0000000..0652907 --- /dev/null +++ b/math/vn_expf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_expf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) +#include "v_expf.c" +#endif diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c new file mode 100644 index 0000000..3be7768 --- /dev/null +++ b/math/vn_expf_1u.c @@ -0,0 +1,11 @@ +/* + * AdvSIMD vector PCS variant of __v_expf_1u. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#include "v_expf_1u.c" +#endif diff --git a/math/vn_log.c b/math/vn_log.c new file mode 100644 index 0000000..b58fe8f --- /dev/null +++ b/math/vn_log.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log) +#include "v_log.c" +#endif diff --git a/math/vn_logf.c b/math/vn_logf.c new file mode 100644 index 0000000..cc5b8ae --- /dev/null +++ b/math/vn_logf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_logf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf) +#include "v_logf.c" +#endif diff --git a/math/vn_pow.c b/math/vn_pow.c new file mode 100644 index 0000000..2609501 --- /dev/null +++ b/math/vn_pow.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_pow. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow) +#include "v_pow.c" +#endif diff --git a/math/vn_powf.c b/math/vn_powf.c new file mode 100644 index 0000000..095d07e --- /dev/null +++ b/math/vn_powf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_powf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf) +#include "v_powf.c" +#endif diff --git a/math/vn_sin.c b/math/vn_sin.c new file mode 100644 index 0000000..905c796 --- /dev/null +++ b/math/vn_sin.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_sin. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin) +#include "v_sin.c" +#endif diff --git a/math/vn_sinf.c b/math/vn_sinf.c new file mode 100644 index 0000000..1214e1a --- /dev/null +++ b/math/vn_sinf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_sinf. + * + * Copyright (c) 2019, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf) +#include "v_sinf.c" +#endif diff --git a/networking/Dir.mk b/networking/Dir.mk index 2589e0a..b496103 100644 --- a/networking/Dir.mk +++ b/networking/Dir.mk @@ -1,7 +1,7 @@ # Makefile fragment - requires GNU make # # Copyright (c) 2019-2020, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# SPDX-License-Identifier: MIT S := $(srcdir)/networking B := build/networking diff --git a/networking/aarch64/chksum_simd.c b/networking/aarch64/chksum_simd.c index 90c00eb..6d5be58 100644 --- a/networking/aarch64/chksum_simd.c +++ b/networking/aarch64/chksum_simd.c @@ -2,7 +2,7 @@ * AArch64-specific checksum implementation using NEON * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "networking.h" diff --git a/networking/arm/chksum_simd.c b/networking/arm/chksum_simd.c index ae08fe5..7f69adf 100644 --- a/networking/arm/chksum_simd.c +++ b/networking/arm/chksum_simd.c @@ -2,7 +2,7 @@ * Armv7-A specific checksum implementation using NEON * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "networking.h" diff --git a/networking/chksum.c b/networking/chksum.c index 329482f..95ce5ba 100644 --- a/networking/chksum.c +++ b/networking/chksum.c @@ -3,7 +3,7 @@ * This sum is often used as a simple checksum in networking. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include "networking.h" diff --git a/networking/chksum_common.h b/networking/chksum_common.h index 16f0f6c..958c8cc 100644 --- a/networking/chksum_common.h +++ b/networking/chksum_common.h @@ -2,7 +2,7 @@ * Common code for checksum implementations * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef CHKSUM_COMMON_H diff --git a/networking/include/networking.h b/networking/include/networking.h index 297dd4b..a88feff 100644 --- a/networking/include/networking.h +++ b/networking/include/networking.h @@ -2,7 +2,7 @@ * Public API. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ unsigned short __chksum (const void *, unsigned int); diff --git a/networking/test/chksum.c b/networking/test/chksum.c index 239b5b8..41b9812 100644 --- a/networking/test/chksum.c +++ b/networking/test/chksum.c @@ -2,7 +2,7 @@ * Ones' complement checksum test & benchmark * * Copyright (c) 2016-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #define _GNU_SOURCE diff --git a/string/Dir.mk b/string/Dir.mk index 40ff5ac..cf3453f 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -1,7 +1,7 @@ # Makefile fragment - requires GNU make # # Copyright (c) 2019-2021, Arm Limited. -# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception +# SPDX-License-Identifier: MIT S := $(srcdir)/string B := build/string diff --git a/string/README.contributors b/string/README.contributors deleted file mode 100644 index 0b4a51b..0000000 --- a/string/README.contributors +++ /dev/null @@ -1,30 +0,0 @@ -STYLE REQUIREMENTS -================== - -1. Most code in this sub-directory is expected to be upstreamed into glibc so - the GNU Coding Standard and glibc specific conventions should be followed - to ease upstreaming. - -2. ABI and symbols: the code should be written so it is suitable for inclusion - into a libc with minimal changes. This e.g. means that internal symbols - should be hidden and in the implementation reserved namespace according to - ISO C and POSIX rules. If possible the built shared libraries and static - library archives should be usable to override libc symbols at link time (or - at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI - (other than symbol versioning), this cannot be done reliably for static - linking so this is a best effort requirement. - -3. API: include headers should be suitable for benchmarking and testing code - and should not conflict with libc headers. - - -CONTRIBUTION GUIDELINES FOR string SUB-DIRECTORY -================================================ -1. Code: - - The assumptions of the code must be clearly documented. - - - Assembly style should be consistent across different implementations. - - -2. Performance: - - Benchmarking is needed on several microarchitectures. diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S index 207e229..84339f7 100644 --- a/string/aarch64/__mtag_tag_region.S +++ b/string/aarch64/__mtag_tag_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_region - tag memory * - * Copyright (c) 2021-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S index 44b8e01..f58364c 100644 --- a/string/aarch64/__mtag_tag_zero_region.S +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_zero_region - tag memory and fill it with zero bytes * - * Copyright (c) 2021-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2021, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h deleted file mode 100644 index 131b95e..0000000 --- a/string/aarch64/asmdefs.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Macros for asm code. AArch64 version. - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#ifndef _ASMDEFS_H -#define _ASMDEFS_H - -/* Branch Target Identitication support. */ -#define BTI_C hint 34 -#define BTI_J hint 36 -/* Return address signing support (pac-ret). */ -#define PACIASP hint 25; .cfi_window_save -#define AUTIASP hint 29; .cfi_window_save - -/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ -#define FEATURE_1_AND 0xc0000000 -#define FEATURE_1_BTI 1 -#define FEATURE_1_PAC 2 - -/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ -#ifdef __ILP32__ -#define GNU_PROPERTY(type, value) \ - .section .note.gnu.property, "a"; \ - .p2align 2; \ - .word 4; \ - .word 12; \ - .word 5; \ - .asciz "GNU"; \ - .word type; \ - .word 4; \ - .word value; \ - .text -#else -#define GNU_PROPERTY(type, value) \ - .section .note.gnu.property, "a"; \ - .p2align 3; \ - .word 4; \ - .word 16; \ - .word 5; \ - .asciz "GNU"; \ - .word type; \ - .word 4; \ - .word value; \ - .word 0; \ - .text -#endif - -/* If set then the GNU Property Note section will be added to - mark objects to support BTI and PAC-RET. */ -#ifndef WANT_GNU_PROPERTY -#define WANT_GNU_PROPERTY 1 -#endif - -#if WANT_GNU_PROPERTY -/* Add property note with supported features to all asm files. */ -GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) -#endif - -#define ENTRY_ALIGN(name, alignment) \ - .global name; \ - .type name,%function; \ - .align alignment; \ - name: \ - .cfi_startproc; \ - BTI_C; - -#define ENTRY(name) ENTRY_ALIGN(name, 6) - -#define ENTRY_ALIAS(name) \ - .global name; \ - .type name,%function; \ - name: - -#define END(name) \ - .cfi_endproc; \ - .size name, .-name; - -#define L(l) .L ## l - -#ifdef __ILP32__ - /* Sanitize padding bits of pointer arguments as per aapcs64 */ -#define PTR_ARG(n) mov w##n, w##n -#else -#define PTR_ARG(n) -#endif - -#ifdef __ILP32__ - /* Sanitize padding bits of size arguments as per aapcs64 */ -#define SIZE_ARG(n) mov w##n, w##n -#else -#define SIZE_ARG(n) -#endif - -/* Compiler supports SVE instructions */ -#ifndef HAVE_SVE -# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) -# define HAVE_SVE 1 -# else -# define HAVE_SVE 0 -# endif -#endif - -#endif diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S index 131b7fa..5a54242 100644 --- a/string/aarch64/check-arch.S +++ b/string/aarch64/check-arch.S @@ -1,8 +1,8 @@ /* * check ARCH setting. * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #if !__aarch64__ @@ -10,4 +10,4 @@ #endif /* Include for GNU property notes. */ -#include "asmdefs.h" +#include "../asmdefs.h" diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S index 948c3cb..c2e967d 100644 --- a/string/aarch64/memchr-mte.S +++ b/string/aarch64/memchr-mte.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,21 +23,25 @@ #define synd x5 #define shift x6 #define tmp x7 +#define wtmp w7 #define vrepchr v0 #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vend v3 -#define dend d3 +#define vrepmask v3 +#define vend v4 +#define dend d4 /* Core algorithm: - For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits - per byte. We take 4 bits of every comparison byte with shift right and narrow - by 4 instruction. Since the bits in the nibble mask reflect the order in - which things occur in the original string, counting leading zeros identifies - exactly which byte matched. */ + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) PTR_ARG (0) @@ -46,53 +50,55 @@ ENTRY (__memchr_aarch64_mte) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b lsl shift, srcin, 2 - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) rbit synd, synd clz synd, synd - cmp cntin, synd, lsr 2 add result, srcin, synd, lsr 2 + cmp cntin, synd, lsr 2 csel result, result, xzr, hi ret - .p2align 3 L(start_loop): sub tmp, src, srcin - add tmp, tmp, 17 + add tmp, tmp, 16 subs cntrem, cntin, tmp - b.lo L(nomatch) + b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - tbz cntrem, 4, L(loop32_2) - sub src, src, 16 + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + .p2align 4 L(loop32): - ldr qdata, [src, 32]! + ldr qdata, [src, 16]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, 16] - cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + ldr qdata, [src, 16]! subs cntrem, cntrem, 32 - b.lo L(end_2) + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + b.ls L(end) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) -L(end_2): - add src, src, 16 L(end): - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ - sub cntrem, src, srcin + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend - sub cntrem, cntin, cntrem + add tmp, srcin, cntin + sub cntrem, tmp, src #ifndef __AARCH64EB__ rbit synd, synd #endif diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S index b851cf3..c22e659 100644 --- a/string/aarch64/memchr-sve.S +++ b/string/aarch64/memchr-sve.S @@ -1,11 +1,11 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index fe6cfe2..353f0d1 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "asmdefs.h" +#include "../asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S index d52ce45..78c5eca 100644 --- a/string/aarch64/memcmp-sve.S +++ b/string/aarch64/memcmp-sve.S @@ -1,11 +1,11 @@ /* * memcmp - compare memory * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index 35135e7..3b10266 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -1,84 +1,103 @@ /* memcmp - compare memory * - * Copyright (c) 2013-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * ARMv8-a, AArch64, unaligned accesses. */ -#include "asmdefs.h" +#include "../asmdefs.h" -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 - -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define data3 x5 -#define data3w w5 -#define data4 x6 -#define data4w w6 -#define tmp x6 -#define src1end x7 -#define src2end x8 +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 ENTRY (__memcmp_aarch64) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) + subs limit, limit, 8 + b.lo L(less8) - cmp limit, 16 - b.lo L(less16) - ldp data1, data3, [src1] - ldp data2, data4, [src2] - ccmp data1, data2, 0, ne - ccmp data3, data4, 0, eq - b.ne L(return2) - - add src1end, src1, limit - add src2end, src2, limit - cmp limit, 32 - b.ls L(last_bytes) - cmp limit, 160 - b.hs L(loop_align) - sub limit, limit, 32 + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) - .p2align 4 -L(loop32): - ldp data1, data3, [src1, 16] - ldp data2, data4, [src2, 16] + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 cmp data1, data2 - ccmp data3, data4, 0, eq - b.ne L(return2) - cmp limit, 16 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 b.ls L(last_bytes) - ldp data1, data3, [src1, 32] - ldp data2, data4, [src2, 32] + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h cmp data1, data2 - ccmp data3, data4, 0, eq - b.ne L(return2) - add src1, src1, 32 - add src2, src2, 32 -L(last64): - subs limit, limit, 32 - b.hi L(loop32) + bne L(return) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - ldp data1, data3, [src1end, -16] - ldp data2, data4, [src2end, -16] -L(return2): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h cmp data1, data2 - csel data1, data1, data3, ne - csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): @@ -86,105 +105,33 @@ L(return): rev data1, data1 rev data2, data2 #endif - cmp data1, data2 + cmp data1, data2 +L(ret_eq): cset result, ne cneg result, result, lo ret .p2align 4 -L(less16): - add src1end, src1, limit - add src2end, src2, limit - tbz limit, 3, L(less8) - ldr data1, [src1] - ldr data2, [src2] - ldr data3, [src1end, -8] - ldr data4, [src2end, -8] - b L(return2) - - .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ L(less8): - tbz limit, 2, L(less4) - ldr data1w, [src1] - ldr data2w, [src2] - ldr data3w, [src1end, -4] - ldr data4w, [src2end, -4] - b L(return2) - -L(less4): - tbz limit, 1, L(less2) - ldrh data1w, [src1] - ldrh data2w, [src2] + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 cmp data1w, data2w b.ne L(return) -L(less2): - mov result, 0 - tbz limit, 0, L(return_zero) - ldrb data1w, [src1end, -1] - ldrb data2w, [src2end, -1] + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) sub result, data1w, data2w -L(return_zero): - ret - -L(loop_align): - ldp data1, data3, [src1, 16] - ldp data2, data4, [src2, 16] - cmp data1, data2 - ccmp data3, data4, 0, eq - b.ne L(return2) - - /* Align src2 and adjust src1, src2 and limit. */ - and tmp, src2, 15 - sub tmp, tmp, 16 - sub src2, src2, tmp - add limit, limit, tmp - sub src1, src1, tmp - sub limit, limit, 64 + 16 - - .p2align 4 -L(loop64): - ldr q0, [src1, 16] - ldr q1, [src2, 16] - subs limit, limit, 64 - ldr q2, [src1, 32] - ldr q3, [src2, 32] - eor v0.16b, v0.16b, v1.16b - eor v1.16b, v2.16b, v3.16b - ldr q2, [src1, 48] - ldr q3, [src2, 48] - umaxp v0.16b, v0.16b, v1.16b - ldr q4, [src1, 64]! - ldr q5, [src2, 64]! - eor v1.16b, v2.16b, v3.16b - eor v2.16b, v4.16b, v5.16b - umaxp v1.16b, v1.16b, v2.16b - umaxp v0.16b, v0.16b, v1.16b - umaxp v0.16b, v0.16b, v0.16b - fmov tmp, d0 - ccmp tmp, 0, 0, hi - b.eq L(loop64) - - /* If equal, process last 1-64 bytes using scalar loop. */ - add limit, limit, 64 + 16 - cbz tmp, L(last64) - - /* Determine the 8-byte aligned offset of the first difference. */ -#ifdef __AARCH64EB__ - rev16 tmp, tmp -#endif - rev tmp, tmp - clz tmp, tmp - bic tmp, tmp, 7 - sub tmp, tmp, 48 - ldr data1, [src1, tmp] - ldr data2, [src2, tmp] -#ifndef __AARCH64EB__ - rev data1, data1 - rev data2, data2 -#endif - mov result, 1 - cmp data1, data2 - cneg result, result, lo ret END (__memcmp_aarch64) + diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S index e6527d0..f97f2c3 100644 --- a/string/aarch64/memcpy-advsimd.S +++ b/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "asmdefs.h" +#include "../asmdefs.h" #define dstin x0 #define src x1 diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S deleted file mode 100644 index b45c314..0000000 --- a/string/aarch64/memcpy-mops.S +++ /dev/null @@ -1,21 +0,0 @@ -/* - * memcpy using MOPS extension. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "asmdefs.h" - -ENTRY (__memcpy_aarch64_mops) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - mov x3, x0 - .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */ - .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */ - .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */ - ret - -END (__memcpy_aarch64_mops) diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S deleted file mode 100644 index e8a946d..0000000 --- a/string/aarch64/memcpy-sve.S +++ /dev/null @@ -1,177 +0,0 @@ -/* - * memcpy - copy memory area - * - * Copyright (c) 2019-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. - * - */ - -#include "asmdefs.h" - -#ifdef HAVE_SVE - -.arch armv8-a+sve - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define tmp1 x6 -#define vlen x6 - -#define A_q q0 -#define B_q q1 -#define C_q q2 -#define D_q q3 -#define E_q q4 -#define F_q q5 -#define G_q q6 -#define H_q q7 - -/* This implementation handles overlaps and supports both memcpy and memmove - from a single entry point. It uses unaligned accesses and branchless - sequences to keep the code small, simple and improve performance. - SVE vectors are used to speedup small copies. - - Copies are split into 3 main cases: small copies of up to 32 bytes, medium - copies of up to 128 bytes, and large copies. The overhead of the overlap - check is negligible since it is only required for large copies. - - Large copies use a software pipelined loop processing 64 bytes per iteration. - The source pointer is 16-byte aligned to minimize unaligned accesses. - The loop tail is handled by always copying 64 bytes from the end. -*/ - -ENTRY_ALIAS (__memmove_aarch64_sve) -ENTRY (__memcpy_aarch64_sve) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - cmp count, 128 - b.hi L(copy_long) - cntb vlen - cmp count, vlen, lsl 1 - b.hi L(copy32_128) - - whilelo p0.b, xzr, count - whilelo p1.b, vlen, count - ld1b z0.b, p0/z, [src, 0, mul vl] - ld1b z1.b, p1/z, [src, 1, mul vl] - st1b z0.b, p0, [dstin, 0, mul vl] - st1b z1.b, p1, [dstin, 1, mul vl] - ret - - /* Medium copies: 33..128 bytes. */ -L(copy32_128): - add srcend, src, count - add dstend, dstin, count - ldp A_q, B_q, [src] - ldp C_q, D_q, [srcend, -32] - cmp count, 64 - b.hi L(copy128) - stp A_q, B_q, [dstin] - stp C_q, D_q, [dstend, -32] - ret - - /* Copy 65..128 bytes. */ -L(copy128): - ldp E_q, F_q, [src, 32] - cmp count, 96 - b.ls L(copy96) - ldp G_q, H_q, [srcend, -64] - stp G_q, H_q, [dstend, -64] -L(copy96): - stp A_q, B_q, [dstin] - stp E_q, F_q, [dstin, 32] - stp C_q, D_q, [dstend, -32] - ret - - /* Copy more than 128 bytes. */ -L(copy_long): - add srcend, src, count - add dstend, dstin, count - - /* Use backwards copy if there is an overlap. */ - sub tmp1, dstin, src - cmp tmp1, count - b.lo L(copy_long_backwards) - - /* Copy 16 bytes and then align src to 16-byte alignment. */ - ldr D_q, [src] - and tmp1, src, 15 - bic src, src, 15 - sub dst, dstin, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_q, B_q, [src, 16] - str D_q, [dstin] - ldp C_q, D_q, [src, 48] - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(copy64_from_end) -L(loop64): - stp A_q, B_q, [dst, 16] - ldp A_q, B_q, [src, 80] - stp C_q, D_q, [dst, 48] - ldp C_q, D_q, [src, 112] - add src, src, 64 - add dst, dst, 64 - subs count, count, 64 - b.hi L(loop64) - - /* Write the last iteration and copy 64 bytes from the end. */ -L(copy64_from_end): - ldp E_q, F_q, [srcend, -64] - stp A_q, B_q, [dst, 16] - ldp A_q, B_q, [srcend, -32] - stp C_q, D_q, [dst, 48] - stp E_q, F_q, [dstend, -64] - stp A_q, B_q, [dstend, -32] - ret - - /* Large backwards copy for overlapping copies. - Copy 16 bytes and then align srcend to 16-byte alignment. */ -L(copy_long_backwards): - cbz tmp1, L(return) - ldr D_q, [srcend, -16] - and tmp1, srcend, 15 - bic srcend, srcend, 15 - sub count, count, tmp1 - ldp A_q, B_q, [srcend, -32] - str D_q, [dstend, -16] - ldp C_q, D_q, [srcend, -64] - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls L(copy64_from_start) - -L(loop64_backwards): - str B_q, [dstend, -16] - str A_q, [dstend, -32] - ldp A_q, B_q, [srcend, -96] - str D_q, [dstend, -48] - str C_q, [dstend, -64]! - ldp C_q, D_q, [srcend, -128] - sub srcend, srcend, 64 - subs count, count, 64 - b.hi L(loop64_backwards) - - /* Write the last iteration and copy 64 bytes from the start. */ -L(copy64_from_start): - ldp E_q, F_q, [src, 32] - stp A_q, B_q, [dstend, -32] - ldp A_q, B_q, [src] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstin, 32] - stp A_q, B_q, [dstin] -L(return): - ret - -END (__memcpy_aarch64_sve) - -#endif diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index 2b1a592..8a967cd 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "asmdefs.h" +#include "../asmdefs.h" #define dstin x0 #define src x1 diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S deleted file mode 100644 index 6c73017..0000000 --- a/string/aarch64/memmove-mops.S +++ /dev/null @@ -1,21 +0,0 @@ -/* - * memmove using MOPS extension. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "asmdefs.h" - -ENTRY (__memmove_aarch64_mops) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - mov x3, x0 - .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */ - .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */ - .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */ - ret - -END (__memmove_aarch64_mops) diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S index 6418bdf..7b4be84 100644 --- a/string/aarch64/memrchr.S +++ b/string/aarch64/memrchr.S @@ -1,8 +1,8 @@ /* * memrchr - find last character in a memory zone. * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,6 +23,7 @@ #define synd x5 #define shift x6 #define tmp x7 +#define wtmp w7 #define end x8 #define endm1 x9 @@ -30,16 +31,19 @@ #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vend v3 -#define dend d3 +#define vrepmask v3 +#define vend v4 +#define dend d4 /* Core algorithm: - For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits - per byte. We take 4 bits of every comparison byte with shift right and narrow - by 4 instruction. Since the bits in the nibble mask reflect the order in - which things occur in the original string, counting leading zeros identifies - exactly which byte matched. */ + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__memrchr_aarch64) PTR_ARG (0) @@ -49,9 +53,12 @@ ENTRY (__memrchr_aarch64) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin + mov wtmp, 0xf00f + dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b neg shift, end, lsl 2 - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend lsl synd, synd, shift cbz synd, L(start_loop) @@ -62,36 +69,34 @@ ENTRY (__memrchr_aarch64) csel result, result, xzr, hi ret - nop L(start_loop): - subs cntrem, src, srcin + sub tmp, end, src + subs cntrem, cntin, tmp b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - sub cntrem, cntrem, 1 - tbz cntrem, 4, L(loop32_2) - add src, src, 16 + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) - .p2align 5 + .p2align 4 L(loop32): - ldr qdata, [src, -32]! + ldr qdata, [src, -16]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, -16] + ldr qdata, [src, -16]! subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.lo L(end_2) + b.ls L(end) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) -L(end_2): - sub src, src, 16 L(end): - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend add tmp, src, 15 diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S deleted file mode 100644 index ec79149..0000000 --- a/string/aarch64/memset-mops.S +++ /dev/null @@ -1,20 +0,0 @@ -/* - * memset using MOPS extension. - * - * Copyright (c) 2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#include "asmdefs.h" - -ENTRY (__memset_aarch64_mops) - PTR_ARG (0) - SIZE_ARG (2) - - mov x3, x0 - .inst 0x19c10443 /* setp [x3]!, x2!, x1 */ - .inst 0x19c14443 /* setm [x3]!, x2!, x1 */ - .inst 0x19c18443 /* sete [x3]!, x2!, x1 */ - ret - -END (__memset_aarch64_mops) diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index 553b0fc..9fcd975 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -1,8 +1,8 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2012-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "asmdefs.h" +#include "../asmdefs.h" #define dstin x0 #define val x1 diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S new file mode 100644 index 0000000..f1c7119 --- /dev/null +++ b/string/aarch64/stpcpy-mte.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy-mte.S" diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S index 5d3f14b..82dd971 100644 --- a/string/aarch64/stpcpy-sve.S +++ b/string/aarch64/stpcpy-sve.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #define BUILD_STPCPY 1 diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S index 155c68d..4f62aa4 100644 --- a/string/aarch64/stpcpy.S +++ b/string/aarch64/stpcpy.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #define BUILD_STPCPY 1 diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index 6ec08f7..dcb0e46 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,7 +19,8 @@ #define src x2 #define tmp1 x1 -#define tmp2 x3 +#define wtmp2 w3 +#define tmp3 x3 #define vrepchr v0 #define vdata v1 @@ -27,30 +28,39 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vend v5 -#define dend d5 +#define vrepmask2 v5 +#define vend v6 +#define dend d6 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. Bits 0-1 are set if the relevant byte matched the requested - character, bits 2-3 are set if the byte is NUL or matched. Count trailing - zeroes gives the position of the matching byte if it is a multiple of 4. - If it is not a multiple of 4, there was no match. */ + per byte. For even bytes, bits 0-1 are set if the relevant byte matched the + requested character, bits 2-3 are set if the byte is NUL (or matched), and + bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd + bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits + in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - movi vrepmask.16b, 0x33 + mov wtmp2, 0x3003 + dup vrepmask.8h, wtmp2 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp2, 0xf00f + dup vrepmask2.8h, wtmp2 + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - lsl tmp2, srcin, 2 - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + lsl tmp3, srcin, 2 + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend - lsr tmp1, tmp1, tmp2 + lsr tmp1, tmp1, tmp3 cbz tmp1, L(loop) rbit tmp1, tmp1 @@ -64,34 +74,28 @@ ENTRY (__strchr_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16] - cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov tmp1, dend - cbnz tmp1, L(end) - ldr qdata, [src, 32]! + ldr qdata, [src, 16]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) - sub src, src, 16 -L(end): #ifdef __AARCH64EB__ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ fmov tmp1, dend #else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ fmov tmp1, dend rbit tmp1, tmp1 #endif - add src, src, 16 clz tmp1, tmp1 - /* Tmp1 is a multiple of 4 if the target character was found. */ + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ tst tmp1, 2 add result, src, tmp1, lsr 2 csel result, result, xzr, eq diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S index ff07516..13ba9f4 100644 --- a/string/aarch64/strchr-sve.S +++ b/string/aarch64/strchr-sve.S @@ -1,11 +1,11 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 37193bd..1063cbf 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "asmdefs.h" +#include "../asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S index 543ee88..1b0d0a6 100644 --- a/string/aarch64/strchrnul-mte.S +++ b/string/aarch64/strchrnul-mte.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define chrin w1 @@ -20,32 +20,38 @@ #define src x2 #define tmp1 x1 #define tmp2 x3 +#define tmp2w w3 #define vrepchr v0 #define vdata v1 #define qdata q1 #define vhas_nul v2 #define vhas_chr v3 -#define vend v4 -#define dend d4 +#define vrepmask v4 +#define vend v5 +#define dend d5 -/* - Core algorithm: - For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits - per byte. We take 4 bits of every comparison byte with shift right and narrow - by 4 instruction. Since the bits in the nibble mask reflect the order in - which things occur in the original string, counting leading zeros identifies - exactly which byte matched. */ +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] + mov tmp2w, 0xf00f + dup vrepmask.8h, tmp2w cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b lsl tmp2, srcin, 2 - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov tmp1, dend lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ cbz tmp1, L(loop) @@ -57,22 +63,15 @@ ENTRY (__strchrnul_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16] - cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b - umaxp vend.16b, vhas_chr.16b, vhas_chr.16b - fmov tmp1, dend - cbnz tmp1, L(end) - ldr qdata, [src, 32]! + ldr qdata, [src, 16]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b fmov tmp1, dend cbz tmp1, L(loop) - sub src, src, 16 -L(end): - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ - add src, src, 16 + + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov tmp1, dend #ifndef __AARCH64EB__ rbit tmp1, tmp1 diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S index 0005f91..428ff1a 100644 --- a/string/aarch64/strchrnul-sve.S +++ b/string/aarch64/strchrnul-sve.S @@ -2,7 +2,7 @@ * strchrnul - find a character or nul in a string * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #define BUILD_STRCHRNUL diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index 666e8d0..a4230d9 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "asmdefs.h" +#include "../asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S new file mode 100644 index 0000000..12d1a6b --- /dev/null +++ b/string/aarch64/strcmp-mte.S @@ -0,0 +1,189 @@ +/* + * strcmp - compare two strings + * + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + + +/* Assumptions: + * + * ARMv8-a, AArch64. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define src1 x0 +#define src2 x1 +#define result x0 + +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define off1 x5 +#define syndrome x6 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + + +ENTRY (__strcmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 + b.ne L(misaligned8) + cbnz tmp, L(mutual_align) + + .p2align 4 + +L(loop_aligned): + ldr data2, [src1, off2] + ldr data1, [src1], 8 +L(start_realigned): +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + rev data2, data2 +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, shift + lsl data2, data2, shift + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 + ret + + .p2align 4 + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) +L(do_misaligned): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, 7 + b.ne L(do_misaligned) + +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 + orr syndrome, diff, has_nul + b L(end) + +L(done): + sub result, data1, data2 + ret + +END (__strcmp_aarch64_mte) + diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S index eaf909a..e6d2da5 100644 --- a/string/aarch64/strcmp-sve.S +++ b/string/aarch64/strcmp-sve.S @@ -1,11 +1,11 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index 137a9aa..7714ebf 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -1,184 +1,168 @@ /* * strcmp - compare two strings * - * Copyright (c) 2012-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2012-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ - /* Assumptions: * - * ARMv8-a, AArch64. - * MTE compatible. + * ARMv8-a, AArch64 */ -#include "asmdefs.h" +#include "../asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 +/* Parameters and result. */ #define src1 x0 #define src2 x1 #define result x0 +/* Internal variables. */ #define data1 x2 #define data1w w2 #define data2 x3 #define data2w w3 #define has_nul x4 #define diff x5 -#define off1 x5 #define syndrome x6 -#define tmp x6 -#define data3 x7 -#define zeroones x8 -#define shift x9 -#define off2 x10 - -/* On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. */ -#ifdef __AARCH64EB__ -# define LS_FW lsl -#else -# define LS_FW lsr -#endif - -/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. - Since carry propagation makes 0x1 bytes before a NUL byte appear - NUL too in big-endian, byte-reverse the data before the NUL check. */ - +#define tmp1 x7 +#define tmp2 x8 +#define tmp3 x9 +#define zeroones x10 +#define pos x11 + /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) PTR_ARG (0) PTR_ARG (1) - sub off2, src2, src1 - mov zeroones, REP8_01 - and tmp, src1, 7 - tst off2, 7 + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 b.ne L(misaligned8) - cbnz tmp, L(mutual_align) - - .p2align 4 - + ands tmp1, src1, #7 + b.ne L(mutual_align) + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ L(loop_aligned): - ldr data2, [src1, off2] - ldr data1, [src1], 8 + ldr data1, [src1], #8 + ldr data2, [src2], #8 L(start_realigned): -#ifdef __AARCH64EB__ - rev tmp, data1 - sub has_nul, tmp, zeroones - orr tmp, tmp, REP8_7f -#else - sub has_nul, data1, zeroones - orr tmp, data1, REP8_7f -#endif - bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ - ccmp data1, data2, 0, eq - b.eq L(loop_aligned) -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - eor diff, data1, data2 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ orr syndrome, diff, has_nul + cbz syndrome, L(loop_aligned) + /* End of performance-critical section -- one 64B cache line. */ + L(end): -#ifndef __AARCH64EB__ +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome rev data2, data2 -#endif - clz shift, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ - lsl data1, data1, shift - lsl data2, data2, shift + lsl data1, data1, pos + lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ - lsr data1, data1, 56 - sub result, data1, data2, lsr 56 + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 ret - - .p2align 4 +#endif L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, 7 - ldr data2, [src1, off2] - ldr data1, [src1], 8 - neg shift, src2, lsl 3 /* Bits to alignment -64. */ - mov tmp, -1 - LS_FW tmp, tmp, shift - orr data1, data1, tmp - orr data2, data2, tmp + the bytes that preceed the start point. */ + bic src1, src1, #7 + bic src2, src2, #7 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + ldr data1, [src1], #8 + neg tmp1, tmp1 /* Bits to alignment -64. */ + ldr data2, [src2], #8 + mov tmp2, #~0 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2, data2, tmp2 b L(start_realigned) L(misaligned8): /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond the end of SRC2. */ - cbz tmp, L(src1_aligned) + checking to make sure that we don't access beyond page boundary in + SRC2. */ + tst src1, #7 + b.eq L(loop_misaligned) L(do_misaligned): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - cmp data1w, 0 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ b.ne L(done) - tst src1, 7 + tst src1, #7 b.ne L(do_misaligned) -L(src1_aligned): - neg shift, src2, lsl 3 - bic src2, src2, 7 - ldr data3, [src2], 8 -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - lsr tmp, zeroones, shift - orr data3, data3, tmp - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - bics has_nul, has_nul, tmp - b.ne L(tail) - - sub off1, src2, src1 - - .p2align 4 - -L(loop_unaligned): - ldr data3, [src1, off1] - ldr data2, [src1, off2] -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - ldr data1, [src1], 8 - bics has_nul, has_nul, tmp - ccmp data1, data2, 0, eq - b.eq L(loop_unaligned) - - lsl tmp, has_nul, shift -#ifdef __AARCH64EB__ - rev tmp, tmp -#endif - eor diff, data1, data2 - orr syndrome, diff, tmp - cbnz syndrome, L(end) -L(tail): - ldr data1, [src1] - neg shift, shift - lsr data2, data3, shift - lsr has_nul, has_nul, shift -#ifdef __AARCH64EB__ - rev data2, data2 - rev has_nul, has_nul -#endif - eor diff, data1, data2 +L(loop_misaligned): + /* Test if we are within the last dword of the end of a 4K page. If + yes then jump back to the misaligned loop to copy a byte at a time. */ + and tmp1, src2, #0xff8 + eor tmp1, tmp1, #0xff8 + cbz tmp1, L(do_misaligned) + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ orr syndrome, diff, has_nul + cbz syndrome, L(loop_misaligned) b L(end) L(done): diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S new file mode 100644 index 0000000..88c222d --- /dev/null +++ b/string/aarch64/strcpy-mte.S @@ -0,0 +1,161 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define len x4 +#define synd x4 +#define tmp x5 +#define wtmp w5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 +#define dataq2 q1 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64_mte +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64_mte +# define IFSTPCPY(X,...) +#endif + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (STRCPY) + PTR_ARG (0) + PTR_ARG (1) + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4,,8 +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 + + .p2align 4 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] + str data1, [dstin] + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(start_loop): + sub len, src, srcin + ldr dataq2, [srcin] + add dst, dstin, len + str dataq2, [dstin] + + .p2align 5 +L(loop): + str dataq, [dst], 16 + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz len, synd + lsr len, len, 2 + sub tmp, len, 15 + ldr dataq, [src, tmp] + str dataq, [dst, tmp] + IFSTPCPY (add result, dst, len) + ret + +END (STRCPY) diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S index 00e72dc..f515462 100644 --- a/string/aarch64/strcpy-sve.S +++ b/string/aarch64/strcpy-sve.S @@ -1,11 +1,11 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 97ae37e..6e9ed42 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -1,156 +1,311 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2013-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, Advanced SIMD. - * MTE compatible. + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ -#include "asmdefs.h" +#include "../asmdefs.h" +/* To build as stpcpy, define BUILD_STPCPY before compiling this file. + + To test the page crossing code path more thoroughly, compile with + -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ #define dstin x0 #define srcin x1 -#define result x0 +/* Locals and temporaries. */ #define src x2 #define dst x3 -#define len x4 -#define synd x4 -#define tmp x5 -#define shift x5 -#define data1 x6 -#define dataw1 w6 -#define data2 x7 -#define dataw2 w7 - -#define dataq q0 -#define vdata v0 -#define vhas_nul v1 -#define vend v2 -#define dend d2 -#define dataq2 q1 +#define data1 x4 +#define data1w w4 +#define data2 x5 +#define data2w w5 +#define has_nul1 x6 +#define has_nul2 x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define tmp4 x11 +#define zeroones x12 +#define data1a x13 +#define data2a x14 +#define pos x15 +#define len x16 +#define to_align x17 #ifdef BUILD_STPCPY -# define STRCPY __stpcpy_aarch64 -# define IFSTPCPY(X,...) X,__VA_ARGS__ +#define STRCPY __stpcpy_aarch64 #else -# define STRCPY __strcpy_aarch64 -# define IFSTPCPY(X,...) +#define STRCPY __strcpy_aarch64 #endif -/* - Core algorithm: - For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits - per byte. We take 4 bits of every comparison byte with shift right and narrow - by 4 instruction. Since the bits in the nibble mask reflect the order in - which things occur in the original string, counting leading zeros identifies - exactly which byte matched. */ + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + + /* AArch64 systems have a minimum page size of 4k. We can do a quick + page size check for crossing this boundary on entry and if we + do not, then we can short-circuit much of the entry code. We + expect early page-crossing strings to be rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite + predictable, even with random strings. + + We don't bother checking for larger page sizes, the cost of setting + up the correct page size is just not worth the extra gain from + a small reduction in the cases taking the slow path. Note that + we only care about whether the first fetch, which may be + misaligned, crosses a page boundary - after that we move to aligned + fetches for the remainder of the string. */ + +#ifdef STRCPY_TEST_PAGE_CROSS + /* Make everything that isn't Qword aligned look like a page cross. */ +#define MIN_PAGE_P2 4 +#else +#define MIN_PAGE_P2 12 +#endif + +#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) ENTRY (STRCPY) PTR_ARG (0) PTR_ARG (1) - bic src, srcin, 15 - ld1 {vdata.16b}, [src] - cmeq vhas_nul.16b, vdata.16b, 0 - lsl shift, srcin, 2 - shrn vend.8b, vhas_nul.8h, 4 - fmov synd, dend - lsr synd, synd, shift - cbnz synd, L(tail) - - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - shrn vend.8b, vhas_nul.8h, 4 - fmov synd, dend - cbz synd, L(start_loop) - -#ifndef __AARCH64EB__ - rbit synd, synd + /* For moderately short strings, the fastest way to do the copy is to + calculate the length of the string in the same way as strlen, then + essentially do a memcpy of the result. This avoids the need for + multiple byte copies and further means that by the time we + reach the bulk copy loop we know we can always use DWord + accesses. We expect __strcpy_aarch64 to rarely be called repeatedly + with the same source string, so branch prediction is likely to + always be difficult - we mitigate against this by preferring + conditional select operations over branches whenever this is + feasible. */ + and tmp2, srcin, #(MIN_PAGE_SIZE - 1) + mov zeroones, #REP8_01 + and to_align, srcin, #15 + cmp tmp2, #(MIN_PAGE_SIZE - 16) + neg tmp1, to_align + /* The first fetch will straddle a (possible) page boundary iff + srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte + aligned string will never fail the page align check, so will + always take the fast path. */ + b.gt L(page_cross) + +L(page_cross_ok): + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* Because we expect the end to be found within 16 characters + (profiling shows this is the most common case), it's worth + swapping the bytes now to save having to recalculate the + termination syndrome later. We preserve data1 and data2 + so that we can re-use the values later on. */ + rev tmp2, data1 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne L(fp_le8) + rev tmp4, data2 + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul1, tmp1, tmp2 + b.ne L(fp_le8) + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f #endif - sub tmp, src, srcin - clz len, synd - add len, tmp, len, lsr 2 - tbz len, 4, L(less16) - sub tmp, len, 15 - ldr dataq, [srcin] - ldr dataq2, [srcin, tmp] - str dataq, [dstin] - str dataq2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret + bics has_nul2, tmp3, tmp4 + b.eq L(bulk_entry) -L(tail): - rbit synd, synd - clz len, synd - lsr len, len, 2 -L(less16): - tbz len, 3, L(less8) - sub tmp, len, 7 - ldr data1, [srcin] - ldr data2, [srcin, tmp] + /* The string is short (<=16 bytes). We don't know exactly how + short though, yet. Work out the exact length so that we can + quickly select the optimal copy strategy. */ +L(fp_gt8): + rev has_nul2, has_nul2 + clz pos, has_nul2 + mov tmp2, #56 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + sub pos, tmp2, pos +#ifdef __AARCH64EB__ + lsr data2, data2, pos +#else + lsl data2, data2, pos +#endif + str data2, [dst, #1] str data1, [dstin] - str data2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) +#ifdef BUILD_STPCPY + add dstin, dst, #8 +#endif ret - .p2align 4 -L(less8): - subs tmp, len, 3 - b.lo L(less4) - ldr dataw1, [srcin] - ldr dataw2, [srcin, tmp] - str dataw1, [dstin] - str dataw2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) +L(fp_le8): + rev has_nul1, has_nul1 + clz pos, has_nul1 + add dst, dstin, pos, lsr #3 /* Bits to bytes. */ + subs tmp2, pos, #24 /* Pos in bits. */ + b.lt L(fp_lt4) +#ifdef __AARCH64EB__ + mov tmp2, #56 + sub pos, tmp2, pos + lsr data2, data1, pos + lsr data1, data1, #32 +#else + lsr data2, data1, tmp2 +#endif + /* 4->7 bytes to copy. */ + str data2w, [dst, #-3] + str data1w, [dstin] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif ret - -L(less4): - cbz len, L(zerobyte) - ldrh dataw1, [srcin] - strh dataw1, [dstin] -L(zerobyte): - strb wzr, [dstin, len] - IFSTPCPY (add result, dstin, len) +L(fp_lt4): + cbz pos, L(fp_lt2) + /* 2->3 bytes to copy. */ +#ifdef __AARCH64EB__ + lsr data1, data1, #48 +#endif + strh data1w, [dstin] + /* Fall-through, one byte (max) to go. */ +L(fp_lt2): + /* Null-terminated string. Last character must be zero! */ + strb wzr, [dst] +#ifdef BUILD_STPCPY + mov dstin, dst +#endif ret - .p2align 4 -L(start_loop): - sub tmp, srcin, dstin - ldr dataq2, [srcin] - sub dst, src, tmp - str dataq2, [dstin] -L(loop): - str dataq, [dst], 32 - ldr dataq, [src, 16] - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbnz synd, L(loopend) - str dataq, [dst, -16] - ldr dataq, [src, 32]! - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(loop) - add dst, dst, 16 -L(loopend): - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ - fmov synd, dend - sub dst, dst, 31 -#ifndef __AARCH64EB__ - rbit synd, synd + .p2align 6 + /* Aligning here ensures that the entry code and main loop all lies + within one 64-byte cache line. */ +L(bulk_entry): + sub to_align, to_align, #16 + stp data1, data2, [dstin] + sub src, srcin, to_align + sub dst, dstin, to_align + b L(entry_no_page_cross) + + /* The inner loop deals with two Dwords at a time. This has a + slightly higher start-up cost, but we should win quite quickly, + especially on cores with a high number of issue slots per + cycle, as we get much better parallelism out of the operations. */ +L(main_loop): + stp data1, data2, [dst], #16 +L(entry_no_page_cross): + ldp data1, data2, [src], #16 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(main_loop) + + /* Since we know we are copying at least 16 bytes, the fastest way + to deal with the tail is to determine the location of the + trailing NUL, then (re)copy the 16 bytes leading up to that. */ + cmp has_nul1, #0 +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, ne + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, ne +#endif + rev has_nul1, has_nul1 + clz pos, has_nul1 + add tmp1, pos, #72 + add pos, pos, #8 + csel pos, pos, tmp1, ne + add src, src, pos, lsr #3 + add dst, dst, pos, lsr #3 + ldp data1, data2, [src, #-32] + stp data1, data2, [dst, #-16] +#ifdef BUILD_STPCPY + sub dstin, dst, #1 #endif - clz len, synd - lsr len, len, 2 - add dst, dst, len - ldr dataq, [dst, tmp] - str dataq, [dst] - IFSTPCPY (add result, dst, 15) ret +L(page_cross): + bic src, srcin, #15 + /* Start by loading two words at [srcin & ~15], then forcing the + bytes that precede srcin to 0xff. This means they never look + like termination bytes. */ + ldp data1, data2, [src] + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + tst to_align, #7 + csetm tmp2, ne +#ifdef __AARCH64EB__ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + cmp to_align, #8 + csinv data1, data1, xzr, lt + csel data2, data2, data2a, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq L(page_cross_ok) + /* We now need to make data1 and data2 look like they've been + loaded directly from srcin. Do a rotate on the 128-bit value. */ + lsl tmp1, to_align, #3 /* Bytes->bits. */ + neg tmp2, to_align, lsl #3 +#ifdef __AARCH64EB__ + lsl data1a, data1, tmp1 + lsr tmp4, data2, tmp2 + lsl data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + rev tmp2, data1 + rev tmp4, data2 + sub tmp1, tmp2, zeroones + orr tmp2, tmp2, #REP8_7f + sub tmp3, tmp4, zeroones + orr tmp4, tmp4, #REP8_7f +#else + lsr data1a, data1, tmp1 + lsl tmp4, data2, tmp2 + lsr data2, data2, tmp1 + orr tmp4, tmp4, data1a + cmp to_align, #8 + csel data1, tmp4, data2, lt + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f +#endif + bic has_nul1, tmp1, tmp2 + cbnz has_nul1, L(fp_le8) + bic has_nul2, tmp3, tmp4 + b L(fp_gt8) + END (STRCPY) + diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S index 7723579..7cf41d5 100644 --- a/string/aarch64/strlen-mte.S +++ b/string/aarch64/strlen-mte.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define result x0 @@ -19,26 +19,35 @@ #define src x1 #define synd x2 #define tmp x3 +#define wtmp w3 #define shift x4 #define data q0 #define vdata v0 #define vhas_nul v1 -#define vend v2 -#define dend d2 +#define vrepmask v2 +#define vend v3 +#define dend d3 /* Core algorithm: - Process the string in 16-byte aligned chunks. Compute a 64-bit mask with - four bits per byte using the shrn instruction. A count trailing zeros then - identifies the first zero byte. */ + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strlen_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 + mov wtmp, 0xf00f ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(loop) @@ -50,25 +59,19 @@ ENTRY (__strlen_aarch64_mte) .p2align 5 L(loop): - ldr data, [src, 16] - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbnz synd, L(loop_end) - ldr data, [src, 32]! + ldr data, [src, 16]! cmeq vhas_nul.16b, vdata.16b, 0 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop) - sub src, src, 16 -L(loop_end): - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ sub result, src, srcin fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif - add result, result, 16 clz tmp, synd add result, result, tmp, lsr 2 ret diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S index 12ebbdb..2392493 100644 --- a/string/aarch64/strlen-sve.S +++ b/string/aarch64/strlen-sve.S @@ -1,11 +1,11 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 6f6f08f..a1b164a 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * Not MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define len x0 @@ -36,7 +36,6 @@ #define tmp x2 #define tmpw w2 #define synd x3 -#define syndw w3 #define shift x4 /* For the first 32 bytes, NUL detection works on the principle that @@ -111,6 +110,7 @@ ENTRY (__strlen_aarch64) add len, len, tmp1, lsr 3 ret + .p2align 3 /* Look for a NUL byte at offset 16..31 in the string. */ L(bytes16_31): ldp data1, data2, [srcin, 16] @@ -138,7 +138,6 @@ L(bytes16_31): add len, len, tmp1, lsr 3 ret - nop L(loop_entry): bic src, srcin, 31 @@ -154,12 +153,18 @@ L(loop): /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ cmeq maskv.16b, datav1.16b, 0 sub len, src, srcin - cbnz syndw, 1f + tst synd, 0xffffffff + b.ne 1f cmeq maskv.16b, datav2.16b, 0 add len, len, 16 1: /* Generate a bitmask and compute correct byte offset. */ - shrn maskv.8b, maskv.8h, 4 +#ifdef __AARCH64EB__ + bic maskv.8h, 0xf0 +#else + bic maskv.8h, 0x0f, lsl 8 +#endif + umaxp maskv.16b, maskv.16b, maskv.16b fmov synd, maskd #ifndef __AARCH64EB__ rbit synd, synd @@ -168,6 +173,8 @@ L(loop): add len, len, tmp, lsr 2 ret + .p2align 4 + L(page_cross): bic src, srcin, 31 mov tmpw, 0x0c03 diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S new file mode 100644 index 0000000..c9d6fc8 --- /dev/null +++ b/string/aarch64/strncmp-mte.S @@ -0,0 +1,307 @@ +/* + * strncmp - compare two strings + * + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +#include "../asmdefs.h" + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define mask x13 +#define endloop x14 +#define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif + +ENTRY (__strncmp_aarch64_mte) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + cbz limit, L(ret0) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): + ldr data1, [src1], #8 + ldr data2, [src2], #8 +L(start_realigned): + subs limit, limit, #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq L(loop_aligned) + /* End of main loop */ + +L(full_check): +#ifndef __AARCH64EB__ + orr syndrome, diff, has_nul + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ + rev syndrome, syndrome + rev data1, data1 + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + cmp limit, pos, lsr #3 + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + csel result, result, xzr, hi + ret +#else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ +L(end_quick): + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): + /* Perhaps we can do better than this. */ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): + sub result, data1, data2 + ret + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + cbz count, L(src1_aligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset + ldr data1, [src1], #8 + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) + +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones + eor diff, data1, data2 /* Non-zero if differences found. */ + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif + +L(ret0): + mov result, #0 + ret +END(__strncmp_aarch64_mte) + diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S index 6a9e9f7..234190e 100644 --- a/string/aarch64/strncmp-sve.S +++ b/string/aarch64/strncmp-sve.S @@ -1,11 +1,11 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index 128a10c..738b653 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -1,20 +1,20 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2013-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64. - * MTE compatible. + * ARMv8-a, AArch64 */ -#include "asmdefs.h" +#include "../asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 /* Parameters and result. */ #define src1 x0 @@ -35,24 +35,10 @@ #define tmp3 x10 #define zeroones x11 #define pos x12 -#define mask x13 -#define endloop x14 +#define limit_wd x13 +#define mask x14 +#define endloop x15 #define count mask -#define offset pos -#define neg_offset x15 - -/* Define endian dependent shift operations. - On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. - LS_BK means shifting towards later bytes. - */ -#ifdef __AARCH64EB__ -#define LS_FW lsl -#define LS_BK lsr -#else -#define LS_FW lsr -#define LS_BK lsl -#endif ENTRY (__strncmp_aarch64) PTR_ARG (0) @@ -65,6 +51,9 @@ ENTRY (__strncmp_aarch64) and count, src1, #7 b.ne L(misaligned8) cbnz count, L(mutual_align) + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and @@ -74,52 +63,56 @@ L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 L(start_realigned): - subs limit, limit, #8 + subs limit_wd, limit_wd, #1 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, hi /* Last Dword or differences. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) /* End of main loop */ -L(full_check): -#ifndef __AARCH64EB__ + /* Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, L(not_limit) + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq L(not_limit) + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +#ifdef __AARCH64EB__ + lsr mask, mask, limit +#else + lsl mask, mask, limit +#endif + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): orr syndrome, diff, has_nul - add limit, limit, 8 /* Rewind limit to before last subs. */ -L(syndrome_check): - /* Limit was reached. Check if the NUL byte or the difference - is before the limit. */ + +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ clz pos, syndrome rev data2, data2 lsl data1, data1, pos - cmp limit, pos, lsr #3 lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ lsr data1, data1, #56 sub result, data1, data2, lsr #56 - csel result, result, xzr, hi ret #else - /* Not reached the limit, must have found the end or a diff. */ - tbz limit, #63, L(not_limit) - add tmp1, limit, 8 - cbz limit, L(not_limit) - - lsl limit, tmp1, #3 /* Bits -> bytes. */ - mov mask, #~0 - lsr mask, mask, limit - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): /* For big-endian we cannot use the trick with the syndrome value as carry-propagation can corrupt the upper bits if the trailing bytes in the string contain 0x01. */ @@ -140,11 +133,10 @@ L(not_limit): rev has_nul, has_nul orr syndrome, diff, has_nul clz pos, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ -L(end_quick): lsl data1, data1, pos lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then @@ -166,12 +158,22 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit and ensure it doesn't overflow. */ - adds limit, limit, count - csinv limit, limit, xzr, lo + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#endif + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ + add limit, limit, count + add tmp3, tmp3, count orr data1, data1, tmp2 orr data2, data2, tmp2 + add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) .p2align 4 @@ -194,11 +196,13 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - cbz count, L(src1_aligned) + lsr limit_wd, limit, #3 + cbz count, L(do_misaligned) neg count, count and count, count, #7 sub limit, limit, count + lsr limit_wd, limit, #3 L(page_end_loop): ldrb data1w, [src1], #1 @@ -209,100 +213,48 @@ L(page_end_loop): subs count, count, #1 b.hi L(page_end_loop) - /* The following diagram explains the comparison of misaligned strings. - The bytes are shown in natural order. For little-endian, it is - reversed in the registers. The "x" bytes are before the string. - The "|" separates data that is loaded at one time. - src1 | a a a a a a a a | b b b c c c c c | . . . - src2 | x x x x x a a a a a a a a b b b | c c c c c . . . - - After shifting in each step, the data looks like this: - STEP_A STEP_B STEP_C - data1 a a a a a a a a b b b c c c c c b b b c c c c c - data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c - - The bytes with "0" are eliminated from the syndrome via mask. - - Align SRC2 down to 16 bytes. This way we can read 16 bytes at a - time from SRC2. The comparison happens in 3 steps. After each step - the loop can exit, or read from SRC1 or SRC2. */ -L(src1_aligned): - /* Calculate offset from 8 byte alignment to string start in bits. No - need to mask offset since shifts are ignoring upper bits. */ - lsl offset, src2, #3 - bic src2, src2, #0xf - mov mask, -1 - neg neg_offset, offset - ldr data1, [src1], #8 - ldp tmp1, tmp2, [src2], #16 - LS_BK mask, mask, neg_offset - and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ - /* Skip the first compare if data in tmp1 is irrelevant. */ - tbnz offset, 6, L(misaligned_mid_loop) - +L(do_misaligned): + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo L(done_loop) L(loop_misaligned): - /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ - LS_FW data2, tmp1, offset - LS_BK tmp1, tmp2, neg_offset - subs limit, limit, #8 - orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ - sub has_nul, data1, zeroones - eor diff, data1, data2 /* Non-zero if differences found. */ - orr tmp3, data1, #REP8_7f - csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ - orr tmp3, endloop, has_nul - cbnz tmp3, L(full_check) - - ldr data1, [src1], #8 -L(misaligned_mid_loop): - /* STEP_B: Compare first part of data1 to second part of tmp2. */ - LS_FW data2, tmp2, offset -#ifdef __AARCH64EB__ - /* For big-endian we do a byte reverse to avoid carry-propagation - problem described above. This way we can reuse the has_nul in the - next step and also use syndrome value trick at the end. */ - rev tmp3, data1 - #define data1_fixed tmp3 -#else - #define data1_fixed data1 -#endif - sub has_nul, data1_fixed, zeroones - orr tmp3, data1_fixed, #REP8_7f - eor diff, data2, data1 /* Non-zero if differences found. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - cmp limit, neg_offset, lsr #3 - orr syndrome, diff, has_nul - bic syndrome, syndrome, mask /* Ignore later bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - /* STEP_C: Compare second part of data1 to first part of tmp1. */ - ldp tmp1, tmp2, [src2], #16 - cmp limit, #8 - LS_BK data2, tmp1, neg_offset - eor diff, data2, data1 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - and syndrome, syndrome, mask /* Ignore earlier bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, L(page_end_loop) ldr data1, [src1], #8 - sub limit, limit, #8 - b L(loop_misaligned) + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + subs limit_wd, limit_wd, #1 + b.pl L(loop_misaligned) -#ifdef __AARCH64EB__ -L(syndrome_check): - clz pos, syndrome - cmp pos, limit, lsl #3 - b.lo L(end_quick) -#endif +L(done_loop): + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, L(not_limit) + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) L(ret0): mov result, #0 ret -END(__strncmp_aarch64) + +END ( __strncmp_aarch64) diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S index 6c43dc4..5b9ebf7 100644 --- a/string/aarch64/strnlen-sve.S +++ b/string/aarch64/strnlen-sve.S @@ -1,11 +1,11 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index f2090a7..48d2495 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -1,8 +1,8 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define cntin x1 @@ -20,30 +20,39 @@ #define src x2 #define synd x3 #define shift x4 +#define wtmp w4 #define tmp x4 #define cntrem x5 #define qdata q0 #define vdata v0 #define vhas_chr v1 -#define vend v2 -#define dend d2 +#define vrepmask v2 +#define vend v3 +#define dend d3 /* Core algorithm: - Process the string in 16-byte aligned chunks. Compute a 64-bit mask with - four bits per byte using the shrn instruction. A count trailing zeros then - identifies the first zero byte. */ + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY (__strnlen_aarch64) PTR_ARG (0) SIZE_ARG (1) bic src, srcin, 15 + mov wtmp, 0xf00f cbz cntin, L(nomatch) - ld1 {vdata.16b}, [src] + ld1 {vdata.16b}, [src], 16 + dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, 0 lsl shift, srcin, 2 - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) @@ -55,40 +64,37 @@ L(finish): csel result, cntin, result, ls ret -L(nomatch): - mov result, cntin - ret - L(start_loop): sub tmp, src, srcin - add tmp, tmp, 17 subs cntrem, cntin, tmp - b.lo L(nomatch) + b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - tbz cntrem, 4, L(loop32_2) - sub src, src, 16 + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + .p2align 5 L(loop32): - ldr qdata, [src, 32]! + ldr qdata, [src], 16 cmeq vhas_chr.16b, vdata.16b, 0 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, 16] + ldr qdata, [src], 16 subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, 0 - b.lo L(end_2) + b.ls L(end) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) -L(end_2): - add src, src, 16 + L(end): - shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 + mov synd, vend.d[0] sub result, src, srcin - fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif @@ -98,5 +104,9 @@ L(end): csel result, cntin, result, ls ret +L(nomatch): + mov result, cntin + ret + END (__strnlen_aarch64) diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S index bb61ab9..1e4fb1a 100644 --- a/string/aarch64/strrchr-mte.S +++ b/string/aarch64/strrchr-mte.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2020-2023, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "asmdefs.h" +#include "../asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,6 +19,7 @@ #define src x2 #define tmp x3 +#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -30,6 +31,7 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 +#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -45,67 +47,55 @@ ENTRY (__strrchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - movi vrepmask.16b, 0x33 - ld1 {vdata.16b}, [src] + mov wtmp, 0x3003 + dup vrepmask.8h, wtmp + tst srcin, 15 + beq L(loop1) + + ld1 {vdata.16b}, [src], 16 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + mov wtmp, 0xf00f + dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - shrn vend.8b, vhas_nul.8h, 4 + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2_start) + cbnz synd, L(loop2) - .p2align 4 + .p2align 5 L(loop1): - ldr q1, [src, 16] - cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbnz synd, L(loop1_end) - ldr q1, [src, 32]! + ld1 {vdata.16b}, [src], 16 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - sub src, src, 16 -L(loop1_end): - add src, src, 16 + cmeq vhas_nul.16b, vdata.16b, 0 -#ifdef __AARCH64EB__ - bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - shrn vend.8b, vhas_nul.8h, 4 - fmov synd, dend - rbit synd, synd -#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - shrn vend.8b, vhas_nul.8h, 4 + bic vhas_nul.8h, 0x0f, lsl 8 + addp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend -#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2_start) + beq L(loop2) + L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - add result, src, 15 + sub result, src, 1 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 - nop - nop -L(loop2_start): - add src, src, 16 - bic vrepmask.8h, 0xf0 - L(loop2): cmp synd, 0 csel src_match, src, src_match, ne diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S index 825a738..d36d69a 100644 --- a/string/aarch64/strrchr-sve.S +++ b/string/aarch64/strrchr-sve.S @@ -1,11 +1,11 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ -#include "asmdefs.h" +#include "../asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S index bf9cb29..56185ff 100644 --- a/string/aarch64/strrchr.S +++ b/string/aarch64/strrchr.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2014-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "asmdefs.h" +#include "../asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c index e070be5..d5d4ea7 100644 --- a/string/bench/memcpy.c +++ b/string/bench/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy benchmark. * - * Copyright (c) 2020-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #define _GNU_SOURCE @@ -13,15 +13,14 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 5000 +#define ITERS 5000 #define ITERS2 20000000 -#define ITERS3 200000 -#define NUM_TESTS 16384 -#define MIN_SIZE 32768 -#define MAX_SIZE (1024 * 1024) +#define ITERS3 500000 +#define MAX_COPIES 8192 +#define SIZE (256*1024) -static uint8_t a[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); -static uint8_t b[MAX_SIZE + 4096 + 64] __attribute__((__aligned__(64))); +static uint8_t a[SIZE + 4096] __attribute__((__aligned__(64))); +static uint8_t b[SIZE + 4096] __attribute__((__aligned__(64))); #define F(x) {#x, x}, @@ -31,21 +30,15 @@ static const struct fun void *(*fun)(void *, const void *, size_t); } funtab[] = { + F(memcpy) #if __aarch64__ F(__memcpy_aarch64) # if __ARM_NEON F(__memcpy_aarch64_simd) # endif -# if __ARM_FEATURE_SVE - F(__memcpy_aarch64_sve) -# endif -# if WANT_MOPS - F(__memcpy_aarch64_mops) -# endif #elif __arm__ F(__memcpy_arm) #endif - F(memcpy) #undef F {0, 0} }; @@ -116,7 +109,7 @@ typedef struct uint64_t len : 16; } copy_t; -static copy_t test_arr[NUM_TESTS]; +static copy_t copy[MAX_COPIES]; typedef char *(*proto_t) (char *, const char *, size_t); @@ -147,14 +140,14 @@ init_copies (size_t max_size) size_t total = 0; /* Create a random set of copies with the given size and alignment distributions. */ - for (int i = 0; i < NUM_TESTS; i++) + for (int i = 0; i < MAX_COPIES; i++) { - test_arr[i].dst = (rand32 (0) & (max_size - 1)); - test_arr[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; - test_arr[i].src = (rand32 (0) & (max_size - 1)); - test_arr[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; - test_arr[i].len = size_arr[rand32 (0) & SIZE_MASK]; - total += test_arr[i].len; + copy[i].dst = (rand32 (0) & (max_size - 1)); + copy[i].dst &= ~dst_align_arr[rand32 (0) & ALIGN_MASK]; + copy[i].src = (rand32 (0) & (max_size - 1)); + copy[i].src &= ~src_align_arr[rand32 (0) & ALIGN_MASK]; + copy[i].len = size_arr[rand32 (0) & SIZE_MASK]; + total += copy[i].len; } return total; @@ -167,27 +160,25 @@ int main (void) memset (a, 1, sizeof (a)); memset (b, 2, sizeof (b)); - printf("Random memcpy (bytes/ns):\n"); + printf("Random memcpy:\n"); for (int f = 0; funtab[f].name != 0; f++) { size_t total = 0; uint64_t tsum = 0; - printf ("%22s ", funtab[f].name); + printf ("%22s (B/ns) ", funtab[f].name); rand32 (0x12345678); - for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) + for (int size = 16384; size <= SIZE; size *= 2) { size_t copy_size = init_copies (size) * ITERS; - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, - test_arr[c].len); + for (int c = 0; c < MAX_COPIES; c++) + funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (b + test_arr[c].dst, a + test_arr[c].src, - test_arr[c].len); + for (int c = 0; c < MAX_COPIES; c++) + funtab[f].fun (b + copy[c].dst, a + copy[c].src, copy[c].len); t = clock_get_ns () - t; total += copy_size; tsum += t; @@ -196,147 +187,74 @@ int main (void) printf( "avg %.2f\n", (double)total / tsum); } - size_t total = 0; - uint64_t tsum = 0; - printf ("%22s ", "memcpy_call"); - rand32 (0x12345678); - - for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) - { - size_t copy_size = init_copies (size) * ITERS; - - for (int c = 0; c < NUM_TESTS; c++) - memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - memcpy (b + test_arr[c].dst, a + test_arr[c].src, test_arr[c].len); - t = clock_get_ns () - t; - total += copy_size; - tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)copy_size / t); - } - printf( "avg %.2f\n", (double)total / tsum); - - - printf ("\nAligned medium memcpy (bytes/ns):\n"); + printf ("\nMedium memcpy:\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s ", funtab[f].name); + printf ("%22s (B/ns) ", funtab[f].name); - for (int size = 8; size <= 512; size *= 2) + for (int size = 16; size <= 512; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS2; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); } printf ("\n"); } - printf ("%22s ", "memcpy_call"); - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - memcpy (b, a, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - - - printf ("\nUnaligned medium memcpy (bytes/ns):\n"); + printf ("\nLarge memcpy:\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s ", funtab[f].name); + printf ("%22s (B/ns) ", funtab[f].name); - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (b + 3, a + 1, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - } - - printf ("%22s ", "memcpy_call"); - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - memcpy (b + 3, a + 1, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - - - printf ("\nLarge memcpy (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 1024; size <= 65536; size *= 2) + for (int size = 1024; size <= 32768; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (b, a, size); t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); } printf ("\n"); } - printf ("%22s ", "memcpy_call"); - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - memcpy (b, a, size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n"); - - - printf ("\nUnaligned forwards memmove (bytes/ns):\n"); + printf ("\nUnaligned forwards memmove:\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s ", funtab[f].name); + printf ("%22s (B/ns) ", funtab[f].name); - for (int size = 1024; size <= 65536; size *= 2) + for (int size = 1024; size <= 32768; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a, a + 256 + (i & 31), size); t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); } printf ("\n"); } - printf ("\nUnaligned backwards memmove (bytes/ns):\n"); + printf ("\nUnaligned backwards memmove:\n"); for (int f = 0; funtab[f].name != 0; f++) { - printf ("%22s ", funtab[f].name); + printf ("%22s (B/ns) ", funtab[f].name); - for (int size = 1024; size <= 65536; size *= 2) + for (int size = 1024; size <= 32768; size *= 2) { uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS3; i++) funtab[f].fun (a + 256 + (i & 31), a, size); t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); } printf ("\n"); } - printf ("\n"); return 0; } diff --git a/string/bench/memset.c b/string/bench/memset.c deleted file mode 100644 index 990e23b..0000000 --- a/string/bench/memset.c +++ /dev/null @@ -1,243 +0,0 @@ -/* - * memset benchmark. - * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include "stringlib.h" -#include "benchlib.h" - -#define ITERS 5000 -#define ITERS2 20000000 -#define ITERS3 1000000 -#define NUM_TESTS 16384 -#define MIN_SIZE 32768 -#define MAX_SIZE (1024 * 1024) - -static uint8_t a[MAX_SIZE + 4096] __attribute__((__aligned__(64))); - -#define F(x) {#x, x}, - -static const struct fun -{ - const char *name; - void *(*fun)(void *, int, size_t); -} funtab[] = -{ -#if __aarch64__ - F(__memset_aarch64) -#elif __arm__ - F(__memset_arm) -#endif - F(memset) -#undef F - {0, 0} -}; - -typedef struct { uint32_t offset : 20, len : 12; } memset_test_t; -static memset_test_t test_arr[NUM_TESTS]; - -typedef struct { uint16_t size; uint16_t freq; } freq_data_t; -typedef struct { uint8_t align; uint16_t freq; } align_data_t; - -#define SIZE_NUM 65536 -#define SIZE_MASK (SIZE_NUM-1) -static uint8_t len_arr[SIZE_NUM]; - -/* Frequency data for memset sizes up to 4096 based on SPEC2017. */ -static freq_data_t memset_len_freq[] = -{ -{40,28817}, {32,15336}, { 16,3823}, {296,3545}, { 24,3454}, { 8,1412}, -{292,1202}, { 48, 927}, { 12, 613}, { 11, 539}, {284, 493}, {108, 414}, -{ 88, 380}, { 20, 295}, {312, 271}, { 72, 233}, { 2, 200}, { 4, 192}, -{ 15, 180}, { 14, 174}, { 13, 160}, { 56, 151}, { 36, 144}, { 64, 140}, -{4095,133}, { 10, 130}, { 9, 124}, { 3, 124}, { 28, 120}, { 0, 118}, -{288, 110}, {1152, 96}, {104, 90}, { 1, 86}, {832, 76}, {248, 74}, -{1024, 69}, {120, 64}, {512, 63}, {384, 60}, { 6, 59}, { 80, 54}, -{ 17, 50}, { 7, 49}, {520, 47}, {2048, 39}, {256, 37}, {864, 33}, -{1440, 28}, { 22, 27}, {2056, 24}, {260, 23}, { 68, 23}, { 5, 22}, -{ 18, 21}, {200, 18}, {2120, 18}, { 60, 17}, { 52, 16}, {336, 15}, -{ 44, 13}, {192, 13}, {160, 12}, {2064, 12}, {128, 12}, { 76, 11}, -{164, 11}, {152, 10}, {136, 9}, {488, 7}, { 96, 6}, {560, 6}, -{1016, 6}, {112, 5}, {232, 5}, {168, 5}, {952, 5}, {184, 5}, -{144, 4}, {252, 4}, { 84, 3}, {960, 3}, {3808, 3}, {244, 3}, -{280, 3}, {224, 3}, {156, 3}, {1088, 3}, {440, 3}, {216, 2}, -{304, 2}, { 23, 2}, { 25, 2}, { 26, 2}, {264, 2}, {328, 2}, -{1096, 2}, {240, 2}, {1104, 2}, {704, 2}, {1664, 2}, {360, 2}, -{808, 1}, {544, 1}, {236, 1}, {720, 1}, {368, 1}, {424, 1}, -{640, 1}, {1112, 1}, {552, 1}, {272, 1}, {776, 1}, {376, 1}, -{ 92, 1}, {536, 1}, {824, 1}, {496, 1}, {760, 1}, {792, 1}, -{504, 1}, {344, 1}, {1816, 1}, {880, 1}, {176, 1}, {320, 1}, -{352, 1}, {2008, 1}, {208, 1}, {408, 1}, {228, 1}, {2072, 1}, -{568, 1}, {220, 1}, {616, 1}, {600, 1}, {392, 1}, {696, 1}, -{2144, 1}, {1280, 1}, {2136, 1}, {632, 1}, {584, 1}, {456, 1}, -{472, 1}, {3440, 1}, {2088, 1}, {680, 1}, {2928, 1}, {212, 1}, -{648, 1}, {1752, 1}, {664, 1}, {3512, 1}, {1032, 1}, {528, 1}, -{4072, 1}, {204, 1}, {2880, 1}, {3392, 1}, {712, 1}, { 59, 1}, -{736, 1}, {592, 1}, {2520, 1}, {744, 1}, {196, 1}, {172, 1}, -{728, 1}, {2040, 1}, {1192, 1}, {3600, 1}, {0, 0} -}; - -#define ALIGN_NUM 1024 -#define ALIGN_MASK (ALIGN_NUM-1) -static uint8_t align_arr[ALIGN_NUM]; - -/* Alignment data for memset based on SPEC2017. */ -static align_data_t memset_align_freq[] = -{ - {16, 338}, {8, 307}, {32, 148}, {64, 131}, {4, 72}, {1, 23}, {2, 5}, {0, 0} -}; - -static void -init_memset_distribution (void) -{ - int i, j, freq, size, n; - - for (n = i = 0; (freq = memset_len_freq[i].freq) != 0; i++) - for (j = 0, size = memset_len_freq[i].size; j < freq; j++) - len_arr[n++] = size; - assert (n == SIZE_NUM); - - for (n = i = 0; (freq = memset_align_freq[i].freq) != 0; i++) - for (j = 0, size = memset_align_freq[i].align; j < freq; j++) - align_arr[n++] = size - 1; - assert (n == ALIGN_NUM); -} - -static size_t -init_memset (size_t max_size) -{ - size_t total = 0; - /* Create a random set of memsets with the given size and alignment - distributions. */ - for (int i = 0; i < NUM_TESTS; i++) - { - test_arr[i].offset = (rand32 (0) & (max_size - 1)); - test_arr[i].offset &= ~align_arr[rand32 (0) & ALIGN_MASK]; - test_arr[i].len = len_arr[rand32 (0) & SIZE_MASK]; - total += test_arr[i].len; - } - - return total; -} - - -int main (void) -{ - init_memset_distribution (); - - memset (a, 1, sizeof (a)); - - printf("Random memset (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - size_t total_size = 0; - uint64_t tsum = 0; - printf ("%22s ", funtab[f].name); - rand32 (0x12345678); - - for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) - { - size_t memset_size = init_memset (size) * ITERS; - - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - funtab[f].fun (a + test_arr[c].offset, 0, test_arr[c].len); - t = clock_get_ns () - t; - total_size += memset_size; - tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); - } - printf( "avg %.2f\n", (double)total_size / tsum); - } - - size_t total_size = 0; - uint64_t tsum = 0; - printf ("%22s ", "memset_call"); - rand32 (0x12345678); - - for (int size = MIN_SIZE; size <= MAX_SIZE; size *= 2) - { - size_t memset_size = init_memset (size) * ITERS; - - for (int c = 0; c < NUM_TESTS; c++) - memset (a + test_arr[c].offset, 0, test_arr[c].len); - - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) - memset (a + test_arr[c].offset, 0, test_arr[c].len); - t = clock_get_ns () - t; - total_size += memset_size; - tsum += t; - printf ("%dK: %.2f ", size / 1024, (double)memset_size / t); - } - printf( "avg %.2f\n", (double)total_size / tsum); - - - printf ("\nMedium memset (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - funtab[f].fun (a, 0, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - printf ("\n"); - } - - printf ("%22s ", "memset_call"); - for (int size = 8; size <= 512; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS2; i++) - memset (a, 0, size); - t = clock_get_ns () - t; - printf ("%dB: %.2f ", size, (double)size * ITERS2 / t); - } - - - printf ("\nLarge memset (bytes/ns):\n"); - for (int f = 0; funtab[f].name != 0; f++) - { - printf ("%22s ", funtab[f].name); - - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - funtab[f].fun (a, 0, size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n"); - } - - printf ("%22s ", "memset_call"); - for (int size = 1024; size <= 65536; size *= 2) - { - uint64_t t = clock_get_ns (); - for (int i = 0; i < ITERS3; i++) - memset (a, 0, size); - t = clock_get_ns () - t; - printf ("%dK: %.2f ", size / 1024, (double)size * ITERS3 / t); - } - printf ("\n\n"); - - return 0; -} diff --git a/string/bench/strlen.c b/string/bench/strlen.c index f05d0d5..cc0f04b 100644 --- a/string/bench/strlen.c +++ b/string/bench/strlen.c @@ -1,8 +1,8 @@ /* * strlen benchmark. * - * Copyright (c) 2020-2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #define _GNU_SOURCE @@ -13,10 +13,10 @@ #include "stringlib.h" #include "benchlib.h" -#define ITERS 5000 +#define ITERS 2000 #define ITERS2 20000000 #define ITERS3 2000000 -#define NUM_TESTS 16384 +#define NUM_STRLEN 16384 #define MAX_ALIGN 32 #define MAX_STRLEN 256 @@ -49,7 +49,7 @@ static const struct fun }; #undef F -static uint16_t strlen_tests[NUM_TESTS]; +static uint16_t strlen_tests[NUM_STRLEN]; typedef struct { uint16_t size; uint16_t freq; } freq_data_t; typedef struct { uint8_t align; uint16_t freq; } align_data_t; @@ -117,7 +117,7 @@ init_strlen_tests (void) /* Create a random set of strlen input strings using the string length and alignment distributions. */ - for (int n = 0; n < NUM_TESTS; n++) + for (int n = 0; n < NUM_STRLEN; n++) { int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; @@ -141,14 +141,14 @@ int main (void) size_t res = 0, strlen_size = 0, mask = maskv; printf ("%22s ", funtab[f].name); - for (int c = 0; c < NUM_TESTS; c++) + for (int c = 0; c < NUM_STRLEN; c++) strlen_size += funtab[f].fun (a + strlen_tests[c]); strlen_size *= ITERS; /* Measure latency of strlen result with (res & mask). */ uint64_t t = clock_get_ns (); for (int i = 0; i < ITERS; i++) - for (int c = 0; c < NUM_TESTS; c++) + for (int c = 0; c < NUM_STRLEN; c++) res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); t = clock_get_ns () - t; printf ("%.2f\n", (double)strlen_size / t); diff --git a/string/include/benchlib.h b/string/include/benchlib.h index f1bbea3..0f2ce2e 100644 --- a/string/include/benchlib.h +++ b/string/include/benchlib.h @@ -2,7 +2,7 @@ * Benchmark support functions. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 650c52c..378c3cd 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2021, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -29,17 +29,19 @@ size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); void * __memchr_aarch64_mte (const void *, int, size_t); +char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); char *__strrchr_aarch64_mte (const char *, int); +int __strcmp_aarch64_mte (const char *, const char *); +int __strncmp_aarch64_mte (const char *, const char *, size_t); #if __ARM_NEON void *__memcpy_aarch64_simd (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64_simd (void *, const void *, size_t); #endif # if __ARM_FEATURE_SVE -void *__memcpy_aarch64_sve (void *__restrict, const void *__restrict, size_t); -void *__memmove_aarch64_sve (void *__restrict, const void *__restrict, size_t); void *__memchr_aarch64_sve (const void *, int, size_t); int __memcmp_aarch64_sve (const void *, const void *, size_t); char *__strchr_aarch64_sve (const char *, int); @@ -52,11 +54,6 @@ size_t __strlen_aarch64_sve (const char *); size_t __strnlen_aarch64_sve (const char *, size_t); int __strncmp_aarch64_sve (const char *, const char *, size_t); # endif -# if WANT_MOPS -void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t); -void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t); -void *__memset_aarch64_mops (void *, int, size_t); -# endif # if __ARM_FEATURE_MEMORY_TAGGING void *__mtag_tag_region (void *, size_t); void *__mtag_tag_zero_region (void *, size_t); diff --git a/string/test/__mtag_tag_region.c b/string/test/__mtag_tag_region.c index c45fa66..d8c02d9 100644 --- a/string/test/__mtag_tag_region.c +++ b/string/test/__mtag_tag_region.c @@ -2,7 +2,7 @@ * __mtag_tag_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/string/test/__mtag_tag_zero_region.c b/string/test/__mtag_tag_zero_region.c index a4a7861..221c223 100644 --- a/string/test/__mtag_tag_zero_region.c +++ b/string/test/__mtag_tag_zero_region.c @@ -2,7 +2,7 @@ * __mtag_tag_zero_region test. * * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #if __ARM_FEATURE_MEMORY_TAGGING && WANT_MTE_TEST diff --git a/string/test/memchr.c b/string/test/memchr.c index c6a9448..0ff77f5 100644 --- a/string/test/memchr.c +++ b/string/test/memchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/string/test/memcmp.c b/string/test/memcmp.c index f9236b8..7a7cf9c 100644 --- a/string/test/memcmp.c +++ b/string/test/memcmp.c @@ -2,7 +2,7 @@ * memcmp test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/string/test/memcpy.c b/string/test/memcpy.c index 0c2c75a..ce0ceee 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -1,8 +1,8 @@ /* * memcpy test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -28,12 +28,6 @@ static const struct fun # if __ARM_NEON F(__memcpy_aarch64_simd, 1) # endif -# if __ARM_FEATURE_SVE - F(__memcpy_aarch64_sve, 1) -# endif -# if WANT_MOPS - F(__memcpy_aarch64_mops, 1) -# endif #elif __arm__ F(__memcpy_arm, 0) #endif diff --git a/string/test/memmove.c b/string/test/memmove.c index a5149d7..689b68c 100644 --- a/string/test/memmove.c +++ b/string/test/memmove.c @@ -1,8 +1,8 @@ /* * memmove test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -28,12 +28,6 @@ static const struct fun # if __ARM_NEON F(__memmove_aarch64_simd, 1) # endif -# if __ARM_FEATURE_SVE - F(__memmove_aarch64_sve, 1) -# endif -# if WANT_MOPS - F(__memmove_aarch64_mops, 1) -# endif #endif {0, 0, 0} // clang-format on diff --git a/string/test/memrchr.c b/string/test/memrchr.c index 4171a56..adf96f0 100644 --- a/string/test/memrchr.c +++ b/string/test/memrchr.c @@ -2,7 +2,7 @@ * memchr test. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef _GNU_SOURCE diff --git a/string/test/memset.c b/string/test/memset.c index 3489e29..f172144 100644 --- a/string/test/memset.c +++ b/string/test/memset.c @@ -2,7 +2,7 @@ * memset test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include @@ -25,9 +25,6 @@ static const struct fun F(memset, 0) #if __aarch64__ F(__memset_aarch64, 1) -# if WANT_MOPS - F(__memset_aarch64_mops, 1) -# endif #elif __arm__ F(__memset_arm, 0) #endif diff --git a/string/test/mte.h b/string/test/mte.h index 40b0ecf..e67cbd9 100644 --- a/string/test/mte.h +++ b/string/test/mte.h @@ -2,7 +2,7 @@ * Memory tagging testing code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef __TEST_MTE_H diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c index 0300892..1827e68 100644 --- a/string/test/stpcpy.c +++ b/string/test/stpcpy.c @@ -1,8 +1,8 @@ /* * stpcpy test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #ifndef _GNU_SOURCE @@ -28,7 +28,8 @@ static const struct fun // clang-format off F(stpcpy, 0) #if __aarch64__ - F(__stpcpy_aarch64, 1) + F(__stpcpy_aarch64, 0) + F(__stpcpy_aarch64_mte, 1) # if __ARM_FEATURE_SVE F(__stpcpy_aarch64_sve, 1) # endif diff --git a/string/test/strchr.c b/string/test/strchr.c index 66180ac..f3ae982 100644 --- a/string/test/strchr.c +++ b/string/test/strchr.c @@ -2,7 +2,7 @@ * strchr test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/string/test/strchrnul.c b/string/test/strchrnul.c index aad0bf5..6c30ab2 100644 --- a/string/test/strchrnul.c +++ b/string/test/strchrnul.c @@ -2,7 +2,7 @@ * strchrnul test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef _GNU_SOURCE diff --git a/string/test/strcmp.c b/string/test/strcmp.c index 4aa95f4..d57b54e 100644 --- a/string/test/strcmp.c +++ b/string/test/strcmp.c @@ -1,8 +1,8 @@ /* * strcmp test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -24,7 +24,8 @@ static const struct fun // clang-format off F(strcmp, 0) #if __aarch64__ - F(__strcmp_aarch64, 1) + F(__strcmp_aarch64, 0) + F(__strcmp_aarch64_mte, 1) # if __ARM_FEATURE_SVE F(__strcmp_aarch64_sve, 1) # endif diff --git a/string/test/strcpy.c b/string/test/strcpy.c index af297f9..e84cace 100644 --- a/string/test/strcpy.c +++ b/string/test/strcpy.c @@ -1,8 +1,8 @@ /* * strcpy test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -24,7 +24,8 @@ static const struct fun // clang-format off F(strcpy, 0) #if __aarch64__ - F(__strcpy_aarch64, 1) + F(__strcpy_aarch64, 0) + F(__strcpy_aarch64_mte, 1) # if __ARM_FEATURE_SVE F(__strcpy_aarch64_sve, 1) # endif diff --git a/string/test/stringtest.h b/string/test/stringtest.h index 6bb7e1f..fe855fc 100644 --- a/string/test/stringtest.h +++ b/string/test/stringtest.h @@ -2,7 +2,7 @@ * Common string test code. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/string/test/strlen.c b/string/test/strlen.c index 47ef3dc..6278380 100644 --- a/string/test/strlen.c +++ b/string/test/strlen.c @@ -1,14 +1,15 @@ /* * strlen test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include #include #include #include +#include #include #include "mte.h" #include "stringlib.h" diff --git a/string/test/strncmp.c b/string/test/strncmp.c index 4bbab6f..018a8a4 100644 --- a/string/test/strncmp.c +++ b/string/test/strncmp.c @@ -1,8 +1,8 @@ /* * strncmp test. * - * Copyright (c) 2019-2022, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * Copyright (c) 2019-2020, Arm Limited. + * SPDX-License-Identifier: MIT */ #include @@ -24,7 +24,8 @@ static const struct fun // clang-format off F(strncmp, 0) #if __aarch64__ - F(__strncmp_aarch64, 1) + F(__strncmp_aarch64, 0) + F(__strncmp_aarch64_mte, 1) # if __ARM_FEATURE_SVE F(__strncmp_aarch64_sve, 1) # endif diff --git a/string/test/strnlen.c b/string/test/strnlen.c index a800fd1..0dea00e 100644 --- a/string/test/strnlen.c +++ b/string/test/strnlen.c @@ -2,7 +2,7 @@ * strnlen test. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #ifndef _GNU_SOURCE diff --git a/string/test/strrchr.c b/string/test/strrchr.c index 580ca49..fedbdc5 100644 --- a/string/test/strrchr.c +++ b/string/test/strrchr.c @@ -2,7 +2,7 @@ * strrchr test. * * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #include diff --git a/string/x86_64/check-arch.S b/string/x86_64/check-arch.S index 5afcf7b..26ade0a 100644 --- a/string/x86_64/check-arch.S +++ b/string/x86_64/check-arch.S @@ -2,7 +2,7 @@ * check ARCH setting. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + * SPDX-License-Identifier: MIT */ #if !__x86_64__ -- Gitee