diff --git a/compose.yaml b/compose.yaml index 84481e1af76..2bd38a381e8 100644 --- a/compose.yaml +++ b/compose.yaml @@ -1718,9 +1718,9 @@ services: cache_from: - ${REPO}:amd64-ubuntu-r-valgrind args: - base: wch1/r-debug:latest + base: rhub/valgrind:latest cmake: ${CMAKE} - r_bin: RDvalgrind + r_bin: R tz: ${TZ} environment: <<: [*common, *ccache, *sccache] diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 0ac5e36ea6d..3252e960c3a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -28,7 +28,7 @@ URL: https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/ BugReports: https://github.com/apache/arrow/issues Encoding: UTF-8 Language: en-US -SystemRequirements: C++17; for AWS S3 support on Linux, libcurl and openssl (optional); +SystemRequirements: C++20; for AWS S3 support on Linux, libcurl and openssl (optional); cmake >= 3.26 (build-time only, and only for full source build) Biarch: true Imports: diff --git a/r/NEWS.md b/r/NEWS.md index 3af9e1185e4..7e4dde0a993 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -18,6 +18,22 @@ --> # arrow 23.0.0 + +## New features + +- `nchar()` now supports `keepNA = FALSE` (@HyukjinKwon, #48665). +- `stringr::str_ilike()` binding for case-insensitive pattern matching (#48262). + +## Minor improvements and fixes + +- Fix slow performance reading files with large number of columns (#48104). +- Fix segfault when calling `concat_tables()` on a `RecordBatch` (#47885). +- Writing partitioned datasets on S3 no longer requires `ListBucket` permissions (@HaochengLIU, #47599). + +## Installation + +- As of version 23.0.0, `arrow` requires C++20 to build from source. This means that you may need a newer compiler than the default on some older systems. See `vignette("install", package = "arrow")` for guidance. + # arrow 22.0.0.1 ## Minor improvements and fixes diff --git a/r/README.md b/r/README.md index 1ab9206f119..bb5d137dc88 100644 --- a/r/README.md +++ b/r/README.md @@ -44,7 +44,7 @@ There are some special cases to note: - On Linux the installation process can sometimes be more involved because CRAN does not host binaries for Linux. For more information please see the [installation guide](https://arrow.apache.org/docs/r/articles/install.html). -- If you are compiling arrow from source, please note that as of version 10.0.0, arrow requires C++17 to build. This has implications on Windows and CentOS 7. For Windows users it means you need to be running an R version of 4.0 or later. On CentOS 7, it means you need to install a newer compiler than the default system compiler gcc. See the [installation details article](https://arrow.apache.org/docs/r/articles/developers/install_details.html) for guidance. +- If you are compiling arrow from source, please note that as of version 23.0.0, arrow requires C++20 to build. This has implications on Windows and CentOS 7. For Windows users it means you need to be running an R version of 4.3 or later (though R 4.2 has incomplete support and might work with special configuration). See the [installation details article](https://arrow.apache.org/docs/r/articles/developers/install_details.html) for guidance. - Development versions of arrow are released nightly. For information on how to install nightly builds please see the [installing nightly builds](https://arrow.apache.org/docs/r/articles/install_nightly.html) article. diff --git a/r/configure b/r/configure index f64a3673f97..9e92eb6b47f 100755 --- a/r/configure +++ b/r/configure @@ -86,10 +86,10 @@ if [ "$ARROW_R_DEV" = "true" ] && [ -f "data-raw/codegen.R" ]; then ${R_HOME}/bin/Rscript data-raw/codegen.R fi -# Arrow requires C++17, so check for it -if [ ! "`${R_HOME}/bin/R CMD config CXX17`" ]; then +# Arrow requires C++20, so check for it +if [ ! "`${R_HOME}/bin/R CMD config CXX20`" ]; then echo "------------------------- NOTE ---------------------------" - echo "Cannot install arrow: a C++17 compiler is required." + echo "Cannot install arrow: a C++20 compiler is required." echo "See https://arrow.apache.org/docs/r/articles/install.html" echo "---------------------------------------------------------" exit 1 @@ -260,14 +260,6 @@ set_pkg_vars () { if [ "$ARROW_R_CXXFLAGS" ]; then PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - - # We use expr because the product version returns more than just 10.13 and we want to - # match the substring. However, expr always outputs the number of matched characters - # to stdout, to avoid noise in the log we redirect the output to /dev/null - if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then - # avoid C++17 availability warnings on macOS < 11 - PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" - fi } # If we have pkg-config, it will tell us what libarrow needs @@ -408,11 +400,11 @@ else fi # Test that we can compile something with those flags -CXX17="`${R_HOME}/bin/R CMD config CXX17` -E" -CXX17FLAGS=`"${R_HOME}"/bin/R CMD config CXX17FLAGS` -CXX17STD=`"${R_HOME}"/bin/R CMD config CXX17STD` +CXX20="`${R_HOME}/bin/R CMD config CXX20` -E" +CXX20FLAGS=`"${R_HOME}"/bin/R CMD config CXX20FLAGS` +CXX20STD=`"${R_HOME}"/bin/R CMD config CXX20STD` CPPFLAGS=`"${R_HOME}"/bin/R CMD config CPPFLAGS` -TEST_CMD="${CXX17} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX17FLAGS} ${CXX17STD} -xc++ -" +TEST_CMD="${CXX20} ${CPPFLAGS} ${PKG_CFLAGS} ${CXX20FLAGS} ${CXX20STD} -xc++ -" TEST_ERROR=$(echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} -o /dev/null 2>&1) if [ $? -eq 0 ]; then diff --git a/r/configure.win b/r/configure.win index 433ef28439a..16c5ec1bee8 100755 --- a/r/configure.win +++ b/r/configure.win @@ -117,14 +117,6 @@ set_pkg_vars () { if [ "$ARROW_R_CXXFLAGS" ]; then PKG_CFLAGS="$PKG_CFLAGS $ARROW_R_CXXFLAGS" fi - - # We use expr because the product version returns more than just 10.13 and we want to - # match the substring. However, expr always outputs the number of matched characters - # to stdout, to avoid noise in the log we redirect the output to /dev/null - if [ "$UNAME" = "Darwin" ] && expr $(sw_vers -productVersion) : '10\.13' >/dev/null 2>&1; then - # avoid C++17 availability warnings on macOS < 11 - PKG_CFLAGS="$PKG_CFLAGS -D_LIBCPP_DISABLE_AVAILABILITY" - fi } # If we have pkg-config, it will tell us what libarrow needs diff --git a/r/data-raw/codegen.R b/r/data-raw/codegen.R index b7bee39b779..9acfef109c5 100644 --- a/r/data-raw/codegen.R +++ b/r/data-raw/codegen.R @@ -211,9 +211,7 @@ static const R_CallMethodDef CallEntries[] = { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); - #if defined(HAS_ALTREP) arrow::r::altrep::Init_Altrep_classes(dll); - #endif _arrow_compute__Initialize(); } diff --git a/r/inst/NOTICE.txt b/r/inst/NOTICE.txt index 2089c6fb203..9b98364d2ab 100644 --- a/r/inst/NOTICE.txt +++ b/r/inst/NOTICE.txt @@ -17,9 +17,6 @@ https://github.com/libdynd This product includes software from the LLVM project * distributed under the University of Illinois Open Source -This product includes software from the google-lint project - * Copyright (c) 2009 Google Inc. All rights reserved. - This product includes software from the mman-win32 project * Copyright https://code.google.com/p/mman-win32/ * Licensed under the MIT License; diff --git a/r/man/DictionaryType.Rd b/r/man/DictionaryType.Rd index 8c9087f1ab6..cda27978b1b 100644 --- a/r/man/DictionaryType.Rd +++ b/r/man/DictionaryType.Rd @@ -3,13 +3,40 @@ \docType{class} \name{DictionaryType} \alias{DictionaryType} -\title{class DictionaryType} +\title{DictionaryType class} \description{ -class DictionaryType +\code{DictionaryType} is a \link{FixedWidthType} that represents dictionary-encoded data. +Dictionary encoding stores unique values in a dictionary and uses integer-type +indices to reference them, which can be more memory-efficient for data with many +repeated values. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$ToString()}: Return a string representation of the dictionary type +\item \verb{$code(namespace = FALSE)}: Return R code to create this dictionary type +} +} + +\section{Active bindings}{ -TODO +\itemize{ +\item \verb{$index_type}: The \link{DataType} for the dictionary indices (must be an integer type, +signed or unsigned) +\item \verb{$value_type}: The \link{DataType} for the dictionary values +\item \verb{$name}: The name of the type. +\item \verb{$ordered}: Whether the dictionary is ordered. +} +} + +\section{Factory}{ + + +\code{DictionaryType$create()} takes the following arguments: +\itemize{ +\item \code{index_type}: A \link{DataType} for the indices (default \code{\link[=int32]{int32()}}) +\item \code{value_type}: A \link{DataType} for the values (default \code{\link[=utf8]{utf8()}}) +\item \code{ordered}: Is this an ordered dictionary (default \code{FALSE})? +} } diff --git a/r/man/FixedWidthType.Rd b/r/man/FixedWidthType.Rd index ac6723d79db..71d0ab2d276 100644 --- a/r/man/FixedWidthType.Rd +++ b/r/man/FixedWidthType.Rd @@ -5,11 +5,22 @@ \alias{FixedWidthType} \title{FixedWidthType class} \description{ -FixedWidthType class +\code{FixedWidthType} is a base class for data types with a fixed width in bits. +This includes all integer types, floating-point types, \code{Boolean}, +\code{FixedSizeBinary}, temporal types (dates, times, timestamps, durations), +and decimal types. } -\section{Methods}{ +\section{R6 Methods}{ -TODO +\code{FixedWidthType} inherits from \link{DataType}, so it has the same methods. } +\section{Active bindings}{ + +\itemize{ +\item \verb{$bit_width}: The width of the type in bits +} +} + +\keyword{internal} diff --git a/r/man/Message.Rd b/r/man/Message.Rd index fbad235b64f..b8be82bfa4b 100644 --- a/r/man/Message.Rd +++ b/r/man/Message.Rd @@ -5,11 +5,24 @@ \alias{Message} \title{Message class} \description{ -Message class +\code{Message} holds an Arrow IPC message, which includes metadata and +an optional message body. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$Equals(other)}: Check if this \code{Message} is equal to another \code{Message} +\item \verb{$body_length()}: Return the length of the message body in bytes +\item \verb{$Verify()}: Check if the \code{Message} metadata is valid Flatbuffer format +} +} -TODO +\section{Active bindings}{ + +\itemize{ +\item \verb{$type}: The message type +\item \verb{$metadata}: The message metadata +\item \verb{$body}: The message body as a \link{Buffer} +} } diff --git a/r/man/MessageReader.Rd b/r/man/MessageReader.Rd index 32ca8900b33..4c3bef3fc9f 100644 --- a/r/man/MessageReader.Rd +++ b/r/man/MessageReader.Rd @@ -5,11 +5,22 @@ \alias{MessageReader} \title{MessageReader class} \description{ -MessageReader class +\code{MessageReader} reads \code{Message} objects from an input stream. } -\section{Methods}{ +\section{R6 Methods}{ +\itemize{ +\item \verb{$ReadNextMessage()}: Read the next \code{Message} from the stream. Returns \code{NULL} if +there are no more messages. +} +} + +\section{Factory}{ -TODO + +\code{MessageReader$create()} takes the following argument: +\itemize{ +\item \code{stream}: An \link{InputStream} or object coercible to one (e.g., a raw vector) +} } diff --git a/r/src/Makevars.in b/r/src/Makevars.in index af0826faacb..1b7ad08e1cb 100644 --- a/r/src/Makevars.in +++ b/r/src/Makevars.in @@ -25,7 +25,7 @@ PKG_CPPFLAGS=@cflags@ # https://bugs.llvm.org/show_bug.cgi?id=39191 # https://www.mail-archive.com/gcc-bugs@gcc.gnu.org/msg534862.html # PKG_CXXFLAGS=$(CXX_VISIBILITY) -CXX_STD=CXX17 +CXX_STD=CXX20 PKG_LIBS=@libs@ all: $(SHLIB) purify diff --git a/r/src/Makevars.ucrt b/r/src/Makevars.ucrt index a91dedc2d55..b72ed64d98e 100644 --- a/r/src/Makevars.ucrt +++ b/r/src/Makevars.ucrt @@ -19,4 +19,4 @@ CRT=-ucrt include Makevars.win # XXX for some reason, this variable doesn't seem propagated from Makevars.win -CXX_STD=CXX17 +CXX_STD=CXX20 diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index c9ed6b59e8a..385d2ec9ff3 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -24,30 +24,9 @@ #include #include -#include #include -#if defined(HAS_ALTREP) -#if R_VERSION < R_Version(3, 6, 0) - -// workaround because R's not so conveniently uses `class` -// as a variable name, and C++ is not happy about that -// -// SEXP R_new_altrep(R_altrep_class_t class, SEXP data1, SEXP data2); -// -#define class klass - -// Because functions declared in have C linkage -extern "C" { #include -} - -// undo the workaround -#undef class - -#else -#include -#endif #include "./r_task_group.h" @@ -1116,29 +1095,6 @@ std::shared_ptr vec_to_arrow_altrep_bypass(SEXP x) { } // namespace r } // namespace arrow -#else // HAS_ALTREP - -namespace arrow { -namespace r { -namespace altrep { - -// return an altrep R vector that shadows the array if possible -SEXP MakeAltrepVector(const std::shared_ptr& chunked_array) { - return R_NilValue; -} - -bool is_arrow_altrep(SEXP) { return false; } - -std::shared_ptr vec_to_arrow_altrep_bypass(SEXP x) { return nullptr; } - -bool is_unmaterialized_arrow_altrep(SEXP) { return false; } - -} // namespace altrep -} // namespace r -} // namespace arrow - -#endif - // [[arrow::export]] bool is_arrow_altrep(cpp11::sexp x) { return arrow::r::altrep::is_arrow_altrep(x); } diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index d27e1b93a93..432b49503e1 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -25,7 +25,6 @@ #include #include -#include #include #include "./extension.h" diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 73bf81f83bb..14c6330074a 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -6259,9 +6259,7 @@ extern "C" void R_init_arrow(DllInfo* dll){ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); - #if defined(HAS_ALTREP) arrow::r::altrep::Init_Altrep_classes(dll); - #endif _arrow_compute__Initialize(); } diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index f44fd635fde..6ea11ea008d 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -23,7 +23,6 @@ #undef Free #include -#include #include "./nameof.h" diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index d1882e56daf..524182f3824 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -246,9 +246,7 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, namespace altrep { -#if defined(HAS_ALTREP) void Init_Altrep_classes(DllInfo* dll); -#endif SEXP MakeAltrepVector(const std::shared_ptr& chunked_array); bool is_arrow_altrep(SEXP x); diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 0777ca8bc72..c8aa903bf06 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -162,12 +162,13 @@ std::shared_ptr make_compute_options( // false means descending, true means ascending // cpp11 does not support bool here so use int auto orders = cpp11::as_cpp>(options["orders"]); - std::vector keys; + // Use resize + assignment to avoid vector growth operations that trigger + // false positive -Wmaybe-uninitialized warnings in GCC 14 with std::variant + std::vector keys(names.size(), Key("", Order::Ascending)); for (size_t i = 0; i < names.size(); i++) { - keys.push_back( - Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending)); + keys[i] = Key(names[i], (orders[i] > 0) ? Order::Descending : Order::Ascending); } - auto out = std::make_shared(Options(keys)); + auto out = std::make_shared(std::move(keys)); return out; } diff --git a/r/tools/checksums/r-libarrow-darwin-arm64-23.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-darwin-arm64-23.0.0.zip.sha512 new file mode 100644 index 00000000000..a22f6875fbf --- /dev/null +++ b/r/tools/checksums/r-libarrow-darwin-arm64-23.0.0.zip.sha512 @@ -0,0 +1 @@ +2115f7927d024996b2819025294c6ea714ba53fc5347de354b68377129050105863f963df8045fe83a49687896b23334b06bd9bf4846771c9be77aae86dbf0de r-libarrow-darwin-arm64-23.0.0.zip diff --git a/r/tools/checksums/r-libarrow-darwin-x86_64-23.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-darwin-x86_64-23.0.0.zip.sha512 new file mode 100644 index 00000000000..a353523d01c --- /dev/null +++ b/r/tools/checksums/r-libarrow-darwin-x86_64-23.0.0.zip.sha512 @@ -0,0 +1 @@ +b6974eb60abbc96ce15cd3dbb9eac0c17b7786120dff7903ce6cebf3cf2c68e117ec69b6b0ac60c4266744f3d95c2fbb41bdb8050da6a27bc3cc0e7190063fe4 r-libarrow-darwin-x86_64-23.0.0.zip diff --git a/r/tools/checksums/r-libarrow-linux-x86_64-23.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-linux-x86_64-23.0.0.zip.sha512 new file mode 100644 index 00000000000..b87f8d70b53 --- /dev/null +++ b/r/tools/checksums/r-libarrow-linux-x86_64-23.0.0.zip.sha512 @@ -0,0 +1 @@ +7a9e5dfa7dde32834ad31c562a964b2da17ad2d7c0fd867c2bdef2d929dd695868300f84a487e81b06deb998ebfa7ae5e2c194241006dce097b2b938b71d4ffb r-libarrow-linux-x86_64-23.0.0.zip diff --git a/r/tools/checksums/r-libarrow-windows-x86_64-23.0.0.zip.sha512 b/r/tools/checksums/r-libarrow-windows-x86_64-23.0.0.zip.sha512 new file mode 100644 index 00000000000..5af2063f5ce --- /dev/null +++ b/r/tools/checksums/r-libarrow-windows-x86_64-23.0.0.zip.sha512 @@ -0,0 +1 @@ +42e316e8b5041b653c63704cc09f05e04675c0be65c1e7ef3bddd70ab85d7373d20cd0590b5e16bcf062b68e065e0840ca7e23e0967570f332de102ecd6c788a r-libarrow-windows-x86_64-23.0.0.zip diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 9d0a2604682..f4ccb4956a8 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -310,11 +310,11 @@ compile_test_program <- function(code) { openssl_dir <- paste0("-I", openssl_root_dir, "/include") } runner <- paste( - R_CMD_config("CXX17"), + R_CMD_config("CXX20"), openssl_dir, R_CMD_config("CPPFLAGS"), - R_CMD_config("CXX17FLAGS"), - R_CMD_config("CXX17STD"), + R_CMD_config("CXX20FLAGS"), + R_CMD_config("CXX20STD"), "-E", "-xc++" ) @@ -565,8 +565,11 @@ build_libarrow <- function(src_dir, dst_dir) { # is found, it will be used by the libarrow build, and this does # not affect how R compiles the arrow bindings. CC = sub("^.*ccache", "", R_CMD_config("CC")), - CXX = paste(sub("^.*ccache", "", R_CMD_config("CXX17")), R_CMD_config("CXX17STD")), - # CXXFLAGS = R_CMD_config("CXX17FLAGS"), # We don't want the same debug symbols + CXX = paste( + sub("^.*ccache", "", R_CMD_config("CXX20")), + R_CMD_config("CXX20STD") + ), + # CXXFLAGS = R_CMD_config("CXX20FLAGS"), # We don't want the same debug symbols LDFLAGS = R_CMD_config("LDFLAGS"), N_JOBS = ncores ) diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index 69780bd64df..d9cdcc3885c 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -23,8 +23,8 @@ but there are a few things to note. ### Compilers -As of version 10.0.0, arrow requires a C++17 compiler to build. -For `gcc`, this generally means version 7 or newer. Most contemporary Linux +As of version 22.0.0, arrow requires a C++20 compiler to build. +For `gcc`, this generally means version 10 or newer. Most contemporary Linux distributions have a new enough compiler; however, CentOS 7 is a notable exception, as it ships with gcc 4.8.