Skip to content

[feature] Array of Strings tuple sketch #475

@proost

Description

@proost

Summary

Adding Array of Strings Tuple Sketch.

Design

  • I reuse and extend the existing tuple sketch interfaces as much as possible.
  • I use array<std::string> as container for string collection, following the same approach as the ArrayOfDoubles tuple sketch.
  • Serialization and deserialization follow the existing tuple sketch patterns using default_array_of_strings_serde .
    • One difference from the Java implementation is UTF-8 validation. In the C++ ArrayOfStringsTupleSketch, I added validation to ensure each std::string contains valid UTF-8 encoded text. This difference comes from the fact that Java String is always valid Unicode, while C++ std::string may contain arbitrary bytes.
    • For validation, I use utfcpp (https://github.com/nemtrif/utfcpp). simdutf (https://github.com/simdutf/simdutf) is widely used and C++11 compatible, but it may trigger warnings because it does not strictly follow ISO C++ in all configurations. Since some users build with strict compiler flags and UTF-8 validation is not on the hot path (it runs only during serialization/deserialization), I chose utfcpp even though it may be slightly slower.
namespace datasketches {

// default update policy for an array of strings
template<typename Allocator = std::allocator<std::string>>
class default_array_of_strings_update_policy {
public:
  using array_of_strings = array<std::string, Allocator>;

  explicit default_array_of_strings_update_policy(const Allocator& allocator = Allocator());

  array_of_strings create() const;

  void update(array_of_strings& array, const array_of_strings& input) const;

  void update(array_of_strings& array, const array_of_strings* input) const;

private:
  Allocator allocator_;
};

// serializer/deserializer for an array of strings
// Requirements: all strings must be valid UTF-8 and array size must be <= 127.
template<typename Allocator = std::allocator<std::string>>
struct default_array_of_strings_serde {
  using array_of_strings = array<std::string, Allocator>;
  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;

  explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());

  void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const;
  void deserialize(std::istream& is, array_of_strings* items, unsigned num) const;
  size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const;
  size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const;
  size_t size_of_item(const array_of_strings& item) const;

private:
  Allocator allocator_;
  summary_allocator summary_allocator_;
  static void check_num_nodes(uint8_t num_nodes);
  static uint32_t compute_total_bytes(const array_of_strings& item);
  static void check_utf8(const std::string& value);
};

/**
 * Extended class of compact_tuple_sketch for array of strings
 * Requirements: all strings must be valid UTF-8 and array size must be <= 127.
 */
template<typename Allocator = std::allocator<std::string>>
class compact_array_of_strings_tuple_sketch:
  public compact_tuple_sketch<
    array<std::string, Allocator>,
    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
  > {
public:
  using array_of_strings = array<std::string, Allocator>;
  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
  using Base = compact_tuple_sketch<array_of_strings, summary_allocator>;
  using vector_bytes = typename Base::vector_bytes;
  using Base::serialize;

  /**
   * Copy constructor.
   * Constructs a compact sketch from another sketch (update or compact)
   * @param other sketch to be constructed from
   * @param ordered if true make the resulting sketch ordered
   */
  template<typename Sketch>
  compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true);

  /**
   * This method deserializes a sketch from a given stream.
   * @param is input stream
   * @param seed the seed for the hash function that was used to create the sketch
   * @param sd instance of a SerDe
   * @param allocator instance of an Allocator
   * @return an instance of the sketch
   */
  template<typename SerDe = default_array_of_strings_serde<Allocator>>
  static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
      const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());

  /**
   * This method deserializes a sketch from a given array of bytes.
   * @param bytes pointer to the array of bytes
   * @param size the size of the array
   * @param seed the seed for the hash function that was used to create the sketch
   * @param sd instance of a SerDe
   * @param allocator instance of an Allocator
   * @return an instance of the sketch
   */
  template<typename SerDe = default_array_of_strings_serde<Allocator>>
  static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
      const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());

private:
  explicit compact_array_of_strings_tuple_sketch(Base&& base);
};

/**
 * Extended class of update_tuple_sketch for array of strings
 */
template<template<typename> class Policy = default_array_of_strings_update_policy,
         typename Allocator = std::allocator<std::string>>
class update_array_of_strings_tuple_sketch:
  public update_tuple_sketch<
    array<std::string, Allocator>,
    array<std::string, Allocator>,
    Policy<Allocator>,
    typename std::allocator_traits<Allocator>::template rebind_alloc<array<std::string, Allocator>>
  > {
public:
  using array_of_strings = array<std::string, Allocator>;
  using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
  using policy_type = Policy<Allocator>;
  using Base = update_tuple_sketch<
    array_of_strings,
    array_of_strings,
    policy_type,
    summary_allocator
  >;
  using resize_factor = typename Base::resize_factor;
  class builder;
  using Base::update;

  /**
   * Updates the sketch with string array for both key and value.
   * @param key the given string array key
   * @param value the given string array value
   */
  void update(const array_of_strings& key, const array_of_strings& value);

  /**
   * Converts this sketch to a compact sketch (ordered or unordered).
   * @param ordered optional flag to specify if an ordered sketch should be produced
   * @return compact array of strings sketch
   */
  compact_array_of_strings_tuple_sketch<Allocator> compact(bool ordered = true) const;

private:
  update_array_of_strings_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta,
      uint64_t seed, const policy_type& policy, const summary_allocator& allocator);

  // Matches Java Util.PRIME for ArrayOfStrings key hashing.
  static constexpr uint64_t STRING_ARR_HASH_SEED = 0x7A3CCA71ULL;

  static uint64_t hash_key(const array_of_strings& key);
};

template<template<typename> class Policy, typename Allocator>
class update_array_of_strings_tuple_sketch<Policy, Allocator>::builder:
  public tuple_base_builder<builder, policy_type, summary_allocator> {
public:
  builder(const policy_type& policy = policy_type(), const summary_allocator& allocator = summary_allocator());

  update_array_of_strings_tuple_sketch build() const;
};

} /* namespace datasketches */

Plan

I will send a PR. But for the compatibility test, after This PR is merged i will send a PR

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions