Skip to content

Deterministic hash #198

@Tom-Kingstone

Description

@Tom-Kingstone

Description:

@tg359 has made a method that can be used to convert a list of values into a has in python:

import hashlib
import json
from typing import Any, Union
from ladybug.epw import EPW

# OLD EXT COMFORT METHOD

def _create_hash(
    epw_file: Path,
    ground_material: Union[EnergyMaterial, EnergyMaterialVegetation],
    shade_material: Union[EnergyMaterial, EnergyMaterialVegetation],
) -> str:
    """Create unique hash for this configuration."""
    content = (
        f"{epw_file.name}_{ground_material.identifier}_{shade_material.identifier}"
    )
    return hashlib.shake_256(
        content.encode(
            "utf-8"
        )
    ).hexdigest(10)

# GENERIC METHOD + SUPPORTING METHODS

def deep_sort_dict(obj: dict[Any, Any], sort_lists: bool = True) -> Any:
    """Recursively sort nested dictionaries and lists for consistent ordering."""
    if isinstance(obj, dict):
        return {
            k: deep_sort_dict(v, sort_lists=sort_lists) for k, v in sorted(obj.items())
        }
    elif sort_lists and isinstance(obj, (list, tuple)):
        processed = [deep_sort_dict(x, sort_lists=sort_lists) for x in obj]
        if isinstance(obj, tuple):
            return tuple(
                sorted(
                    processed,
                    key=lambda x: json.dumps(x, sort_keys=True, cls=AllPowerfulEncoder),
                )
            )
        else:
            return sorted(
                processed,
                key=lambda x: json.dumps(x, sort_keys=True, cls=AllPowerfulEncoder),
            )
    else:
        return obj

def deterministic_hash(*values: Any, length: int = 16, algorithm: str = "md5", sort_lists: bool = True) -> int:
    """Create a deterministic hash from one or more values.

    This function produces consistent hash values across Python sessions,
    unlike the built-in hash() function which uses random salting.

    Args:
        *values (Any):
            One or more values to hash. Will be converted to strings and
            concatenated before hashing.
        length (int, optional):
            Number of hexadecimal characters to use from the hash.
            Defaults to 16 (64 bits).
        algorithm (str, optional):
            Hash algorithm to use. Must be supported by hashlib.
            Common options: 'md5', 'sha1', 'sha256', 'sha512'.
            Defaults to 'md5'.
        sort_lists (bool, optional):
            Whether to sort lists and dictionaries recursively before hashing,
            to ensure consistent ordering. Defaults to True.

    Returns:
        int:
            An integer hash value derived from the input values.

    Raises:
        ValueError:
            If the algorithm is not supported by hashlib.

    Example:
        >>> deterministic_hash("hello", "world", 123)
        123456789012345678
        >>> deterministic_hash("hello", "world", 123)  # same result in new session
        123456789012345678
        >>> deterministic_hash("test", length=8)
        12345678

    """

    # validate algorithm
    if algorithm not in hashlib.algorithms_available:
        raise ValueError(
            f"Hash algorithm '{algorithm}' not supported. "
            f"Available: {sorted(hashlib.algorithms_available)}"
        )

    # canonicalize each value
    canonicalized = []
    for v in values:
        if isinstance(v, (dict, list)):
            v = deep_sort_dict(v, sort_lists=sort_lists)
            v = json.dumps(v, sort_keys=True, cls=AllPowerfulEncoder)
        canonicalized.append(str(v))

    concatenated = "".join(canonicalized)
    hasher = hashlib.new(algorithm)
    hasher.update(concatenated.encode("utf-8"))
    hex_digest = hasher.hexdigest()[:length]
    return int(hex_digest, 16)

This would be most useful in toolkits like LadybugTools_Toolkit where ExternalComfort simulations require identifiers to be under 100 characters, but will be placed in this toolkit in case other tools require it.

Metadata

Metadata

Assignees

Labels

type:featureNew capability or enhancement

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions