Fix project isolation: Make loadChatHistory respect active project sessions

- Modified loadChatHistory() to check for active project before fetching all sessions
- When active project exists, use project.sessions instead of fetching from API
- Added detailed console logging to debug session filtering
- This prevents ALL sessions from appearing in every project's sidebar

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
uroma
2026-01-22 14:43:05 +00:00
Unverified
parent b82837aa5f
commit 55aafbae9a
6463 changed files with 1115462 additions and 4486 deletions

View File

@@ -0,0 +1,33 @@
"""
rapid string matching library
"""
from __future__ import annotations
__author__: str = "Max Bachmann"
__license__: str = "MIT"
__version__: str = "3.14.3"
from rapidfuzz import distance, fuzz, process, utils
__all__ = ["distance", "fuzz", "get_include", "process", "utils"]
def get_include():
"""
Return the directory that contains the RapidFuzz \\*.h header files.
Extension modules that need to compile against RapidFuzz should use this
function to locate the appropriate include directory.
Notes
-----
When using ``distutils``, for example in ``setup.py``.
::
import rapidfuzz_capi
...
Extension('extension_name', ...
include_dirs=[rapidfuzz_capi.get_include()])
...
"""
from pathlib import Path
return str(Path(__file__).parent)

View File

@@ -0,0 +1,12 @@
from __future__ import annotations
__author__: str
__license__: str
__version__: str
from rapidfuzz import (
distance as distance,
fuzz as fuzz,
process as process,
utils as utils,
)

View File

@@ -0,0 +1,7 @@
from __future__ import annotations
from pathlib import Path
def get_PyInstaller_tests():
return [str(Path(__file__).parent)]

View File

@@ -0,0 +1,37 @@
from __future__ import annotations
import subprocess
from PyInstaller import __main__ as pyi_main
# Test out the package by importing it, then running functions from it.
def test_pyi_hooksample(tmp_path):
app_name = "userapp"
workpath = tmp_path / "build"
distpath = tmp_path / "dist"
app = tmp_path / (app_name + ".py")
app.write_text(
"\n".join(
[
"import rapidfuzz",
"from rapidfuzz.distance import metrics_py",
"from rapidfuzz.distance import metrics_cpp",
"rapidfuzz.distance.Levenshtein.distance('test', 'teste')",
"metrics_py.levenshtein_distance('test', 'teste')",
"metrics_cpp.levenshtein_distance('test', 'teste')",
]
)
)
args = [
# Place all generated files in ``tmp_path``.
"--workpath",
str(workpath),
"--distpath",
str(distpath),
"--specpath",
str(tmp_path),
str(app),
]
pyi_main.run(args)
subprocess.run([str(distpath / app_name / app_name)], check=True)

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2023 Max Bachmann
from __future__ import annotations
from array import array
from collections.abc import Hashable, Sequence
def conv_sequence(s: Sequence[Hashable]) -> Sequence[Hashable]:
if isinstance(s, str):
return [ord(x) for x in s]
if isinstance(s, bytes):
return s
if isinstance(s, array):
if s.typecode in ("u", "w"):
return [ord(x) for x in s]
return s
if s is None:
return s
res = []
for elem in s:
if isinstance(elem, str) and len(elem) == 1:
res.append(ord(elem))
elif isinstance(elem, int) and elem == -1:
res.append(-1)
else:
res.append(hash(elem))
return res
def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> tuple[Sequence[Hashable], Sequence[Hashable]]:
if isinstance(s1, str) and isinstance(s2, str):
return s1, s2
if isinstance(s1, bytes) and isinstance(s2, bytes):
return s1, s2
return conv_sequence(s1), conv_sequence(s2)
def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
prefix_len = 0
for ch1, ch2 in zip(s1, s2):
if ch1 != ch2:
break
prefix_len += 1
return prefix_len
def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
suffix_len = 0
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
if ch1 != ch2:
break
suffix_len += 1
return suffix_len
def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> tuple[int, int]:
prefix_len = common_prefix(s1, s2)
suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:])
return (prefix_len, suffix_len)

View File

@@ -0,0 +1,15 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
__all__ = ["AVX2", "SSE2", "supports"]
try:
from rapidfuzz._feature_detector_cpp import AVX2, SSE2, supports
except ImportError:
SSE2 = 1
AVX2 = 2
def supports(features):
_ = features
return False

View File

@@ -0,0 +1,85 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
import sys
from math import isnan
from typing import Any, Callable
pandas_NA = None
def setupPandas():
global pandas_NA # noqa: PLW0603
if pandas_NA is None:
pandas = sys.modules.get("pandas")
if hasattr(pandas, "NA"):
pandas_NA = pandas.NA
setupPandas()
class ScorerFlag:
RESULT_F64 = 1 << 5
RESULT_I64 = 1 << 6
RESULT_SIZE_T = 1 << 7
SYMMETRIC = 1 << 11
def _get_scorer_flags_distance(**_kwargs: Any) -> dict[str, Any]:
return {
"optimal_score": 0,
"worst_score": 2**63 - 1,
"flags": ScorerFlag.RESULT_SIZE_T | ScorerFlag.SYMMETRIC,
}
def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]:
return {
"optimal_score": 2**63 - 1,
"worst_score": 0,
"flags": ScorerFlag.RESULT_SIZE_T | ScorerFlag.SYMMETRIC,
}
def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]:
return {
"optimal_score": 0,
"worst_score": 1,
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
}
def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]:
return {
"optimal_score": 1,
"worst_score": 0,
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
}
def is_none(s: Any) -> bool:
if s is None or s is pandas_NA:
return True
return isinstance(s, float) and isnan(s)
def add_scorer_attrs(func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]]):
func._RF_ScorerPy = cached_scorer_call
# used to detect the function hasn't been wrapped afterwards
func._RF_OriginalScorer = func
default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {"get_scorer_flags": _get_scorer_flags_distance}
default_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_similarity
}
default_normalized_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_normalized_distance
}
default_normalized_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_normalized_similarity
}

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
damerau_levenshtein_distance as distance,
damerau_levenshtein_normalized_distance as normalized_distance,
damerau_levenshtein_normalized_similarity as normalized_similarity,
damerau_levenshtein_similarity as similarity,
)

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,233 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def _damerau_levenshtein_distance_zhao(s1, s2):
maxVal = max(len(s1), len(s2)) + 1
last_row_id = {}
last_row_id_get = last_row_id.get
size = len(s2) + 2
FR = [maxVal] * size
R1 = [maxVal] * size
R = list(range(size))
R[-1] = maxVal
for i in range(1, len(s1) + 1):
R, R1 = R1, R
last_col_id = -1
last_i2l1 = R[0]
R[0] = i
T = maxVal
for j in range(1, len(s2) + 1):
diag = R1[j - 1] + (s1[i - 1] != s2[j - 1])
left = R[j - 1] + 1
up = R1[j] + 1
temp = min(diag, left, up)
if s1[i - 1] == s2[j - 1]:
last_col_id = j # last occurrence of s1_i
FR[j] = R1[j - 2] # save H_k-1,j-2
T = last_i2l1 # save H_i-2,l-1
else:
k = last_row_id_get(s2[j - 1], -1)
l = last_col_id # noqa: E741
if (j - l) == 1:
transpose = FR[j] + (i - k)
temp = min(temp, transpose)
elif (i - k) == 1:
transpose = T + (j - l)
temp = min(temp, transpose)
last_i2l1 = R[j]
R[j] = temp
last_row_id[s1[i - 1]] = i
return R[len(s2)]
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the Damerau-Levenshtein distance.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Examples
--------
Find the Damerau-Levenshtein distance between two strings:
>>> from rapidfuzz.distance import DamerauLevenshtein
>>> DamerauLevenshtein.distance("CA", "ABC")
2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
dist = _damerau_levenshtein_distance_zhao(s1, s2)
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the Damerau-Levenshtein similarity in the range [max, 0].
This is calculated as ``max(len1, len2) - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2)
sim = maximum - dist
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized Damerau-Levenshtein distance in the range [1, 0].
This is calculated as ``distance / max(len1, len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized Damerau-Levenshtein similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
norm_dist = normalized_distance(s1, s2)
norm_sim = 1.0 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = [
"distance",
"editops",
"normalized_distance",
"normalized_similarity",
"opcodes",
"similarity",
]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
hamming_distance as distance,
hamming_editops as editops,
hamming_normalized_distance as normalized_distance,
hamming_normalized_similarity as normalized_similarity,
hamming_opcodes as opcodes,
hamming_similarity as similarity,
)

View File

@@ -0,0 +1,113 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
from rapidfuzz.distance import Editops, Opcodes
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
pad: bool = True,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
pad: bool = True,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
pad: bool = True,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
pad: bool = True,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
pad: bool = True,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
pad: bool = True,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
pad: bool = True,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
pad: bool = True,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
) -> Editops: ...
@overload
def editops(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
) -> Editops: ...
@overload
def opcodes(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
) -> Opcodes: ...
@overload
def opcodes(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
) -> Opcodes: ...

View File

@@ -0,0 +1,322 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance._initialize_py import Editop, Editops
def distance(
s1,
s2,
*,
pad=True,
processor=None,
score_cutoff=None,
):
"""
Calculates the Hamming distance between two strings.
The hamming distance is defined as the number of positions
where the two strings differ. It describes the minimum
amount of substitutions required to transform s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Defaults is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int or None, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If s1 and s2 have a different length
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
if not pad and len(s1) != len(s2):
msg = "Sequences are not the same length."
raise ValueError(msg)
min_len = min(len(s1), len(s2))
dist = max(len(s1), len(s2))
for i in range(min_len):
dist -= s1[i] == s2[i]
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
pad=True,
processor=None,
score_cutoff=None,
):
"""
Calculates the Hamming similarity between two strings.
This is calculated as ``len1 - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Defaults is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If s1 and s2 have a different length
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2, pad=pad)
sim = maximum - dist
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
pad=True,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized Hamming similarity in the range [1, 0].
This is calculated as ``distance / (len1 + len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Defaults is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If s1 and s2 have a different length
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2, pad=pad)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
def normalized_similarity(
s1,
s2,
*,
pad=True,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized Hamming similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Defaults is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If s1 and s2 have a different length
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
s1, s2 = conv_sequences(s1, s2)
norm_dist = normalized_distance(s1, s2, pad=pad, processor=processor)
norm_sim = 1 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
def editops(
s1,
s2,
*,
pad=True,
processor=None,
):
"""
Return Editops describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Defaults is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
editops : Editops
edit operations required to turn s1 into s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
if not pad and len(s1) != len(s2):
msg = "Sequences are not the same length."
raise ValueError(msg)
ops_list = []
min_len = min(len(s1), len(s2))
for i in range(min_len):
if s1[i] != s2[i]:
ops_list.append(Editop("replace", i, i))
for i in range(min_len, len(s1)):
ops_list.append(Editop("delete", i, len(s2)))
for i in range(min_len, len(s2)):
ops_list.append(Editop("insert", len(s1), i))
# sidestep input validation
ops = Editops.__new__(Editops)
ops._src_len = len(s1)
ops._dest_len = len(s2)
ops._editops = ops_list
return ops
def opcodes(
s1,
s2,
*,
pad=True,
processor=None,
):
"""
Return Opcodes describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Defaults is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
opcodes : Opcodes
edit operations required to turn s1 into s2
"""
return editops(s1, s2, pad=pad, processor=processor).as_opcodes()

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = [
"distance",
"editops",
"normalized_distance",
"normalized_similarity",
"opcodes",
"similarity",
]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
indel_distance as distance,
indel_editops as editops,
indel_normalized_distance as normalized_distance,
indel_normalized_similarity as normalized_similarity,
indel_opcodes as opcodes,
indel_similarity as similarity,
)

View File

@@ -0,0 +1,105 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
from rapidfuzz.distance import Editops, Opcodes
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
) -> Editops: ...
@overload
def editops(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
) -> Editops: ...
@overload
def opcodes(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
) -> Opcodes: ...
@overload
def opcodes(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
) -> Opcodes: ...

View File

@@ -0,0 +1,358 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance.LCSseq_py import (
_block_similarity as lcs_seq_block_similarity,
editops as lcs_seq_editops,
opcodes as lcs_seq_opcodes,
similarity as lcs_seq_similarity,
)
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the minimum number of insertions and deletions
required to change one sequence into the other. This is equivalent to the
Levenshtein distance with a substitution weight of 2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Examples
--------
Find the Indel distance between two strings:
>>> from rapidfuzz.distance import Indel
>>> Indel.distance("lewenstein", "levenshtein")
3
Setting a maximum distance allows the implementation to select
a more efficient implementation:
>>> Indel.distance("lewenstein", "levenshtein", score_cutoff=1)
2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = len(s1) + len(s2)
lcs_sim = lcs_seq_similarity(s1, s2)
dist = maximum - 2 * lcs_sim
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def _block_distance(
block,
s1,
s2,
score_cutoff=None,
):
maximum = len(s1) + len(s2)
lcs_sim = lcs_seq_block_similarity(block, s1, s2)
dist = maximum - 2 * lcs_sim
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the Indel similarity in the range [max, 0].
This is calculated as ``(len1 + len2) - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = len(s1) + len(s2)
dist = distance(s1, s2)
sim = maximum - dist
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized levenshtein similarity in the range [1, 0].
This is calculated as ``distance / (len1 + len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = len(s1) + len(s2)
dist = distance(s1, s2)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
def _block_normalized_distance(
block,
s1,
s2,
score_cutoff=None,
):
maximum = len(s1) + len(s2)
dist = _block_distance(block, s1, s2)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized indel similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Examples
--------
Find the normalized Indel similarity between two strings:
>>> from rapidfuzz.distance import Indel
>>> Indel.normalized_similarity("lewenstein", "levenshtein")
0.85714285714285
Setting a score_cutoff allows the implementation to select
a more efficient implementation:
>>> Indel.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
0.0
When a different processor is used s1 and s2 do not have to be strings
>>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.8571428571428572
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
norm_dist = normalized_distance(s1, s2)
norm_sim = 1.0 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
def _block_normalized_similarity(
block,
s1,
s2,
score_cutoff=None,
):
norm_dist = _block_normalized_distance(block, s1, s2)
norm_sim = 1.0 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
def editops(
s1,
s2,
*,
processor=None,
):
"""
Return Editops describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
editops : Editops
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import Indel
>>> for tag, src_pos, dest_pos in Indel.editops("qabxcd", "abycdf"):
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
delete s1[0] s2[0]
delete s1[3] s2[2]
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
return lcs_seq_editops(s1, s2, processor=processor)
def opcodes(
s1,
s2,
*,
processor=None,
):
"""
Return Opcodes describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
opcodes : Opcodes
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import Indel
>>> a = "qabxcd"
>>> b = "abycdf"
>>> for tag, i1, i2, j1, j2 in Indel.opcodes(a, b):
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
delete a[0:1] (q) b[0:0] ()
equal a[1:3] (ab) b[0:2] (ab)
delete a[3:4] (x) b[2:2] ()
insert a[4:4] () b[2:3] (y)
equal a[4:6] (cd) b[3:5] (cd)
insert a[6:6] () b[5:6] (f)
"""
return lcs_seq_opcodes(s1, s2, processor=processor)

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
jaro_distance as distance,
jaro_normalized_distance as normalized_distance,
jaro_normalized_similarity as normalized_similarity,
jaro_similarity as similarity,
)

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
jaro_winkler_distance as distance,
jaro_winkler_normalized_distance as normalized_distance,
jaro_winkler_normalized_similarity as normalized_similarity,
jaro_winkler_similarity as similarity,
)

View File

@@ -0,0 +1,83 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
prefix_weight: float = 0.1,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
prefix_weight: float = 0.1,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
prefix_weight: float = 0.1,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
prefix_weight: float = 0.1,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
prefix_weight: float = 0.1,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
prefix_weight: float = 0.1,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
prefix_weight: float = 0.1,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
prefix_weight: float = 0.1,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,235 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance import Jaro_py as Jaro
def similarity(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro winkler similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If prefix_weight is invalid
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if score_cutoff is None:
score_cutoff = 0
if prefix_weight > 1.0 or prefix_weight < 0.0:
msg = "prefix_weight has to be in the range 0.0 - 1.0"
raise ValueError(msg)
s1, s2 = conv_sequences(s1, s2)
P_len = len(s1)
T_len = len(s2)
min_len = min(P_len, T_len)
prefix = 0
max_prefix = min(min_len, 4)
for _ in range(max_prefix):
if s1[prefix] != s2[prefix]:
break
prefix += 1
jaro_score_cutoff = score_cutoff
if jaro_score_cutoff > 0.7:
prefix_sim = prefix * prefix_weight
if prefix_sim >= 1.0:
jaro_score_cutoff = 0.7
else:
jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0))
Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
if Sim > 0.7:
Sim += prefix * prefix_weight * (1.0 - Sim)
Sim = min(Sim, 1.0)
return Sim if Sim >= score_cutoff else 0
def normalized_similarity(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro winkler similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized similarity : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If prefix_weight is invalid
"""
return similarity(
s1,
s2,
prefix_weight=prefix_weight,
processor=processor,
score_cutoff=score_cutoff,
)
def distance(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro winkler distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
distance : float
distance between s1 and s2 as a float between 1.0 and 0.0
Raises
------
ValueError
If prefix_weight is invalid
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
sim = similarity(s1, s2, prefix_weight=prefix_weight, score_cutoff=cutoff_distance)
dist = 1.0 - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
def normalized_distance(
s1,
s2,
*,
prefix_weight=0.1,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro winkler distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
prefix_weight : float, optional
Weight used for the common prefix of the two strings.
Has to be between 0 and 0.25. Default is 0.1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized distance : float
normalized distance between s1 and s2 as a float between 1.0 and 0.0
Raises
------
ValueError
If prefix_weight is invalid
"""
return distance(
s1,
s2,
prefix_weight=prefix_weight,
processor=processor,
score_cutoff=score_cutoff,
)

View File

@@ -0,0 +1,255 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def _jaro_calculate_similarity(pattern_len, text_len, common_chars, transpositions):
transpositions //= 2
sim = 0.0
sim += common_chars / pattern_len
sim += common_chars / text_len
sim += (common_chars - transpositions) / common_chars
return sim / 3.0
def _jaro_length_filter(pattern_len, text_len, score_cutoff):
"""
filter matches below score_cutoff based on string lengths
"""
if not pattern_len or not text_len:
return False
sim = _jaro_calculate_similarity(pattern_len, text_len, min(pattern_len, text_len), 0)
return sim >= score_cutoff
def _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
"""
filter matches below score_cutoff based on string lengths and common characters
"""
if not common_chars:
return False
sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0)
return sim >= score_cutoff
def _jaro_bounds(s1, s2):
"""
find bounds and skip out of bound parts of the sequences
"""
pattern_len = len(s1)
text_len = len(s2)
# since jaro uses a sliding window some parts of T/P might never be in
# range an can be removed ahead of time
bound = 0
if text_len > pattern_len:
bound = text_len // 2 - 1
if text_len > pattern_len + bound:
s2 = s2[: pattern_len + bound]
else:
bound = pattern_len // 2 - 1
if pattern_len > text_len + bound:
s1 = s1[: text_len + bound]
return s1, s2, bound
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if not s1 and not s2:
return 1.0
if score_cutoff is None:
score_cutoff = 0
s1, s2 = conv_sequences(s1, s2)
pattern_len = len(s1)
text_len = len(s2)
# short circuit if score_cutoff can not be reached
if not _jaro_length_filter(pattern_len, text_len, score_cutoff):
return 0
if pattern_len == 1 and text_len == 1:
return float(s1[0] == s2[0])
s1, s2, bound = _jaro_bounds(s1, s2)
s1_flags = [False] * pattern_len
s2_flags = [False] * text_len
# todo use bitparallel implementation
# looking only within search range, count & flag matched pairs
common_chars = 0
for i, s1_ch in enumerate(s1):
low = max(0, i - bound)
hi = min(i + bound, text_len - 1)
for j in range(low, hi + 1):
if not s2_flags[j] and s2[j] == s1_ch:
s1_flags[i] = s2_flags[j] = True
common_chars += 1
break
# short circuit if score_cutoff can not be reached
if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
return 0
# todo use bitparallel implementation
# count transpositions
k = trans_count = 0
for i, s1_f in enumerate(s1_flags):
if s1_f:
for j in range(k, text_len):
if s2_flags[j]:
k = j + 1
break
if s1[i] != s2[j]:
trans_count += 1
return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count)
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro similarity
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized similarity : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the jaro distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
distance : float
distance between s1 and s2 as a float between 1.0 and 0.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
sim = similarity(s1, s2, score_cutoff=cutoff_distance)
dist = 1.0 - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized jaro distance
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For ratio < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
Returns
-------
normalized distance : float
normalized distance between s1 and s2 as a float between 1.0 and 0.0
"""
return distance(s1, s2, processor=processor, score_cutoff=score_cutoff)

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = [
"distance",
"editops",
"normalized_distance",
"normalized_similarity",
"opcodes",
"similarity",
]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
lcs_seq_distance as distance,
lcs_seq_editops as editops,
lcs_seq_normalized_distance as normalized_distance,
lcs_seq_normalized_similarity as normalized_similarity,
lcs_seq_opcodes as opcodes,
lcs_seq_similarity as similarity,
)

View File

@@ -0,0 +1,105 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
from rapidfuzz.distance import Editops, Opcodes
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
) -> Editops: ...
@overload
def editops(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
) -> Editops: ...
@overload
def opcodes(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
) -> Opcodes: ...
@overload
def opcodes(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
) -> Opcodes: ...

View File

@@ -0,0 +1,426 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import common_affix, conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance._initialize_py import Editop, Editops
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the length of the longest common subsequence
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if not s1:
return 0
s1, s2 = conv_sequences(s1, s2)
S = (1 << len(s1)) - 1
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
res = bin(S)[-len(s1) :].count("0")
return res if (score_cutoff is None or res >= score_cutoff) else 0
def _block_similarity(
block,
s1,
s2,
score_cutoff=None,
):
if not s1:
return 0
S = (1 << len(s1)) - 1
block_get = block.get
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
res = bin(S)[-len(s1) :].count("0")
return res if (score_cutoff is None or res >= score_cutoff) else 0
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the LCS distance in the range [0, max].
This is calculated as ``max(len1, len2) - similarity``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Examples
--------
Find the LCS distance between two strings:
>>> from rapidfuzz.distance import LCSseq
>>> LCSseq.distance("lewenstein", "levenshtein")
2
Setting a maximum distance allows the implementation to select
a more efficient implementation:
>>> LCSseq.distance("lewenstein", "levenshtein", score_cutoff=1)
2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
sim = similarity(s1, s2)
dist = maximum - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized LCS similarity in the range [1, 0].
This is calculated as ``distance / max(len1, len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if not s1 or not s2:
return 0
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
norm_sim = distance(s1, s2) / maximum
return norm_sim if (score_cutoff is None or norm_sim <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized LCS similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Examples
--------
Find the normalized LCS similarity between two strings:
>>> from rapidfuzz.distance import LCSseq
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein")
0.8181818181818181
Setting a score_cutoff allows the implementation to select
a more efficient implementation:
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
0.0
When a different processor is used s1 and s2 do not have to be strings
>>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.81818181818181
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
norm_sim = 1.0 - normalized_distance(s1, s2)
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
def _matrix(s1, s2):
if not s1:
return (0, [])
S = (1 << len(s1)) - 1
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
matrix = []
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
matrix.append(S)
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
sim = bin(S)[-len(s1) :].count("0")
return (sim, matrix)
def editops(
s1,
s2,
*,
processor=None,
):
"""
Return Editops describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
editops : Editops
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described in [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import LCSseq
>>> for tag, src_pos, dest_pos in LCSseq.editops("qabxcd", "abycdf"):
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
delete s1[0] s2[0]
delete s1[3] s2[2]
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
prefix_len, suffix_len = common_affix(s1, s2)
s1 = s1[prefix_len : len(s1) - suffix_len]
s2 = s2[prefix_len : len(s2) - suffix_len]
sim, matrix = _matrix(s1, s2)
editops = Editops([], 0, 0)
editops._src_len = len(s1) + prefix_len + suffix_len
editops._dest_len = len(s2) + prefix_len + suffix_len
dist = len(s1) + len(s2) - 2 * sim
if dist == 0:
return editops
editop_list = [None] * dist
col = len(s1)
row = len(s2)
while row != 0 and col != 0:
# deletion
if matrix[row - 1] & (1 << (col - 1)):
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
else:
row -= 1
# insertion
if row and not (matrix[row - 1] & (1 << (col - 1))):
dist -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
# match
else:
col -= 1
while col != 0:
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
while row != 0:
dist -= 1
row -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
editops._editops = editop_list
return editops
def opcodes(
s1,
s2,
*,
processor=None,
):
"""
Return Opcodes describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
Returns
-------
opcodes : Opcodes
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described in [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import LCSseq
>>> a = "qabxcd"
>>> b = "abycdf"
>>> for tag, i1, i2, j1, j2 in LCSseq.opcodes(a, b):
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
delete a[0:1] (q) b[0:0] ()
equal a[1:3] (ab) b[0:2] (ab)
delete a[3:4] (x) b[2:2] ()
insert a[4:4] () b[2:3] (y)
equal a[4:6] (cd) b[3:5] (cd)
insert a[6:6] () b[5:6] (f)
"""
return editops(s1, s2, processor=processor).as_opcodes()

View File

@@ -0,0 +1,116 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = [
"distance",
"editops",
"normalized_distance",
"normalized_similarity",
"opcodes",
"similarity",
]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
levenshtein_distance as distance,
levenshtein_editops as editops,
levenshtein_normalized_distance as normalized_distance,
levenshtein_normalized_similarity as normalized_similarity,
levenshtein_opcodes as opcodes,
levenshtein_similarity as similarity,
)

View File

@@ -0,0 +1,131 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
"""
The Levenshtein (edit) distance is a string metric to measure the
difference between two strings/sequences s1 and s2.
It's defined as the minimum number of insertions, deletions or
substitutions required to transform s1 into s2.
"""
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
from rapidfuzz.distance import Editops, Opcodes
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: None = None,
score_cutoff: int | None = None,
score_hint: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
score_hint: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: None = None,
score_cutoff: float | None = 0,
score_hint: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
score_hint: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: None = None,
score_cutoff: int | None = None,
score_hint: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
score_hint: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: None = None,
score_cutoff: float | None = 0,
score_hint: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
weights: tuple[int, int, int] | None = (1, 1, 1),
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
score_hint: float | None = 0,
) -> float: ...
@overload
def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_hint: int | None = None,
) -> Editops: ...
@overload
def editops(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_hint: int | None = None,
) -> Editops: ...
@overload
def opcodes(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_hint: int | None = None,
) -> Opcodes: ...
@overload
def opcodes(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_hint: int | None = None,
) -> Opcodes: ...

View File

@@ -0,0 +1,571 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import common_affix, conv_sequences
from rapidfuzz._utils import is_none, setupPandas
from rapidfuzz.distance import Indel_py as Indel
from rapidfuzz.distance._initialize_py import Editop, Editops
def _levenshtein_maximum(s1, s2, weights):
len1 = len(s1)
len2 = len(s2)
insert, delete, replace = weights
max_dist = len1 * delete + len2 * insert
if len1 >= len2:
max_dist = min(max_dist, len2 * replace + (len1 - len2) * delete)
else:
max_dist = min(max_dist, len1 * replace + (len2 - len1) * insert)
return max_dist
def _uniform_generic(s1, s2, weights):
len1 = len(s1)
insert, delete, replace = weights
cache = list(range(0, (len1 + 1) * delete, delete))
for ch2 in s2:
temp = cache[0]
cache[0] += insert
for i in range(len1):
x = temp
if s1[i] != ch2:
x = min(cache[i] + delete, cache[i + 1] + insert, temp + replace)
temp = cache[i + 1]
cache[i + 1] = x
return cache[-1]
def _uniform_distance(s1, s2):
if not s1:
return len(s2)
VP = (1 << len(s1)) - 1
VN = 0
currDist = len(s1)
mask = 1 << (len(s1) - 1)
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
for ch2 in s2:
# Step 1: Computing D0
PM_j = block_get(ch2, 0)
X = PM_j
D0 = (((X & VP) + VP) ^ VP) | X | VN
# Step 2: Computing HP and HN
HP = VN | ~(D0 | VP)
HN = D0 & VP
# Step 3: Computing the value D[m,j]
currDist += (HP & mask) != 0
currDist -= (HN & mask) != 0
# Step 4: Computing Vp and VN
HP = (HP << 1) | 1
HN = HN << 1
VP = HN | ~(D0 | HP)
VN = HP & D0
return currDist
def distance(
s1,
s2,
*,
weights=(1, 1, 1),
processor=None,
score_cutoff=None,
score_hint=None,
):
"""
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein with custom
costs for insertion, deletion and substitution
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
weights : tuple[int, int, int] or None, optional
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor : callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
score_hint : int, optional
Expected distance between s1 and s2. This is used to select a
faster implementation. Default is None, which deactivates this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If unsupported weights are provided a ValueError is thrown
Examples
--------
Find the Levenshtein distance between two strings:
>>> from rapidfuzz.distance import Levenshtein
>>> Levenshtein.distance("lewenstein", "levenshtein")
2
Setting a maximum distance allows the implementation to select
a more efficient implementation:
>>> Levenshtein.distance("lewenstein", "levenshtein", score_cutoff=1)
2
It is possible to select different weights by passing a `weight`
tuple.
>>> Levenshtein.distance("lewenstein", "levenshtein", weights=(1,1,2))
3
"""
_ = score_hint
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
if weights is None or weights == (1, 1, 1):
dist = _uniform_distance(s1, s2)
elif weights == (1, 1, 2):
dist = Indel.distance(s1, s2)
else:
dist = _uniform_generic(s1, s2, weights)
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
weights=(1, 1, 1),
processor=None,
score_cutoff=None,
score_hint=None,
):
"""
Calculates the levenshtein similarity in the range [max, 0] using custom
costs for insertion, deletion and substitution.
This is calculated as ``max - distance``, where max is the maximal possible
Levenshtein distance given the lengths of the sequences s1/s2 and the weights.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
weights : tuple[int, int, int] or None, optional
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor : callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
score_hint : int, optional
Expected similarity between s1 and s2. This is used to select a
faster implementation. Default is None, which deactivates this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
Raises
------
ValueError
If unsupported weights are provided a ValueError is thrown
"""
_ = score_hint
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
weights = weights or (1, 1, 1)
maximum = _levenshtein_maximum(s1, s2, weights)
dist = distance(s1, s2, weights=weights)
sim = maximum - dist
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
weights=(1, 1, 1),
processor=None,
score_cutoff=None,
score_hint=None,
):
"""
Calculates a normalized levenshtein distance in the range [1, 0] using custom
costs for insertion, deletion and substitution.
This is calculated as ``distance / max``, where max is the maximal possible
Levenshtein distance given the lengths of the sequences s1/s2 and the weights.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
weights : tuple[int, int, int] or None, optional
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor : callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is None,
which deactivates this behaviour.
score_hint : float, optional
Expected normalized distance between s1 and s2. This is used to select a
faster implementation. Default is None, which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 1.0 and 0.0
Raises
------
ValueError
If unsupported weights are provided a ValueError is thrown
"""
_ = score_hint
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
weights = weights or (1, 1, 1)
maximum = _levenshtein_maximum(s1, s2, weights)
dist = distance(s1, s2, weights=weights)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
weights=(1, 1, 1),
processor=None,
score_cutoff=None,
score_hint=None,
):
"""
Calculates a normalized levenshtein similarity in the range [0, 1] using custom
costs for insertion, deletion and substitution.
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
weights : tuple[int, int, int] or None, optional
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor : callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is None,
which deactivates this behaviour.
score_hint : int, optional
Expected normalized similarity between s1 and s2. This is used to select a
faster implementation. Default is None, which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Raises
------
ValueError
If unsupported weights are provided a ValueError is thrown
Examples
--------
Find the normalized Levenshtein similarity between two strings:
>>> from rapidfuzz.distance import Levenshtein
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein")
0.81818181818181
Setting a score_cutoff allows the implementation to select
a more efficient implementation:
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.85)
0.0
It is possible to select different weights by passing a `weight`
tuple.
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein", weights=(1,1,2))
0.85714285714285
When a different processor is used s1 and s2 do not have to be strings
>>> Levenshtein.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.81818181818181
"""
_ = score_hint
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
weights = weights or (1, 1, 1)
norm_dist = normalized_distance(s1, s2, weights=weights)
norm_sim = 1.0 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
def _matrix(s1, s2):
if not s1:
return (len(s2), [], [])
VP = (1 << len(s1)) - 1
VN = 0
currDist = len(s1)
mask = 1 << (len(s1) - 1)
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
matrix_VP = []
matrix_VN = []
for ch2 in s2:
# Step 1: Computing D0
PM_j = block_get(ch2, 0)
X = PM_j
D0 = (((X & VP) + VP) ^ VP) | X | VN
# Step 2: Computing HP and HN
HP = VN | ~(D0 | VP)
HN = D0 & VP
# Step 3: Computing the value D[m,j]
currDist += (HP & mask) != 0
currDist -= (HN & mask) != 0
# Step 4: Computing Vp and VN
HP = (HP << 1) | 1
HN = HN << 1
VP = HN | ~(D0 | HP)
VN = HP & D0
matrix_VP.append(VP)
matrix_VN.append(VN)
return (currDist, matrix_VP, matrix_VN)
def editops(
s1,
s2,
*,
processor=None,
score_hint=None,
):
"""
Return Editops describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor : callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_hint : int, optional
Expected distance between s1 and s2. This is used to select a
faster implementation. Default is None, which deactivates this behaviour.
Returns
-------
editops : Editops
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described [8]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [8] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import Levenshtein
>>> for tag, src_pos, dest_pos in Levenshtein.editops("qabxcd", "abycdf"):
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
delete s1[1] s2[0]
replace s1[3] s2[2]
insert s1[6] s2[5]
"""
_ = score_hint
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
prefix_len, suffix_len = common_affix(s1, s2)
s1 = s1[prefix_len : len(s1) - suffix_len]
s2 = s2[prefix_len : len(s2) - suffix_len]
dist, VP, VN = _matrix(s1, s2)
editops = Editops([], 0, 0)
editops._src_len = len(s1) + prefix_len + suffix_len
editops._dest_len = len(s2) + prefix_len + suffix_len
if dist == 0:
return editops
editop_list = [None] * dist
col = len(s1)
row = len(s2)
while row != 0 and col != 0:
# deletion
if VP[row - 1] & (1 << (col - 1)):
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
else:
row -= 1
# insertion
if row and (VN[row - 1] & (1 << (col - 1))):
dist -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
else:
col -= 1
# replace (Matches are not recorded)
if s1[col] != s2[row]:
dist -= 1
editop_list[dist] = Editop("replace", col + prefix_len, row + prefix_len)
while col != 0:
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
while row != 0:
dist -= 1
row -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
editops._editops = editop_list
return editops
def opcodes(
s1,
s2,
*,
processor=None,
score_hint=None,
):
"""
Return Opcodes describing how to turn s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor : callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_hint : int, optional
Expected distance between s1 and s2. This is used to select a
faster implementation. Default is None, which deactivates this behaviour.
Returns
-------
opcodes : Opcodes
edit operations required to turn s1 into s2
Notes
-----
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
described [9]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
References
----------
.. [9] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
Stringology (2004).
Examples
--------
>>> from rapidfuzz.distance import Levenshtein
>>> a = "qabxcd"
>>> b = "abycdf"
>>> for tag, i1, i2, j1, j2 in Levenshtein.opcodes("qabxcd", "abycdf"):
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
delete a[0:1] (q) b[0:0] ()
equal a[1:3] (ab) b[0:2] (ab)
replace a[3:4] (x) b[2:3] (y)
equal a[4:6] (cd) b[3:5] (cd)
insert a[6:6] () b[5:6] (f)
"""
return editops(s1, s2, processor=processor, score_hint=score_hint).as_opcodes()

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
osa_distance as distance,
osa_normalized_distance as normalized_distance,
osa_normalized_similarity as normalized_similarity,
osa_similarity as similarity,
)

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,232 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def _osa_distance_hyrroe2003(s1, s2):
if not s1:
return len(s2)
VP = (1 << len(s1)) - 1
VN = 0
D0 = 0
PM_j_old = 0
currDist = len(s1)
mask = 1 << (len(s1) - 1)
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
for ch2 in s2:
# Step 1: Computing D0
PM_j = block_get(ch2, 0)
TR = (((~D0) & PM_j) << 1) & PM_j_old
D0 = (((PM_j & VP) + VP) ^ VP) | PM_j | VN
D0 = D0 | TR
# Step 2: Computing HP and HN
HP = VN | ~(D0 | VP)
HN = D0 & VP
# Step 3: Computing the value D[m,j]
currDist += (HP & mask) != 0
currDist -= (HN & mask) != 0
# Step 4: Computing Vp and VN
HP = (HP << 1) | 1
HN = HN << 1
VP = HN | ~(D0 | HP)
VN = HP & D0
PM_j_old = PM_j
return currDist
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the optimal string alignment (OSA) distance.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Examples
--------
Find the OSA distance between two strings:
>>> from rapidfuzz.distance import OSA
>>> OSA.distance("CA", "AC")
2
>>> OSA.distance("CA", "ABC")
3
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
dist = _osa_distance_hyrroe2003(s1, s2)
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the optimal string alignment (OSA) similarity in the range [max, 0].
This is calculated as ``max(len1, len2) - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
similarity : int
similarity between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2)
sim = maximum - dist
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized optimal string alignment (OSA) similarity in the range [1, 0].
This is calculated as ``distance / max(len1, len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
dist = distance(s1, s2)
norm_dist = dist / maximum if maximum else 0
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized optimal string alignment (OSA) similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
norm_dist = normalized_distance(s1, s2)
norm_sim = 1.0 - norm_dist
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
postfix_distance as distance,
postfix_normalized_distance as normalized_distance,
postfix_normalized_similarity as normalized_similarity,
postfix_similarity as similarity,
)

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,182 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the postfix distance between two strings.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int or None, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
sim = similarity(s1, s2)
dist = maximum - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the postfix similarity between two strings.
This is calculated as ``len1 - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
sim = 0
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
if ch1 != ch2:
break
sim += 1
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized postfix similarity in the range [1, 0].
This is calculated as ``distance / (len1 + len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
norm_sim = normalized_similarity(s1, s2, processor=processor)
norm_dist = 1.0 - norm_sim
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized postfix similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
sim = similarity(s1, s2)
norm_sim = sim / maximum if maximum else 1.0
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0

View File

@@ -0,0 +1,93 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
elif _impl == "python":
from rapidfuzz.distance.metrics_py import (
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)
imported = True
if not imported:
from rapidfuzz.distance.metrics_py import (
prefix_distance as distance,
prefix_normalized_distance as normalized_distance,
prefix_normalized_similarity as normalized_similarity,
prefix_similarity as similarity,
)

View File

@@ -0,0 +1,75 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_distance(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_distance(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: int | None = None,
) -> int: ...
@overload
def similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: int | None = None,
) -> int: ...
@overload
def normalized_similarity(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def normalized_similarity(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,182 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import is_none, setupPandas
def distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the Prefix distance between two strings.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int or None, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
sim = similarity(s1, s2)
dist = maximum - sim
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
def similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the prefix similarity between two strings.
This is calculated as ``len1 - distance``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the similarity is smaller than score_cutoff,
0 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
sim = 0
for ch1, ch2 in zip(s1, s2):
if ch1 != ch2:
break
sim += 1
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
def normalized_distance(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized prefix similarity in the range [1, 0].
This is calculated as ``distance / (len1 + len2)``.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
which deactivates this behaviour.
Returns
-------
norm_dist : float
normalized distance between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 1.0
norm_sim = normalized_similarity(s1, s2, processor=processor)
norm_dist = 1.0 - norm_sim
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
def normalized_similarity(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a normalized prefix similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0.0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
maximum = max(len(s1), len(s2))
sim = similarity(s1, s2)
norm_sim = sim / maximum if maximum else 1.0
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0

View File

@@ -0,0 +1,37 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from . import (
OSA,
DamerauLevenshtein,
Hamming,
Indel,
Jaro,
JaroWinkler,
LCSseq,
Levenshtein,
Postfix,
Prefix,
)
from ._initialize import Editop, Editops, MatchingBlock, Opcode, Opcodes, ScoreAlignment
__all__ = [
"OSA",
"DamerauLevenshtein",
"Editop",
"Editops",
"Hamming",
"Indel",
"Jaro",
"JaroWinkler",
"LCSseq",
"Levenshtein",
"MatchingBlock",
"Opcode",
"Opcodes",
"Postfix",
"Prefix",
"ScoreAlignment",
]

View File

@@ -0,0 +1,25 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from . import (
OSA as OSA,
DamerauLevenshtein as DamerauLevenshtein,
Hamming as Hamming,
Indel as Indel,
Jaro as Jaro,
JaroWinkler as JaroWinkler,
LCSseq as LCSseq,
Levenshtein as Levenshtein,
Postfix as Postfix,
Prefix as Prefix,
)
from ._initialize import (
Editop as Editop,
Editops as Editops,
MatchingBlock as MatchingBlock,
Opcode as Opcode,
Opcodes as Opcodes,
ScoreAlignment as ScoreAlignment,
)

View File

@@ -0,0 +1,109 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["Editop", "Editops", "MatchingBlock", "Opcode", "Opcodes", "ScoreAlignment"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance._initialize_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance._initialize_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
imported = True
if not imported:
from rapidfuzz.distance._initialize_cpp import ( # pyright: ignore[reportMissingImports]
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
elif _impl == "python":
from rapidfuzz.distance._initialize_py import (
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance._initialize_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.distance._initialize_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.distance._initialize_cpp import ( # pyright: ignore[reportMissingImports]
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)
imported = True
if not imported:
from rapidfuzz.distance._initialize_py import (
Editop,
Editops,
MatchingBlock,
Opcode,
Opcodes,
ScoreAlignment,
)

View File

@@ -0,0 +1,133 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Iterator
_AnyOpList = list[Editop | tuple[str, int, int]] | list[Opcode | tuple[str, int, int, int, int]]
class MatchingBlock:
a: int
b: int
size: int
def __init__(self, a: int, b: int, size: int): ...
def __len__(self) -> int: ...
def __eq__(self, other: object) -> bool: ...
def __getitem__(self, i: int) -> int: ...
def __iter__(self) -> Iterator[int]: ...
def __repr__(self) -> str: ...
class Editop:
tag: str
src_pos: int
dest_pos: int
def __init__(self, tag: str, src_pos: int, dest_pos: int): ...
def __len__(self) -> int: ...
def __eq__(self, other: object) -> bool: ...
def __getitem__(self, i: int) -> int | str: ...
def __iter__(self) -> Iterator[int | str]: ...
def __repr__(self) -> str: ...
class Editops:
_src_len: int
_dest_len: int
_editops: list[Editop]
def __init__(
self,
editops: _AnyOpList | None = None,
src_len: int = 0,
dest_len: int = 0,
): ...
@classmethod
def from_opcodes(cls, opcodes: Opcodes) -> Editops: ...
def as_matching_blocks(self) -> list[MatchingBlock]: ...
def as_list(self) -> list[Editop]: ...
def copy(self) -> Editops: ...
def inverse(self) -> Editops: ...
def remove_subsequence(self, subsequence: Editops) -> None: ...
def apply(self, source_string: str, destination_string: str) -> str: ...
@property
def src_len(self) -> int: ...
@src_len.setter
def src_len(self, value: int) -> None: ...
@property
def dest_len(self) -> int: ...
@dest_len.setter
def dest_len(self, value: int) -> None: ...
def __eq__(self, other: object) -> bool: ...
def __len__(self) -> int: ...
def __delitem__(self, key: int | slice) -> None: ...
def __getitem__(self, key: int | slice) -> Editops | Editop: ...
def __iter__(self) -> Iterator[Editop]: ...
def __repr__(self) -> str: ...
class Opcode:
tag: str
src_start: int
src_end: int
dest_start: int
dest_end: int
def __init__(self, tag: str, src_start: int, src_end: int, dest_start: int, dest_end: int): ...
def __len__(self) -> int: ...
def __eq__(self, other: object) -> bool: ...
def __getitem__(self, i: int) -> int | str: ...
def __iter__(self) -> Iterator[int | str]: ...
class Opcodes:
_src_len: int
_dest_len: int
_opcodes: list[Opcode]
def __init__(
self,
opcodes: _AnyOpList | None = None,
src_len: int = 0,
dest_len: int = 0,
): ...
@classmethod
def from_editops(cls, editops: Editops) -> Opcodes: ...
def as_editops(self) -> Editops: ...
def as_matching_blocks(self) -> list[MatchingBlock]: ...
def as_list(self) -> list[Opcode]: ...
def copy(self) -> Opcodes: ...
def inverse(self) -> Opcodes: ...
def apply(self, source_string: str, destination_string: str) -> str: ...
@property
def src_len(self) -> int: ...
@src_len.setter
def src_len(self, value: int) -> None: ...
@property
def dest_len(self) -> int: ...
@dest_len.setter
def dest_len(self, value: int) -> None: ...
def __eq__(self, other: object) -> bool: ...
def __len__(self) -> int: ...
def __getitem__(self, key: int) -> Opcode: ...
def __iter__(self) -> Iterator[Opcode]: ...
def __repr__(self) -> str: ...
class ScoreAlignment:
score: int | float
src_start: int
src_end: int
dest_start: int
dest_end: int
def __init__(
self,
score: int | float,
src_start: int,
src_end: int,
dest_start: int,
dest_end: int,
): ...
def __len__(self) -> int: ...
def __eq__(self, other: object) -> bool: ...
def __getitem__(self, i: int) -> int | float: ...
def __iter__(self) -> Iterator[int | float]: ...
def __repr__(self) -> str: ...

View File

@@ -0,0 +1,884 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
def _list_to_editops(
ops,
src_len,
dest_len,
):
if not ops:
return []
if len(ops[0]) == 5:
return Opcodes(ops, src_len, dest_len).as_editops()._editops
blocks = []
for op in ops:
edit_type, src_pos, dest_pos = op
if src_pos > src_len or dest_pos > dest_len:
msg = "List of edit operations invalid"
raise ValueError(msg)
if src_pos == src_len and edit_type != "insert":
msg = "List of edit operations invalid"
raise ValueError(msg)
if dest_pos == dest_len and edit_type != "delete":
msg = "List of edit operations invalid"
raise ValueError(msg)
# keep operations are not relevant in editops
if edit_type == "equal":
continue
blocks.append(Editop(edit_type, src_pos, dest_pos))
# validate order of editops
for i in range(len(blocks) - 1):
if blocks[i + 1].src_pos < blocks[i].src_pos or blocks[i + 1].dest_pos < blocks[i].dest_pos:
msg = "List of edit operations out of order"
raise ValueError(msg)
if blocks[i + 1].src_pos == blocks[i].src_pos and blocks[i + 1].dest_pos == blocks[i].dest_pos:
msg = "Duplicated edit operation"
raise ValueError(msg)
return blocks
def _list_to_opcodes(
ops,
src_len,
dest_len,
):
if not ops or len(ops[0]) == 3:
return Editops(ops, src_len, dest_len).as_opcodes()._opcodes
blocks = []
for op in ops:
edit_type, src_start, src_end, dest_start, dest_end = op
if src_end > src_len or dest_end > dest_len:
msg = "List of edit operations invalid"
raise ValueError(msg)
if src_end < src_start or dest_end < dest_start:
msg = "List of edit operations invalid"
raise ValueError(msg)
if edit_type in {"equal", "replace"} and (src_end - src_start != dest_end - dest_start or src_start == src_end):
msg = "List of edit operations invalid"
raise ValueError(msg)
if edit_type == "insert" and (src_start != src_end or dest_start == dest_end):
msg = "List of edit operations invalid"
raise ValueError(msg)
if edit_type == "delete" and (src_start == src_end or dest_start != dest_end):
msg = "List of edit operations invalid"
raise ValueError(msg)
# merge similar adjacent blocks
if blocks and (
blocks[-1].tag == edit_type and blocks[-1].src_end == src_start and blocks[-1].dest_end == dest_start
):
blocks[-1].src_end = src_end
blocks[-1].dest_end = dest_end
continue
blocks.append(Opcode(edit_type, src_start, src_end, dest_start, dest_end))
# check if edit operations span the complete string
if blocks[0].src_start != 0 or blocks[0].dest_start != 0:
msg = "List of edit operations does not start at position 0"
raise ValueError(msg)
if blocks[-1].src_end != src_len or blocks[-1].dest_end != dest_len:
msg = "List of edit operations does not end at the string ends"
raise ValueError(msg)
for i in range(len(blocks) - 1):
if blocks[i + 1].src_start != blocks[i].src_end or blocks[i + 1].dest_start != blocks[i].dest_end:
msg = "List of edit operations is not continuous"
raise ValueError(msg)
return blocks
class MatchingBlock:
"""
Triple describing matching subsequences
"""
def __init__(self, a, b, size):
self.a = a
self.b = b
self.size = size
def __len__(self):
return 3
def __eq__(self, other):
try:
if len(other) != 3:
return False
return bool(other[0] == self.a and other[1] == self.b and other[2] == self.size)
except TypeError:
return False
def __getitem__(self, i):
if i in {0, -3}:
return self.a
if i in {1, -2}:
return self.b
if i in {2, -1}:
return self.size
msg = "MatchingBlock index out of range"
raise IndexError(msg)
def __iter__(self):
for i in range(3):
yield self[i]
def __repr__(self):
return f"MatchingBlock(a={self.a}, b={self.b}, size={self.size})"
class Editop:
"""
Tuple like object describing an edit operation.
It is in the form (tag, src_pos, dest_pos)
The tags are strings, with these meanings:
+-----------+---------------------------------------------------+
| tag | explanation |
+===========+===================================================+
| 'replace' | src[src_pos] should be replaced by dest[dest_pos] |
+-----------+---------------------------------------------------+
| 'delete' | src[src_pos] should be deleted |
+-----------+---------------------------------------------------+
| 'insert' | dest[dest_pos] should be inserted at src[src_pos] |
+-----------+---------------------------------------------------+
"""
def __init__(self, tag, src_pos, dest_pos):
self.tag = tag
self.src_pos = src_pos
self.dest_pos = dest_pos
def __len__(self):
return 3
def __eq__(self, other):
try:
if len(other) != 3:
return False
return bool(other[0] == self.tag and other[1] == self.src_pos and other[2] == self.dest_pos)
except TypeError:
return False
def __getitem__(self, i):
if i in {0, -3}:
return self.tag
if i in {1, -2}:
return self.src_pos
if i in {2, -1}:
return self.dest_pos
msg = "Editop index out of range"
raise IndexError(msg)
def __iter__(self):
for i in range(3):
yield self[i]
def __repr__(self):
return f"Editop(tag={self.tag!r}, src_pos={self.src_pos}, dest_pos={self.dest_pos})"
class Editops:
"""
List like object of Editops describing how to turn s1 into s2.
"""
def __init__(
self,
editops=None,
src_len=0,
dest_len=0,
):
self._src_len = src_len
self._dest_len = dest_len
self._editops = _list_to_editops(editops, src_len, dest_len)
@classmethod
def from_opcodes(cls, opcodes):
"""
Create Editops from Opcodes
Parameters
----------
opcodes : Opcodes
opcodes to convert to editops
Returns
-------
editops : Editops
Opcodes converted to Editops
"""
return opcodes.as_editops()
def as_opcodes(self):
"""
Convert to Opcodes
Returns
-------
opcodes : Opcodes
Editops converted to Opcodes
"""
x = Opcodes.__new__(Opcodes)
x._src_len = self._src_len
x._dest_len = self._dest_len
blocks = []
src_pos = 0
dest_pos = 0
i = 0
while i < len(self._editops):
if src_pos < self._editops[i].src_pos or dest_pos < self._editops[i].dest_pos:
blocks.append(
Opcode(
"equal",
src_pos,
self._editops[i].src_pos,
dest_pos,
self._editops[i].dest_pos,
)
)
src_pos = self._editops[i].src_pos
dest_pos = self._editops[i].dest_pos
src_begin = src_pos
dest_begin = dest_pos
tag = self._editops[i].tag
while (
i < len(self._editops)
and self._editops[i].tag == tag
and src_pos == self._editops[i].src_pos
and dest_pos == self._editops[i].dest_pos
):
if tag == "replace":
src_pos += 1
dest_pos += 1
elif tag == "insert":
dest_pos += 1
elif tag == "delete":
src_pos += 1
i += 1
blocks.append(Opcode(tag, src_begin, src_pos, dest_begin, dest_pos))
if src_pos < self.src_len or dest_pos < self.dest_len:
blocks.append(Opcode("equal", src_pos, self.src_len, dest_pos, self.dest_len))
x._opcodes = blocks
return x
def as_matching_blocks(self):
"""
Convert to matching blocks
Returns
-------
matching blocks : list[MatchingBlock]
Editops converted to matching blocks
"""
blocks = []
src_pos = 0
dest_pos = 0
for op in self:
if src_pos < op.src_pos or dest_pos < op.dest_pos:
length = min(op.src_pos - src_pos, op.dest_pos - dest_pos)
if length > 0:
blocks.append(MatchingBlock(src_pos, dest_pos, length))
src_pos = op.src_pos
dest_pos = op.dest_pos
if op.tag == "replace":
src_pos += 1
dest_pos += 1
elif op.tag == "delete":
src_pos += 1
elif op.tag == "insert":
dest_pos += 1
if src_pos < self.src_len or dest_pos < self.dest_len:
length = min(self.src_len - src_pos, self.dest_len - dest_pos)
if length > 0:
blocks.append(MatchingBlock(src_pos, dest_pos, length))
blocks.append(MatchingBlock(self.src_len, self.dest_len, 0))
return blocks
def as_list(self):
"""
Convert Editops to a list of tuples.
This is the equivalent of ``[x for x in editops]``
"""
return [tuple(op) for op in self._editops]
def copy(self):
"""
performs copy of Editops
"""
x = Editops.__new__(Editops)
x._src_len = self._src_len
x._dest_len = self._dest_len
x._editops = self._editops[::]
return x
def inverse(self):
"""
Invert Editops, so it describes how to transform the destination string to
the source string.
Returns
-------
editops : Editops
inverted Editops
Examples
--------
>>> from rapidfuzz.distance import Levenshtein
>>> Levenshtein.editops('spam', 'park')
[Editop(tag=delete, src_pos=0, dest_pos=0),
Editop(tag=replace, src_pos=3, dest_pos=2),
Editop(tag=insert, src_pos=4, dest_pos=3)]
>>> Levenshtein.editops('spam', 'park').inverse()
[Editop(tag=insert, src_pos=0, dest_pos=0),
Editop(tag=replace, src_pos=2, dest_pos=3),
Editop(tag=delete, src_pos=3, dest_pos=4)]
"""
blocks = []
for op in self:
tag = op.tag
if tag == "delete":
tag = "insert"
elif tag == "insert":
tag = "delete"
blocks.append(Editop(tag, op.dest_pos, op.src_pos))
x = Editops.__new__(Editops)
x._src_len = self.dest_len
x._dest_len = self.src_len
x._editops = blocks
return x
def remove_subsequence(self, subsequence):
"""
remove a subsequence
Parameters
----------
subsequence : Editops
subsequence to remove (has to be a subset of editops)
Returns
-------
sequence : Editops
a copy of the editops without the subsequence
"""
result = Editops.__new__(Editops)
result._src_len = self._src_len
result._dest_len = self._dest_len
if len(subsequence) > len(self):
msg = "subsequence is not a subsequence"
raise ValueError(msg)
result._editops = [None] * (len(self) - len(subsequence))
# offset to correct removed edit operation
offset = 0
op_pos = 0
result_pos = 0
for sop in subsequence:
while op_pos != len(self) and sop != self._editops[op_pos]:
result[result_pos] = self._editops[op_pos]
result[result_pos].src_pos += offset
result_pos += 1
op_pos += 1
# element of subsequence not part of the sequence
if op_pos == len(self):
msg = "subsequence is not a subsequence"
raise ValueError(msg)
if sop.tag == "insert":
offset += 1
elif sop.tag == "delete":
offset -= 1
op_pos += 1
# add remaining elements
while op_pos != len(self):
result[result_pos] = self._editops[op_pos]
result[result_pos].src_pos += offset
result_pos += 1
op_pos += 1
return result
def apply(self, source_string, destination_string):
"""
apply editops to source_string
Parameters
----------
source_string : str | bytes
string to apply editops to
destination_string : str | bytes
string to use for replacements / insertions into source_string
Returns
-------
mod_string : str
modified source_string
"""
res_str = ""
src_pos = 0
for op in self._editops:
# matches between last and current editop
while src_pos < op.src_pos:
res_str += source_string[src_pos]
src_pos += 1
if op.tag == "replace":
res_str += destination_string[op.dest_pos]
src_pos += 1
elif op.tag == "insert":
res_str += destination_string[op.dest_pos]
elif op.tag == "delete":
src_pos += 1
# matches after the last editop
while src_pos < len(source_string):
res_str += source_string[src_pos]
src_pos += 1
return res_str
@property
def src_len(self):
return self._src_len
@src_len.setter
def src_len(self, value):
self._src_len = value
@property
def dest_len(self):
return self._dest_len
@dest_len.setter
def dest_len(self, value):
self._dest_len = value
def __eq__(self, other):
if not isinstance(other, Editops):
return False
return self.dest_len == other.dest_len and self.src_len == other.src_len and self._editops == other._editops
def __len__(self):
return len(self._editops)
def __delitem__(self, key):
del self._editops[key]
def __getitem__(self, key):
if isinstance(key, int):
return self._editops[key]
start, stop, step = key.indices(len(self._editops))
if step < 0:
msg = "step sizes below 0 lead to an invalid order of editops"
raise ValueError(msg)
x = Editops.__new__(Editops)
x._src_len = self._src_len
x._dest_len = self._dest_len
x._editops = self._editops[start:stop:step]
return x
def __iter__(self):
yield from self._editops
def __repr__(self):
return (
"Editops([" + ", ".join(repr(op) for op in self) + f"], src_len={self.src_len}, dest_len={self.dest_len})"
)
class Opcode:
"""
Tuple like object describing an edit operation.
It is in the form (tag, src_start, src_end, dest_start, dest_end)
The tags are strings, with these meanings:
+-----------+-----------------------------------------------------+
| tag | explanation |
+===========+=====================================================+
| 'replace' | src[src_start:src_end] should be |
| | replaced by dest[dest_start:dest_end] |
+-----------+-----------------------------------------------------+
| 'delete' | src[src_start:src_end] should be deleted. |
| | Note that dest_start==dest_end in this case. |
+-----------+-----------------------------------------------------+
| 'insert' | dest[dest_start:dest_end] should be inserted |
| | at src[src_start:src_start]. |
| | Note that src_start==src_end in this case. |
+-----------+-----------------------------------------------------+
| 'equal' | src[src_start:src_end] == dest[dest_start:dest_end] |
+-----------+-----------------------------------------------------+
Note
----
Opcode is compatible with the tuples returned by difflib's SequenceMatcher to make them
interoperable
"""
def __init__(self, tag, src_start, src_end, dest_start, dest_end):
self.tag = tag
self.src_start = src_start
self.src_end = src_end
self.dest_start = dest_start
self.dest_end = dest_end
def __len__(self):
return 5
def __eq__(self, other):
try:
if len(other) != 5:
return False
return bool(
other[0] == self.tag
and other[1] == self.src_start
and other[2] == self.src_end
and other[3] == self.dest_start
and other[4] == self.dest_end
)
except TypeError:
return False
def __getitem__(self, i):
if i in {0, -5}:
return self.tag
if i in {1, -4}:
return self.src_start
if i in {2, -3}:
return self.src_end
if i in {3, -2}:
return self.dest_start
if i in {4, -1}:
return self.dest_end
msg = "Opcode index out of range"
raise IndexError(msg)
def __iter__(self):
for i in range(5):
yield self[i]
def __repr__(self):
return (
f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
f"dest_start={self.dest_start}, dest_end={self.dest_end})"
)
class Opcodes:
"""
List like object of Opcodes describing how to turn s1 into s2.
The first Opcode has src_start == dest_start == 0, and remaining tuples
have src_start == the src_end from the tuple preceding it,
and likewise for dest_start == the previous dest_end.
"""
def __init__(
self,
opcodes=None,
src_len=0,
dest_len=0,
):
self._src_len = src_len
self._dest_len = dest_len
self._opcodes = _list_to_opcodes(opcodes, src_len, dest_len)
@classmethod
def from_editops(cls, editops):
"""
Create Opcodes from Editops
Parameters
----------
editops : Editops
editops to convert to opcodes
Returns
-------
opcodes : Opcodes
Editops converted to Opcodes
"""
return editops.as_opcodes()
def as_editops(self):
"""
Convert Opcodes to Editops
Returns
-------
editops : Editops
Opcodes converted to Editops
"""
x = Editops.__new__(Editops)
x._src_len = self._src_len
x._dest_len = self._dest_len
blocks = []
for op in self:
if op.tag == "replace":
for j in range(op.src_end - op.src_start):
blocks.append(Editop("replace", op.src_start + j, op.dest_start + j))
elif op.tag == "insert":
for j in range(op.dest_end - op.dest_start):
blocks.append(Editop("insert", op.src_start, op.dest_start + j))
elif op.tag == "delete":
for j in range(op.src_end - op.src_start):
blocks.append(Editop("delete", op.src_start + j, op.dest_start))
x._editops = blocks
return x
def as_matching_blocks(self):
"""
Convert to matching blocks
Returns
-------
matching blocks : list[MatchingBlock]
Opcodes converted to matching blocks
"""
blocks = []
for op in self:
if op.tag == "equal":
length = min(op.src_end - op.src_start, op.dest_end - op.dest_start)
if length > 0:
blocks.append(MatchingBlock(op.src_start, op.dest_start, length))
blocks.append(MatchingBlock(self.src_len, self.dest_len, 0))
return blocks
def as_list(self):
"""
Convert Opcodes to a list of tuples, which is compatible
with the opcodes of difflibs SequenceMatcher.
This is the equivalent of ``[x for x in opcodes]``
"""
return [tuple(op) for op in self._opcodes]
def copy(self):
"""
performs copy of Opcodes
"""
x = Opcodes.__new__(Opcodes)
x._src_len = self._src_len
x._dest_len = self._dest_len
x._opcodes = self._opcodes[::]
return x
def inverse(self):
"""
Invert Opcodes, so it describes how to transform the destination string to
the source string.
Returns
-------
opcodes : Opcodes
inverted Opcodes
Examples
--------
>>> from rapidfuzz.distance import Levenshtein
>>> Levenshtein.opcodes('spam', 'park')
[Opcode(tag=delete, src_start=0, src_end=1, dest_start=0, dest_end=0),
Opcode(tag=equal, src_start=1, src_end=3, dest_start=0, dest_end=2),
Opcode(tag=replace, src_start=3, src_end=4, dest_start=2, dest_end=3),
Opcode(tag=insert, src_start=4, src_end=4, dest_start=3, dest_end=4)]
>>> Levenshtein.opcodes('spam', 'park').inverse()
[Opcode(tag=insert, src_start=0, src_end=0, dest_start=0, dest_end=1),
Opcode(tag=equal, src_start=0, src_end=2, dest_start=1, dest_end=3),
Opcode(tag=replace, src_start=2, src_end=3, dest_start=3, dest_end=4),
Opcode(tag=delete, src_start=3, src_end=4, dest_start=4, dest_end=4)]
"""
blocks = []
for op in self:
tag = op.tag
if tag == "delete":
tag = "insert"
elif tag == "insert":
tag = "delete"
blocks.append(Opcode(tag, op.dest_start, op.dest_end, op.src_start, op.src_end))
x = Opcodes.__new__(Opcodes)
x._src_len = self.dest_len
x._dest_len = self.src_len
x._opcodes = blocks
return x
def apply(self, source_string, destination_string):
"""
apply opcodes to source_string
Parameters
----------
source_string : str | bytes
string to apply opcodes to
destination_string : str | bytes
string to use for replacements / insertions into source_string
Returns
-------
mod_string : str
modified source_string
"""
res_str = ""
for op in self._opcodes:
if op.tag == "equal":
res_str += source_string[op.src_start : op.src_end]
elif op.tag in {"replace", "insert"}:
res_str += destination_string[op.dest_start : op.dest_end]
return res_str
@property
def src_len(self):
return self._src_len
@src_len.setter
def src_len(self, value):
self._src_len = value
@property
def dest_len(self):
return self._dest_len
@dest_len.setter
def dest_len(self, value):
self._dest_len = value
def __eq__(self, other):
if not isinstance(other, Opcodes):
return False
return self.dest_len == other.dest_len and self.src_len == other.src_len and self._opcodes == other._opcodes
def __len__(self):
return len(self._opcodes)
def __getitem__(self, key):
if isinstance(key, int):
return self._opcodes[key]
msg = "Expected index"
raise TypeError(msg)
def __iter__(self):
yield from self._opcodes
def __repr__(self):
return (
"Opcodes([" + ", ".join(repr(op) for op in self) + f"], src_len={self.src_len}, dest_len={self.dest_len})"
)
class ScoreAlignment:
"""
Tuple like object describing the position of the compared strings in
src and dest.
It indicates that the score has been calculated between
src[src_start:src_end] and dest[dest_start:dest_end]
"""
def __init__(
self,
score,
src_start,
src_end,
dest_start,
dest_end,
):
self.score = score
self.src_start = src_start
self.src_end = src_end
self.dest_start = dest_start
self.dest_end = dest_end
def __len__(self):
return 5
def __eq__(self, other):
try:
if len(other) != 5:
return False
return bool(
other[0] == self.score
and other[1] == self.src_start
and other[2] == self.src_end
and other[3] == self.dest_start
and other[4] == self.dest_end
)
except TypeError:
return False
def __getitem__(self, i):
if i in {0, -5}:
return self.score
if i in {1, -4}:
return self.src_start
if i in {2, -3}:
return self.src_end
if i in {3, -2}:
return self.dest_start
if i in {4, -1}:
return self.dest_end
msg = "Opcode index out of range"
raise IndexError(msg)
def __iter__(self):
for i in range(5):
yield self[i]
def __repr__(self):
return (
f"ScoreAlignment(score={self.score}, src_start={self.src_start}, "
f"src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})"
)

View File

@@ -0,0 +1,299 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from typing import Any, Callable
from rapidfuzz._utils import (
ScorerFlag,
add_scorer_attrs,
default_distance_attribute as dist_attr,
default_normalized_distance_attribute as norm_dist_attr,
default_normalized_similarity_attribute as norm_sim_attr,
default_similarity_attribute as sim_attr,
)
# DamerauLevenshtein
from rapidfuzz.distance.DamerauLevenshtein_py import (
distance as damerau_levenshtein_distance,
normalized_distance as damerau_levenshtein_normalized_distance,
normalized_similarity as damerau_levenshtein_normalized_similarity,
similarity as damerau_levenshtein_similarity,
)
# Hamming
from rapidfuzz.distance.Hamming_py import (
distance as hamming_distance,
editops as hamming_editops,
normalized_distance as hamming_normalized_distance,
normalized_similarity as hamming_normalized_similarity,
opcodes as hamming_opcodes,
similarity as hamming_similarity,
)
# Indel
from rapidfuzz.distance.Indel_py import (
distance as indel_distance,
editops as indel_editops,
normalized_distance as indel_normalized_distance,
normalized_similarity as indel_normalized_similarity,
opcodes as indel_opcodes,
similarity as indel_similarity,
)
# Jaro
from rapidfuzz.distance.Jaro_py import (
distance as jaro_distance,
normalized_distance as jaro_normalized_distance,
normalized_similarity as jaro_normalized_similarity,
similarity as jaro_similarity,
)
# JaroWinkler
from rapidfuzz.distance.JaroWinkler_py import (
distance as jaro_winkler_distance,
normalized_distance as jaro_winkler_normalized_distance,
normalized_similarity as jaro_winkler_normalized_similarity,
similarity as jaro_winkler_similarity,
)
# LCSseq
from rapidfuzz.distance.LCSseq_py import (
distance as lcs_seq_distance,
editops as lcs_seq_editops,
normalized_distance as lcs_seq_normalized_distance,
normalized_similarity as lcs_seq_normalized_similarity,
opcodes as lcs_seq_opcodes,
similarity as lcs_seq_similarity,
)
# Levenshtein
from rapidfuzz.distance.Levenshtein_py import (
distance as levenshtein_distance,
editops as levenshtein_editops,
normalized_distance as levenshtein_normalized_distance,
normalized_similarity as levenshtein_normalized_similarity,
opcodes as levenshtein_opcodes,
similarity as levenshtein_similarity,
)
# OSA
from rapidfuzz.distance.OSA_py import (
distance as osa_distance,
normalized_distance as osa_normalized_distance,
normalized_similarity as osa_normalized_similarity,
similarity as osa_similarity,
)
# Postfix
from rapidfuzz.distance.Postfix_py import (
distance as postfix_distance,
normalized_distance as postfix_normalized_distance,
normalized_similarity as postfix_normalized_similarity,
similarity as postfix_similarity,
)
# Prefix
from rapidfuzz.distance.Prefix_py import (
distance as prefix_distance,
normalized_distance as prefix_normalized_distance,
normalized_similarity as prefix_normalized_similarity,
similarity as prefix_similarity,
)
__all__ = []
add_scorer_attrs(osa_distance, dist_attr)
add_scorer_attrs(osa_similarity, sim_attr)
add_scorer_attrs(osa_normalized_distance, norm_dist_attr)
add_scorer_attrs(osa_normalized_similarity, norm_sim_attr)
__all__ += [
"osa_distance",
"osa_normalized_distance",
"osa_normalized_similarity",
"osa_similarity",
]
add_scorer_attrs(prefix_distance, dist_attr)
add_scorer_attrs(prefix_similarity, sim_attr)
add_scorer_attrs(prefix_normalized_distance, norm_dist_attr)
add_scorer_attrs(prefix_normalized_similarity, norm_sim_attr)
__all__ += [
"prefix_distance",
"prefix_normalized_distance",
"prefix_normalized_similarity",
"prefix_similarity",
]
add_scorer_attrs(postfix_distance, dist_attr)
add_scorer_attrs(postfix_similarity, sim_attr)
add_scorer_attrs(postfix_normalized_distance, norm_dist_attr)
add_scorer_attrs(postfix_normalized_similarity, norm_sim_attr)
__all__ += [
"postfix_distance",
"postfix_normalized_distance",
"postfix_normalized_similarity",
"postfix_similarity",
]
add_scorer_attrs(jaro_distance, norm_dist_attr)
add_scorer_attrs(jaro_similarity, norm_sim_attr)
add_scorer_attrs(jaro_normalized_distance, norm_dist_attr)
add_scorer_attrs(jaro_normalized_similarity, norm_sim_attr)
__all__ += [
"jaro_distance",
"jaro_normalized_distance",
"jaro_normalized_similarity",
"jaro_similarity",
]
add_scorer_attrs(jaro_winkler_distance, norm_dist_attr)
add_scorer_attrs(jaro_winkler_similarity, norm_sim_attr)
add_scorer_attrs(jaro_winkler_normalized_distance, norm_dist_attr)
add_scorer_attrs(jaro_winkler_normalized_similarity, norm_sim_attr)
__all__ += [
"jaro_winkler_distance",
"jaro_winkler_normalized_distance",
"jaro_winkler_normalized_similarity",
"jaro_winkler_similarity",
]
add_scorer_attrs(damerau_levenshtein_distance, dist_attr)
add_scorer_attrs(damerau_levenshtein_similarity, sim_attr)
add_scorer_attrs(damerau_levenshtein_normalized_distance, norm_dist_attr)
add_scorer_attrs(damerau_levenshtein_normalized_similarity, norm_sim_attr)
__all__ += [
"damerau_levenshtein_distance",
"damerau_levenshtein_normalized_distance",
"damerau_levenshtein_normalized_similarity",
"damerau_levenshtein_similarity",
]
def _get_scorer_flags_levenshtein_distance(weights: tuple[int, int, int] | None = (1, 1, 1)) -> dict[str, Any]:
flags = ScorerFlag.RESULT_SIZE_T
if weights is None or weights[0] == weights[1]:
flags |= ScorerFlag.SYMMETRIC
return {
"optimal_score": 0,
"worst_score": 2**63 - 1,
"flags": flags,
}
def _get_scorer_flags_levenshtein_similarity(weights: tuple[int, int, int] | None = (1, 1, 1)) -> dict[str, Any]:
flags = ScorerFlag.RESULT_SIZE_T
if weights is None or weights[0] == weights[1]:
flags |= ScorerFlag.SYMMETRIC
return {
"optimal_score": 2**63 - 1,
"worst_score": 0,
"flags": flags,
}
def _get_scorer_flags_levenshtein_normalized_distance(
weights: tuple[int, int, int] | None = (1, 1, 1)
) -> dict[str, Any]:
flags = ScorerFlag.RESULT_F64
if weights is None or weights[0] == weights[1]:
flags |= ScorerFlag.SYMMETRIC
return {"optimal_score": 0, "worst_score": 1, "flags": flags}
def _get_scorer_flags_levenshtein_normalized_similarity(
weights: tuple[int, int, int] | None = (1, 1, 1)
) -> dict[str, Any]:
flags = ScorerFlag.RESULT_F64
if weights is None or weights[0] == weights[1]:
flags |= ScorerFlag.SYMMETRIC
return {"optimal_score": 1, "worst_score": 0, "flags": flags}
levenshtein_dist_attr: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_levenshtein_distance
}
levenshtein_sim_attr: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_levenshtein_similarity
}
levenshtein_norm_dist_attr: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_levenshtein_normalized_distance
}
levenshtein_norm_sim_attr: dict[str, Callable[..., dict[str, Any]]] = {
"get_scorer_flags": _get_scorer_flags_levenshtein_normalized_similarity
}
add_scorer_attrs(levenshtein_distance, levenshtein_dist_attr)
add_scorer_attrs(levenshtein_similarity, levenshtein_sim_attr)
add_scorer_attrs(levenshtein_normalized_distance, levenshtein_norm_dist_attr)
add_scorer_attrs(levenshtein_normalized_similarity, levenshtein_norm_sim_attr)
__all__ += [
"levenshtein_distance",
"levenshtein_editops",
"levenshtein_normalized_distance",
"levenshtein_normalized_similarity",
"levenshtein_opcodes",
"levenshtein_similarity",
]
add_scorer_attrs(lcs_seq_distance, dist_attr)
add_scorer_attrs(lcs_seq_similarity, sim_attr)
add_scorer_attrs(lcs_seq_normalized_distance, norm_dist_attr)
add_scorer_attrs(lcs_seq_normalized_similarity, norm_sim_attr)
__all__ += [
"lcs_seq_distance",
"lcs_seq_editops",
"lcs_seq_normalized_distance",
"lcs_seq_normalized_similarity",
"lcs_seq_opcodes",
"lcs_seq_similarity",
]
add_scorer_attrs(indel_distance, dist_attr)
add_scorer_attrs(indel_similarity, sim_attr)
add_scorer_attrs(indel_normalized_distance, norm_dist_attr)
add_scorer_attrs(indel_normalized_similarity, norm_sim_attr)
__all__ += [
"indel_distance",
"indel_editops",
"indel_normalized_distance",
"indel_normalized_similarity",
"indel_opcodes",
"indel_similarity",
]
add_scorer_attrs(hamming_distance, dist_attr)
add_scorer_attrs(hamming_similarity, sim_attr)
add_scorer_attrs(hamming_normalized_distance, norm_dist_attr)
add_scorer_attrs(hamming_normalized_similarity, norm_sim_attr)
__all__ += [
"hamming_distance",
"hamming_editops",
"hamming_normalized_distance",
"hamming_normalized_similarity",
"hamming_opcodes",
"hamming_similarity",
]

View File

@@ -0,0 +1,161 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = [
"QRatio",
"WRatio",
"partial_ratio",
"partial_ratio_alignment",
"partial_token_ratio",
"partial_token_set_ratio",
"partial_token_sort_ratio",
"ratio",
"token_ratio",
"token_set_ratio",
"token_sort_ratio",
]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.fuzz_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.fuzz_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
imported = True
if not imported:
from rapidfuzz.fuzz_cpp import ( # pyright: ignore[reportMissingImports]
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
elif _impl == "python":
from rapidfuzz.fuzz_py import (
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.fuzz_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.fuzz_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.fuzz_cpp import ( # pyright: ignore[reportMissingImports]
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)
imported = True
if not imported:
from rapidfuzz.fuzz_py import (
QRatio,
WRatio,
partial_ratio,
partial_ratio_alignment,
partial_token_ratio,
partial_token_set_ratio,
partial_token_sort_ratio,
ratio,
token_ratio,
token_set_ratio,
token_sort_ratio,
)

View File

@@ -0,0 +1,189 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2021 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import Callable, TypeVar, overload
from rapidfuzz.distance import ScoreAlignment
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
@overload
def ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_ratio_alignment(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> ScoreAlignment | None: ...
@overload
def partial_ratio_alignment(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> ScoreAlignment | None: ...
@overload
def token_sort_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def token_sort_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def token_set_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def token_set_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def token_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def token_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_token_sort_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_token_sort_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_token_set_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_token_set_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_token_ratio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def partial_token_ratio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def WRatio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def WRatio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...
@overload
def QRatio(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
*,
processor: None = None,
score_cutoff: float | None = 0,
) -> float: ...
@overload
def QRatio(
s1: _UnprocessedType1,
s2: _UnprocessedType2,
*,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = 0,
) -> float: ...

View File

@@ -0,0 +1,877 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from math import ceil
from rapidfuzz._common_py import conv_sequences
from rapidfuzz._utils import ScorerFlag, add_scorer_attrs, is_none, setupPandas
from rapidfuzz.distance import ScoreAlignment
from rapidfuzz.distance.Indel_py import (
_block_normalized_similarity as indel_block_normalized_similarity,
distance as indel_distance,
normalized_similarity as indel_normalized_similarity,
)
def get_scorer_flags_fuzz(**_kwargs):
return {
"optimal_score": 100,
"worst_score": 0,
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
}
fuzz_attribute = {"get_scorer_flags": get_scorer_flags_fuzz}
def _norm_distance(dist, lensum, score_cutoff):
score = (100 - 100 * dist / lensum) if lensum else 100
return score if score >= score_cutoff else 0
def _split_sequence(seq):
if isinstance(seq, (str, bytes)):
return seq.split()
splitted_seq = [[]]
for x in seq:
ch = x if isinstance(x, str) else chr(x)
if ch.isspace():
splitted_seq.append([])
else:
splitted_seq[-1].append(x)
return [tuple(x) for x in splitted_seq if x]
def _join_splitted_sequence(seq_list):
if not seq_list:
return ""
if isinstance(next(iter(seq_list)), str):
return " ".join(seq_list)
if isinstance(next(iter(seq_list)), bytes):
return b" ".join(seq_list)
joined = []
for seq in seq_list:
joined += seq
joined += [ord(" ")]
return joined[:-1]
def ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates the normalized Indel similarity.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
See Also
--------
rapidfuzz.distance.Indel.normalized_similarity : Normalized Indel similarity
Notes
-----
.. image:: img/ratio.svg
Examples
--------
>>> fuzz.ratio("this is a test", "this is a test!")
96.55171966552734
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if score_cutoff is not None:
score_cutoff /= 100
score = indel_normalized_similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
return score * 100
def _partial_ratio_impl(s1, s2, score_cutoff):
"""
implementation of partial_ratio. This assumes len(s1) <= len(s2).
"""
s1_char_set = set(s1)
len1 = len(s1)
len2 = len(s2)
res = ScoreAlignment(0, 0, len1, 0, len1)
block = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1
for i in range(1, len1):
substr_last = s2[i - 1]
if substr_last not in s1_char_set:
continue
# todo cache map
ls_ratio = indel_block_normalized_similarity(block, s1, s2[:i], score_cutoff=score_cutoff)
if ls_ratio > res.score:
res.score = score_cutoff = ls_ratio
res.dest_start = 0
res.dest_end = i
if res.score == 1:
res.score = 100
return res
for i in range(len2 - len1):
substr_last = s2[i + len1 - 1]
if substr_last not in s1_char_set:
continue
# todo cache map
ls_ratio = indel_block_normalized_similarity(block, s1, s2[i : i + len1], score_cutoff=score_cutoff)
if ls_ratio > res.score:
res.score = score_cutoff = ls_ratio
res.dest_start = i
res.dest_end = i + len1
if res.score == 1:
res.score = 100
return res
for i in range(len2 - len1, len2):
substr_first = s2[i]
if substr_first not in s1_char_set:
continue
# todo cache map
ls_ratio = indel_block_normalized_similarity(block, s1, s2[i:], score_cutoff=score_cutoff)
if ls_ratio > res.score:
res.score = score_cutoff = ls_ratio
res.dest_start = i
res.dest_end = len2
if res.score == 1:
res.score = 100
return res
res.score *= 100
return res
def partial_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Searches for the optimal alignment of the shorter string in the
longer string and returns the fuzz.ratio for this alignment.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
Depending on the length of the needle (shorter string) different
implementations are used to improve the performance.
short needle (length ≤ 64):
When using a short needle length the fuzz.ratio is calculated for all
alignments that could result in an optimal alignment. It is
guaranteed to find the optimal alignment. For short needles this is very
fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst
case performance of ``O(NM)``.
.. image:: img/partial_ratio_short_needle.svg
long needle (length > 64):
For long needles a similar implementation to FuzzyWuzzy is used.
This implementation only considers alignments which start at one
of the longest common substrings. This results in a worst case performance
of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.
The following Python code shows the concept:
.. code-block:: python
blocks = SequenceMatcher(None, needle, longer, False).get_matching_blocks()
score = 0
for block in blocks:
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
long_end = long_start + len(shorter)
long_substr = longer[long_start:long_end]
score = max(score, fuzz.ratio(needle, long_substr))
This is a lot faster than checking all possible alignments. However it
only finds one of the best alignments and not necessarily the optimal one.
.. image:: img/partial_ratio_long_needle.svg
Examples
--------
>>> fuzz.partial_ratio("this is a test", "this is a test!")
100.0
"""
alignment = partial_ratio_alignment(s1, s2, processor=processor, score_cutoff=score_cutoff)
if alignment is None:
return 0
return alignment.score
def partial_ratio_alignment(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Searches for the optimal alignment of the shorter string in the
longer string and returns the fuzz.ratio and the corresponding
alignment.
Parameters
----------
s1 : str | bytes
First string to compare.
s2 : str | bytes
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff None is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
alignment : ScoreAlignment, optional
alignment between s1 and s2 with the score as a float between 0 and 100
Examples
--------
>>> s1 = "a certain string"
>>> s2 = "cetain"
>>> res = fuzz.partial_ratio_alignment(s1, s2)
>>> res
ScoreAlignment(score=83.33333333333334, src_start=2, src_end=8, dest_start=0, dest_end=6)
Using the alignment information it is possible to calculate the same fuzz.ratio
>>> fuzz.ratio(s1[res.src_start:res.src_end], s2[res.dest_start:res.dest_end])
83.33333333333334
"""
setupPandas()
if is_none(s1) or is_none(s2):
return None
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if score_cutoff is None:
score_cutoff = 0
if not s1 and not s2:
return ScoreAlignment(100.0, 0, 0, 0, 0)
s1, s2 = conv_sequences(s1, s2)
len1 = len(s1)
len2 = len(s2)
if len1 <= len2:
shorter = s1
longer = s2
else:
shorter = s2
longer = s1
res = _partial_ratio_impl(shorter, longer, score_cutoff / 100)
if res.score != 100 and len1 == len2:
score_cutoff = max(score_cutoff, res.score)
res2 = _partial_ratio_impl(longer, shorter, score_cutoff / 100)
if res2.score > res.score:
res = ScoreAlignment(res2.score, res2.dest_start, res2.dest_end, res2.src_start, res2.src_end)
if res.score < score_cutoff:
return None
if len1 <= len2:
return res
return ScoreAlignment(res.score, res.dest_start, res.dest_end, res.src_start, res.src_end)
def token_sort_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Sorts the words in the strings and calculates the fuzz.ratio between them
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/token_sort_ratio.svg
Examples
--------
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
100.0
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
sorted_s1 = _join_splitted_sequence(sorted(_split_sequence(s1)))
sorted_s2 = _join_splitted_sequence(sorted(_split_sequence(s2)))
return ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff)
def token_set_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Compares the words in the strings based on unique and common words between them
using fuzz.ratio
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/token_set_ratio.svg
Examples
--------
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
83.8709716796875
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
100.0
# Returns 100.0 if one string is a subset of the other, regardless of extra content in the longer string
>>> fuzz.token_set_ratio("fuzzy was a bear but not a dog", "fuzzy was a bear")
100.0
# Score is reduced only when there is explicit disagreement in the two strings
>>> fuzz.token_set_ratio("fuzzy was a bear but not a dog", "fuzzy was a bear but not a cat")
92.3076923076923
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if score_cutoff is None:
score_cutoff = 0
s1, s2 = conv_sequences(s1, s2)
tokens_a = set(_split_sequence(s1))
tokens_b = set(_split_sequence(s2))
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
if not tokens_a or not tokens_b:
return 0
intersect = tokens_a.intersection(tokens_b)
diff_ab = tokens_a.difference(tokens_b)
diff_ba = tokens_b.difference(tokens_a)
# one sentence is part of the other one
if intersect and (not diff_ab or not diff_ba):
return 100
diff_ab_joined = _join_splitted_sequence(sorted(diff_ab))
diff_ba_joined = _join_splitted_sequence(sorted(diff_ba))
ab_len = len(diff_ab_joined)
ba_len = len(diff_ba_joined)
# todo is length sum without joining faster?
sect_len = len(_join_splitted_sequence(intersect))
# string length sect+ab <-> sect and sect+ba <-> sect
sect_ab_len = sect_len + (sect_len != 0) + ab_len
sect_ba_len = sect_len + (sect_len != 0) + ba_len
result = 0.0
cutoff_distance = ceil((sect_ab_len + sect_ba_len) * (1 - score_cutoff / 100))
dist = indel_distance(diff_ab_joined, diff_ba_joined, score_cutoff=cutoff_distance)
if dist <= cutoff_distance:
result = _norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff)
# exit early since the other ratios are 0
if not sect_len:
return result
# levenshtein distance sect+ab <-> sect and sect+ba <-> sect
# since only sect is similar in them the distance can be calculated based on
# the length difference
sect_ab_dist = (sect_len != 0) + ab_len
sect_ab_ratio = _norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff)
sect_ba_dist = (sect_len != 0) + ba_len
sect_ba_ratio = _norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff)
return max(result, sect_ab_ratio, sect_ba_ratio)
def token_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
(faster than manually executing the two functions)
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/token_ratio.svg
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
# todo write combined implementation
return max(
token_set_ratio(s1, s2, processor=None, score_cutoff=score_cutoff),
token_sort_ratio(s1, s2, processor=None, score_cutoff=score_cutoff),
)
def partial_token_sort_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
sorts the words in the strings and calculates the fuzz.partial_ratio between them
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/partial_token_sort_ratio.svg
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
sorted_s1 = _join_splitted_sequence(sorted(_split_sequence(s1)))
sorted_s2 = _join_splitted_sequence(sorted(_split_sequence(s2)))
return partial_ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff)
def partial_token_set_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Compares the words in the strings based on unique and common words between them
using fuzz.partial_ratio
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/partial_token_set_ratio.svg
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
s1, s2 = conv_sequences(s1, s2)
tokens_a = set(_split_sequence(s1))
tokens_b = set(_split_sequence(s2))
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
if not tokens_a or not tokens_b:
return 0
# exit early when there is a common word in both sequences
if tokens_a.intersection(tokens_b):
return 100
diff_ab = _join_splitted_sequence(sorted(tokens_a.difference(tokens_b)))
diff_ba = _join_splitted_sequence(sorted(tokens_b.difference(tokens_a)))
return partial_ratio(diff_ab, diff_ba, score_cutoff=score_cutoff)
def partial_token_ratio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Helper method that returns the maximum of fuzz.partial_token_set_ratio and
fuzz.partial_token_sort_ratio (faster than manually executing the two functions)
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/partial_token_ratio.svg
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
if score_cutoff is None:
score_cutoff = 0
s1, s2 = conv_sequences(s1, s2)
tokens_split_a = _split_sequence(s1)
tokens_split_b = _split_sequence(s2)
tokens_a = set(tokens_split_a)
tokens_b = set(tokens_split_b)
# exit early when there is a common word in both sequences
if tokens_a.intersection(tokens_b):
return 100
diff_ab = tokens_a.difference(tokens_b)
diff_ba = tokens_b.difference(tokens_a)
result = partial_ratio(
_join_splitted_sequence(sorted(tokens_split_a)),
_join_splitted_sequence(sorted(tokens_split_b)),
score_cutoff=score_cutoff,
)
# do not calculate the same partial_ratio twice
if len(tokens_split_a) == len(diff_ab) and len(tokens_split_b) == len(diff_ba):
return result
score_cutoff = max(score_cutoff, result)
return max(
result,
partial_ratio(
_join_splitted_sequence(sorted(diff_ab)),
_join_splitted_sequence(sorted(diff_ba)),
score_cutoff=score_cutoff,
),
)
def WRatio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a weighted ratio based on the other ratio algorithms
Parameters
----------
s1 : str
First string to compare.
s2 : str
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Notes
-----
.. image:: img/WRatio.svg
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
UNBASE_SCALE = 0.95
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
if not s1 or not s2:
return 0
if score_cutoff is None:
score_cutoff = 0
len1 = len(s1)
len2 = len(s2)
len_ratio = len1 / len2 if len1 > len2 else len2 / len1
end_ratio = ratio(s1, s2, score_cutoff=score_cutoff)
if len_ratio < 1.5:
score_cutoff = max(score_cutoff, end_ratio) / UNBASE_SCALE
return max(
end_ratio,
token_ratio(s1, s2, score_cutoff=score_cutoff, processor=None) * UNBASE_SCALE,
)
PARTIAL_SCALE = 0.9 if len_ratio <= 8.0 else 0.6
score_cutoff = max(score_cutoff, end_ratio) / PARTIAL_SCALE
end_ratio = max(end_ratio, partial_ratio(s1, s2, score_cutoff=score_cutoff) * PARTIAL_SCALE)
score_cutoff = max(score_cutoff, end_ratio) / UNBASE_SCALE
return max(
end_ratio,
partial_token_ratio(s1, s2, score_cutoff=score_cutoff, processor=None) * UNBASE_SCALE * PARTIAL_SCALE,
)
def QRatio(
s1,
s2,
*,
processor=None,
score_cutoff=None,
):
"""
Calculates a quick ratio between two strings using fuzz.ratio.
Since v3.0 this behaves similar to fuzz.ratio with the exception that this
returns 0 when comparing two empty strings
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 100.
For ratio < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
similarity : float
similarity between s1 and s2 as a float between 0 and 100
Examples
--------
>>> fuzz.QRatio("this is a test", "this is a test!")
96.55171966552734
"""
setupPandas()
if is_none(s1) or is_none(s2):
return 0
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
if not s1 or not s2:
return 0
return ratio(s1, s2, score_cutoff=score_cutoff)
add_scorer_attrs(ratio, fuzz_attribute)
add_scorer_attrs(partial_ratio, fuzz_attribute)
add_scorer_attrs(token_sort_ratio, fuzz_attribute)
add_scorer_attrs(token_set_ratio, fuzz_attribute)
add_scorer_attrs(token_ratio, fuzz_attribute)
add_scorer_attrs(partial_token_sort_ratio, fuzz_attribute)
add_scorer_attrs(partial_token_set_ratio, fuzz_attribute)
add_scorer_attrs(partial_token_ratio, fuzz_attribute)
add_scorer_attrs(WRatio, fuzz_attribute)
add_scorer_attrs(QRatio, fuzz_attribute)

View File

@@ -0,0 +1,95 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["cdist", "cpdist", "extract", "extractOne", "extract_iter"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.process_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.process_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)
imported = True
if not imported:
from rapidfuzz.process_cpp import ( # pyright: ignore[reportMissingImports]
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)
elif _impl == "python":
from rapidfuzz.process_py import cdist, cpdist, extract, extract_iter, extractOne
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.process_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.process_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.process_cpp import ( # pyright: ignore[reportMissingImports]
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)
imported = True
if not imported:
from rapidfuzz.process_py import (
cdist,
cpdist,
extract,
extract_iter,
extractOne,
)

View File

@@ -0,0 +1,430 @@
from __future__ import annotations
from collections.abc import Collection, Generator, Hashable, Iterable, Mapping, Sequence
from typing import (
Any,
Callable,
Protocol,
TypeVar,
overload,
)
from rapidfuzz.fuzz import WRatio, ratio
_StringType = Sequence[Hashable]
_StringType1 = TypeVar("_StringType1", bound=Sequence[Hashable])
_StringType2 = TypeVar("_StringType2", bound=Sequence[Hashable])
_UnprocessedType1 = TypeVar("_UnprocessedType1")
_UnprocessedType2 = TypeVar("_UnprocessedType2")
_KeyType = TypeVar("_KeyType")
_ResultType = TypeVar("_ResultType", int, float)
_StringType1_contra = TypeVar("_StringType1_contra", contravariant=True, bound=Sequence[Hashable])
_StringType2_contra = TypeVar("_StringType2_contra", contravariant=True, bound=Sequence[Hashable])
_ResultType_contra = TypeVar("_ResultType_contra", int, float, contravariant=True)
_ResultType_co = TypeVar("_ResultType_co", int, float, covariant=True)
class _Scorer(Protocol[_StringType1_contra, _StringType2_contra, _ResultType_contra, _ResultType_co]):
def __call__(
self, __s1: _StringType1_contra, __s2: _StringType2_contra, *, score_cutoff: _ResultType_contra | None
) -> _ResultType_co: ...
# mypy wants defaults to be valid for every possible parameterization of a generic function
# so add separate overloads for the default version
@overload
def extractOne(
query: Sequence[Hashable] | None,
choices: Mapping[_KeyType, _StringType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_StringType2, float, _KeyType] | None: ...
@overload
def extractOne(
query: Sequence[Hashable] | None,
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_StringType2, float, int] | None: ...
@overload
def extractOne(
query: _UnprocessedType1 | None,
choices: Mapping[_KeyType, _UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_UnprocessedType2, float, _KeyType] | None: ...
@overload
def extractOne(
query: _UnprocessedType1 | None,
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_UnprocessedType2, float, int] | None: ...
@overload
def extractOne(
query: _StringType1 | None,
choices: Mapping[_KeyType, _StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_StringType2, _ResultType, _KeyType] | None: ...
@overload
def extractOne(
query: _StringType1 | None,
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_StringType2, _ResultType, int] | None: ...
@overload
def extractOne(
query: _UnprocessedType1 | None,
choices: Mapping[_KeyType, _UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_UnprocessedType2, _ResultType, _KeyType] | None: ...
@overload
def extractOne(
query: _UnprocessedType1 | None,
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> tuple[_UnprocessedType2, _ResultType, int] | None: ...
# mypy wants defaults to be valid for every possible parameterization of a generic function
# so add separate overloads for the default version
@overload
def extract(
query: Sequence[Hashable] | None,
choices: Mapping[_KeyType, _StringType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: None = None,
limit: int | None = 5,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_StringType2, float, _KeyType]]: ...
@overload
def extract(
query: Sequence[Hashable] | None,
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: None = None,
limit: int | None = 5,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_StringType2, float, int]]: ...
@overload
def extract(
query: _UnprocessedType1 | None,
choices: Mapping[_KeyType, _UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
limit: int | None = 5,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_UnprocessedType2, float, _KeyType]]: ...
@overload
def extract(
query: _UnprocessedType1 | None,
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
limit: int | None = 5,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_UnprocessedType2, float, int]]: ...
@overload
def extract(
query: _StringType1 | None,
choices: Mapping[_KeyType, _StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
limit: int | None = 5,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_StringType2, _ResultType, _KeyType]]: ...
@overload
def extract(
query: _StringType1 | None,
choices: Collection[_StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
limit: int | None = 5,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_StringType2, _ResultType, int]]: ...
@overload
def extract(
query: _UnprocessedType1 | None,
choices: Mapping[_KeyType, _UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
limit: int | None = 5,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_UnprocessedType2, _ResultType, _KeyType]]: ...
@overload
def extract(
query: _UnprocessedType1 | None,
choices: Collection[_UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
limit: int | None = 5,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> list[tuple[_UnprocessedType2, _ResultType, int]]: ...
# mypy wants defaults to be valid for every possible parameterization of a generic function
# so add separate overloads for the default version
@overload
def extract_iter(
query: Sequence[Hashable] | None,
choices: Mapping[_KeyType, _StringType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_StringType2, float, _KeyType], None, None]: ...
@overload
def extract_iter(
query: Sequence[Hashable] | None,
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_StringType2, float, int], None, None]: ...
@overload
def extract_iter(
query: _UnprocessedType1 | None,
choices: Mapping[_KeyType, _UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_UnprocessedType2, float, _KeyType], None, None]: ...
@overload
def extract_iter(
query: _UnprocessedType1 | None,
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = None,
score_hint: float | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_UnprocessedType2, float, int], None, None]: ...
@overload
def extract_iter(
query: _StringType1 | None,
choices: Mapping[_KeyType, _StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_StringType2, _ResultType, _KeyType], None, None]: ...
@overload
def extract_iter(
query: _StringType1 | None,
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_StringType2, _ResultType, int], None, None]: ...
@overload
def extract_iter(
query: _UnprocessedType1 | None,
choices: Mapping[_KeyType, _UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_UnprocessedType2, _ResultType, _KeyType], None, None]: ...
@overload
def extract_iter(
query: _UnprocessedType1 | None,
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
scorer_kwargs: dict[str, Any] | None = None,
) -> Generator[tuple[_UnprocessedType2, _ResultType, int], None, None]: ...
try:
import numpy.typing as npt
@overload
def cdist(
queries: Iterable[Sequence[Hashable] | None],
choices: Iterable[Sequence[Hashable] | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
processor: None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
score_multiplier: float = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cdist(
queries: Iterable[_UnprocessedType1 | None],
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = None,
score_hint: float | None = None,
score_multiplier: float = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cdist(
queries: Iterable[_StringType1 | None],
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
score_multiplier: _ResultType = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cdist(
queries: Iterable[_UnprocessedType1 | None],
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
score_multiplier: _ResultType = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cpdist(
queries: Iterable[Sequence[Hashable] | None],
choices: Iterable[Sequence[Hashable] | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
processor: None = None,
score_cutoff: float | None = None,
score_hint: float | None = None,
score_multiplier: float = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cpdist(
queries: Iterable[_UnprocessedType1 | None],
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
score_cutoff: float | None = None,
score_hint: float | None = None,
score_multiplier: float = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cpdist(
queries: Iterable[_StringType1 | None],
choices: Iterable[_StringType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
processor: None = None,
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
score_multiplier: _ResultType = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
@overload
def cpdist(
queries: Iterable[_UnprocessedType1 | None],
choices: Iterable[_UnprocessedType2 | None],
*,
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
score_cutoff: _ResultType | None = None,
score_hint: _ResultType | None = None,
score_multiplier: _ResultType = 1,
dtype: npt.DTypeLike | None = None,
workers: int = 1,
scorer_kwargs: dict[str, Any] | None = None,
) -> npt.NDArray[Any]: ...
except ImportError:
pass

View File

@@ -0,0 +1,125 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from rapidfuzz.fuzz import ratio
from rapidfuzz.process_cpp_impl import (
FLOAT32 as _FLOAT32,
FLOAT64 as _FLOAT64,
INT8 as _INT8,
INT16 as _INT16,
INT32 as _INT32,
INT64 as _INT64,
UINT8 as _UINT8,
UINT16 as _UINT16,
UINT32 as _UINT32,
UINT64 as _UINT64,
cdist as _cdist,
cpdist as _cpdist,
extract,
extract_iter,
extractOne,
)
__all__ = ["cdist", "cpdist", "extract", "extractOne", "extract_iter"]
def _dtype_to_type_num(dtype):
import numpy as np
if dtype is None:
return None
dtype = np.dtype(dtype)
if dtype == np.int32:
return _INT32
if dtype == np.int8:
return _INT8
if dtype == np.int16:
return _INT16
if dtype == np.int64:
return _INT64
if dtype == np.uint8:
return _UINT8
if dtype == np.uint16:
return _UINT16
if dtype == np.uint32:
return _UINT32
if dtype == np.uint64:
return _UINT64
if dtype == np.float32:
return _FLOAT32
if dtype == np.float64:
return _FLOAT64
msg = f"unsupported dtype: {dtype}"
raise TypeError(msg)
def cdist(
queries,
choices,
*,
scorer=ratio,
processor=None,
score_cutoff=None,
score_hint=None,
score_multiplier=1,
dtype=None,
workers=1,
**kwargs,
):
import numpy as np
dtype = _dtype_to_type_num(dtype)
return np.asarray(
_cdist(
queries,
choices,
scorer=scorer,
processor=processor,
score_cutoff=score_cutoff,
score_hint=score_hint,
score_multiplier=score_multiplier,
dtype=dtype,
workers=workers,
**kwargs,
)
)
cdist.__doc__ = _cdist.__doc__
def cpdist(
queries,
choices,
*,
scorer=ratio,
processor=None,
score_cutoff=None,
score_hint=None,
score_multiplier=1,
dtype=None,
workers=1,
**kwargs,
):
import numpy as np
dtype = _dtype_to_type_num(dtype)
distance_matrix = _cpdist(
queries,
choices,
scorer=scorer,
processor=processor,
score_cutoff=score_cutoff,
score_hint=score_hint,
score_multiplier=score_multiplier,
dtype=dtype,
workers=workers,
**kwargs,
)
return np.asarray(distance_matrix)
cpdist.__doc__ = _cpdist.__doc__

View File

@@ -0,0 +1,679 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
import heapq
from rapidfuzz._utils import ScorerFlag, is_none, setupPandas
from rapidfuzz.fuzz import WRatio, ratio
__all__ = ["cdist", "extract", "extractOne", "extract_iter"]
def _get_scorer_flags_py(scorer, scorer_kwargs):
params = getattr(scorer, "_RF_ScorerPy", None)
if params is not None:
flags = params["get_scorer_flags"](**scorer_kwargs)
return (flags["worst_score"], flags["optimal_score"])
return (0, 100)
def extract_iter(
query,
choices,
*,
scorer=WRatio,
processor=None,
score_cutoff=None,
score_hint=None,
scorer_kwargs=None,
):
"""
Find the best match in a list of choices
Parameters
----------
query : Sequence[Hashable]
string we want to find
choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer : Callable, optional
Optional callable that is used to calculate the matching score between
the query and each choice. This can be any of the scorers included in RapidFuzz
(both scorers that calculate the edit distance or the normalized edit distance), or
a custom function, which returns a normalized edit distance.
fuzz.WRatio is used by default.
processor : Callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : Any, optional
Optional argument for a score threshold. When an edit distance is used this represents the maximum
edit distance and matches with a `distance > score_cutoff` are ignored. When a
normalized edit distance is used this represents the minimal similarity
and matches with a `similarity < score_cutoff` are ignored. Default is None, which deactivates this behaviour.
score_hint : Any, optional
Optional argument for an expected score to be passed to the scorer.
This is used to select a faster implementation. Default is None,
which deactivates this behaviour.
scorer_kwargs : dict[str, Any], optional
any other named parameters are passed to the scorer. This can be used to pass
e.g. weights to `Levenshtein.distance`
Yields
-------
tuple[Sequence[Hashable], Any, Any]
Yields similarity between the query and each choice in form of a Tuple with 3 elements.
The values stored in the tuple depend on the types of the input arguments.
* The first element is always the current `choice`, which is the value that's compared to the query.
* The second value represents the similarity calculated by the scorer. This can be:
* An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
In this case only choices which have a `distance <= score_cutoff` are yielded.
An example of a scorer with this behavior is `Levenshtein.distance`.
* A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
In this case only choices which have a `similarity >= score_cutoff` are yielded.
An example of a scorer with this behavior is `Levenshtein.normalized_similarity`.
Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.
* The third parameter depends on the type of the `choices` argument it is:
* The `index of choice` when choices is a simple iterable like a list
* The `key of choice` when choices is a mapping like a dict, or a pandas Series
"""
_ = score_hint
scorer_kwargs = scorer_kwargs or {}
worst_score, optimal_score = _get_scorer_flags_py(scorer, scorer_kwargs)
lowest_score_worst = optimal_score > worst_score
setupPandas()
if is_none(query):
return
if score_cutoff is None:
score_cutoff = worst_score
# preprocess the query
if processor is not None:
query = processor(query)
choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices)
for key, choice in choices_iter:
if is_none(choice):
continue
if processor is None:
score = scorer(query, choice, score_cutoff=score_cutoff, **scorer_kwargs)
else:
score = scorer(
query,
processor(choice),
score_cutoff=score_cutoff,
**scorer_kwargs,
)
if lowest_score_worst:
if score >= score_cutoff:
yield (choice, score, key)
else:
if score <= score_cutoff:
yield (choice, score, key)
def extractOne(
query,
choices,
*,
scorer=WRatio,
processor=None,
score_cutoff=None,
score_hint=None,
scorer_kwargs=None,
):
"""
Find the best match in a list of choices. When multiple elements have the same similarity,
the first element is returned.
Parameters
----------
query : Sequence[Hashable]
string we want to find
choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer : Callable, optional
Optional callable that is used to calculate the matching score between
the query and each choice. This can be any of the scorers included in RapidFuzz
(both scorers that calculate the edit distance or the normalized edit distance), or
a custom function, which returns a normalized edit distance.
fuzz.WRatio is used by default.
processor : Callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : Any, optional
Optional argument for a score threshold. When an edit distance is used this represents the maximum
edit distance and matches with a `distance > score_cutoff` are ignored. When a
normalized edit distance is used this represents the minimal similarity
and matches with a `similarity < score_cutoff` are ignored. Default is None, which deactivates this behaviour.
score_hint : Any, optional
Optional argument for an expected score to be passed to the scorer.
This is used to select a faster implementation. Default is None,
which deactivates this behaviour.
scorer_kwargs : dict[str, Any], optional
any other named parameters are passed to the scorer. This can be used to pass
e.g. weights to `Levenshtein.distance`
Returns
-------
tuple[Sequence[Hashable], Any, Any]
Returns the best match in form of a Tuple with 3 elements. The values stored in the
tuple depend on the types of the input arguments.
* The first element is always the `choice`, which is the value that's compared to the query.
* The second value represents the similarity calculated by the scorer. This can be:
* An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
In this case only choices which have a `distance <= score_cutoff` are returned.
An example of a scorer with this behavior is `Levenshtein.distance`.
* A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
In this case only choices which have a `similarity >= score_cutoff` are returned.
An example of a scorer with this behavior is `Levenshtein.normalized_similarity`.
Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.
* The third parameter depends on the type of the `choices` argument it is:
* The `index of choice` when choices is a simple iterable like a list
* The `key of choice` when choices is a mapping like a dict, or a pandas Series
None
When no choice has a `similarity >= score_cutoff`/`distance <= score_cutoff` None is returned
Examples
--------
>>> from rapidfuzz.process import extractOne
>>> from rapidfuzz.distance import Levenshtein
>>> from rapidfuzz.fuzz import ratio
extractOne can be used with normalized edit distances.
>>> extractOne("abcd", ["abce"], scorer=ratio)
("abcd", 75.0, 1)
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.normalized_similarity)
("abcd", 0.75, 1)
extractOne can be used with edit distances as well.
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance)
("abce", 1, 0)
additional settings of the scorer can be passed via the scorer_kwargs argument to extractOne
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance, scorer_kwargs={"weights":(1,1,2)})
("abcde", 2, 1)
when a mapping is used for the choices the key of the choice is returned instead of the List index
>>> extractOne("abcd", {"key": "abce"}, scorer=ratio)
("abcd", 75.0, "key")
It is possible to specify a processor function which is used to preprocess the strings before comparing them.
>>> extractOne("abcd", ["abcD"], scorer=ratio)
("abcD", 75.0, 0)
>>> extractOne("abcd", ["abcD"], scorer=ratio, processor=utils.default_process)
("abcD", 100.0, 0)
>>> extractOne("abcd", ["abcD"], scorer=ratio, processor=lambda s: s.upper())
("abcD", 100.0, 0)
When only results with a similarity above a certain threshold are relevant, the parameter score_cutoff can be
used to filter out results with a lower similarity. This threshold is used by some of the scorers to exit early,
when they are sure, that the similarity is below the threshold.
For normalized edit distances all results with a similarity below score_cutoff are filtered out
>>> extractOne("abcd", ["abce"], scorer=ratio)
("abce", 75.0, 0)
>>> extractOne("abcd", ["abce"], scorer=ratio, score_cutoff=80)
None
For edit distances all results with an edit distance above the score_cutoff are filtered out
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance, scorer_kwargs={"weights":(1,1,2)})
("abce", 2, 0)
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance, scorer_kwargs={"weights":(1,1,2)}, score_cutoff=1)
None
"""
_ = score_hint
scorer_kwargs = scorer_kwargs or {}
worst_score, optimal_score = _get_scorer_flags_py(scorer, scorer_kwargs)
lowest_score_worst = optimal_score > worst_score
setupPandas()
if is_none(query):
return None
if score_cutoff is None:
score_cutoff = worst_score
# preprocess the query
if processor is not None:
query = processor(query)
result = None
choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices)
for key, choice in choices_iter:
if is_none(choice):
continue
if processor is None:
score = scorer(query, choice, score_cutoff=score_cutoff, **scorer_kwargs)
else:
score = scorer(
query,
processor(choice),
score_cutoff=score_cutoff,
**scorer_kwargs,
)
if lowest_score_worst:
if score >= score_cutoff and (result is None or score > result[1]):
score_cutoff = score
result = (choice, score, key)
else:
if score <= score_cutoff and (result is None or score < result[1]):
score_cutoff = score
result = (choice, score, key)
if score == optimal_score:
break
return result
def extract(
query,
choices,
*,
scorer=WRatio,
processor=None,
limit=5,
score_cutoff=None,
score_hint=None,
scorer_kwargs=None,
):
"""
Find the best matches in a list of choices. The list is sorted by the similarity.
When multiple choices have the same similarity, they are sorted by their index
Parameters
----------
query : Sequence[Hashable]
string we want to find
choices : Collection[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
list of all strings the query should be compared with or dict with a mapping
{<result>: <string to compare>}
scorer : Callable, optional
Optional callable that is used to calculate the matching score between
the query and each choice. This can be any of the scorers included in RapidFuzz
(both scorers that calculate the edit distance or the normalized edit distance), or
a custom function, which returns a normalized edit distance.
fuzz.WRatio is used by default.
processor : Callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
limit : int, optional
maximum amount of results to return. None can be passed to disable this behavior.
Default is 5.
score_cutoff : Any, optional
Optional argument for a score threshold. When an edit distance is used this represents the maximum
edit distance and matches with a `distance > score_cutoff` are ignored. When a
normalized edit distance is used this represents the minimal similarity
and matches with a `similarity < score_cutoff` are ignored. Default is None, which deactivates this behaviour.
score_hint : Any, optional
Optional argument for an expected score to be passed to the scorer.
This is used to select a faster implementation. Default is None,
which deactivates this behaviour.
scorer_kwargs : dict[str, Any], optional
any other named parameters are passed to the scorer. This can be used to pass
e.g. weights to `Levenshtein.distance`
Returns
-------
list[tuple[Sequence[Hashable], Any, Any]]
The return type is always a List of Tuples with 3 elements. However the values stored in the
tuple depend on the types of the input arguments.
* The first element is always the `choice`, which is the value that's compared to the query.
* The second value represents the similarity calculated by the scorer. This can be:
* An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
In this case only choices which have a `distance <= score_cutoff` are returned.
An example of a scorer with this behavior is `Levenshtein.distance`.
* A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
In this case only choices which have a `similarity >= score_cutoff` are returned.
An example of a scorer with this behavior is `Levenshtein.normalized_similarity`.
Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.
* The third parameter depends on the type of the `choices` argument it is:
* The `index of choice` when choices is a simple iterable like a list
* The `key of choice` when choices is a mapping like a dict, or a pandas Series
The list is sorted by similarity or distance depending on the scorer used. The first element in the list
has the `highest similarity`/`smallest distance`.
"""
scorer_kwargs = scorer_kwargs or {}
worst_score, optimal_score = _get_scorer_flags_py(scorer, scorer_kwargs)
lowest_score_worst = optimal_score > worst_score
if limit == 1:
res = extractOne(
query,
choices,
processor=processor,
scorer=scorer,
score_cutoff=score_cutoff,
score_hint=score_hint,
scorer_kwargs=scorer_kwargs,
)
if res is None:
return []
return [res]
result_iter = extract_iter(
query,
choices,
processor=processor,
scorer=scorer,
score_cutoff=score_cutoff,
score_hint=score_hint,
scorer_kwargs=scorer_kwargs,
)
if limit is None:
return sorted(result_iter, key=lambda i: i[1], reverse=lowest_score_worst)
if lowest_score_worst:
return heapq.nlargest(limit, result_iter, key=lambda i: i[1])
return heapq.nsmallest(limit, result_iter, key=lambda i: i[1])
def _dtype_to_type_num(
dtype,
scorer,
scorer_kwargs,
):
import numpy as np
if dtype is not None:
return np.dtype(dtype)
params = getattr(scorer, "_RF_ScorerPy", None)
if params is not None:
flags = params["get_scorer_flags"](**scorer_kwargs)
if flags["flags"] & ScorerFlag.RESULT_I64:
return np.int32
if flags["flags"] & ScorerFlag.RESULT_SIZE_T:
return np.uint32
return np.float32
return np.float32
def _is_symmetric(scorer, scorer_kwargs):
params = getattr(scorer, "_RF_ScorerPy", None)
if params is not None:
flags = params["get_scorer_flags"](**scorer_kwargs)
if flags["flags"] & ScorerFlag.SYMMETRIC:
return True
return False
def cdist(
queries,
choices,
*,
scorer=ratio,
processor=None,
score_cutoff=None,
score_hint=None,
score_multiplier=1,
dtype=None,
workers=1,
scorer_kwargs=None,
):
"""
Compute distance/similarity between each pair of the two collections of inputs.
Parameters
----------
queries : Collection[Sequence[Hashable]]
list of all strings the queries
choices : Collection[Sequence[Hashable]]
list of all strings the query should be compared
scorer : Callable, optional
Optional callable that is used to calculate the matching score between
the query and each choice. This can be any of the scorers included in RapidFuzz
(both scorers that calculate the edit distance or the normalized edit distance), or
a custom function, which returns a normalized edit distance.
fuzz.ratio is used by default.
processor : Callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : Any, optional
Optional argument for a score threshold to be passed to the scorer.
Default is None, which deactivates this behaviour.
score_hint : Any, optional
Optional argument for an expected score to be passed to the scorer.
This is used to select a faster implementation. Default is None,
which deactivates this behaviour.
score_multiplier: Any, optional
Optional argument to multiply the calculated score with. This is applied as the final step,
so e.g. score_cutoff is applied on the unmodified score. This is mostly useful to map from
a floating point range to an integer to reduce the memory usage. Default is 1,
which deactivates this behaviour.
dtype : data-type, optional
The desired data-type for the result array. Depending on the scorer type the following
dtypes are supported:
- similarity:
- np.float32, np.float64
- np.uint8 -> stores fixed point representation of the result scaled to a range 0-100
- distance:
- np.int8, np.int16, np.int32, np.int64
If not given, then the type will be np.float32 for similarities and np.int32 for distances.
workers : int, optional
The calculation is subdivided into workers sections and evaluated in parallel.
Supply -1 to use all available CPU cores.
This argument is only available for scorers using the RapidFuzz C-API so far, since it
releases the Python GIL.
scorer_kwargs : dict[str, Any], optional
any other named parameters are passed to the scorer. This can be used to pass
e.g. weights to `Levenshtein.distance`
Returns
-------
ndarray
Returns a matrix of dtype with the distance/similarity between each pair
of the two collections of inputs.
"""
import numpy as np
_ = workers, score_hint
scorer_kwargs = scorer_kwargs or {}
dtype = _dtype_to_type_num(dtype, scorer, scorer_kwargs)
results = np.zeros((len(queries), len(choices)), dtype=dtype)
setupPandas()
if processor is None:
proc_choices = list(choices)
else:
proc_choices = [x if is_none(x) else processor(x) for x in choices]
if queries is choices and _is_symmetric(scorer, scorer_kwargs):
for i, proc_query in enumerate(proc_choices):
score = scorer(proc_query, proc_query, score_cutoff=score_cutoff, **scorer_kwargs) * score_multiplier
if np.issubdtype(dtype, np.integer):
score = round(score)
results[i, i] = score
for j in range(i + 1, len(proc_choices)):
score = (
scorer(
proc_query,
proc_choices[j],
score_cutoff=score_cutoff,
**scorer_kwargs,
)
* score_multiplier
)
if np.issubdtype(dtype, np.integer):
score = round(score)
results[i, j] = results[j, i] = score
else:
for i, query in enumerate(queries):
proc_query = processor(query) if (processor and not is_none(query)) else query
for j, choice in enumerate(proc_choices):
score = (
scorer(
proc_query,
choice,
score_cutoff=score_cutoff,
**scorer_kwargs,
)
* score_multiplier
)
if np.issubdtype(dtype, np.integer):
score = round(score)
results[i, j] = score
return results
def cpdist(
queries,
choices,
*,
scorer=ratio,
processor=None,
score_cutoff=None,
score_hint=None,
score_multiplier=1,
dtype=None,
workers=1,
scorer_kwargs=None,
):
"""
Compute the pairwise distance/similarity between corresponding elements of the queries & choices.
Parameters
----------
queries : Collection[Sequence[Hashable]]
list of strings used to compute the distance/similarity.
choices : Collection[Sequence[Hashable]]
list of strings the queries should be compared with. Must be the same length as the queries.
scorer : Callable, optional
Optional callable that is used to calculate the matching score between
the query and each choice. This can be any of the scorers included in RapidFuzz
(both scorers that calculate the edit distance or the normalized edit distance), or
a custom function, which returns a normalized edit distance.
fuzz.ratio is used by default.
processor : Callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : Any, optional
Optional argument for a score threshold to be passed to the scorer.
Default is None, which deactivates this behaviour.
score_hint : Any, optional
Optional argument for an expected score to be passed to the scorer.
This is used to select a faster implementation. Default is None,
which deactivates this behaviour.
score_multiplier: Any, optional
Optional argument to multiply the calculated score with. This is applied as the final step,
so e.g. score_cutoff is applied on the unmodified score. This is mostly useful to map from
a floating point range to an integer to reduce the memory usage. Default is 1,
which deactivates this behaviour.
dtype : data-type, optional
The desired data-type for the result array. Depending on the scorer type the following
dtypes are supported:
- similarity:
- np.float32, np.float64
- np.uint8 -> stores fixed point representation of the result scaled to a range 0-100
- distance:
- np.int8, np.int16, np.int32, np.int64
If not given, then the type will be np.float32 for similarities and np.int32 for distances.
workers : int, optional
The calculation is subdivided into workers sections and evaluated in parallel.
Supply -1 to use all available CPU cores.
This argument is only available for scorers using the RapidFuzz C-API so far, since it
releases the Python GIL.
scorer_kwargs : dict[str, Any], optional
any other named parameters are passed to the scorer. This can be used to pass
e.g. weights to `Levenshtein.distance`
Returns
-------
ndarray
Returns a matrix of size (n x 1) of dtype with the distance/similarity between each pair
of the two collections of inputs.
"""
import numpy as np
len_queries = len(queries)
len_choices = len(choices)
if len_queries != len_choices:
error_message = "Length of queries and choices must be the same!"
raise ValueError(error_message)
_ = workers, score_hint
scorer_kwargs = scorer_kwargs or {}
dtype = _dtype_to_type_num(dtype, scorer, scorer_kwargs)
results = np.zeros((len_queries,), dtype=dtype)
setupPandas()
for i, (query, choice) in enumerate(zip(queries, choices)):
proc_query = processor(query) if (processor and not is_none(query)) else query
proc_choice = processor(choice) if (processor and not is_none(choice)) else choice
score = scorer(
proc_query,
proc_choice,
score_cutoff=score_cutoff,
**scorer_kwargs,
)
# Apply score multiplier
score *= score_multiplier
# Round the result if dtype is integral
if np.issubdtype(dtype, np.integer):
score = round(score)
# Store the score in the results matrix
results[i] = score
return results

View File

@@ -0,0 +1,65 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2025 Max Bachmann
# This file is generated by tools/generate_python.py
from __future__ import annotations
import contextlib
import os
from rapidfuzz._feature_detector import AVX2, SSE2, supports
__all__ = ["default_process"]
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
if _impl == "cpp":
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.utils_cpp_avx2 import (
default_process, # pyright: ignore[reportMissingImports]
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.utils_cpp_sse2 import (
default_process, # pyright: ignore[reportMissingImports]
)
imported = True
if not imported:
from rapidfuzz.utils_cpp import (
default_process, # pyright: ignore[reportMissingImports]
)
elif _impl == "python":
from rapidfuzz.utils_py import default_process
else:
imported = False
if supports(AVX2):
with contextlib.suppress(ImportError):
from rapidfuzz.utils_cpp_avx2 import (
default_process, # pyright: ignore[reportMissingImports]
)
imported = True
if not imported and supports(SSE2):
with contextlib.suppress(ImportError):
from rapidfuzz.utils_cpp_sse2 import (
default_process, # pyright: ignore[reportMissingImports]
)
imported = True
if not imported:
with contextlib.suppress(ImportError):
from rapidfuzz.utils_cpp import (
default_process, # pyright: ignore[reportMissingImports]
)
imported = True
if not imported:
from rapidfuzz.utils_py import default_process

View File

@@ -0,0 +1,11 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
from collections.abc import Hashable, Sequence
from typing import TypeVar
_StringType = TypeVar("_StringType", bound=Sequence[Hashable])
def default_process(sentence: _StringType) -> _StringType: ...

View File

@@ -0,0 +1,32 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2022 Max Bachmann
from __future__ import annotations
import re
_alnum_regex = re.compile(r"(?ui)\W")
def default_process(sentence: str) -> str:
"""
This function preprocesses a string by:
* removing all non alphanumeric characters
* trimming whitespaces
* converting all characters to lower case
Parameters
----------
sentence : str
String to preprocess
Returns
-------
processed_string : str
processed string
"""
string_out = _alnum_regex.sub(" ", sentence)
return string_out.strip().lower()