Fix project isolation: Make loadChatHistory respect active project sessions
- Modified loadChatHistory() to check for active project before fetching all sessions - When active project exists, use project.sessions instead of fetching from API - Added detailed console logging to debug session filtering - This prevents ALL sessions from appearing in every project's sidebar Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,233 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _damerau_levenshtein_distance_zhao(s1, s2):
|
||||
maxVal = max(len(s1), len(s2)) + 1
|
||||
last_row_id = {}
|
||||
last_row_id_get = last_row_id.get
|
||||
size = len(s2) + 2
|
||||
FR = [maxVal] * size
|
||||
R1 = [maxVal] * size
|
||||
R = list(range(size))
|
||||
R[-1] = maxVal
|
||||
|
||||
for i in range(1, len(s1) + 1):
|
||||
R, R1 = R1, R
|
||||
last_col_id = -1
|
||||
last_i2l1 = R[0]
|
||||
R[0] = i
|
||||
T = maxVal
|
||||
|
||||
for j in range(1, len(s2) + 1):
|
||||
diag = R1[j - 1] + (s1[i - 1] != s2[j - 1])
|
||||
left = R[j - 1] + 1
|
||||
up = R1[j] + 1
|
||||
temp = min(diag, left, up)
|
||||
|
||||
if s1[i - 1] == s2[j - 1]:
|
||||
last_col_id = j # last occurrence of s1_i
|
||||
FR[j] = R1[j - 2] # save H_k-1,j-2
|
||||
T = last_i2l1 # save H_i-2,l-1
|
||||
else:
|
||||
k = last_row_id_get(s2[j - 1], -1)
|
||||
l = last_col_id # noqa: E741
|
||||
|
||||
if (j - l) == 1:
|
||||
transpose = FR[j] + (i - k)
|
||||
temp = min(temp, transpose)
|
||||
elif (i - k) == 1:
|
||||
transpose = T + (j - l)
|
||||
temp = min(temp, transpose)
|
||||
|
||||
last_i2l1 = R[j]
|
||||
R[j] = temp
|
||||
|
||||
last_row_id[s1[i - 1]] = i
|
||||
|
||||
return R[len(s2)]
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Damerau-Levenshtein distance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Damerau-Levenshtein distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import DamerauLevenshtein
|
||||
>>> DamerauLevenshtein.distance("CA", "ABC")
|
||||
2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
dist = _damerau_levenshtein_distance_zhao(s1, s2)
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Damerau-Levenshtein similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``max(len1, len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Damerau-Levenshtein distance in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / max(len1, len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Damerau-Levenshtein similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Hamming.py
Normal file
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Hamming.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,113 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,322 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Hamming distance between two strings.
|
||||
The hamming distance is defined as the number of positions
|
||||
where the two strings differ. It describes the minimum
|
||||
amount of substitutions required to transform s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int or None, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
if not pad and len(s1) != len(s2):
|
||||
msg = "Sequences are not the same length."
|
||||
raise ValueError(msg)
|
||||
|
||||
min_len = min(len(s1), len(s2))
|
||||
dist = max(len(s1), len(s2))
|
||||
for i in range(min_len):
|
||||
dist -= s1[i] == s2[i]
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Hamming similarity between two strings.
|
||||
|
||||
This is calculated as ``len1 - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2, pad=pad)
|
||||
sim = maximum - dist
|
||||
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Hamming similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2, pad=pad)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Hamming similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2, pad=pad, processor=processor)
|
||||
norm_sim = 1 - norm_dist
|
||||
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
if not pad and len(s1) != len(s2):
|
||||
msg = "Sequences are not the same length."
|
||||
raise ValueError(msg)
|
||||
|
||||
ops_list = []
|
||||
min_len = min(len(s1), len(s2))
|
||||
for i in range(min_len):
|
||||
if s1[i] != s2[i]:
|
||||
ops_list.append(Editop("replace", i, i))
|
||||
|
||||
for i in range(min_len, len(s1)):
|
||||
ops_list.append(Editop("delete", i, len(s2)))
|
||||
|
||||
for i in range(min_len, len(s2)):
|
||||
ops_list.append(Editop("insert", len(s1), i))
|
||||
|
||||
# sidestep input validation
|
||||
ops = Editops.__new__(Editops)
|
||||
ops._src_len = len(s1)
|
||||
ops._dest_len = len(s2)
|
||||
ops._editops = ops_list
|
||||
return ops
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
"""
|
||||
return editops(s1, s2, pad=pad, processor=processor).as_opcodes()
|
||||
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.py
Normal file
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.pyi
Normal file
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.pyi
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,358 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance.LCSseq_py import (
|
||||
_block_similarity as lcs_seq_block_similarity,
|
||||
editops as lcs_seq_editops,
|
||||
opcodes as lcs_seq_opcodes,
|
||||
similarity as lcs_seq_similarity,
|
||||
)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the minimum number of insertions and deletions
|
||||
required to change one sequence into the other. This is equivalent to the
|
||||
Levenshtein distance with a substitution weight of 2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Indel distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> Indel.distance("lewenstein", "levenshtein")
|
||||
3
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Indel.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
lcs_sim = lcs_seq_similarity(s1, s2)
|
||||
dist = maximum - 2 * lcs_sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def _block_distance(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
maximum = len(s1) + len(s2)
|
||||
lcs_sim = lcs_seq_block_similarity(block, s1, s2)
|
||||
dist = maximum - 2 * lcs_sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Indel similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``(len1 + len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def _block_normalized_distance(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = _block_distance(block, s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized indel similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized Indel similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> Indel.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.85714285714285
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Indel.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
|
||||
0.0
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.8571428571428572
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _block_normalized_similarity(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
norm_dist = _block_normalized_distance(block, s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> for tag, src_pos, dest_pos in Indel.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[0] s2[0]
|
||||
delete s1[3] s2[2]
|
||||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
return lcs_seq_editops(s1, s2, processor=processor)
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in Indel.opcodes(a, b):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
delete a[3:4] (x) b[2:2] ()
|
||||
insert a[4:4] () b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return lcs_seq_opcodes(s1, s2, processor=processor)
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,235 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance import Jaro_py as Jaro
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro winkler similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
if prefix_weight > 1.0 or prefix_weight < 0.0:
|
||||
msg = "prefix_weight has to be in the range 0.0 - 1.0"
|
||||
raise ValueError(msg)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
P_len = len(s1)
|
||||
T_len = len(s2)
|
||||
min_len = min(P_len, T_len)
|
||||
prefix = 0
|
||||
max_prefix = min(min_len, 4)
|
||||
|
||||
for _ in range(max_prefix):
|
||||
if s1[prefix] != s2[prefix]:
|
||||
break
|
||||
prefix += 1
|
||||
|
||||
jaro_score_cutoff = score_cutoff
|
||||
if jaro_score_cutoff > 0.7:
|
||||
prefix_sim = prefix * prefix_weight
|
||||
|
||||
if prefix_sim >= 1.0:
|
||||
jaro_score_cutoff = 0.7
|
||||
else:
|
||||
jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0))
|
||||
|
||||
Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
|
||||
if Sim > 0.7:
|
||||
Sim += prefix * prefix_weight * (1.0 - Sim)
|
||||
Sim = min(Sim, 1.0)
|
||||
|
||||
return Sim if Sim >= score_cutoff else 0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro winkler similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized similarity : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
return similarity(
|
||||
s1,
|
||||
s2,
|
||||
prefix_weight=prefix_weight,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro winkler distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : float
|
||||
distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
|
||||
sim = similarity(s1, s2, prefix_weight=prefix_weight, score_cutoff=cutoff_distance)
|
||||
dist = 1.0 - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro winkler distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized distance : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
return distance(
|
||||
s1,
|
||||
s2,
|
||||
prefix_weight=prefix_weight,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
255
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Jaro_py.py
Normal file
255
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Jaro_py.py
Normal file
@@ -0,0 +1,255 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _jaro_calculate_similarity(pattern_len, text_len, common_chars, transpositions):
|
||||
transpositions //= 2
|
||||
sim = 0.0
|
||||
sim += common_chars / pattern_len
|
||||
sim += common_chars / text_len
|
||||
sim += (common_chars - transpositions) / common_chars
|
||||
return sim / 3.0
|
||||
|
||||
|
||||
def _jaro_length_filter(pattern_len, text_len, score_cutoff):
|
||||
"""
|
||||
filter matches below score_cutoff based on string lengths
|
||||
"""
|
||||
if not pattern_len or not text_len:
|
||||
return False
|
||||
|
||||
sim = _jaro_calculate_similarity(pattern_len, text_len, min(pattern_len, text_len), 0)
|
||||
return sim >= score_cutoff
|
||||
|
||||
|
||||
def _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
|
||||
"""
|
||||
filter matches below score_cutoff based on string lengths and common characters
|
||||
"""
|
||||
if not common_chars:
|
||||
return False
|
||||
|
||||
sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0)
|
||||
return sim >= score_cutoff
|
||||
|
||||
|
||||
def _jaro_bounds(s1, s2):
|
||||
"""
|
||||
find bounds and skip out of bound parts of the sequences
|
||||
"""
|
||||
pattern_len = len(s1)
|
||||
text_len = len(s2)
|
||||
|
||||
# since jaro uses a sliding window some parts of T/P might never be in
|
||||
# range an can be removed ahead of time
|
||||
bound = 0
|
||||
if text_len > pattern_len:
|
||||
bound = text_len // 2 - 1
|
||||
if text_len > pattern_len + bound:
|
||||
s2 = s2[: pattern_len + bound]
|
||||
else:
|
||||
bound = pattern_len // 2 - 1
|
||||
if pattern_len > text_len + bound:
|
||||
s1 = s1[: text_len + bound]
|
||||
return s1, s2, bound
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1 and not s2:
|
||||
return 1.0
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
pattern_len = len(s1)
|
||||
text_len = len(s2)
|
||||
|
||||
# short circuit if score_cutoff can not be reached
|
||||
if not _jaro_length_filter(pattern_len, text_len, score_cutoff):
|
||||
return 0
|
||||
|
||||
if pattern_len == 1 and text_len == 1:
|
||||
return float(s1[0] == s2[0])
|
||||
|
||||
s1, s2, bound = _jaro_bounds(s1, s2)
|
||||
|
||||
s1_flags = [False] * pattern_len
|
||||
s2_flags = [False] * text_len
|
||||
|
||||
# todo use bitparallel implementation
|
||||
# looking only within search range, count & flag matched pairs
|
||||
common_chars = 0
|
||||
for i, s1_ch in enumerate(s1):
|
||||
low = max(0, i - bound)
|
||||
hi = min(i + bound, text_len - 1)
|
||||
for j in range(low, hi + 1):
|
||||
if not s2_flags[j] and s2[j] == s1_ch:
|
||||
s1_flags[i] = s2_flags[j] = True
|
||||
common_chars += 1
|
||||
break
|
||||
|
||||
# short circuit if score_cutoff can not be reached
|
||||
if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
|
||||
return 0
|
||||
|
||||
# todo use bitparallel implementation
|
||||
# count transpositions
|
||||
k = trans_count = 0
|
||||
for i, s1_f in enumerate(s1_flags):
|
||||
if s1_f:
|
||||
for j in range(k, text_len):
|
||||
if s2_flags[j]:
|
||||
k = j + 1
|
||||
break
|
||||
if s1[i] != s2[j]:
|
||||
trans_count += 1
|
||||
|
||||
return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count)
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized similarity : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : float
|
||||
distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
|
||||
sim = similarity(s1, s2, score_cutoff=cutoff_distance)
|
||||
dist = 1.0 - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized distance : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
"""
|
||||
return distance(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.py
Normal file
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.pyi
Normal file
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.pyi
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,426 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import common_affix, conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the length of the longest common subsequence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1:
|
||||
return 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
S = (1 << len(s1)) - 1
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
res = bin(S)[-len(s1) :].count("0")
|
||||
return res if (score_cutoff is None or res >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _block_similarity(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
if not s1:
|
||||
return 0
|
||||
|
||||
S = (1 << len(s1)) - 1
|
||||
block_get = block.get
|
||||
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
res = bin(S)[-len(s1) :].count("0")
|
||||
return res if (score_cutoff is None or res >= score_cutoff) else 0
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the LCS distance in the range [0, max].
|
||||
|
||||
This is calculated as ``max(len1, len2) - similarity``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the LCS distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
>>> LCSseq.distance("lewenstein", "levenshtein")
|
||||
2
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> LCSseq.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
dist = maximum - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized LCS similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / max(len1, len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1 or not s2:
|
||||
return 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
norm_sim = distance(s1, s2) / maximum
|
||||
return norm_sim if (score_cutoff is None or norm_sim <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized LCS similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized LCS similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.8181818181818181
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
|
||||
0.0
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.81818181818181
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
norm_sim = 1.0 - normalized_distance(s1, s2)
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _matrix(s1, s2):
|
||||
if not s1:
|
||||
return (0, [])
|
||||
|
||||
S = (1 << len(s1)) - 1
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
matrix = []
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
matrix.append(S)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
sim = bin(S)[-len(s1) :].count("0")
|
||||
return (sim, matrix)
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described in [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
>>> for tag, src_pos, dest_pos in LCSseq.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[0] s2[0]
|
||||
delete s1[3] s2[2]
|
||||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
prefix_len, suffix_len = common_affix(s1, s2)
|
||||
s1 = s1[prefix_len : len(s1) - suffix_len]
|
||||
s2 = s2[prefix_len : len(s2) - suffix_len]
|
||||
sim, matrix = _matrix(s1, s2)
|
||||
|
||||
editops = Editops([], 0, 0)
|
||||
editops._src_len = len(s1) + prefix_len + suffix_len
|
||||
editops._dest_len = len(s2) + prefix_len + suffix_len
|
||||
|
||||
dist = len(s1) + len(s2) - 2 * sim
|
||||
if dist == 0:
|
||||
return editops
|
||||
|
||||
editop_list = [None] * dist
|
||||
col = len(s1)
|
||||
row = len(s2)
|
||||
while row != 0 and col != 0:
|
||||
# deletion
|
||||
if matrix[row - 1] & (1 << (col - 1)):
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
row -= 1
|
||||
|
||||
# insertion
|
||||
if row and not (matrix[row - 1] & (1 << (col - 1))):
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
# match
|
||||
else:
|
||||
col -= 1
|
||||
|
||||
while col != 0:
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
|
||||
while row != 0:
|
||||
dist -= 1
|
||||
row -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
|
||||
editops._editops = editop_list
|
||||
return editops
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described in [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in LCSseq.opcodes(a, b):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
delete a[3:4] (x) b[2:2] ()
|
||||
insert a[4:4] () b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return editops(s1, s2, processor=processor).as_opcodes()
|
||||
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,131 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
"""
|
||||
The Levenshtein (edit) distance is a string metric to measure the
|
||||
difference between two strings/sequences s1 and s2.
|
||||
It's defined as the minimum number of insertions, deletions or
|
||||
substitutions required to transform s1 into s2.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_hint: int | None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_hint: int | None = None,
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,571 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import common_affix, conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance import Indel_py as Indel
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops
|
||||
|
||||
|
||||
def _levenshtein_maximum(s1, s2, weights):
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
insert, delete, replace = weights
|
||||
|
||||
max_dist = len1 * delete + len2 * insert
|
||||
|
||||
if len1 >= len2:
|
||||
max_dist = min(max_dist, len2 * replace + (len1 - len2) * delete)
|
||||
else:
|
||||
max_dist = min(max_dist, len1 * replace + (len2 - len1) * insert)
|
||||
|
||||
return max_dist
|
||||
|
||||
|
||||
def _uniform_generic(s1, s2, weights):
|
||||
len1 = len(s1)
|
||||
insert, delete, replace = weights
|
||||
cache = list(range(0, (len1 + 1) * delete, delete))
|
||||
|
||||
for ch2 in s2:
|
||||
temp = cache[0]
|
||||
cache[0] += insert
|
||||
for i in range(len1):
|
||||
x = temp
|
||||
if s1[i] != ch2:
|
||||
x = min(cache[i] + delete, cache[i + 1] + insert, temp + replace)
|
||||
temp = cache[i + 1]
|
||||
cache[i + 1] = x
|
||||
|
||||
return cache[-1]
|
||||
|
||||
|
||||
def _uniform_distance(s1, s2):
|
||||
if not s1:
|
||||
return len(s2)
|
||||
|
||||
VP = (1 << len(s1)) - 1
|
||||
VN = 0
|
||||
currDist = len(s1)
|
||||
mask = 1 << (len(s1) - 1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for ch2 in s2:
|
||||
# Step 1: Computing D0
|
||||
PM_j = block_get(ch2, 0)
|
||||
X = PM_j
|
||||
D0 = (((X & VP) + VP) ^ VP) | X | VN
|
||||
# Step 2: Computing HP and HN
|
||||
HP = VN | ~(D0 | VP)
|
||||
HN = D0 & VP
|
||||
# Step 3: Computing the value D[m,j]
|
||||
currDist += (HP & mask) != 0
|
||||
currDist -= (HN & mask) != 0
|
||||
# Step 4: Computing Vp and VN
|
||||
HP = (HP << 1) | 1
|
||||
HN = HN << 1
|
||||
VP = HN | ~(D0 | HP)
|
||||
VN = HP & D0
|
||||
|
||||
return currDist
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein with custom
|
||||
costs for insertion, deletion and substitution
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Levenshtein distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.distance("lewenstein", "levenshtein")
|
||||
2
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Levenshtein.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
It is possible to select different weights by passing a `weight`
|
||||
tuple.
|
||||
|
||||
>>> Levenshtein.distance("lewenstein", "levenshtein", weights=(1,1,2))
|
||||
3
|
||||
"""
|
||||
_ = score_hint
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
if weights is None or weights == (1, 1, 1):
|
||||
dist = _uniform_distance(s1, s2)
|
||||
elif weights == (1, 1, 2):
|
||||
dist = Indel.distance(s1, s2)
|
||||
else:
|
||||
dist = _uniform_generic(s1, s2, weights)
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates the levenshtein similarity in the range [max, 0] using custom
|
||||
costs for insertion, deletion and substitution.
|
||||
|
||||
This is calculated as ``max - distance``, where max is the maximal possible
|
||||
Levenshtein distance given the lengths of the sequences s1/s2 and the weights.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected similarity between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
"""
|
||||
_ = score_hint
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
weights = weights or (1, 1, 1)
|
||||
maximum = _levenshtein_maximum(s1, s2, weights)
|
||||
dist = distance(s1, s2, weights=weights)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein distance in the range [1, 0] using custom
|
||||
costs for insertion, deletion and substitution.
|
||||
|
||||
This is calculated as ``distance / max``, where max is the maximal possible
|
||||
Levenshtein distance given the lengths of the sequences s1/s2 and the weights.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
score_hint : float, optional
|
||||
Expected normalized distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
"""
|
||||
_ = score_hint
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
weights = weights or (1, 1, 1)
|
||||
maximum = _levenshtein_maximum(s1, s2, weights)
|
||||
dist = distance(s1, s2, weights=weights)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein similarity in the range [0, 1] using custom
|
||||
costs for insertion, deletion and substitution.
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected normalized similarity between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized Levenshtein similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.81818181818181
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.85)
|
||||
0.0
|
||||
|
||||
It is possible to select different weights by passing a `weight`
|
||||
tuple.
|
||||
|
||||
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein", weights=(1,1,2))
|
||||
0.85714285714285
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> Levenshtein.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.81818181818181
|
||||
"""
|
||||
_ = score_hint
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
weights = weights or (1, 1, 1)
|
||||
norm_dist = normalized_distance(s1, s2, weights=weights)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _matrix(s1, s2):
|
||||
if not s1:
|
||||
return (len(s2), [], [])
|
||||
|
||||
VP = (1 << len(s1)) - 1
|
||||
VN = 0
|
||||
currDist = len(s1)
|
||||
mask = 1 << (len(s1) - 1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
matrix_VP = []
|
||||
matrix_VN = []
|
||||
for ch2 in s2:
|
||||
# Step 1: Computing D0
|
||||
PM_j = block_get(ch2, 0)
|
||||
X = PM_j
|
||||
D0 = (((X & VP) + VP) ^ VP) | X | VN
|
||||
# Step 2: Computing HP and HN
|
||||
HP = VN | ~(D0 | VP)
|
||||
HN = D0 & VP
|
||||
# Step 3: Computing the value D[m,j]
|
||||
currDist += (HP & mask) != 0
|
||||
currDist -= (HN & mask) != 0
|
||||
# Step 4: Computing Vp and VN
|
||||
HP = (HP << 1) | 1
|
||||
HN = HN << 1
|
||||
VP = HN | ~(D0 | HP)
|
||||
VN = HP & D0
|
||||
|
||||
matrix_VP.append(VP)
|
||||
matrix_VN.append(VN)
|
||||
|
||||
return (currDist, matrix_VP, matrix_VN)
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [8]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [8] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> for tag, src_pos, dest_pos in Levenshtein.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[1] s2[0]
|
||||
replace s1[3] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
_ = score_hint
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
prefix_len, suffix_len = common_affix(s1, s2)
|
||||
s1 = s1[prefix_len : len(s1) - suffix_len]
|
||||
s2 = s2[prefix_len : len(s2) - suffix_len]
|
||||
dist, VP, VN = _matrix(s1, s2)
|
||||
|
||||
editops = Editops([], 0, 0)
|
||||
editops._src_len = len(s1) + prefix_len + suffix_len
|
||||
editops._dest_len = len(s2) + prefix_len + suffix_len
|
||||
|
||||
if dist == 0:
|
||||
return editops
|
||||
|
||||
editop_list = [None] * dist
|
||||
col = len(s1)
|
||||
row = len(s2)
|
||||
while row != 0 and col != 0:
|
||||
# deletion
|
||||
if VP[row - 1] & (1 << (col - 1)):
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
row -= 1
|
||||
|
||||
# insertion
|
||||
if row and (VN[row - 1] & (1 << (col - 1))):
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
col -= 1
|
||||
|
||||
# replace (Matches are not recorded)
|
||||
if s1[col] != s2[row]:
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("replace", col + prefix_len, row + prefix_len)
|
||||
|
||||
while col != 0:
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
|
||||
while row != 0:
|
||||
dist -= 1
|
||||
row -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
|
||||
editops._editops = editop_list
|
||||
return editops
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [9]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [9] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in Levenshtein.opcodes("qabxcd", "abycdf"):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
replace a[3:4] (x) b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return editops(s1, s2, processor=processor, score_hint=score_hint).as_opcodes()
|
||||
93
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA.py
Normal file
93
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
232
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA_py.py
Normal file
232
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA_py.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _osa_distance_hyrroe2003(s1, s2):
|
||||
if not s1:
|
||||
return len(s2)
|
||||
|
||||
VP = (1 << len(s1)) - 1
|
||||
VN = 0
|
||||
D0 = 0
|
||||
PM_j_old = 0
|
||||
currDist = len(s1)
|
||||
mask = 1 << (len(s1) - 1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for ch2 in s2:
|
||||
# Step 1: Computing D0
|
||||
PM_j = block_get(ch2, 0)
|
||||
TR = (((~D0) & PM_j) << 1) & PM_j_old
|
||||
D0 = (((PM_j & VP) + VP) ^ VP) | PM_j | VN
|
||||
D0 = D0 | TR
|
||||
|
||||
# Step 2: Computing HP and HN
|
||||
HP = VN | ~(D0 | VP)
|
||||
HN = D0 & VP
|
||||
|
||||
# Step 3: Computing the value D[m,j]
|
||||
currDist += (HP & mask) != 0
|
||||
currDist -= (HN & mask) != 0
|
||||
|
||||
# Step 4: Computing Vp and VN
|
||||
HP = (HP << 1) | 1
|
||||
HN = HN << 1
|
||||
VP = HN | ~(D0 | HP)
|
||||
VN = HP & D0
|
||||
PM_j_old = PM_j
|
||||
|
||||
return currDist
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the optimal string alignment (OSA) distance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the OSA distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import OSA
|
||||
>>> OSA.distance("CA", "AC")
|
||||
2
|
||||
>>> OSA.distance("CA", "ABC")
|
||||
3
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
dist = _osa_distance_hyrroe2003(s1, s2)
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the optimal string alignment (OSA) similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``max(len1, len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized optimal string alignment (OSA) similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / max(len1, len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized optimal string alignment (OSA) similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the postfix distance between two strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int or None, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
dist = maximum - sim
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the postfix similarity between two strings.
|
||||
|
||||
This is calculated as ``len1 - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
sim = 0
|
||||
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
sim += 1
|
||||
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized postfix similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
norm_sim = normalized_similarity(s1, s2, processor=processor)
|
||||
norm_dist = 1.0 - norm_sim
|
||||
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized postfix similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
norm_sim = sim / maximum if maximum else 1.0
|
||||
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Prefix distance between two strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int or None, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
dist = maximum - sim
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the prefix similarity between two strings.
|
||||
|
||||
This is calculated as ``len1 - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
sim = 0
|
||||
for ch1, ch2 in zip(s1, s2):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
sim += 1
|
||||
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized prefix similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
norm_sim = normalized_similarity(s1, s2, processor=processor)
|
||||
norm_dist = 1.0 - norm_sim
|
||||
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized prefix similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
norm_sim = sim / maximum if maximum else 1.0
|
||||
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
|
||||
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from . import (
|
||||
OSA,
|
||||
DamerauLevenshtein,
|
||||
Hamming,
|
||||
Indel,
|
||||
Jaro,
|
||||
JaroWinkler,
|
||||
LCSseq,
|
||||
Levenshtein,
|
||||
Postfix,
|
||||
Prefix,
|
||||
)
|
||||
from ._initialize import Editop, Editops, MatchingBlock, Opcode, Opcodes, ScoreAlignment
|
||||
|
||||
__all__ = [
|
||||
"OSA",
|
||||
"DamerauLevenshtein",
|
||||
"Editop",
|
||||
"Editops",
|
||||
"Hamming",
|
||||
"Indel",
|
||||
"Jaro",
|
||||
"JaroWinkler",
|
||||
"LCSseq",
|
||||
"Levenshtein",
|
||||
"MatchingBlock",
|
||||
"Opcode",
|
||||
"Opcodes",
|
||||
"Postfix",
|
||||
"Prefix",
|
||||
"ScoreAlignment",
|
||||
]
|
||||
@@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from . import (
|
||||
OSA as OSA,
|
||||
DamerauLevenshtein as DamerauLevenshtein,
|
||||
Hamming as Hamming,
|
||||
Indel as Indel,
|
||||
Jaro as Jaro,
|
||||
JaroWinkler as JaroWinkler,
|
||||
LCSseq as LCSseq,
|
||||
Levenshtein as Levenshtein,
|
||||
Postfix as Postfix,
|
||||
Prefix as Prefix,
|
||||
)
|
||||
from ._initialize import (
|
||||
Editop as Editop,
|
||||
Editops as Editops,
|
||||
MatchingBlock as MatchingBlock,
|
||||
Opcode as Opcode,
|
||||
Opcodes as Opcodes,
|
||||
ScoreAlignment as ScoreAlignment,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["Editop", "Editops", "MatchingBlock", "Opcode", "Opcodes", "ScoreAlignment"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance._initialize_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance._initialize_py import (
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance._initialize_py import (
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
@@ -0,0 +1,133 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterator
|
||||
|
||||
_AnyOpList = list[Editop | tuple[str, int, int]] | list[Opcode | tuple[str, int, int, int, int]]
|
||||
|
||||
class MatchingBlock:
|
||||
a: int
|
||||
b: int
|
||||
size: int
|
||||
|
||||
def __init__(self, a: int, b: int, size: int): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int: ...
|
||||
def __iter__(self) -> Iterator[int]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class Editop:
|
||||
tag: str
|
||||
src_pos: int
|
||||
dest_pos: int
|
||||
|
||||
def __init__(self, tag: str, src_pos: int, dest_pos: int): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int | str: ...
|
||||
def __iter__(self) -> Iterator[int | str]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class Editops:
|
||||
_src_len: int
|
||||
_dest_len: int
|
||||
_editops: list[Editop]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
editops: _AnyOpList | None = None,
|
||||
src_len: int = 0,
|
||||
dest_len: int = 0,
|
||||
): ...
|
||||
@classmethod
|
||||
def from_opcodes(cls, opcodes: Opcodes) -> Editops: ...
|
||||
def as_matching_blocks(self) -> list[MatchingBlock]: ...
|
||||
def as_list(self) -> list[Editop]: ...
|
||||
def copy(self) -> Editops: ...
|
||||
def inverse(self) -> Editops: ...
|
||||
def remove_subsequence(self, subsequence: Editops) -> None: ...
|
||||
def apply(self, source_string: str, destination_string: str) -> str: ...
|
||||
@property
|
||||
def src_len(self) -> int: ...
|
||||
@src_len.setter
|
||||
def src_len(self, value: int) -> None: ...
|
||||
@property
|
||||
def dest_len(self) -> int: ...
|
||||
@dest_len.setter
|
||||
def dest_len(self, value: int) -> None: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __delitem__(self, key: int | slice) -> None: ...
|
||||
def __getitem__(self, key: int | slice) -> Editops | Editop: ...
|
||||
def __iter__(self) -> Iterator[Editop]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class Opcode:
|
||||
tag: str
|
||||
src_start: int
|
||||
src_end: int
|
||||
dest_start: int
|
||||
dest_end: int
|
||||
|
||||
def __init__(self, tag: str, src_start: int, src_end: int, dest_start: int, dest_end: int): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int | str: ...
|
||||
def __iter__(self) -> Iterator[int | str]: ...
|
||||
|
||||
class Opcodes:
|
||||
_src_len: int
|
||||
_dest_len: int
|
||||
_opcodes: list[Opcode]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
opcodes: _AnyOpList | None = None,
|
||||
src_len: int = 0,
|
||||
dest_len: int = 0,
|
||||
): ...
|
||||
@classmethod
|
||||
def from_editops(cls, editops: Editops) -> Opcodes: ...
|
||||
def as_editops(self) -> Editops: ...
|
||||
def as_matching_blocks(self) -> list[MatchingBlock]: ...
|
||||
def as_list(self) -> list[Opcode]: ...
|
||||
def copy(self) -> Opcodes: ...
|
||||
def inverse(self) -> Opcodes: ...
|
||||
def apply(self, source_string: str, destination_string: str) -> str: ...
|
||||
@property
|
||||
def src_len(self) -> int: ...
|
||||
@src_len.setter
|
||||
def src_len(self, value: int) -> None: ...
|
||||
@property
|
||||
def dest_len(self) -> int: ...
|
||||
@dest_len.setter
|
||||
def dest_len(self, value: int) -> None: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __getitem__(self, key: int) -> Opcode: ...
|
||||
def __iter__(self) -> Iterator[Opcode]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class ScoreAlignment:
|
||||
score: int | float
|
||||
src_start: int
|
||||
src_end: int
|
||||
dest_start: int
|
||||
dest_end: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
score: int | float,
|
||||
src_start: int,
|
||||
src_end: int,
|
||||
dest_start: int,
|
||||
dest_end: int,
|
||||
): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int | float: ...
|
||||
def __iter__(self) -> Iterator[int | float]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
Binary file not shown.
@@ -0,0 +1,884 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def _list_to_editops(
|
||||
ops,
|
||||
src_len,
|
||||
dest_len,
|
||||
):
|
||||
if not ops:
|
||||
return []
|
||||
|
||||
if len(ops[0]) == 5:
|
||||
return Opcodes(ops, src_len, dest_len).as_editops()._editops
|
||||
|
||||
blocks = []
|
||||
for op in ops:
|
||||
edit_type, src_pos, dest_pos = op
|
||||
|
||||
if src_pos > src_len or dest_pos > dest_len:
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
if src_pos == src_len and edit_type != "insert":
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if dest_pos == dest_len and edit_type != "delete":
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
# keep operations are not relevant in editops
|
||||
if edit_type == "equal":
|
||||
continue
|
||||
|
||||
blocks.append(Editop(edit_type, src_pos, dest_pos))
|
||||
|
||||
# validate order of editops
|
||||
for i in range(len(blocks) - 1):
|
||||
if blocks[i + 1].src_pos < blocks[i].src_pos or blocks[i + 1].dest_pos < blocks[i].dest_pos:
|
||||
msg = "List of edit operations out of order"
|
||||
raise ValueError(msg)
|
||||
if blocks[i + 1].src_pos == blocks[i].src_pos and blocks[i + 1].dest_pos == blocks[i].dest_pos:
|
||||
msg = "Duplicated edit operation"
|
||||
raise ValueError(msg)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _list_to_opcodes(
|
||||
ops,
|
||||
src_len,
|
||||
dest_len,
|
||||
):
|
||||
if not ops or len(ops[0]) == 3:
|
||||
return Editops(ops, src_len, dest_len).as_opcodes()._opcodes
|
||||
|
||||
blocks = []
|
||||
for op in ops:
|
||||
edit_type, src_start, src_end, dest_start, dest_end = op
|
||||
|
||||
if src_end > src_len or dest_end > dest_len:
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if src_end < src_start or dest_end < dest_start:
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
if edit_type in {"equal", "replace"} and (src_end - src_start != dest_end - dest_start or src_start == src_end):
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if edit_type == "insert" and (src_start != src_end or dest_start == dest_end):
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if edit_type == "delete" and (src_start == src_end or dest_start != dest_end):
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
# merge similar adjacent blocks
|
||||
if blocks and (
|
||||
blocks[-1].tag == edit_type and blocks[-1].src_end == src_start and blocks[-1].dest_end == dest_start
|
||||
):
|
||||
blocks[-1].src_end = src_end
|
||||
blocks[-1].dest_end = dest_end
|
||||
continue
|
||||
|
||||
blocks.append(Opcode(edit_type, src_start, src_end, dest_start, dest_end))
|
||||
|
||||
# check if edit operations span the complete string
|
||||
if blocks[0].src_start != 0 or blocks[0].dest_start != 0:
|
||||
msg = "List of edit operations does not start at position 0"
|
||||
raise ValueError(msg)
|
||||
if blocks[-1].src_end != src_len or blocks[-1].dest_end != dest_len:
|
||||
msg = "List of edit operations does not end at the string ends"
|
||||
raise ValueError(msg)
|
||||
for i in range(len(blocks) - 1):
|
||||
if blocks[i + 1].src_start != blocks[i].src_end or blocks[i + 1].dest_start != blocks[i].dest_end:
|
||||
msg = "List of edit operations is not continuous"
|
||||
raise ValueError(msg)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
class MatchingBlock:
|
||||
"""
|
||||
Triple describing matching subsequences
|
||||
"""
|
||||
|
||||
def __init__(self, a, b, size):
|
||||
self.a = a
|
||||
self.b = b
|
||||
self.size = size
|
||||
|
||||
def __len__(self):
|
||||
return 3
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 3:
|
||||
return False
|
||||
|
||||
return bool(other[0] == self.a and other[1] == self.b and other[2] == self.size)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -3}:
|
||||
return self.a
|
||||
if i in {1, -2}:
|
||||
return self.b
|
||||
if i in {2, -1}:
|
||||
return self.size
|
||||
|
||||
msg = "MatchingBlock index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(3):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return f"MatchingBlock(a={self.a}, b={self.b}, size={self.size})"
|
||||
|
||||
|
||||
class Editop:
|
||||
"""
|
||||
Tuple like object describing an edit operation.
|
||||
It is in the form (tag, src_pos, dest_pos)
|
||||
|
||||
The tags are strings, with these meanings:
|
||||
|
||||
+-----------+---------------------------------------------------+
|
||||
| tag | explanation |
|
||||
+===========+===================================================+
|
||||
| 'replace' | src[src_pos] should be replaced by dest[dest_pos] |
|
||||
+-----------+---------------------------------------------------+
|
||||
| 'delete' | src[src_pos] should be deleted |
|
||||
+-----------+---------------------------------------------------+
|
||||
| 'insert' | dest[dest_pos] should be inserted at src[src_pos] |
|
||||
+-----------+---------------------------------------------------+
|
||||
"""
|
||||
|
||||
def __init__(self, tag, src_pos, dest_pos):
|
||||
self.tag = tag
|
||||
self.src_pos = src_pos
|
||||
self.dest_pos = dest_pos
|
||||
|
||||
def __len__(self):
|
||||
return 3
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 3:
|
||||
return False
|
||||
|
||||
return bool(other[0] == self.tag and other[1] == self.src_pos and other[2] == self.dest_pos)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -3}:
|
||||
return self.tag
|
||||
if i in {1, -2}:
|
||||
return self.src_pos
|
||||
if i in {2, -1}:
|
||||
return self.dest_pos
|
||||
|
||||
msg = "Editop index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(3):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return f"Editop(tag={self.tag!r}, src_pos={self.src_pos}, dest_pos={self.dest_pos})"
|
||||
|
||||
|
||||
class Editops:
|
||||
"""
|
||||
List like object of Editops describing how to turn s1 into s2.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
editops=None,
|
||||
src_len=0,
|
||||
dest_len=0,
|
||||
):
|
||||
self._src_len = src_len
|
||||
self._dest_len = dest_len
|
||||
self._editops = _list_to_editops(editops, src_len, dest_len)
|
||||
|
||||
@classmethod
|
||||
def from_opcodes(cls, opcodes):
|
||||
"""
|
||||
Create Editops from Opcodes
|
||||
|
||||
Parameters
|
||||
----------
|
||||
opcodes : Opcodes
|
||||
opcodes to convert to editops
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
Opcodes converted to Editops
|
||||
"""
|
||||
return opcodes.as_editops()
|
||||
|
||||
def as_opcodes(self):
|
||||
"""
|
||||
Convert to Opcodes
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
Editops converted to Opcodes
|
||||
"""
|
||||
x = Opcodes.__new__(Opcodes)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
blocks = []
|
||||
src_pos = 0
|
||||
dest_pos = 0
|
||||
i = 0
|
||||
while i < len(self._editops):
|
||||
if src_pos < self._editops[i].src_pos or dest_pos < self._editops[i].dest_pos:
|
||||
blocks.append(
|
||||
Opcode(
|
||||
"equal",
|
||||
src_pos,
|
||||
self._editops[i].src_pos,
|
||||
dest_pos,
|
||||
self._editops[i].dest_pos,
|
||||
)
|
||||
)
|
||||
src_pos = self._editops[i].src_pos
|
||||
dest_pos = self._editops[i].dest_pos
|
||||
|
||||
src_begin = src_pos
|
||||
dest_begin = dest_pos
|
||||
tag = self._editops[i].tag
|
||||
while (
|
||||
i < len(self._editops)
|
||||
and self._editops[i].tag == tag
|
||||
and src_pos == self._editops[i].src_pos
|
||||
and dest_pos == self._editops[i].dest_pos
|
||||
):
|
||||
if tag == "replace":
|
||||
src_pos += 1
|
||||
dest_pos += 1
|
||||
elif tag == "insert":
|
||||
dest_pos += 1
|
||||
elif tag == "delete":
|
||||
src_pos += 1
|
||||
|
||||
i += 1
|
||||
|
||||
blocks.append(Opcode(tag, src_begin, src_pos, dest_begin, dest_pos))
|
||||
|
||||
if src_pos < self.src_len or dest_pos < self.dest_len:
|
||||
blocks.append(Opcode("equal", src_pos, self.src_len, dest_pos, self.dest_len))
|
||||
|
||||
x._opcodes = blocks
|
||||
return x
|
||||
|
||||
def as_matching_blocks(self):
|
||||
"""
|
||||
Convert to matching blocks
|
||||
|
||||
Returns
|
||||
-------
|
||||
matching blocks : list[MatchingBlock]
|
||||
Editops converted to matching blocks
|
||||
"""
|
||||
blocks = []
|
||||
src_pos = 0
|
||||
dest_pos = 0
|
||||
for op in self:
|
||||
if src_pos < op.src_pos or dest_pos < op.dest_pos:
|
||||
length = min(op.src_pos - src_pos, op.dest_pos - dest_pos)
|
||||
if length > 0:
|
||||
blocks.append(MatchingBlock(src_pos, dest_pos, length))
|
||||
src_pos = op.src_pos
|
||||
dest_pos = op.dest_pos
|
||||
|
||||
if op.tag == "replace":
|
||||
src_pos += 1
|
||||
dest_pos += 1
|
||||
elif op.tag == "delete":
|
||||
src_pos += 1
|
||||
elif op.tag == "insert":
|
||||
dest_pos += 1
|
||||
|
||||
if src_pos < self.src_len or dest_pos < self.dest_len:
|
||||
length = min(self.src_len - src_pos, self.dest_len - dest_pos)
|
||||
if length > 0:
|
||||
blocks.append(MatchingBlock(src_pos, dest_pos, length))
|
||||
|
||||
blocks.append(MatchingBlock(self.src_len, self.dest_len, 0))
|
||||
return blocks
|
||||
|
||||
def as_list(self):
|
||||
"""
|
||||
Convert Editops to a list of tuples.
|
||||
|
||||
This is the equivalent of ``[x for x in editops]``
|
||||
"""
|
||||
return [tuple(op) for op in self._editops]
|
||||
|
||||
def copy(self):
|
||||
"""
|
||||
performs copy of Editops
|
||||
"""
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
x._editops = self._editops[::]
|
||||
return x
|
||||
|
||||
def inverse(self):
|
||||
"""
|
||||
Invert Editops, so it describes how to transform the destination string to
|
||||
the source string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
inverted Editops
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.editops('spam', 'park')
|
||||
[Editop(tag=delete, src_pos=0, dest_pos=0),
|
||||
Editop(tag=replace, src_pos=3, dest_pos=2),
|
||||
Editop(tag=insert, src_pos=4, dest_pos=3)]
|
||||
|
||||
>>> Levenshtein.editops('spam', 'park').inverse()
|
||||
[Editop(tag=insert, src_pos=0, dest_pos=0),
|
||||
Editop(tag=replace, src_pos=2, dest_pos=3),
|
||||
Editop(tag=delete, src_pos=3, dest_pos=4)]
|
||||
"""
|
||||
blocks = []
|
||||
for op in self:
|
||||
tag = op.tag
|
||||
if tag == "delete":
|
||||
tag = "insert"
|
||||
elif tag == "insert":
|
||||
tag = "delete"
|
||||
|
||||
blocks.append(Editop(tag, op.dest_pos, op.src_pos))
|
||||
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self.dest_len
|
||||
x._dest_len = self.src_len
|
||||
x._editops = blocks
|
||||
return x
|
||||
|
||||
def remove_subsequence(self, subsequence):
|
||||
"""
|
||||
remove a subsequence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
subsequence : Editops
|
||||
subsequence to remove (has to be a subset of editops)
|
||||
|
||||
Returns
|
||||
-------
|
||||
sequence : Editops
|
||||
a copy of the editops without the subsequence
|
||||
"""
|
||||
result = Editops.__new__(Editops)
|
||||
result._src_len = self._src_len
|
||||
result._dest_len = self._dest_len
|
||||
|
||||
if len(subsequence) > len(self):
|
||||
msg = "subsequence is not a subsequence"
|
||||
raise ValueError(msg)
|
||||
|
||||
result._editops = [None] * (len(self) - len(subsequence))
|
||||
|
||||
# offset to correct removed edit operation
|
||||
offset = 0
|
||||
op_pos = 0
|
||||
result_pos = 0
|
||||
|
||||
for sop in subsequence:
|
||||
while op_pos != len(self) and sop != self._editops[op_pos]:
|
||||
result[result_pos] = self._editops[op_pos]
|
||||
result[result_pos].src_pos += offset
|
||||
result_pos += 1
|
||||
op_pos += 1
|
||||
|
||||
# element of subsequence not part of the sequence
|
||||
if op_pos == len(self):
|
||||
msg = "subsequence is not a subsequence"
|
||||
raise ValueError(msg)
|
||||
|
||||
if sop.tag == "insert":
|
||||
offset += 1
|
||||
elif sop.tag == "delete":
|
||||
offset -= 1
|
||||
|
||||
op_pos += 1
|
||||
|
||||
# add remaining elements
|
||||
while op_pos != len(self):
|
||||
result[result_pos] = self._editops[op_pos]
|
||||
result[result_pos].src_pos += offset
|
||||
result_pos += 1
|
||||
op_pos += 1
|
||||
|
||||
return result
|
||||
|
||||
def apply(self, source_string, destination_string):
|
||||
"""
|
||||
apply editops to source_string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_string : str | bytes
|
||||
string to apply editops to
|
||||
destination_string : str | bytes
|
||||
string to use for replacements / insertions into source_string
|
||||
|
||||
Returns
|
||||
-------
|
||||
mod_string : str
|
||||
modified source_string
|
||||
|
||||
"""
|
||||
res_str = ""
|
||||
src_pos = 0
|
||||
|
||||
for op in self._editops:
|
||||
# matches between last and current editop
|
||||
while src_pos < op.src_pos:
|
||||
res_str += source_string[src_pos]
|
||||
src_pos += 1
|
||||
|
||||
if op.tag == "replace":
|
||||
res_str += destination_string[op.dest_pos]
|
||||
src_pos += 1
|
||||
elif op.tag == "insert":
|
||||
res_str += destination_string[op.dest_pos]
|
||||
elif op.tag == "delete":
|
||||
src_pos += 1
|
||||
|
||||
# matches after the last editop
|
||||
while src_pos < len(source_string):
|
||||
res_str += source_string[src_pos]
|
||||
src_pos += 1
|
||||
|
||||
return res_str
|
||||
|
||||
@property
|
||||
def src_len(self):
|
||||
return self._src_len
|
||||
|
||||
@src_len.setter
|
||||
def src_len(self, value):
|
||||
self._src_len = value
|
||||
|
||||
@property
|
||||
def dest_len(self):
|
||||
return self._dest_len
|
||||
|
||||
@dest_len.setter
|
||||
def dest_len(self, value):
|
||||
self._dest_len = value
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Editops):
|
||||
return False
|
||||
|
||||
return self.dest_len == other.dest_len and self.src_len == other.src_len and self._editops == other._editops
|
||||
|
||||
def __len__(self):
|
||||
return len(self._editops)
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._editops[key]
|
||||
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, int):
|
||||
return self._editops[key]
|
||||
|
||||
start, stop, step = key.indices(len(self._editops))
|
||||
if step < 0:
|
||||
msg = "step sizes below 0 lead to an invalid order of editops"
|
||||
raise ValueError(msg)
|
||||
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
x._editops = self._editops[start:stop:step]
|
||||
return x
|
||||
|
||||
def __iter__(self):
|
||||
yield from self._editops
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"Editops([" + ", ".join(repr(op) for op in self) + f"], src_len={self.src_len}, dest_len={self.dest_len})"
|
||||
)
|
||||
|
||||
|
||||
class Opcode:
|
||||
"""
|
||||
Tuple like object describing an edit operation.
|
||||
It is in the form (tag, src_start, src_end, dest_start, dest_end)
|
||||
|
||||
The tags are strings, with these meanings:
|
||||
|
||||
+-----------+-----------------------------------------------------+
|
||||
| tag | explanation |
|
||||
+===========+=====================================================+
|
||||
| 'replace' | src[src_start:src_end] should be |
|
||||
| | replaced by dest[dest_start:dest_end] |
|
||||
+-----------+-----------------------------------------------------+
|
||||
| 'delete' | src[src_start:src_end] should be deleted. |
|
||||
| | Note that dest_start==dest_end in this case. |
|
||||
+-----------+-----------------------------------------------------+
|
||||
| 'insert' | dest[dest_start:dest_end] should be inserted |
|
||||
| | at src[src_start:src_start]. |
|
||||
| | Note that src_start==src_end in this case. |
|
||||
+-----------+-----------------------------------------------------+
|
||||
| 'equal' | src[src_start:src_end] == dest[dest_start:dest_end] |
|
||||
+-----------+-----------------------------------------------------+
|
||||
|
||||
Note
|
||||
----
|
||||
Opcode is compatible with the tuples returned by difflib's SequenceMatcher to make them
|
||||
interoperable
|
||||
"""
|
||||
|
||||
def __init__(self, tag, src_start, src_end, dest_start, dest_end):
|
||||
self.tag = tag
|
||||
self.src_start = src_start
|
||||
self.src_end = src_end
|
||||
self.dest_start = dest_start
|
||||
self.dest_end = dest_end
|
||||
|
||||
def __len__(self):
|
||||
return 5
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 5:
|
||||
return False
|
||||
|
||||
return bool(
|
||||
other[0] == self.tag
|
||||
and other[1] == self.src_start
|
||||
and other[2] == self.src_end
|
||||
and other[3] == self.dest_start
|
||||
and other[4] == self.dest_end
|
||||
)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -5}:
|
||||
return self.tag
|
||||
if i in {1, -4}:
|
||||
return self.src_start
|
||||
if i in {2, -3}:
|
||||
return self.src_end
|
||||
if i in {3, -2}:
|
||||
return self.dest_start
|
||||
if i in {4, -1}:
|
||||
return self.dest_end
|
||||
|
||||
msg = "Opcode index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(5):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
|
||||
f"dest_start={self.dest_start}, dest_end={self.dest_end})"
|
||||
)
|
||||
|
||||
|
||||
class Opcodes:
|
||||
"""
|
||||
List like object of Opcodes describing how to turn s1 into s2.
|
||||
The first Opcode has src_start == dest_start == 0, and remaining tuples
|
||||
have src_start == the src_end from the tuple preceding it,
|
||||
and likewise for dest_start == the previous dest_end.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
opcodes=None,
|
||||
src_len=0,
|
||||
dest_len=0,
|
||||
):
|
||||
self._src_len = src_len
|
||||
self._dest_len = dest_len
|
||||
self._opcodes = _list_to_opcodes(opcodes, src_len, dest_len)
|
||||
|
||||
@classmethod
|
||||
def from_editops(cls, editops):
|
||||
"""
|
||||
Create Opcodes from Editops
|
||||
|
||||
Parameters
|
||||
----------
|
||||
editops : Editops
|
||||
editops to convert to opcodes
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
Editops converted to Opcodes
|
||||
"""
|
||||
return editops.as_opcodes()
|
||||
|
||||
def as_editops(self):
|
||||
"""
|
||||
Convert Opcodes to Editops
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
Opcodes converted to Editops
|
||||
"""
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
blocks = []
|
||||
for op in self:
|
||||
if op.tag == "replace":
|
||||
for j in range(op.src_end - op.src_start):
|
||||
blocks.append(Editop("replace", op.src_start + j, op.dest_start + j))
|
||||
elif op.tag == "insert":
|
||||
for j in range(op.dest_end - op.dest_start):
|
||||
blocks.append(Editop("insert", op.src_start, op.dest_start + j))
|
||||
elif op.tag == "delete":
|
||||
for j in range(op.src_end - op.src_start):
|
||||
blocks.append(Editop("delete", op.src_start + j, op.dest_start))
|
||||
|
||||
x._editops = blocks
|
||||
return x
|
||||
|
||||
def as_matching_blocks(self):
|
||||
"""
|
||||
Convert to matching blocks
|
||||
|
||||
Returns
|
||||
-------
|
||||
matching blocks : list[MatchingBlock]
|
||||
Opcodes converted to matching blocks
|
||||
"""
|
||||
blocks = []
|
||||
for op in self:
|
||||
if op.tag == "equal":
|
||||
length = min(op.src_end - op.src_start, op.dest_end - op.dest_start)
|
||||
if length > 0:
|
||||
blocks.append(MatchingBlock(op.src_start, op.dest_start, length))
|
||||
|
||||
blocks.append(MatchingBlock(self.src_len, self.dest_len, 0))
|
||||
return blocks
|
||||
|
||||
def as_list(self):
|
||||
"""
|
||||
Convert Opcodes to a list of tuples, which is compatible
|
||||
with the opcodes of difflibs SequenceMatcher.
|
||||
|
||||
This is the equivalent of ``[x for x in opcodes]``
|
||||
"""
|
||||
return [tuple(op) for op in self._opcodes]
|
||||
|
||||
def copy(self):
|
||||
"""
|
||||
performs copy of Opcodes
|
||||
"""
|
||||
x = Opcodes.__new__(Opcodes)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
x._opcodes = self._opcodes[::]
|
||||
return x
|
||||
|
||||
def inverse(self):
|
||||
"""
|
||||
Invert Opcodes, so it describes how to transform the destination string to
|
||||
the source string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
inverted Opcodes
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.opcodes('spam', 'park')
|
||||
[Opcode(tag=delete, src_start=0, src_end=1, dest_start=0, dest_end=0),
|
||||
Opcode(tag=equal, src_start=1, src_end=3, dest_start=0, dest_end=2),
|
||||
Opcode(tag=replace, src_start=3, src_end=4, dest_start=2, dest_end=3),
|
||||
Opcode(tag=insert, src_start=4, src_end=4, dest_start=3, dest_end=4)]
|
||||
|
||||
>>> Levenshtein.opcodes('spam', 'park').inverse()
|
||||
[Opcode(tag=insert, src_start=0, src_end=0, dest_start=0, dest_end=1),
|
||||
Opcode(tag=equal, src_start=0, src_end=2, dest_start=1, dest_end=3),
|
||||
Opcode(tag=replace, src_start=2, src_end=3, dest_start=3, dest_end=4),
|
||||
Opcode(tag=delete, src_start=3, src_end=4, dest_start=4, dest_end=4)]
|
||||
"""
|
||||
blocks = []
|
||||
for op in self:
|
||||
tag = op.tag
|
||||
if tag == "delete":
|
||||
tag = "insert"
|
||||
elif tag == "insert":
|
||||
tag = "delete"
|
||||
|
||||
blocks.append(Opcode(tag, op.dest_start, op.dest_end, op.src_start, op.src_end))
|
||||
|
||||
x = Opcodes.__new__(Opcodes)
|
||||
x._src_len = self.dest_len
|
||||
x._dest_len = self.src_len
|
||||
x._opcodes = blocks
|
||||
return x
|
||||
|
||||
def apply(self, source_string, destination_string):
|
||||
"""
|
||||
apply opcodes to source_string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_string : str | bytes
|
||||
string to apply opcodes to
|
||||
destination_string : str | bytes
|
||||
string to use for replacements / insertions into source_string
|
||||
|
||||
Returns
|
||||
-------
|
||||
mod_string : str
|
||||
modified source_string
|
||||
|
||||
"""
|
||||
res_str = ""
|
||||
|
||||
for op in self._opcodes:
|
||||
if op.tag == "equal":
|
||||
res_str += source_string[op.src_start : op.src_end]
|
||||
elif op.tag in {"replace", "insert"}:
|
||||
res_str += destination_string[op.dest_start : op.dest_end]
|
||||
|
||||
return res_str
|
||||
|
||||
@property
|
||||
def src_len(self):
|
||||
return self._src_len
|
||||
|
||||
@src_len.setter
|
||||
def src_len(self, value):
|
||||
self._src_len = value
|
||||
|
||||
@property
|
||||
def dest_len(self):
|
||||
return self._dest_len
|
||||
|
||||
@dest_len.setter
|
||||
def dest_len(self, value):
|
||||
self._dest_len = value
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Opcodes):
|
||||
return False
|
||||
|
||||
return self.dest_len == other.dest_len and self.src_len == other.src_len and self._opcodes == other._opcodes
|
||||
|
||||
def __len__(self):
|
||||
return len(self._opcodes)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, int):
|
||||
return self._opcodes[key]
|
||||
|
||||
msg = "Expected index"
|
||||
raise TypeError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
yield from self._opcodes
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"Opcodes([" + ", ".join(repr(op) for op in self) + f"], src_len={self.src_len}, dest_len={self.dest_len})"
|
||||
)
|
||||
|
||||
|
||||
class ScoreAlignment:
|
||||
"""
|
||||
Tuple like object describing the position of the compared strings in
|
||||
src and dest.
|
||||
|
||||
It indicates that the score has been calculated between
|
||||
src[src_start:src_end] and dest[dest_start:dest_end]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
score,
|
||||
src_start,
|
||||
src_end,
|
||||
dest_start,
|
||||
dest_end,
|
||||
):
|
||||
self.score = score
|
||||
self.src_start = src_start
|
||||
self.src_end = src_end
|
||||
self.dest_start = dest_start
|
||||
self.dest_end = dest_end
|
||||
|
||||
def __len__(self):
|
||||
return 5
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 5:
|
||||
return False
|
||||
|
||||
return bool(
|
||||
other[0] == self.score
|
||||
and other[1] == self.src_start
|
||||
and other[2] == self.src_end
|
||||
and other[3] == self.dest_start
|
||||
and other[4] == self.dest_end
|
||||
)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -5}:
|
||||
return self.score
|
||||
if i in {1, -4}:
|
||||
return self.src_start
|
||||
if i in {2, -3}:
|
||||
return self.src_end
|
||||
if i in {3, -2}:
|
||||
return self.dest_start
|
||||
if i in {4, -1}:
|
||||
return self.dest_end
|
||||
|
||||
msg = "Opcode index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(5):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"ScoreAlignment(score={self.score}, src_start={self.src_start}, "
|
||||
f"src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})"
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,299 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Callable
|
||||
|
||||
from rapidfuzz._utils import (
|
||||
ScorerFlag,
|
||||
add_scorer_attrs,
|
||||
default_distance_attribute as dist_attr,
|
||||
default_normalized_distance_attribute as norm_dist_attr,
|
||||
default_normalized_similarity_attribute as norm_sim_attr,
|
||||
default_similarity_attribute as sim_attr,
|
||||
)
|
||||
|
||||
# DamerauLevenshtein
|
||||
from rapidfuzz.distance.DamerauLevenshtein_py import (
|
||||
distance as damerau_levenshtein_distance,
|
||||
normalized_distance as damerau_levenshtein_normalized_distance,
|
||||
normalized_similarity as damerau_levenshtein_normalized_similarity,
|
||||
similarity as damerau_levenshtein_similarity,
|
||||
)
|
||||
|
||||
# Hamming
|
||||
from rapidfuzz.distance.Hamming_py import (
|
||||
distance as hamming_distance,
|
||||
editops as hamming_editops,
|
||||
normalized_distance as hamming_normalized_distance,
|
||||
normalized_similarity as hamming_normalized_similarity,
|
||||
opcodes as hamming_opcodes,
|
||||
similarity as hamming_similarity,
|
||||
)
|
||||
|
||||
# Indel
|
||||
from rapidfuzz.distance.Indel_py import (
|
||||
distance as indel_distance,
|
||||
editops as indel_editops,
|
||||
normalized_distance as indel_normalized_distance,
|
||||
normalized_similarity as indel_normalized_similarity,
|
||||
opcodes as indel_opcodes,
|
||||
similarity as indel_similarity,
|
||||
)
|
||||
|
||||
# Jaro
|
||||
from rapidfuzz.distance.Jaro_py import (
|
||||
distance as jaro_distance,
|
||||
normalized_distance as jaro_normalized_distance,
|
||||
normalized_similarity as jaro_normalized_similarity,
|
||||
similarity as jaro_similarity,
|
||||
)
|
||||
|
||||
# JaroWinkler
|
||||
from rapidfuzz.distance.JaroWinkler_py import (
|
||||
distance as jaro_winkler_distance,
|
||||
normalized_distance as jaro_winkler_normalized_distance,
|
||||
normalized_similarity as jaro_winkler_normalized_similarity,
|
||||
similarity as jaro_winkler_similarity,
|
||||
)
|
||||
|
||||
# LCSseq
|
||||
from rapidfuzz.distance.LCSseq_py import (
|
||||
distance as lcs_seq_distance,
|
||||
editops as lcs_seq_editops,
|
||||
normalized_distance as lcs_seq_normalized_distance,
|
||||
normalized_similarity as lcs_seq_normalized_similarity,
|
||||
opcodes as lcs_seq_opcodes,
|
||||
similarity as lcs_seq_similarity,
|
||||
)
|
||||
|
||||
# Levenshtein
|
||||
from rapidfuzz.distance.Levenshtein_py import (
|
||||
distance as levenshtein_distance,
|
||||
editops as levenshtein_editops,
|
||||
normalized_distance as levenshtein_normalized_distance,
|
||||
normalized_similarity as levenshtein_normalized_similarity,
|
||||
opcodes as levenshtein_opcodes,
|
||||
similarity as levenshtein_similarity,
|
||||
)
|
||||
|
||||
# OSA
|
||||
from rapidfuzz.distance.OSA_py import (
|
||||
distance as osa_distance,
|
||||
normalized_distance as osa_normalized_distance,
|
||||
normalized_similarity as osa_normalized_similarity,
|
||||
similarity as osa_similarity,
|
||||
)
|
||||
|
||||
# Postfix
|
||||
from rapidfuzz.distance.Postfix_py import (
|
||||
distance as postfix_distance,
|
||||
normalized_distance as postfix_normalized_distance,
|
||||
normalized_similarity as postfix_normalized_similarity,
|
||||
similarity as postfix_similarity,
|
||||
)
|
||||
|
||||
# Prefix
|
||||
from rapidfuzz.distance.Prefix_py import (
|
||||
distance as prefix_distance,
|
||||
normalized_distance as prefix_normalized_distance,
|
||||
normalized_similarity as prefix_normalized_similarity,
|
||||
similarity as prefix_similarity,
|
||||
)
|
||||
|
||||
__all__ = []
|
||||
|
||||
add_scorer_attrs(osa_distance, dist_attr)
|
||||
add_scorer_attrs(osa_similarity, sim_attr)
|
||||
add_scorer_attrs(osa_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(osa_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"osa_distance",
|
||||
"osa_normalized_distance",
|
||||
"osa_normalized_similarity",
|
||||
"osa_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(prefix_distance, dist_attr)
|
||||
add_scorer_attrs(prefix_similarity, sim_attr)
|
||||
add_scorer_attrs(prefix_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(prefix_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"prefix_distance",
|
||||
"prefix_normalized_distance",
|
||||
"prefix_normalized_similarity",
|
||||
"prefix_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(postfix_distance, dist_attr)
|
||||
add_scorer_attrs(postfix_similarity, sim_attr)
|
||||
add_scorer_attrs(postfix_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(postfix_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"postfix_distance",
|
||||
"postfix_normalized_distance",
|
||||
"postfix_normalized_similarity",
|
||||
"postfix_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(jaro_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_similarity, norm_sim_attr)
|
||||
add_scorer_attrs(jaro_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"jaro_distance",
|
||||
"jaro_normalized_distance",
|
||||
"jaro_normalized_similarity",
|
||||
"jaro_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(jaro_winkler_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_winkler_similarity, norm_sim_attr)
|
||||
add_scorer_attrs(jaro_winkler_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_winkler_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"jaro_winkler_distance",
|
||||
"jaro_winkler_normalized_distance",
|
||||
"jaro_winkler_normalized_similarity",
|
||||
"jaro_winkler_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(damerau_levenshtein_distance, dist_attr)
|
||||
add_scorer_attrs(damerau_levenshtein_similarity, sim_attr)
|
||||
add_scorer_attrs(damerau_levenshtein_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(damerau_levenshtein_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"damerau_levenshtein_distance",
|
||||
"damerau_levenshtein_normalized_distance",
|
||||
"damerau_levenshtein_normalized_similarity",
|
||||
"damerau_levenshtein_similarity",
|
||||
]
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_distance(weights: tuple[int, int, int] | None = (1, 1, 1)) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_SIZE_T
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {
|
||||
"optimal_score": 0,
|
||||
"worst_score": 2**63 - 1,
|
||||
"flags": flags,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_similarity(weights: tuple[int, int, int] | None = (1, 1, 1)) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_SIZE_T
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {
|
||||
"optimal_score": 2**63 - 1,
|
||||
"worst_score": 0,
|
||||
"flags": flags,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_normalized_distance(
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1)
|
||||
) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_F64
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {"optimal_score": 0, "worst_score": 1, "flags": flags}
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_normalized_similarity(
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1)
|
||||
) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_F64
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {"optimal_score": 1, "worst_score": 0, "flags": flags}
|
||||
|
||||
|
||||
levenshtein_dist_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_distance
|
||||
}
|
||||
levenshtein_sim_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_similarity
|
||||
}
|
||||
levenshtein_norm_dist_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_normalized_distance
|
||||
}
|
||||
levenshtein_norm_sim_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_normalized_similarity
|
||||
}
|
||||
|
||||
add_scorer_attrs(levenshtein_distance, levenshtein_dist_attr)
|
||||
add_scorer_attrs(levenshtein_similarity, levenshtein_sim_attr)
|
||||
add_scorer_attrs(levenshtein_normalized_distance, levenshtein_norm_dist_attr)
|
||||
add_scorer_attrs(levenshtein_normalized_similarity, levenshtein_norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"levenshtein_distance",
|
||||
"levenshtein_editops",
|
||||
"levenshtein_normalized_distance",
|
||||
"levenshtein_normalized_similarity",
|
||||
"levenshtein_opcodes",
|
||||
"levenshtein_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(lcs_seq_distance, dist_attr)
|
||||
add_scorer_attrs(lcs_seq_similarity, sim_attr)
|
||||
add_scorer_attrs(lcs_seq_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(lcs_seq_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"lcs_seq_distance",
|
||||
"lcs_seq_editops",
|
||||
"lcs_seq_normalized_distance",
|
||||
"lcs_seq_normalized_similarity",
|
||||
"lcs_seq_opcodes",
|
||||
"lcs_seq_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(indel_distance, dist_attr)
|
||||
add_scorer_attrs(indel_similarity, sim_attr)
|
||||
add_scorer_attrs(indel_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(indel_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"indel_distance",
|
||||
"indel_editops",
|
||||
"indel_normalized_distance",
|
||||
"indel_normalized_similarity",
|
||||
"indel_opcodes",
|
||||
"indel_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(hamming_distance, dist_attr)
|
||||
add_scorer_attrs(hamming_similarity, sim_attr)
|
||||
add_scorer_attrs(hamming_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(hamming_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"hamming_distance",
|
||||
"hamming_editops",
|
||||
"hamming_normalized_distance",
|
||||
"hamming_normalized_similarity",
|
||||
"hamming_opcodes",
|
||||
"hamming_similarity",
|
||||
]
|
||||
Reference in New Issue
Block a user